/
main.py
104 lines (92 loc) · 3.62 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
__author__ = 'Ariel'
import time
import argparse
import readHelper
import writeHelper
import model
import memory
import BiCluster
import BiCF
import pickle
def main():
start_time = time.time()
parser = argparse.ArgumentParser(description = "collaborative filtering")
parser.add_argument("-t", help = "if running test", action = 'store_true', default = False)
parser.add_argument("-m", help = "memory-based or model collaborative filtering", choices = ['memory','model'])
parser.add_argument("-k", help = "number of k nearest neighborhood", type = int, choices = [10,100,500])
parser.add_argument("-s", help = "similarity metric used for knn ", choices = ['dotp','cosine'])
parser.add_argument("-w", help = "approach for combining prediction given knn", choices = ['mean','weight'])
parser.add_argument("-p", help = "if standardization used", action = 'store_true', default = False)
parser.add_argument("-b", help = "if bipartite clustering userd", action = "store_true", default = False)
args = parser.parse_args()
print args
if not args.t:
output = "-".join([args.m, str(args.k), args.s, args.w])
if args.p:
output += '-pcc'
if args.b:
output += '-bi'
output += '.txt'
train = 'train.csv'
dev = 'dev.csv'
if args.m == 'memory' and not args.b:
M = readHelper.readTrainMemory(train)
query, tuples = readHelper.readQueryMemory(dev)
if args.p:
pred = memory.pccMemoryCF(M, query, args.k, args.s, args.w)
else:
pred = memory.memoryCF(M, query, args.k, args.s, args.w)
writeHelper.writePredMemory(output, pred, tuples)
elif args.m == 'model' and not args.b:
M = readHelper.readTrainModel(train)
tuples = readHelper.readQueryModel(dev)
if args.p:
pred = model.pccModelCF(M, tuples, args.k, args.s, args.w)
else:
pred = model.modelCF(M, tuples, args.k, args.s, args.w)
writeHelper.writePredModel(output, pred)
if args.b:
M = readHelper.readTrainModel(train)
tuples = readHelper.readQueryModel(dev)
if args.m == 'model':
pred = BiCF.bi_item(M, tuples, args.k, args.s, args.w)
else:
pred = BiCF.bi_user(M, tuples, args.k, args.s, args.w)
writeHelper.writePredModel(output, pred)
print 'time: %s seconds' % (time.time()-start_time)
else:
runTest()
def getBiCluster(k1, k2):
train = 'train.csv'
M = readHelper.readTrainModel(train)
start_time = time.time()
u2U, m2M, u2Mcluster, m2Ucluster = BiCluster.BiClustering(M, k1, k2)
ClusterTime = (time.time()-start_time)
print 'time: %s seconds' % ClusterTime
# write to file
with open('u2U.txt', 'w') as f1:
pickle.dump(u2U, f1)
with open('m2M.txt', 'w') as f2:
pickle.dump(m2M, f2)
with open('u2Mcluster.txt', 'w') as f3:
pickle.dump(u2Mcluster, f3)
with open('m2Ucluster.txt', 'w') as f4:
pickle.dump(m2Ucluster, f4)
print 'time: %s seconds' % (time.time()-start_time)
print ClusterTime
return ClusterTime
def runTest():
train = 'train.csv'
test = 'test.csv'
output = 'prediciton.txt'
M = readHelper.readTrainMemory(train)
query, tuples = readHelper.readQueryMemory(test)
k = 100
func = 'dotp'
func_w = 'mean'
pred = memory.pccMemoryCF(M, query, k, func, func_w)
writeHelper.writePredMemory(output, pred, tuples)
return
if __name__ == '__main__':
main()
#getBiCluster(3000,1500)