-
Notifications
You must be signed in to change notification settings - Fork 0
/
calcsim.py
82 lines (73 loc) · 2.06 KB
/
calcsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def main(model_,bnum_,tnum_,train_,pas_):
from gensim import corpora, models, similarities
import csv
import textedit
import time
model=str(model_)
bnum=int(bnum_)
tnum=int(tnum_)
train=str(train_)
pas=str(pas_)
print "start",time.ctime()
#dictionary = corpora.Dictionary.load(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".dict")
#corpus = corpora.MmCorpus(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".mm")
dictionary = corpora.Dictionary.load(pas+"over4/corpus/nNVrev_o4b6.dict")
#use LSI
#lsi = models.LsiModel.load(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".lsi")
#if(model=="lda"):
# lsi=models.LdaModel.load(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".lda")
lsi = models.LsiModel.load(pas+"over4/model/nNVrevo4b6_t300.lsi")
#calc topic sim
header=[]
header.append("rev_id")
header.append("user_id")
header.append("bus_id")
header.append("stars")
header.append("sentnum")
header.append("date")
for num in range(0,int(tnum)):
header.append("t"+str(num).zfill(len(str(tnum))/10))
#wfile=open(pas+train+model+"_o4b"+str(bnum)+"t"+str(tnum)+".csv","wb")
wfile=open(pas+"model/hoge.csv","wb")
writer=csv.writer(wfile)
writer.writerow(header)
"NVreview.csv:[review_id,user_id,business_id,stars,date,texts]"
#test file
ifile=open(pas+"ks/NVrevrawsent.csv","r")
idata=csv.reader(ifile)
idata.next()
k=0
for line in idata:
wlist=[]
wlist.append(line[0])
wlist.append(line[1])
wlist.append(line[2])
wlist.append(line[3])
wlist.append(line[6])###for revraw only
wlist.append(line[4])
doc=textedit.textedit(line[5])
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
slist=[0]*int(tnum)
for num in range(0,len(vec_lsi)):
slist[vec_lsi[num][0]]=vec_lsi[num][1]
wlist=wlist+slist
writer.writerow(wlist)
k=k+1
if(k%1000==0):
print k,time.ctime()
ifile.close()
wfile.close()
print "fin",time.ctime()
if __name__ == '__main__':
print "model"
model_="lsi"
print "bnum"
bnum_=6
print "tnum"
tnum_=300
print "train"
train_="nNVreview"
print "pas"
pas_="D:/Lresult/"
main(model_,bnum_,tnum_,train_,pas_)