def main(): from gensim import corpora, models, similarities import csv import textedit import time import collections import glob stopfile = open("D:/Lresult/NV_s5/subrev_1000.csv", "r") stopdata = csv.reader(stopfile) stoplist = collections.Counter() for line in stopdata: stoplist[line[0]] = 1 stopfile.close() buslist = glob.glob("D:/Lresult/NVbus/*") print len(buslist) ##make document bow for bus in buslist: dfile = open(bus, "r") ddata = csv.reader(dfile) dlist = [] busname = "" for line in ddata: busname = line[2] if (line[0] not in stoplist): te = line[5] doc = textedit.textedit(te) dlist.append(doc) dfile.close() texts = [[word for word in document.lower().split()] for document in dlist] dictionary = corpora.Dictionary(texts) dictionary.save_as_text("D:/Lresult/NVbusdict/" + busname + ".dict")
def main(pas_): import collections import csv import textedit import time pas = str(pas_) print "mkcorpus_start", time.ctime() ##make documents dnum = 0 subfile = open(pas + "subrev_1000.csv", "r") subdata = csv.reader(subfile) subdata.next() for line in subdata: te = line[5] doc = textedit.textedit(te) dlist = doc.split() wlist = collections.Counter() for t in dlist: wlist[t] = wlist[t] + 1 wfile = open(pas + "subrevbow/" + line[0] + ".csv", "wb") wri = csv.writer(wfile) wri.writerow(["word", "num"]) wri.writerows(wlist.items()) wfile.close() subfile.close()
def main(): from gensim import corpora, models, similarities import csv import textedit import time import collections import glob stopfile=open("D:/Lresult/NV_s5/subrev_1000.csv","r") stopdata=csv.reader(stopfile) stoplist=collections.Counter() for line in stopdata: stoplist[line[0]]=1 stopfile.close() buslist=glob.glob("D:/Lresult/NVbus/*") print len(buslist) ##make document bow for bus in buslist: dfile=open(bus,"r") ddata=csv.reader(dfile) dlist=[] busname="" for line in ddata: busname=line[2] if(line[0] not in stoplist): te=line[5] doc=textedit.textedit(te) dlist.append(doc) dfile.close() texts = [[word for word in document.lower().split()] for document in dlist] dictionary = corpora.Dictionary(texts) dictionary.save_as_text("D:/Lresult/NVbusdict/"+busname+".dict")
def main(pas_): import collections import csv import textedit import time pas=str(pas_) print "mkcorpus_start",time.ctime() ##make documents dnum=0 subfile=open(pas+"subrev_1000.csv","r") subdata=csv.reader(subfile) subdata.next() for line in subdata: te=line[5] doc=textedit.textedit(te) dlist=doc.split() wlist=collections.Counter() for t in dlist: wlist[t]=wlist[t]+1 wfile=open(pas+"subrevbow/"+line[0]+".csv","wb") wri=csv.writer(wfile) wri.writerow(["word","num"]) wri.writerows(wlist.items()) wfile.close() subfile.close()
def main(model_,bnum_,tnum_,train_): from gensim import corpora, models, similarities import csv import textedit import time pas="" print "mkcorpus_start",time.ctime() # remove common words and tokenize #stopfile=open("stopwords_en.csv","r") #stopdata=csv.reader(stopfile) #stoplist=[] #for line in stopdata: # stoplist.append(line[0]) #stopfile.close() #print stoplist ##make documents dlist=[] dfile=open("notNVreview.csv","r") #dfile=open("testrev.csv") ddata=csv.reader(dfile) dnum=0 for line in ddata: te=line[0] doc=textedit.textedit(te) dlist.append(doc) dnum=dnum+1 if(dnum%10000==0): print dnum, dfile.close() #print dlist print "dfile fin",time.ctime() texts = [[word for word in document.lower().split()] for document in dlist] print "text fin",time.ctime() # remove words that appear only once #all_tokens = sum(texts, []) #tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) #texts = [[word for word in text if word not in tokens_once] for text in texts] #print(texts) #print("texts fin") dictionary = corpora.Dictionary(texts) print "dictionary fin",time.ctime(),len(dictionary.token2id) dictionary.filter_extremes(no_below=10,no_above=0.5) print "dictionary cut fin",time.ctime(),len(dictionary.token2id) dictionary.save("nNVreview.dict") dictionary.save_as_text(pas+"nNVreview_text.dict") corpus=[dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize(pas+"nNVreview.mm", corpus) #print corpus print "mk_corpus fin",time.ctime()
def main(clus_): from gensim import corpora, models, similarities import csv import textedit import time pas = "******" clus = str(clus_) print "mkcorpus_start", time.ctime() # remove common words and tokenize #stopfile=open("stopwords_en.csv","r") #stopdata=csv.reader(stopfile) #stoplist=[] #for line in stopdata: # stoplist.append(line[0]) #stopfile.close() #print stoplist ##make documents for clus in range(0, 10): clus = str(clus) dlist = [] dfile = open( "D:/Lresult/ks/o4b6t500LDAbus_p/4bEjOyTaDG24SY5TxsaUNQ.csv", "r") #dfile=open("testrev.csv") ddata = csv.reader(dfile) ddata.next() dnum = 0 for line in ddata: if (line[6] == clus): te = line[5] doc = textedit.textedit(te) dlist.append(doc) dfile.close() print "dfile fin", time.ctime() texts = [[word for word in document.lower().split()] for document in dlist] print "text fin", time.ctime() # remove words that appear only once #all_tokens = sum(texts, []) #tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) #texts = [[word for word in text if word not in tokens_once] for text in texts] #print(texts) #print("texts fin") dictionary = corpora.Dictionary(texts) print "dictionary fin", time.ctime(), len(dictionary.token2id) #dictionary.filter_extremes(no_below=10,no_above=0.5) #print "dictionary cut fin",time.ctime(),len(dictionary.token2id) #dictionary.save("clus"+clus+".dict") dictionary.save_as_text(pas + "clus" + clus + "_text.dict")
def main(clus_): from gensim import corpora, models, similarities import csv import textedit import time pas="******" clus=str(clus_) print "mkcorpus_start",time.ctime() # remove common words and tokenize #stopfile=open("stopwords_en.csv","r") #stopdata=csv.reader(stopfile) #stoplist=[] #for line in stopdata: # stoplist.append(line[0]) #stopfile.close() #print stoplist ##make documents for clus in range(0,10): clus=str(clus) dlist=[] dfile=open("D:/Lresult/ks/o4b6t500LDAbus_p/4bEjOyTaDG24SY5TxsaUNQ.csv","r") #dfile=open("testrev.csv") ddata=csv.reader(dfile) ddata.next() dnum=0 for line in ddata: if(line[6]==clus): te=line[5] doc=textedit.textedit(te) dlist.append(doc) dfile.close() print "dfile fin",time.ctime() texts = [[word for word in document.lower().split()] for document in dlist] print "text fin",time.ctime() # remove words that appear only once #all_tokens = sum(texts, []) #tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) #texts = [[word for word in text if word not in tokens_once] for text in texts] #print(texts) #print("texts fin") dictionary = corpora.Dictionary(texts) print "dictionary fin",time.ctime(),len(dictionary.token2id) #dictionary.filter_extremes(no_below=10,no_above=0.5) #print "dictionary cut fin",time.ctime(),len(dictionary.token2id) #dictionary.save("clus"+clus+".dict") dictionary.save_as_text(pas+"clus"+clus+"_text.dict")
def main(): from gensim import corpora import csv import textedit import collections stopfile=open("D:/Lresult/NV_s5/subrev_1000.csv","r") stopdata=csv.reader(stopfile) stoplist=collections.Counter() for line in stopdata: te=line[5] dlist=[] doc=textedit.textedit(te) dlist.append(doc) texts = [word for word in document.lower().split()] dictionary = corpora.Dictionary(texts) dictionary.save_as_text("D:/Lresult/NV_s5/subrevdict/"+line[0]+".dict") stopfile.close()
def main(): import csv import textedit import time import collections import glob stopfile = open("D:/Lresult/NV_s5/subrev_1000.csv", "r") stopdata = csv.reader(stopfile) stoplist = collections.Counter() for line in stopdata: stoplist[line[0]] = 1 stopfile.close() buslist = glob.glob("D:/Lresult/NVbus/*") print len(buslist) ##make document bow for bus in buslist: dfile = open(bus, "r") ddata = csv.reader(dfile) busname = "" wlist = collections.Counter() for line in ddata: busname = line[2] if (line[0] not in stoplist): te = line[5] doc = textedit.textedit(te) dlist = doc.split() for t in dlist: wlist[t] = wlist[t] + 1 wfile = open("D:/Lresult/NVbusbow/" + busname + ".csv", "wb") writer = csv.writer(wfile) writer.writerow(["word", "num"]) writer.writerows(wlist.items()) wfile.close() dfile.close()
def main(): import csv import textedit import time import collections import glob stopfile=open("D:/Lresult/NV_s5/subrev_1000.csv","r") stopdata=csv.reader(stopfile) stoplist=collections.Counter() for line in stopdata: stoplist[line[0]]=1 stopfile.close() buslist=glob.glob("D:/Lresult/NVbus/*") print len(buslist) ##make document bow for bus in buslist: dfile=open(bus,"r") ddata=csv.reader(dfile) busname="" wlist=collections.Counter() for line in ddata: busname=line[2] if(line[0] not in stoplist): te=line[5] doc=textedit.textedit(te) dlist=doc.split() for t in dlist: wlist[t]=wlist[t]+1 wfile=open("D:/Lresult/NVbusbow/"+busname+".csv","wb") writer=csv.writer(wfile) writer.writerow(["word","num"]) writer.writerows(wlist.items()) wfile.close() dfile.close()
from gensim import corpora, models, similarities import csv import textedit import time pas="******" print "start",time.ctime() dlist=[] dfile=open(pas+"nNVreview.csv","r") ddata=csv.reader(dfile) dnum=0 for line in ddata: te=line[0] doc=textedit.textedit(te) dlist.append(doc) dnum=dnum+1 if(dnum%100000==0): print dnum, dfile.close() #print dlist print "dfile fin",time.ctime() texts = [[word for word in document.lower().split()] for document in dlist] print "text fin",len(dlist),time.ctime() ##dictionary_load dictionary=corpora.Dictionary.load(pas+"/corpus_pl/nNVreviewpl.dict") ####stopword_load sfile=open(pas+"stopwords/over4word.csv","r") sdata=csv.reader(sfile) slist=[]
def main(cluster,fwords): import csv import os import collections import textedit import numpy busname="4bEjOyTaDG24SY5TxsaUNQ" pas="******" pas2="//kaede/PPTShare/masafumi/musc/20151117" fewords=int(fwords)####feature word num cnum=int(cluster)###cluster_num ifile=open(pas+"/ks/busclus/"+busname+".csv","r") idata=csv.reader(ifile) idata.next() cluslist=collections.Counter() for line in idata: ####revid,sentiment_num,clus cluslist[line[0],line[1]]=int(line[2]) ifile.close() ifile=open(pas2+"/ks/bussent/"+busname+".csv","r") idata=csv.reader(ifile) wlist=collections.Counter()####word dictionary for each cluster for num in range(0,cnum): wlist[num]=collections.Counter() for line in idata: doc=textedit.textedit(line[5]) doc=doc.split() for t in doc: wlist[cluslist[line[0],line[6]]][t]=wlist[cluslist[line[0],line[6]]][t]+1 ifile.close() tflist=collections.Counter() wordlen=[] wordsum=collections.Counter() for t in range(0,cnum): wordlen.append(len(wlist[t])) wordsum[t]=sum(wlist[t].values()) for w in wlist[t]: tflist[w]=tflist[w]+1 print wordlen,max(wordlen) wfile=open(pas+"/ks/busclusword/"+busname+".csv","wb") writer=csv.writer(wfile) header=[] for num in map(str,range(0,cnum)): header=header+["c"+num+"word","c"+num+"num","c"+num+"tfidf"] writer.writerow(header) for num in range(0,max(wordlen)): wwlist=[] for t in range(0,10): if(len(wlist[t])<=num): wwlist=wwlist+["_",0,0] else: tmp=wlist[t].items() wwlist.append(tmp[num][0]) wwlist.append(tmp[num][1]) wwlist.append(1.0*tmp[num][1]/wordsum[t]*numpy.log(1+10.0/tflist[tmp[num][0]])) writer.writerow(wwlist) wfile.close()
def main(model_, bnum_, tnum_, train_, pas_): from gensim import corpora, models, similarities import csv import textedit import time import glob model = str(model_) bnum = int(bnum_) tnum = int(tnum_) train = str(train_) pas = str(pas_) print "mkcorpus_start", time.ctime() #remove subrev subfile = open(pas + "subrev_1000.csv", "r") subdata = csv.reader(subfile) sublist = [] for line in subdata: sublist.append(line[0]) subfile.close() ##remove stoplist stopfile = open(pas + "stopwords/over4word.csv", "r") stopdata = csv.reader(stopfile) stoplist = [] for line in stopdata: stoplist.append(line[0]) stopfile.close() stopset = set(stoplist) ##make documents st = "D:/Lresult/NVbus/*" dlist = glob.glob(st) print len(dlist), "star" dnum = 0 for bus in dlist: dfile = open(bus, "r") ddata = csv.reader(dfile) dlist = [] for line in ddata: if (line[0] not in sublist): te = line[5] doc = textedit.textedit(te) dlist.append(doc) dnum = dnum + 1 if (dnum % 1000 == 0): print dnum, dfile.close() texts = [[word for word in document.lower().split()] for document in dlist] #print "text fin",time.ctime(),texts texts = [[word for word in text if word not in stopset] for text in texts] dictionary = corpora.Dictionary(texts) #print "dictionary fin",time.ctime(),len(dictionary.token2id) #dictionary.filter_extremes(no_below=bnum) #print "dictionary cut fin",time.ctime(),len(dictionary.token2id) #dictionary.save(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".dict") dictionary.save_as_text(pas + "NVbus_o4b6_bow/" + bus[17:-4] + ".tsv")
ddata = csv.reader(dfile) ddic = collections.Counter() for line in ddata: ddic[line[0]] = map(float, line[1:]) dfile.close() print "dic fin", len(ddic), len(ddic["we"]), time.ctime() ##base data revfile = open("D:/Lresult/NV_s5/subrev_1000.csv", "r") revdata = csv.reader(revfile) revdata.next() revid = collections.Counter() revvec = collections.Counter() for line in revdata: revid[line[0]] = line[2] ##bus_id text = textedit.textedit(line[5]) revvec[line[0]] = numpy.array([0] * int(dim)) for num in range(0, len(text)): if (num not in stoplist): revvec[line[0]] = revvec[line[0]] + numpy.array(ddic[text[num]]) revfile.close() print "rev fin", len(revvec), time.ctime() ####comp data sfile = open("D:/Lresult/NVreview.csv", "r") sdata = csv.reader(sfile) sdata.next() svec = collections.Counter() lnum = 0 slist = [] for line in sdata:
def main(model_,bnum_,tnum_,train_,pas_): from gensim import corpora, models, similarities import csv import textedit import time import glob model=str(model_) bnum=int(bnum_) tnum=int(tnum_) train=str(train_) pas=str(pas_) print "mkcorpus_start",time.ctime() #remove subrev subfile=open(pas+"subrev_1000.csv","r") subdata=csv.reader(subfile) sublist=[] for line in subdata: sublist.append(line[0]) subfile.close() ##remove stoplist stopfile=open(pas+"stopwords/over4word.csv","r") stopdata=csv.reader(stopfile) stoplist=[] for line in stopdata: stoplist.append(line[0]) stopfile.close() stopset=set(stoplist) ##make documents st="D:/Lresult/NVbus/*" dlist=glob.glob(st) print len(dlist),"star" dnum=0 for bus in dlist: dfile=open(bus,"r") ddata=csv.reader(dfile) dlist=[] for line in ddata: if(line[0] not in sublist): te=line[5] doc=textedit.textedit(te) dlist.append(doc) dnum=dnum+1 if(dnum%1000==0): print dnum, dfile.close() texts = [[word for word in document.lower().split()] for document in dlist] #print "text fin",time.ctime(),texts texts = [[word for word in text if word not in stopset] for text in texts] dictionary = corpora.Dictionary(texts) #print "dictionary fin",time.ctime(),len(dictionary.token2id) #dictionary.filter_extremes(no_below=bnum) #print "dictionary cut fin",time.ctime(),len(dictionary.token2id) #dictionary.save(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".dict") dictionary.save_as_text(pas+"NVbus_o4b6_bow/"+bus[17:-4]+".tsv")
#calc topic sim header=[] header.append("rev_id") header.append("bus_id") for num in range(1,51): header.append(str(num)) wfile=open("subrevtopic.csv","wb") writer=csv.reader(wfile) writer.writerow(header) ifile=open("subrev.csv","r") idata=csv.reader(ifile) idata.next() for line in idata: wlist=[] wlist.append(line[0]) wlist.append(line[1]) doc=textedit.textedit(line[2]) vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = lda[vec_bow] slist=[0]*50 for num in range(0,len(vec_lda)): slist[vec_lda[num][0]]=vec_lda[num][1] wlist=wlist+slist writer.writerow(wlist) ifile.close() wfile.close()
def main(model_,bnum_,tnum_,train_,pas_): from gensim import corpora, models, similarities import csv import textedit import time model=str(model_) bnum=int(bnum_) tnum=int(tnum_) train=str(train_) pas=str(pas_) print "start",time.ctime() #dictionary = corpora.Dictionary.load(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".dict") #corpus = corpora.MmCorpus(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".mm") dictionary = corpora.Dictionary.load(pas+"over4/corpus/nNVrev_o4b6.dict") #use LSI #lsi = models.LsiModel.load(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".lsi") #if(model=="lda"): # lsi=models.LdaModel.load(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".lda") lsi = models.LsiModel.load(pas+"over4/model/nNVrevo4b6_t300.lsi") #calc topic sim header=[] header.append("rev_id") header.append("user_id") header.append("bus_id") header.append("stars") header.append("sentnum") header.append("date") for num in range(0,int(tnum)): header.append("t"+str(num).zfill(len(str(tnum))/10)) #wfile=open(pas+train+model+"_o4b"+str(bnum)+"t"+str(tnum)+".csv","wb") wfile=open(pas+"model/hoge.csv","wb") writer=csv.writer(wfile) writer.writerow(header) "NVreview.csv:[review_id,user_id,business_id,stars,date,texts]" #test file ifile=open(pas+"ks/NVrevrawsent.csv","r") idata=csv.reader(ifile) idata.next() k=0 for line in idata: wlist=[] wlist.append(line[0]) wlist.append(line[1]) wlist.append(line[2]) wlist.append(line[3]) wlist.append(line[6])###for revraw only wlist.append(line[4]) doc=textedit.textedit(line[5]) vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] slist=[0]*int(tnum) for num in range(0,len(vec_lsi)): slist[vec_lsi[num][0]]=vec_lsi[num][1] wlist=wlist+slist writer.writerow(wlist) k=k+1 if(k%1000==0): print k,time.ctime() ifile.close() wfile.close() print "fin",time.ctime()
def main(model_,bnum_,tnum_,train_,pas_): from gensim import corpora, models, similarities import csv import textedit import time model=str(model_) bnum=int(bnum_) tnum=int(tnum_) train=str(train_) pas=str(pas_) print "mkcorpus_start",time.ctime() #remove subrev subfile=open(pas+"subrev_1000.csv","r") subdata=csv.reader(subfile) sublist=[] for line in subdata: stoplist.append(line[0]) subfile.close() ##remove stoplist stopfile=open(pas+"stopwords/over4word.csv","r") stopdata=csv.reader(stopfile) stoplist=[] for line in stopdata: stoplist.append(line[0]) stopfile.close() stopset=set(stoplist) ##make documents dfile=open(pas+train+".csv","r") ddata=csv.reader(dfile) ddata.next() dnum=0 dlist=[] for line in ddata: if(line[0] not in sublist): te=line[5] doc=textedit.textedit(te) dlist.append(doc) dnum=dnum+1 if(dnum%10000==0): print dnum, dfile.close() print "dfile fin",time.ctime() texts = [[word for word in document.lower().split()] for document in dlist] print "text fin",time.ctime() texts = [[word for word in text if word not in stopset] for text in texts] dictionary = corpora.Dictionary(texts) print "dictionary fin",time.ctime(),len(dictionary.token2id) dictionary.filter_extremes(no_below=bnum) print "dictionary cut fin",time.ctime(),len(dictionary.token2id) dictionary.save(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".dict") dictionary.save_as_text(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+"_text.dict") corpus=[dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize(pas+train+"_o4b"+str(bnum)+"t"+str(tnum)+".mm", corpus) print "mk_corpus fin",time.ctime()
header.append("t"+str(num).zfill(int(topic_num)/10)) wfile=open(pas+"nNVrev_LSI_t"+str(topic_num)+".csv","wb") writer=csv.writer(wfile) writer.writerow(header) "NVreview.csv:[review_id,user_id,business_id,stars,date,texts]" ifile=open("D:/Lresult/NVreview.csv","r") idata=csv.reader(ifile) idata.next() k=0 for line in idata: wlist=[] wlist.append(line[0]) wlist.append(line[2]) doc=textedit.textedit(line[5]) vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] ### slist=[0]*int(topic_num) for num in range(0,len(vec_lsi)): slist[vec_lsi[num][0]]=vec_lsi[num][1] wlist=wlist+slist writer.writerow(wlist) k=k+1 if(k%1000==0): print k,time.ctime() ifile.close() wfile.close() print "fin",time.ctime()