def getSimofSens(s1, s2, modify=False): wl1 = delpunc(" ".join(jieba.cut(s1.lower()))).split() # make sure is a utf-8 str wl2 = delpunc(" ".join(jieba.cut(s2.lower()))).split() # make sure is a utf-8 str printSimofWordList(wl1, wl2) sim0 = getSimofWordListTopWeight(wl1, wl2) sim1 = getSimofWordListTopAve(wl1, wl2) sim2 = getSimofWordListPairMatch(wl1, wl2) sim3 = getSimofWordListVecSum(wl1, wl2) print "weight:%.3f,top:%.3f,pair:%.3f,vec:%.3f" % (sim0, sim1, sim2, sim3) return sim2
def getSimofSens(s1,s2,modify=False): wl1=delpunc(' '.join(jieba.cut(s1.lower()))).split()# make sure is a utf-8 str wl2=delpunc(' '.join(jieba.cut(s2.lower()))).split()# make sure is a utf-8 str printSimofWordList(wl1,wl2) #getSimofWordListTopWeight(wl1, wl2) sim1=getSimofWordListTopMod(wl1,wl2,modify) sim2=getSimofWordListPairMod(wl1,wl2,modify) sim3=getSimofWordListVec(wl1,wl2) print 'sim1:%.3f,sim2:%.3f,sim3:%.3f'%(sim1,sim2,sim3) return sim2
def getMergeNews(): print tablemerge.getAllCount(dbconfig.mergetable); rows=tablemerge.getTitleSummary(dbconfig.mergetable) if rows !=-1: with open(news_file,'w') as fout: count=0 for row in rows: # id,title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3] title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8') summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8') #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_ts='%s %s %s'%(count,title,summary) fout.write(msg_ts+'\n')
def removeDuplicationId(in_file): corpus=list(open(news_file,'r')) fout_lsa=open(rmdup_idlsa_file,'w') fout_esa=open(rmdup_idesa_file,'w') fout_w2v=open(rmdup_idw2v_file,'w') fout=open(rmdup_id_file,'w') with open(in_file,'r') as fin: for line in fin: line=line.split() newStr=' '.join(line[4:]) #print >> fout,line[1],newStr wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') #print >> fout,line[1],wordStr sims=_getRelatedNews(wordStr) if not sims: print 'sims is none',wordStr continue sim_dic=_vote2remove(sims) for indx,_dic in sim_dic.iteritems(): line_mtid=corpus[indx].split()[0] if line_mtid==line[1]: # skip the news itself continue if len(_dic)>=2 and 'w2v' in _dic: #if len(_dic)>=2: print >> fout,line[1],line_mtid if 'lsa' in _dic: print >> fout_lsa,line[1],line_mtid if 'esa' in _dic: print >> fout_esa,line[1],line_mtid if 'w2v' in _dic: print >> fout_w2v,line[1],line_mtid fout.close() fout_lsa.close() fout_esa.close() fout_w2v.close()
def getSimTfIdfFilter(s1, s2, top_num=5): wl1 = delpunc(" ".join(jieba.cut(s1.lower()))).split() # make sure is a utf-8 str wl2 = delpunc(" ".join(jieba.cut(s2.lower()))).split() wordStr = " ".join(wl1) if isinstance(wordStr, unicode): # make sure word is utf-8 str type wl1 = wordStr.encode("utf-8").split() wordStr = " ".join(wl2) if isinstance(wordStr, unicode): # make sure word is utf-8 str type wl2 = wordStr.encode("utf-8").split() wl1 = _getTfIdfWordList(wl1)[:top_num] wl2 = _getTfIdfWordList(wl2)[:top_num] sim0 = getSimofWordListTopWeight(wl1, wl2) sim1 = getSimofWordListTopAve(wl1, wl2) sim2 = getSimofWordListPairMatch(wl1, wl2) sim3 = getSimofWordListVecSum(wl1, wl2) print "TfIdf weight:%.3f,top:%.3f,pair:%.3f,vec:%.3f" % (sim0, sim1, sim2, sim3)
def getRelatedNewsBat(in_file,out_file): corpus=list(open(news_file,'r')) fout=open(out_file,'w') with open(in_file,'r') as fin: for line in fin: line=line.split() newStr=' '.join(line[4:]) #print >> fout,line[1],newStr wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') wl1=wordStr.split() sims=[] for i in xrange(len(corpus)): news=corpus[i].split() wl2=news[1:] (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2) sims.append((_sq,sim)) sims=sorted(sims,key=lambda x:x[1],reverse=True) print >> fout,line[1],wordStr for indx,sim in sims[:11]: line_mtid=corpus[indx].split()[0] if line_mtid==line[1]: # skip the news itself continue #if len(_dic)>=2 and 'w2v' in _dic: print >> fout,' %.4f,%s'%(sim,corpus[indx]), fout.close()
def removeDuplicationId(in_file): corpus=list(open(news_file,'r')) fout_w2v=open(rmdup_idw2v_file,'w') with open(in_file,'r') as fin: for line in fin: line=line.split() newStr=' '.join(line[4:]) print line[1],newStr #print >> fout,line[1],newStr wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') wl1=wordStr.split() sims=[] for i in xrange(len(corpus)): news=corpus[i].split() wl2=news[1:] (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2) sims.append((_sq,sim)) sims=sorted(sims,key=lambda x:x[1],reverse=True) for indx,sim in sims[:11]: line_mtid=corpus[indx].split()[0] if sim<0.7: break if line_mtid==line[1]: # skip the news itself continue print >> fout_w2v,line[1],line_mtid fout_w2v.close()
def dealRepFile(rep_pre_file,rep_file): with open(rep_file,'w') as fout: for line in open(rep_pre_file): line= line.split() title=delpunc(' '.join(jieba.cut(' '.join(line[4:]).lower()))).encode('utf-8') line='%s %s'%(line[1],title) print >> fout,line
def get_records_dayago(dayago=30): rows = tablemerge2.getTitleBriefRecords(dbconfig.mergetable2, dayago) if rows == -1: print "error tablemerge2 getTitleBriefRecords" return docs = {} for row in rows: # newsid,title,ctime,source,using newsid for convenience to add related news title = row[1].strip() docs[Doc(row[0], row[2], row[3])] = delpunc(" ".join(jieba.cut(title)).lower()).split() return docs
def getSecondRmDuplicationResult(split=True): rows=tablemerge2.getTitleSummary(dbconfig.mergetable2) if rows !=-1: f_summary=open(merge2_summary_file,'w') with open(merge2_title_file,'w') as fout: count=0 for row in rows: # title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),row[2].strip(),row[3] if split: title=' '.join(jieba.cut(delpunc(title.lower()))).encode('utf-8') summary=' '.join(jieba.cut(delpunc(summary.lower()))).encode('utf-8') timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_t='%s %s %s %s'%(count,mtid,timeStr,title) msg_s='%s %s %s %s'%(count,mtid,timeStr,summary) print msg_t print msg_s fout.write(msg_t+'\n') f_summary.write(msg_s+'\n') f_summary.close()
def getMerge2Title(): rows=tablemerge2.getTitleSummary(dbconfig.mergetable2) if rows !=-1: f_summary=open(merge2_summary_file,'w') with open(merge2_title_file,'w') as fout: count=0 for row in rows: # title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3] title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8') summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8') if len(summary)<len(title): summary=title #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_t='%s %s %s %s'%(count,mtid,ctime,title) msg_s='%s %s %s %s'%(count,mtid,ctime,summary) print msg_t print msg_s fout.write(msg_t+'\n') f_summary.write(msg_s+'\n') f_summary.close()
def get_records_dayago(tablename,dayago=30): if dbconfig.mergetable == tablename: rows=tablemerge.getBriefRecords(tablename, dayago) else: rows=table.getBriefRecords(tablename,dayago) if rows== -1: print 'error table getBriefRecords' return docs={} for row in rows: # id,title,summary,loadtime,web summary=row[1].strip() docs[Doc(row[0],row[3],row[4])]=delpunc(' '.join(jieba.cut(summary)).lower()).split() return docs
def getMergeTitle(): rows=tablemerge.getTitles(dbconfig.mergetable, limit=5000) if rows !=-1: f_title_ori=open(merge1_title_file_ori,'w') with open(merge1_title_file,'w') as fout: for row in rows: # id,title,summary,ctime,source mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3] msg_t_ori='%s %s'%(mtid,title) title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8') msg_t='%s %s'%(mtid,title) print msg_t_ori fout.write(msg_t+'\n') f_title_ori.write(msg_t_ori+'\n') f_title_ori.close()
def statisticDuplicationId(): corpus=list(open(news_file,'r')) fout=open(sim_news_file,'w') with open(rep_file,'r') as fin: for line in fin: newStr=''.join(line.split()[4:]) wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') sims=_getRelatedNews(wordStr) if not sims: print 'sims is none' continue for alg in ['lsa','esa','w2v']: print >>fout,alg,':' for indx,sim in sims[alg]: print >>fout,sim,corpus[indx], fout.close()
def get_records_newadded(): m2_maxid = tablemerge2.getMaxMtId(dbconfig.mergetable2) if not m2_maxid: m2_maxid = -1 m_maxid = tablemerge.getMaxId(dbconfig.mergetable) docs = {} if m_maxid > m2_maxid: rows = tablemerge.getTitleBriefRecordsBiggerId(dbconfig.mergetable, m2_maxid) if rows == -1: print "error tablemerge getTitleBriefRecordsBiggerId" return if len(rows[0]) > 0: # the first element is not null for row in rows: # newsid,title,ctime,source title = row[1].strip() if title: docs[Doc(row[0], row[2], row[3])] = delpunc(" ".join(jieba.cut(title)).lower()).split() return docs
def get_records_newadded(web): m_maxid=tablemerge.getMaxWebId(dbconfig.mergetable, web) if not m_maxid: m_maxid=-1 w_maxid=table.getMaxId(web) docs={} if w_maxid>m_maxid: rows=table.getBriefRecordsBiggerId(web, m_maxid) if rows==-1: print 'error table getBriefRecordsBiggerId' return if len(rows[0])>0: # the first element is not null for row in rows: # id,title,summary,ctime,source summary=row[1].strip() if summary: docs[Doc(row[0],row[3],row[4])]=delpunc(' '.join(jieba.cut(summary)).lower()).split() return docs
def removeDuplication(in_file,out_file): # in_file:the file with news to be detected,out_file:the duplication result corpus=list(open(news_file,'r')) fout=open(out_file,'w') with open(in_file,'r') as fin: for line in fin: line=line.split() newStr=' '.join(line[4:]) #print >> fout,line[1],newStr wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') print >> fout,line[1],wordStr sims=_getRelatedNews(wordStr) if not sims: print 'sims is none' continue sim_dic=_vote2remove(sims) for indx,_dic in sim_dic.iteritems(): line_mtid=corpus[indx].split()[0] if line_mtid==line[1]: # skip the news itself continue if len(_dic)>=2 and 'w2v' in _dic: #if len(_dic)>=2: print >> fout,' ',_getDictStr(_dic),corpus[indx], fout.close()
def get_vec(doc): wordList=delpunc(' '.join(jieba.cut(doc.lower()))).split()# make sure is a utf-8 str return _get_concept_vec(wordList)
for docid,weight in word2doc_mat[wordid]: value=dic_tfidf.get(docid,0) value+=tfidf_v*weight dic_tfidf[docid]=value #print 'tfidf:',vec_tfidf #print 'bow:',vec_bow if not dic_tfidf.values(): print 'Document not recruited:',' '.join(wordList) limit_low=prune_at*max(dic_tfidf.iteritems(),key=lambda i:i[1])[1] concept_vec=[] for item in dic_tfidf.iteritems(): if item[1]>=limit_low: concept_vec.append(item) return sorted(concept_vec) oldtime=time.time() rows=tablemerge.getTopRecords(dbconfig.mergetable, 10) title=rows[3][2] doc=delpunc(' '.join(jieba.cut(title)).lower()) vec_tfidf = _get_concept_vec_prune(doc.split())# convert the query to concept space vec_pca=esa_pca[vec_tfidf] # print vec_pca sims = index[vec_pca] # perform a similarity query against the corpus # print sims # print (document_number, document_similarity) 2-tuples print doc doc_list=list(open(news_file)) for sim in sims: print sim[1],' '.join(doc_list[sim[0]].strip().split()[1:]) print 'time cost:%s' % str(time.time()-oldtime)