def getSimofSens(s1, s2, modify=False):
    wl1 = delpunc(" ".join(jieba.cut(s1.lower()))).split()  # make sure is a utf-8 str
    wl2 = delpunc(" ".join(jieba.cut(s2.lower()))).split()  # make sure is a utf-8 str
    printSimofWordList(wl1, wl2)
    sim0 = getSimofWordListTopWeight(wl1, wl2)
    sim1 = getSimofWordListTopAve(wl1, wl2)
    sim2 = getSimofWordListPairMatch(wl1, wl2)
    sim3 = getSimofWordListVecSum(wl1, wl2)
    print "weight:%.3f,top:%.3f,pair:%.3f,vec:%.3f" % (sim0, sim1, sim2, sim3)
    return sim2
Exemple #2
0
def getSimofSens(s1,s2,modify=False):
    wl1=delpunc(' '.join(jieba.cut(s1.lower()))).split()# make sure is a utf-8 str  
    wl2=delpunc(' '.join(jieba.cut(s2.lower()))).split()# make sure is a utf-8 str
    printSimofWordList(wl1,wl2)
    #getSimofWordListTopWeight(wl1, wl2)
    sim1=getSimofWordListTopMod(wl1,wl2,modify) 
    sim2=getSimofWordListPairMod(wl1,wl2,modify)
    sim3=getSimofWordListVec(wl1,wl2)
    print 'sim1:%.3f,sim2:%.3f,sim3:%.3f'%(sim1,sim2,sim3)
    return sim2
Exemple #3
0
def getMergeNews():
    print tablemerge.getAllCount(dbconfig.mergetable);
    rows=tablemerge.getTitleSummary(dbconfig.mergetable)
    if rows !=-1:
        with open(news_file,'w') as fout:
            count=0
            for row in rows:
                # id,title,summary,ctime,source
                count+=1
                mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3]
                title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8')
                summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8')
                #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime))
                msg_ts='%s %s %s'%(count,title,summary)                
                fout.write(msg_ts+'\n')   
Exemple #4
0
def removeDuplicationId(in_file):
    corpus=list(open(news_file,'r'))
    fout_lsa=open(rmdup_idlsa_file,'w')
    fout_esa=open(rmdup_idesa_file,'w')
    fout_w2v=open(rmdup_idw2v_file,'w')
    fout=open(rmdup_id_file,'w')
    with open(in_file,'r') as fin:
        for line in fin:
            line=line.split()
            newStr=' '.join(line[4:])
            #print >> fout,line[1],newStr
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')
            #print >> fout,line[1],wordStr
            sims=_getRelatedNews(wordStr)
            if not sims:
                print 'sims is none',wordStr
                continue
            sim_dic=_vote2remove(sims)            
            for indx,_dic in sim_dic.iteritems():
                line_mtid=corpus[indx].split()[0]
                if line_mtid==line[1]: # skip the news itself
                    continue
                if len(_dic)>=2 and  'w2v' in _dic:
                #if len(_dic)>=2:
                    print >> fout,line[1],line_mtid
                if 'lsa' in _dic:
                    print >> fout_lsa,line[1],line_mtid                
                if 'esa' in _dic:
                    print >> fout_esa,line[1],line_mtid
                if 'w2v' in _dic:
                    print >> fout_w2v,line[1],line_mtid
    fout.close()        
    fout_lsa.close() 
    fout_esa.close()     
    fout_w2v.close()   
def getSimTfIdfFilter(s1, s2, top_num=5):
    wl1 = delpunc(" ".join(jieba.cut(s1.lower()))).split()  # make sure is a utf-8 str
    wl2 = delpunc(" ".join(jieba.cut(s2.lower()))).split()
    wordStr = " ".join(wl1)
    if isinstance(wordStr, unicode):  # make sure word is utf-8 str type
        wl1 = wordStr.encode("utf-8").split()
    wordStr = " ".join(wl2)
    if isinstance(wordStr, unicode):  # make sure word is utf-8 str type
        wl2 = wordStr.encode("utf-8").split()
    wl1 = _getTfIdfWordList(wl1)[:top_num]
    wl2 = _getTfIdfWordList(wl2)[:top_num]
    sim0 = getSimofWordListTopWeight(wl1, wl2)
    sim1 = getSimofWordListTopAve(wl1, wl2)
    sim2 = getSimofWordListPairMatch(wl1, wl2)
    sim3 = getSimofWordListVecSum(wl1, wl2)
    print "TfIdf weight:%.3f,top:%.3f,pair:%.3f,vec:%.3f" % (sim0, sim1, sim2, sim3)
Exemple #6
0
def getRelatedNewsBat(in_file,out_file):
    corpus=list(open(news_file,'r'))
    fout=open(out_file,'w')    
    with open(in_file,'r') as fin:
        for line in fin:
            line=line.split()
            newStr=' '.join(line[4:])
            #print >> fout,line[1],newStr
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')            
            wl1=wordStr.split()
            sims=[]
            for i in xrange(len(corpus)):
                news=corpus[i].split()
                wl2=news[1:]
                (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2)
                sims.append((_sq,sim))
            sims=sorted(sims,key=lambda x:x[1],reverse=True)  
            print >> fout,line[1],wordStr                  
            for indx,sim in sims[:11]:
                line_mtid=corpus[indx].split()[0]
                if line_mtid==line[1]: # skip the news itself
                    continue
                #if len(_dic)>=2 and  'w2v' in _dic:                
                print >> fout,'    %.4f,%s'%(sim,corpus[indx]),
    fout.close()
Exemple #7
0
def removeDuplicationId(in_file):
    corpus=list(open(news_file,'r'))    
    fout_w2v=open(rmdup_idw2v_file,'w')
    with open(in_file,'r') as fin:
        for line in fin:            
            line=line.split()
            newStr=' '.join(line[4:])
            print line[1],newStr
            #print >> fout,line[1],newStr
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')
            wl1=wordStr.split()
            sims=[]
            for i in xrange(len(corpus)):
                news=corpus[i].split()
                wl2=news[1:]
                (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2)
                sims.append((_sq,sim))
            sims=sorted(sims,key=lambda x:x[1],reverse=True) 
            for indx,sim in sims[:11]:
                line_mtid=corpus[indx].split()[0]
                if sim<0.7:
                    break
                if line_mtid==line[1]: # skip the news itself
                    continue  
                print >> fout_w2v,line[1],line_mtid    
    fout_w2v.close()   
Exemple #8
0
def dealRepFile(rep_pre_file,rep_file):
    with open(rep_file,'w') as fout:
        for line in open(rep_pre_file):
            line= line.split()
            title=delpunc(' '.join(jieba.cut(' '.join(line[4:]).lower()))).encode('utf-8')
            line='%s %s'%(line[1],title)
            print >> fout,line
Exemple #9
0
def get_records_dayago(dayago=30):
    rows = tablemerge2.getTitleBriefRecords(dbconfig.mergetable2, dayago)
    if rows == -1:
        print "error tablemerge2 getTitleBriefRecords"
        return
    docs = {}
    for row in rows:
        # newsid,title,ctime,source,using newsid for convenience to add related news
        title = row[1].strip()
        docs[Doc(row[0], row[2], row[3])] = delpunc(" ".join(jieba.cut(title)).lower()).split()
    return docs
Exemple #10
0
def getSecondRmDuplicationResult(split=True):
    rows=tablemerge2.getTitleSummary(dbconfig.mergetable2)
    if rows !=-1:
        f_summary=open(merge2_summary_file,'w')
        with open(merge2_title_file,'w') as fout:
            count=0
            for row in rows:
                # title,summary,ctime,source
                count+=1
                mtid,title,summary,ctime=row[0],row[1].strip(),row[2].strip(),row[3]
                if split:
                    title=' '.join(jieba.cut(delpunc(title.lower()))).encode('utf-8')
                    summary=' '.join(jieba.cut(delpunc(summary.lower()))).encode('utf-8')
                timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime))
                msg_t='%s %s %s %s'%(count,mtid,timeStr,title)
                msg_s='%s %s %s %s'%(count,mtid,timeStr,summary)
                print msg_t
                print msg_s
                fout.write(msg_t+'\n')                
                f_summary.write(msg_s+'\n')
        f_summary.close()
Exemple #11
0
def getMerge2Title():
    rows=tablemerge2.getTitleSummary(dbconfig.mergetable2)
    if rows !=-1:
        f_summary=open(merge2_summary_file,'w')
        with open(merge2_title_file,'w') as fout:
            count=0
            for row in rows:
                # title,summary,ctime,source
                count+=1
                mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3]               
                title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8')
                summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8')
                if len(summary)<len(title):
                    summary=title
                #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime))
                msg_t='%s %s %s %s'%(count,mtid,ctime,title)
                msg_s='%s %s %s %s'%(count,mtid,ctime,summary)
                print msg_t
                print msg_s
                fout.write(msg_t+'\n')                
                f_summary.write(msg_s+'\n')
        f_summary.close()
Exemple #12
0
def get_records_dayago(tablename,dayago=30):
    if dbconfig.mergetable == tablename:
        rows=tablemerge.getBriefRecords(tablename, dayago)
    else:
        rows=table.getBriefRecords(tablename,dayago)
    if rows== -1:
        print 'error table getBriefRecords'
        return   
    docs={}
    for row in rows:
        # id,title,summary,loadtime,web
        summary=row[1].strip()
        docs[Doc(row[0],row[3],row[4])]=delpunc(' '.join(jieba.cut(summary)).lower()).split()
    return docs
Exemple #13
0
def getMergeTitle():
    rows=tablemerge.getTitles(dbconfig.mergetable, limit=5000)
    if rows !=-1:
        f_title_ori=open(merge1_title_file_ori,'w')
        with open(merge1_title_file,'w') as fout:
            for row in rows:
                # id,title,summary,ctime,source
                mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3]
                msg_t_ori='%s %s'%(mtid,title)
                title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8')                
                msg_t='%s %s'%(mtid,title)                
                print msg_t_ori
                fout.write(msg_t+'\n')     
                f_title_ori.write(msg_t_ori+'\n')
        f_title_ori.close()
Exemple #14
0
def statisticDuplicationId():
    corpus=list(open(news_file,'r'))
    fout=open(sim_news_file,'w')
    with open(rep_file,'r') as fin:
        for line in fin:
            newStr=''.join(line.split()[4:])
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')
            sims=_getRelatedNews(wordStr)
            if not sims:
                print 'sims is none'
                continue
            for alg in ['lsa','esa','w2v']:
                print >>fout,alg,':'
                for indx,sim in sims[alg]:
                    print >>fout,sim,corpus[indx],            
    fout.close()
Exemple #15
0
def get_records_newadded():
    m2_maxid = tablemerge2.getMaxMtId(dbconfig.mergetable2)
    if not m2_maxid:
        m2_maxid = -1
    m_maxid = tablemerge.getMaxId(dbconfig.mergetable)
    docs = {}
    if m_maxid > m2_maxid:
        rows = tablemerge.getTitleBriefRecordsBiggerId(dbconfig.mergetable, m2_maxid)
        if rows == -1:
            print "error tablemerge getTitleBriefRecordsBiggerId"
            return
        if len(rows[0]) > 0:  # the first element is not null
            for row in rows:
                # newsid,title,ctime,source
                title = row[1].strip()
                if title:
                    docs[Doc(row[0], row[2], row[3])] = delpunc(" ".join(jieba.cut(title)).lower()).split()
    return docs
Exemple #16
0
def get_records_newadded(web):
    m_maxid=tablemerge.getMaxWebId(dbconfig.mergetable, web)    
    if not m_maxid:
        m_maxid=-1
    w_maxid=table.getMaxId(web)    
    docs={}
    if w_maxid>m_maxid:
        rows=table.getBriefRecordsBiggerId(web, m_maxid)
        if rows==-1:
            print 'error table getBriefRecordsBiggerId'
            return
        if len(rows[0])>0:      # the first element is not null
            for row in rows:
                # id,title,summary,ctime,source
                summary=row[1].strip()
                if summary:
                    docs[Doc(row[0],row[3],row[4])]=delpunc(' '.join(jieba.cut(summary)).lower()).split()
    return docs
Exemple #17
0
def removeDuplication(in_file,out_file):
    # in_file:the file with news to be detected,out_file:the duplication result
    corpus=list(open(news_file,'r'))
    fout=open(out_file,'w')
    with open(in_file,'r') as fin:
        for line in fin:
            line=line.split()
            newStr=' '.join(line[4:])
            #print >> fout,line[1],newStr
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')
            print >> fout,line[1],wordStr
            sims=_getRelatedNews(wordStr)
            if not sims:
                print 'sims is none'
                continue
            sim_dic=_vote2remove(sims)            
            for indx,_dic in sim_dic.iteritems():
                line_mtid=corpus[indx].split()[0]
                if line_mtid==line[1]: # skip the news itself
                    continue
                if len(_dic)>=2 and  'w2v' in _dic:
                #if len(_dic)>=2:
                    print >> fout,'    ',_getDictStr(_dic),corpus[indx],
    fout.close()
Exemple #18
0
def get_vec(doc):
    wordList=delpunc(' '.join(jieba.cut(doc.lower()))).split()# make sure is a utf-8 str     
    return _get_concept_vec(wordList)
Exemple #19
0
        for docid,weight in word2doc_mat[wordid]:
            value=dic_tfidf.get(docid,0)
            value+=tfidf_v*weight
            dic_tfidf[docid]=value
    #print 'tfidf:',vec_tfidf
    #print 'bow:',vec_bow
    if not dic_tfidf.values():        
        print 'Document not recruited:',' '.join(wordList)
    limit_low=prune_at*max(dic_tfidf.iteritems(),key=lambda i:i[1])[1]
    concept_vec=[]
    for item in dic_tfidf.iteritems():
        if item[1]>=limit_low:
            concept_vec.append(item)
    return sorted(concept_vec)    

oldtime=time.time()
rows=tablemerge.getTopRecords(dbconfig.mergetable, 10)
title=rows[3][2]
doc=delpunc(' '.join(jieba.cut(title)).lower())
vec_tfidf = _get_concept_vec_prune(doc.split())# convert the query to concept space
vec_pca=esa_pca[vec_tfidf]
# print vec_pca
sims = index[vec_pca] # perform a similarity query against the corpus
# print sims # print (document_number, document_similarity) 2-tuples

print doc
doc_list=list(open(news_file))
for sim in sims:
    print sim[1],' '.join(doc_list[sim[0]].strip().split()[1:])

print 'time cost:%s' % str(time.time()-oldtime)