Ejemplo n.º 1
0
def getTopRecords(web,vid,loadtime='0',topnum=10,mtype=None,click=0):
    vnInfos=[]     
    if mtype not in merge_cn:  
        return vnInfos 
    if mtype == type_new:
        records=tablemerge.getTopRecords(ctable, topnum)
    elif mtype == type_hot:
        records=tablemerge.getTopClickRecords(ctable, topnum)
    else:        
        records=tablemerge.getTopRecords(ctable, topnum, mtype)
    if records!=-1 and len(records)>0:
        for item in records:
        #0id,1vid,2title,3url,4thumb,5summary,6keywords,7newsid,8vtype,9source,10related,
        #11loadtime,12duration,13web,14mvid,15mtype,16click
        #vid,title,url,thumb,brief,source,loadtime,duration,web,mtype,click
            vnInfos.append(NewsInfo(item[1],item[2],item[3],item[4],item[5],item[9],item[11],
                                    item[12],item[13],item[15],item[16]))
    return vnInfos
Ejemplo n.º 2
0
def getTopRecords(web,mvid,loadtime='0',topnum=10,mtype=None,click=0):
    vnInfos=[]     
    if mtype not in merge_cn:  
        return vnInfos 
    if mtype == type_new:
        records=tablemerge.getTopRecords(ctable, topnum)
    elif mtype == type_hot:
        records=tablemerge.getTopClickRecords(ctable, topnum)
    else:        
        records=tablemerge.getTopRecords(ctable, topnum, mtype)
    if records!=-1 and len(records)>0:
        for item in records:
        #0id,1webid,2vid,3title,4url,5thumb,6summary,7keywords,8newsid,9vtype,10source,
        #11related,12loadtime,13duration,14web,15mvid,16mtype,17click
        #2vid,3title,4url,5thumb,6brief,10source,12loadtime,13duration,14web,15mvid,16mtype,17click
            vnInfos.append(NewsInfo(item[2],item[3],item[4],item[5],item[6],item[10],
                                    item[12],item[13],item[14],item[15],item[16],item[17]))
    return vnInfos
Ejemplo n.º 3
0
        for docid,weight in word2doc_mat[wordid]:
            value=dic_tfidf.get(docid,0)
            value+=tfidf_v*weight
            dic_tfidf[docid]=value
    #print 'tfidf:',vec_tfidf
    #print 'bow:',vec_bow
    if not dic_tfidf.values():        
        print 'Document not recruited:',' '.join(wordList)
    limit_low=prune_at*max(dic_tfidf.iteritems(),key=lambda i:i[1])[1]
    concept_vec=[]
    for item in dic_tfidf.iteritems():
        if item[1]>=limit_low:
            concept_vec.append(item)
    return sorted(concept_vec)    

oldtime=time.time()
rows=tablemerge.getTopRecords(dbconfig.mergetable, 10)
title=rows[3][2]
doc=delpunc(' '.join(jieba.cut(title)).lower())
vec_tfidf = _get_concept_vec_prune(doc.split())# convert the query to concept space
vec_pca=esa_pca[vec_tfidf]
# print vec_pca
sims = index[vec_pca] # perform a similarity query against the corpus
# print sims # print (document_number, document_similarity) 2-tuples

print doc
doc_list=list(open(news_file))
for sim in sims:
    print sim[1],' '.join(doc_list[sim[0]].strip().split()[1:])

print 'time cost:%s' % str(time.time()-oldtime)