def getTopRecords(web,vid,loadtime='0',topnum=10,mtype=None,click=0): vnInfos=[] if mtype not in merge_cn: return vnInfos if mtype == type_new: records=tablemerge.getTopRecords(ctable, topnum) elif mtype == type_hot: records=tablemerge.getTopClickRecords(ctable, topnum) else: records=tablemerge.getTopRecords(ctable, topnum, mtype) if records!=-1 and len(records)>0: for item in records: #0id,1vid,2title,3url,4thumb,5summary,6keywords,7newsid,8vtype,9source,10related, #11loadtime,12duration,13web,14mvid,15mtype,16click #vid,title,url,thumb,brief,source,loadtime,duration,web,mtype,click vnInfos.append(NewsInfo(item[1],item[2],item[3],item[4],item[5],item[9],item[11], item[12],item[13],item[15],item[16])) return vnInfos
def getTopRecords(web,mvid,loadtime='0',topnum=10,mtype=None,click=0): vnInfos=[] if mtype not in merge_cn: return vnInfos if mtype == type_new: records=tablemerge.getTopRecords(ctable, topnum) elif mtype == type_hot: records=tablemerge.getTopClickRecords(ctable, topnum) else: records=tablemerge.getTopRecords(ctable, topnum, mtype) if records!=-1 and len(records)>0: for item in records: #0id,1webid,2vid,3title,4url,5thumb,6summary,7keywords,8newsid,9vtype,10source, #11related,12loadtime,13duration,14web,15mvid,16mtype,17click #2vid,3title,4url,5thumb,6brief,10source,12loadtime,13duration,14web,15mvid,16mtype,17click vnInfos.append(NewsInfo(item[2],item[3],item[4],item[5],item[6],item[10], item[12],item[13],item[14],item[15],item[16],item[17])) return vnInfos
for docid,weight in word2doc_mat[wordid]: value=dic_tfidf.get(docid,0) value+=tfidf_v*weight dic_tfidf[docid]=value #print 'tfidf:',vec_tfidf #print 'bow:',vec_bow if not dic_tfidf.values(): print 'Document not recruited:',' '.join(wordList) limit_low=prune_at*max(dic_tfidf.iteritems(),key=lambda i:i[1])[1] concept_vec=[] for item in dic_tfidf.iteritems(): if item[1]>=limit_low: concept_vec.append(item) return sorted(concept_vec) oldtime=time.time() rows=tablemerge.getTopRecords(dbconfig.mergetable, 10) title=rows[3][2] doc=delpunc(' '.join(jieba.cut(title)).lower()) vec_tfidf = _get_concept_vec_prune(doc.split())# convert the query to concept space vec_pca=esa_pca[vec_tfidf] # print vec_pca sims = index[vec_pca] # perform a similarity query against the corpus # print sims # print (document_number, document_similarity) 2-tuples print doc doc_list=list(open(news_file)) for sim in sims: print sim[1],' '.join(doc_list[sim[0]].strip().split()[1:]) print 'time cost:%s' % str(time.time()-oldtime)