def get_tfidf_of_article_all_corpus(self,title,which_tf='l',which_idf='t'): all_article_token_list,title2Document = self.get_all_article_token_list() if title in title2Document: tokens = title2Document[title] else: assert False strategy = score.ScoreAlgorithm(self.token2id,score.get_tf_func(which_tf),score.get_idf_func(which_idf)) tfidf_scores = strategy.calculateOneWithIdfData(self.idf_dict[which_idf],tokens) return tfidf_scores
def get_tfidf_of_articles(self,which_tf='l',which_idf='t'): all_article_token_list,title2Document = self.get_all_article_token_list() from backend import score strategy = score.ScoreAlgorithm(self.token2id,score.get_tf_func(which_tf),score.get_idf_func(which_idf)) title_tfidf_dict = {} articles = self.get_all_articles() for article in articles: title_tfidf_dict[article.getTitle()] = strategy.calculateOneWithIdfData(self.idf_dict[which_idf],article.getTokens()) return title_tfidf_dict
def make_query_order_by_tfidf(self,query,top_k,which_tf='l',which_idf='t',preview_len=200): article_corpusname = {} article_match_total = {} article_token_matchtimes ={} import queue q = queue.PriorityQueue() score_alogorithm = score.ScoreAlgorithm(self.token2id, tf_func=score.get_tf_func(which_tf)\ ,idf_func=score.get_idf_func(which_idf)) query_tfidf = score_alogorithm .calculateOneWithIdfData(self.idf_dict[which_idf],self.tokenizer.tokenize(query)) for corpus_name,queryer in self.queryers.items(): files, tokens = queryer.query_files_by_sentence(query, self.tokenizer, error_rate=0.0, flag_spell_check=False) articles = queryer.indexer.corpus.getArticlesByPaths(files) for article in articles: match_times,match_details = self.count_matches_in_article(article,tokens) article_match_total[article.getTitle()] = match_times article_corpusname[article.getTitle()] = corpus_name article_token_matchtimes[article.getTitle()] = match_details if article.getType()=='pubmed': content = article.abstract_text[0] elif article.getType()=='twitter': content = article.text article_tfidf = score_alogorithm.calculateOneWithIdfData(self.idf_dict[which_idf],article.getTokens()) q.put((-1*score.cosine_sim(article_tfidf,query_tfidf),article.getTitle(),content),False) ret_article_info =[] ret_num = min(top_k,q.qsize()) for i in range(ret_num): tfidf,title,abstract = q.get(False) if tfidf >0 : #minus continue if len(abstract)>preview_len: abstract = abstract[0:preview_len] print('queue') print((tfidf,title)) ret_article_info.append((title,abstract)) ret_article_titles, ret_article_abstracts = [],[] if len(ret_article_info)>0: ret_article_titles,ret_article_abstracts = zip(*ret_article_info) ret_article_titles, ret_article_abstracts = list(ret_article_titles), list(ret_article_abstracts) k_article_corpusname ={ k:v for k,v in article_corpusname.items() if k in ret_article_titles} k_article_match_total = {k: v for k, v in article_match_total.items() if k in ret_article_titles} k_article_token_matchtimes = {k: v for k, v in article_token_matchtimes.items() if k in ret_article_titles} return ret_article_titles,ret_article_abstracts,k_article_corpusname,k_article_match_total, k_article_token_matchtimes,tokens
def tfidf_compare(token_algorithm,article_title,which_tf,which_idf): top_n = 50 ir_sys_compared = None if token_algorithm == 'porter': ir_sys_compared = ir_sys_porter else : ir_sys_compared = ir_sys article,query_article_cpname = ir_sys_compared.findArticleByTitle(article_title) corpus_category_name_dict = ir_sys_compared.corpus_names # res corpus name : (most similar article(title,tfidf),least similar article(title,tfidf)) res = {} corpusName_tfidfWithTile_dict = {} for _,corpus_name_list in corpus_category_name_dict.items(): for corpus_name in corpus_name_list: corpus = ir_sys_compared.get_corpus_by_name(corpus_name) title_tfidf_dict = corpus.get_tfidf_of_articles(which_tf=which_tf,which_idf=which_idf) if article_title in title_tfidf_dict: title_tfidf_dict.pop(article_title,None) items = list(title_tfidf_dict.items()) _,tfidf_list = list(zip(*(items))) article_tfidf = score.ScoreAlgorithm(corpus.token2id,tf_func=score.get_tf_func(which_tf), idf_func=score.get_idf_func(which_idf)).calculateOneWithIdfData(corpus.idf_dict[which_idf],article.getTokens()) order_inds = score.cosine_sim_rank(article_tfidf,tfidf_list) title_least_sim,least_tfidf = items[ order_inds[0]][0],items[ order_inds[0]][1] title_most_sim,most_tfidf = items[ order_inds[-1]][0],items[ order_inds[-1]][1] corpusName_tfidfWithTile_dict[corpus_name] = score.sort_tfidf_of_document(article_tfidf,top_n,corpus.id2token) res[corpus_name] = ((title_most_sim,score.sort_tfidf_of_document(most_tfidf,top_n,corpus.id2token)), (title_least_sim,score.sort_tfidf_of_document(least_tfidf,top_n,corpus.id2token))) return render_template('tfidf/compare.html',article_title=article_title , corpusName_tfidfWithTile_dict = corpusName_tfidfWithTile_dict,res=res,which_tf=which_tf,which_idf=which_idf)