Beispiel #1
0
    def get_tfidf_of_article_all_corpus(self,title,which_tf='l',which_idf='t'):
 
        all_article_token_list,title2Document = self.get_all_article_token_list()

        if title in title2Document:
            tokens = title2Document[title]
        else:
            assert False
     
        strategy = score.ScoreAlgorithm(self.token2id,score.get_tf_func(which_tf),score.get_idf_func(which_idf))
        
        tfidf_scores = strategy.calculateOneWithIdfData(self.idf_dict[which_idf],tokens)
        return tfidf_scores
Beispiel #2
0
    def get_tfidf_of_articles(self,which_tf='l',which_idf='t'):
 
      all_article_token_list,title2Document = self.get_all_article_token_list()

      from backend import score
      strategy = score.ScoreAlgorithm(self.token2id,score.get_tf_func(which_tf),score.get_idf_func(which_idf))
      
      title_tfidf_dict = {}

      articles = self.get_all_articles()
      for article in articles:
         title_tfidf_dict[article.getTitle()] = strategy.calculateOneWithIdfData(self.idf_dict[which_idf],article.getTokens())

      return title_tfidf_dict
Beispiel #3
0
    def make_query_order_by_tfidf(self,query,top_k,which_tf='l',which_idf='t',preview_len=200):
        article_corpusname  = {}
        article_match_total = {}
        article_token_matchtimes ={}
      
        import queue
        q = queue.PriorityQueue()
        score_alogorithm = score.ScoreAlgorithm(self.token2id, tf_func=score.get_tf_func(which_tf)\
                            ,idf_func=score.get_idf_func(which_idf))

        query_tfidf =  score_alogorithm .calculateOneWithIdfData(self.idf_dict[which_idf],self.tokenizer.tokenize(query))

        for corpus_name,queryer in self.queryers.items():
            files, tokens = queryer.query_files_by_sentence(query, self.tokenizer, error_rate=0.0, flag_spell_check=False)
            articles = queryer.indexer.corpus.getArticlesByPaths(files)
            for article in articles:
                match_times,match_details = self.count_matches_in_article(article,tokens)
                article_match_total[article.getTitle()] = match_times
                article_corpusname[article.getTitle()] = corpus_name
                article_token_matchtimes[article.getTitle()] = match_details
                if article.getType()=='pubmed':
                    content = article.abstract_text[0]
                elif article.getType()=='twitter':
                    content = article.text

                article_tfidf = score_alogorithm.calculateOneWithIdfData(self.idf_dict[which_idf],article.getTokens())
                q.put((-1*score.cosine_sim(article_tfidf,query_tfidf),article.getTitle(),content),False)
                
        ret_article_info =[]
        ret_num = min(top_k,q.qsize())
        for i in range(ret_num):
            tfidf,title,abstract = q.get(False)
            if tfidf >0 : #minus
                continue
            if len(abstract)>preview_len:
                abstract = abstract[0:preview_len]
            print('queue')
            print((tfidf,title))
            ret_article_info.append((title,abstract))

        ret_article_titles, ret_article_abstracts = [],[]
        if len(ret_article_info)>0:
            ret_article_titles,ret_article_abstracts =  zip(*ret_article_info)
            ret_article_titles, ret_article_abstracts = list(ret_article_titles), list(ret_article_abstracts)

        k_article_corpusname ={ k:v for k,v in article_corpusname.items() if k in ret_article_titles}
        k_article_match_total = {k: v for k, v in article_match_total.items() if k in ret_article_titles}
        k_article_token_matchtimes = {k: v for k, v in article_token_matchtimes.items() if k in ret_article_titles}

        return  ret_article_titles,ret_article_abstracts,k_article_corpusname,k_article_match_total, k_article_token_matchtimes,tokens
Beispiel #4
0
def tfidf_compare(token_algorithm,article_title,which_tf,which_idf):
    top_n = 50
    ir_sys_compared = None
    if token_algorithm == 'porter':
        ir_sys_compared = ir_sys_porter
    else :
        ir_sys_compared = ir_sys
    article,query_article_cpname = ir_sys_compared.findArticleByTitle(article_title)
    corpus_category_name_dict = ir_sys_compared.corpus_names 


    # res corpus name : (most similar article(title,tfidf),least similar article(title,tfidf))
    res = {}
    corpusName_tfidfWithTile_dict = {}

    for _,corpus_name_list in corpus_category_name_dict.items():
        for corpus_name in corpus_name_list:
            corpus = ir_sys_compared.get_corpus_by_name(corpus_name)
            title_tfidf_dict = corpus.get_tfidf_of_articles(which_tf=which_tf,which_idf=which_idf)

            if article_title in title_tfidf_dict:
                title_tfidf_dict.pop(article_title,None)

            items = list(title_tfidf_dict.items())
            _,tfidf_list = list(zip(*(items)))
            

            article_tfidf = score.ScoreAlgorithm(corpus.token2id,tf_func=score.get_tf_func(which_tf),
                idf_func=score.get_idf_func(which_idf)).calculateOneWithIdfData(corpus.idf_dict[which_idf],article.getTokens())

            order_inds = score.cosine_sim_rank(article_tfidf,tfidf_list)

            title_least_sim,least_tfidf = items[ order_inds[0]][0],items[ order_inds[0]][1]
            title_most_sim,most_tfidf = items[ order_inds[-1]][0],items[ order_inds[-1]][1]

            corpusName_tfidfWithTile_dict[corpus_name] = score.sort_tfidf_of_document(article_tfidf,top_n,corpus.id2token)

            res[corpus_name] = ((title_most_sim,score.sort_tfidf_of_document(most_tfidf,top_n,corpus.id2token)),
                                (title_least_sim,score.sort_tfidf_of_document(least_tfidf,top_n,corpus.id2token)))

    return render_template('tfidf/compare.html',article_title=article_title , 
        corpusName_tfidfWithTile_dict = corpusName_tfidfWithTile_dict,res=res,which_tf=which_tf,which_idf=which_idf)