Example #1
0
class GetTag(object):
    def __init__(self ):
        self.idf = idf_zhihu()
        self.db = DbKyoto('bayes.kch')

    def get_tag(self, txt):
        topic_rank = defaultdict(float)
        tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
        average_tfidf = sum([i[1] for i in tfidf_list])/float(len(tfidf_list))
        tfidf_list = [ i for i in tfidf_list if i[1]>average_tfidf]


        for (word, word_tfidf), word_id in zip(
            tfidf_list,
            WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
        ):
            topic_items_dict  = self.db.get(word_id)
            if topic_items_dict:
                for topic_id, bayes in topic_items_dict:
                    topic_rank[topic_id] += (word_tfidf*bayes)

        topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
        txt = txt.lower()
        if topic_rank:
            rank_avg = float(sum(i[1] for i in topic_rank))/len(topic_rank)
            for topic_id, rank in topic_rank[:50]:
                '''
                推荐主题做二元分词, 如果文章中没有, 则去掉. 
                '''
                topic = ID2TAG[topic_id]
                rank_t = rank/rank_avg
                for seg in sp_txt(topic):
                    if seg in txt:
                        yield topic, rank_t
                        break

                if rank_t<6:
                    break
Example #2
0
 def __init__(self ):
     self.idf = idf_zhihu()
     self.db = DbKyoto('bayes.kch')