class GetTag(object): def __init__(self ): self.idf = idf_zhihu() self.db = DbKyoto('bayes.kch') def get_tag(self, txt): topic_rank = defaultdict(float) tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True) average_tfidf = sum([i[1] for i in tfidf_list])/float(len(tfidf_list)) tfidf_list = [ i for i in tfidf_list if i[1]>average_tfidf] for (word, word_tfidf), word_id in zip( tfidf_list, WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list) ): topic_items_dict = self.db.get(word_id) if topic_items_dict: for topic_id, bayes in topic_items_dict: topic_rank[topic_id] += (word_tfidf*bayes) topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True) txt = txt.lower() if topic_rank: rank_avg = float(sum(i[1] for i in topic_rank))/len(topic_rank) for topic_id, rank in topic_rank[:50]: ''' 推荐主题做二元分词, 如果文章中没有, 则去掉. ''' topic = ID2TAG[topic_id] rank_t = rank/rank_avg for seg in sp_txt(topic): if seg in txt: yield topic, rank_t break if rank_t<6: break
def __init__(self ): self.idf = idf_zhihu() self.db = DbKyoto('bayes.kch')