def tag_id_rank_list_by_txt(txt): txt = txt.lower() tag_id_list_rank = defaultdict(int) for word, rank in tf_idf_seg_txt(txt): #print word ars = db_tag_bayes.get(word) if ars: ar = array('I') ar.fromstring(ars) #print len(ar) #print db_tag_bayes[word] #print word, ar for tag_id, bayes in chunkiter(ar, 2): tag_id_list_rank[tag_id] += (bayes * rank) result = [] for tag_id, rank in sorted(tag_id_list_rank.iteritems(), key=itemgetter(1), reverse=True): has_tag = False if tag_id not in ID2NAME: continue for i in ID2NAME[tag_id]: if has_tag: break tag_list = list(sp_txt(i)) if tag_list: for j in tag_list: #print j, str(j) in txt if str(j) in txt: has_tag = True break elif i in txt: has_tag = True break if has_tag: result.append((tag_id, rank)) return result
def tag_id_rank_list_by_txt(txt): txt = txt.lower() tag_id_list_rank = defaultdict(int) for word, rank in tf_idf_seg_txt(txt): # print word ars = db_tag_bayes.get(word) if ars: ar = array("I") ar.fromstring(ars) # print len(ar) # print db_tag_bayes[word] # print word, ar for tag_id, bayes in chunkiter(ar, 2): tag_id_list_rank[tag_id] += bayes * rank result = [] for tag_id, rank in sorted(tag_id_list_rank.iteritems(), key=itemgetter(1), reverse=True): has_tag = False if tag_id not in ID2NAME: continue for i in ID2NAME[tag_id]: if has_tag: break tag_list = list(sp_txt(i)) if tag_list: for j in tag_list: # print j, str(j) in txt if str(j) in txt: has_tag = True break elif i in txt: has_tag = True break if has_tag: result.append((tag_id, rank)) return result
#coding:utf-8 import _env from name2id import NAME2ID from zkit.txt_cleanup import sp_txt from collections import defaultdict from zkit.pprint import pprint sp2id = defaultdict(list) for k, v in NAME2ID.iteritems(): for i in sp_txt(k): sp2id[i].append(k) word_parent = defaultdict(set) for k, v in NAME2ID.iteritems(): for i in sp_txt(k): for j in sp2id[i]: if j != k and k in j: #print k, j word_parent[NAME2ID[j]].add(NAME2ID[k]) id2name = dict((k, v) for v, k in NAME2ID.iteritems()) #for id, pid_list in word_parent.iteritems(): # print id2name[id] # for i in pid_list: # print id2name[i], # print "\n" word_parent = dict((k, tuple(v)) for k, v in word_parent.iteritems())