def rank_candidates_for_entity(self, entity_str, username, candidates, incorporate_editnum=False): # create map of candidate -> frequency appears as top ranked candidate top_cand_freq = {} # get list of strings of user's edited articles user_doc = self.get_user_doc(username) print "Successfully loaded or created user model based on text in user's edited articles" # initialize map with entity's candidate resources for candidate_title in candidates: top_cand_freq[candidate_title] = 0 for article_text in user_doc: try: clean_entity_str = text_util.get_clean_doc(entity_str) clean_article_text = text_util.get_clean_doc(article_text) if not clean_entity_str in clean_article_text: continue # entity not in this article if incorporate_editnum: factor = user_doc[article_text] else: factor = 1 # have to do sentence at a time or else breaks toolkits sentences = text_util.get_sentences(article_text) for sentence in sentences: clean_sentence = text_util.get_clean_doc(sentence) if not clean_entity_str in clean_sentence: continue # entity not in this sentence cands = __find_best_candidates_for_entity__(entity_str, clean_entity_str, sentence) if cands is None or len(cands)==0: # tookits unable to detect entity in this sentence at all # or couldn't find any candidates for the entity continue # for each detected entity that matches the one we're searching for, # get its top candidate and update that candidate's entry in the map ranked_cands = Ambiguous_Entity.sort_CandidateResources(cands.values()) for rc in ranked_cands: cand_title = rc.title if cand_title in top_cand_freq: # this is the highest ranked of the candidates we need to resolve # (might be that ambiguous entity goes to candidates -> c1, c2, c3 # and user doc disambiguation maps entity to -> c4, c3, c2, in which case # we need to keep iterating through ranked_cands until encounter one of # the ambiguous entities candidates, ie here c3) top_cand_freq[cand_title] = top_cand_freq[cand_title] + factor break except Exception as e: raise print "Unexpected exception ", e continue return top_cand_freq
def build_model(cache=True): if cache: f = "%s/word2vec.model" % cache_dir() if os.path.isfile(f): return Word2Vec.load(f) texts = [] for url in crawl_report_list(): html = get(url) enc, time, title, text = ce.parse(url, html) sentences = text_util.get_sentences(text) for s in sentences: texts.append([w for w in jieba.cut(s)]) b = Word2Vec(texts) if cache: b.save(f) return b
def tf(cache=True, force=False): f = "%s/tf.txt" % cache_dir() if cache and not force: if os.path.isfile(f): return True d = defaultdict(int) for url in (crawl_report_list() + crawl_plan_list()): html = get(url) enc, time, title, text = ce.parse(url, html) sentences = text_util.get_sentences(text) for s in sentences: for w in jieba.cut(s): d[w] += 1 r = sorted(d.items(), key=lambda x:x[1], reverse=True) if cache: out = open(f, "w") for k,v in r: out.write(("%s\t%s\n" % (k,v)).encode('utf-8', 'ignore')) out.close() return True