Ejemplo n.º 1
0
    def rank_candidates_for_entity(self, entity_str, username, candidates, incorporate_editnum=False):
        
        # create map of candidate -> frequency appears as top ranked candidate
        top_cand_freq = {}
        
        # get list of strings of user's edited articles
        user_doc = self.get_user_doc(username)
        print "Successfully loaded or created user model based on text in user's edited articles"
        
        # initialize map with entity's candidate resources
        for candidate_title in candidates:
            top_cand_freq[candidate_title] = 0
        
        for article_text in user_doc:
            try:
                clean_entity_str = text_util.get_clean_doc(entity_str)
                clean_article_text = text_util.get_clean_doc(article_text)
                if not clean_entity_str in clean_article_text:
                    continue # entity not in this article
                
                if incorporate_editnum:
                    factor = user_doc[article_text]
                else:
                    factor = 1
                
                # have to do sentence at a time or else breaks toolkits
                sentences = text_util.get_sentences(article_text)
                for sentence in sentences:
                    clean_sentence = text_util.get_clean_doc(sentence)
                    if not clean_entity_str in clean_sentence:
                        continue # entity not in this sentence
 
                    cands = __find_best_candidates_for_entity__(entity_str, clean_entity_str, sentence)
                    if cands is None or len(cands)==0:
                        # tookits unable to detect entity in this sentence at all
                        # or couldn't find any candidates for the entity
                        continue
                    
                    # for each detected entity that matches the one we're searching for, 
                    # get its top candidate and update that candidate's entry in the map 
                    ranked_cands = Ambiguous_Entity.sort_CandidateResources(cands.values())
                    for rc in ranked_cands:
                        cand_title = rc.title
                        if cand_title in top_cand_freq:
                            # this is the highest ranked of the candidates we need to resolve
                            # (might be that ambiguous entity goes to candidates -> c1, c2, c3
                            # and user doc disambiguation maps entity to -> c4, c3, c2, in which case
                            # we need to keep iterating through ranked_cands until encounter one of 
                            # the ambiguous entities candidates, ie here c3)
                            top_cand_freq[cand_title] = top_cand_freq[cand_title] + factor
                            break
            except Exception as e:
                raise
                print "Unexpected exception ", e
                continue
        return top_cand_freq
Ejemplo n.º 2
0
def build_model(cache=True):
    if cache:
        f = "%s/word2vec.model" % cache_dir()
        if os.path.isfile(f):
            return Word2Vec.load(f)
    texts = []
    for url in crawl_report_list():
        html = get(url)
        enc, time, title, text = ce.parse(url, html)
        sentences = text_util.get_sentences(text)
        for s in sentences:
            texts.append([w for w in jieba.cut(s)])
    b = Word2Vec(texts)
    if cache:
        b.save(f)
    return b
Ejemplo n.º 3
0
Archivo: cn.py Proyecto: liuzl/nlp4econ
def tf(cache=True, force=False):
    f = "%s/tf.txt" % cache_dir()
    if cache and not force:
        if os.path.isfile(f):
            return True
    d = defaultdict(int)
    for url in (crawl_report_list() + crawl_plan_list()):
        html = get(url)
        enc, time, title, text = ce.parse(url, html)
        sentences = text_util.get_sentences(text)
        for s in sentences:
            for w in jieba.cut(s):
                d[w] += 1
    r = sorted(d.items(), key=lambda x:x[1], reverse=True)
    if cache:
        out = open(f, "w")
        for k,v in r:
            out.write(("%s\t%s\n" % (k,v)).encode('utf-8', 'ignore'))
        out.close()
    return True