Beispiel #1
0
def get_article_distance(article1_name, article2_name):
    #print "Article 1: %s, article 2: %s" % (article1_name, article2_name)

    #article1_text = None
    #article2_text = None

    article1_text = wiki_index.get_article(article1_name)
    article2_text = wiki_index.get_article(article2_name)

    # split the text by space; convert to a set; filter stop words
    #article1_words = set(article1_text)
    #article2_words = set(article2_text)

    # feature 1 = 1.0 / number of non-stop words in common
    # feature 2 = 1.0 / Jaccard similarity
    # feature 3 = 1.0 / cos sim between tfidf vectors
    # feature 4 = Hellinger distance between LDA distr

    #size_int = float(len(article1_words.intersection(article2_words)))
    #feat1 = size_int

    #size_union = float(len(article1_words.union(article2_words)))
    #feat2 = size_int / size_union
    
    # feature 3: tf idf cosine sim
    # vec1 = lda.get_tfidf_for_doc(article1_text)
    # vec2 = lda.get_tfidf_for_doc(article2_text)
    # feat3 = lda.get_cosine_sim(vec1, vec2)

    # feature 4: hellinger dist
    vec1 = lda.get_topics_for_article_text(article1_text, 10)
    vec2 = lda.get_topics_for_article_text(article2_text, 10)
    feat4 = lda.get_hellinger(vec1, vec2, 10)

    return feat4
Beispiel #2
0
def extract_nlp_features(article1_name, article2_name, num_lda_topics, name_to_type, type_to_depth, type_to_node):
    article_name_to_linenum = wiki_index.get_article_name_to_linenum()

    # lists of words
    article1_text = wiki_index.get_article(article1_name)
    article2_text = wiki_index.get_article(article2_name)

    return get_features(article1_name, article2_name, article1_text, article2_text, num_lda_topics, name_to_type, type_to_depth, type_to_node)
Beispiel #3
0
def check_adjlist_articles(adj_list_arg):
    print "Checking adj list..."
    errors = 0
    total = 0
    for key in adj_list_arg:
        name = linenum_to_title[str(key)]
        try:
            total += 1
            text = wiki_index.get_article(name)
        except KeyError:
            errors += 1

        for v in adj_list_arg[key]:
            name1 = linenum_to_title[str(v)]
            try:
                total += 1
                text1 = wiki_index.get_article(name1)
            except KeyError:
                errors += 1
    print "Number of errors = %d; total = %d" % (errors, total)
Beispiel #4
0
def get_topics_for_article_name(article_name, num_topics):
    model = None
    if num_topics == 10:
        model = lda_10
    elif num_topics == 30:
        model = lda_30
    elif num_topics == 60:
        model = lda_60
    elif num_topics == 120:
        model = lda_120
    else:
        raise ValueError("bad number of topics")

    article = wiki_index.get_article(article_name)
    doc_bow = dictionary.doc2bow(article)
    return model[doc_bow]
Beispiel #5
0
def get_tfidf_for_article_name(article_name):
    bow = dictionary.doc2bow(wiki_index.get_article(article_name))
    return tfidf[bow]