Ejemplo n.º 1
0
def generate_idf_model():
    tokens_global = defaultdict(int)
    bigrams_global = defaultdict(int)
    print "Extracting..."
    articles = 0
    for i, article in enumerate(load_articles(50000)):
        for token in set(article['tokens']):
            tokens_global[token] += 1
        for bigram in set(article['bigrams']):
            bigrams_global[bigram] += 1
        if i % 100 == 0:
            print "Done: %s" % i
        articles += 1

    print "Calculating IDF..."
    for file_name, terms in (('tokens', tokens_global), ('bigrams',
                                                         bigrams_global)):
        data = {'articles': articles}
        data['terms'] = dict()
        for term, count in terms.items():
            idf = math.log((articles / (1 + count)))
            data['terms'][term] = idf

        with open(data_path('idf_%s.json' % file_name), 'wb') as fh:
            json.dump(data, fh)
Ejemplo n.º 2
0
def topic_graph():
    print "Loading IDF model..."
    model = load_idf_model('tokens')
    print "Making a graph..."
    #articles = 0
    edges = defaultdict(int)
    for article in articles.find(_limit=10000):
        if 'spiegel.de/international' in article['article_url']:
            continue
        terms = article_terms(model, article)[:15]
        for (term1, score1), (term2, score2) in combinations(terms, 2):
            key = max(term1, term2), min(term1, term2)
            edges[key] += score1 * score2

    G = nx.Graph()
    #for (s, d), w in edges.items():
    for (s, d), w in sorted(edges.items(), key=lambda (a, b): b, reverse=True)[:20000]:
        G.add_edge(s, d, weight=w)
    nx.write_gexf(G, data_path('topic_graph_abridged.gexf'))
Ejemplo n.º 3
0
def topic_graph():
    print "Loading IDF model..."
    model = load_idf_model('tokens')
    print "Making a graph..."
    #articles = 0
    edges = defaultdict(int)
    for article in articles.find(_limit=10000):
        if 'spiegel.de/international' in article['article_url']:
            continue
        terms = article_terms(model, article)[:15]
        for (term1, score1), (term2, score2) in combinations(terms, 2):
            key = max(term1, term2), min(term1, term2)
            edges[key] += score1 * score2

    G = nx.Graph()
    #for (s, d), w in edges.items():
    for (s, d), w in sorted(edges.items(), key=lambda (a, b): b,
                            reverse=True)[:20000]:
        G.add_edge(s, d, weight=w)
    nx.write_gexf(G, data_path('topic_graph_abridged.gexf'))
Ejemplo n.º 4
0
def generate_idf_model():
    tokens_global = defaultdict(int)
    bigrams_global = defaultdict(int)
    print "Extracting..."
    articles = 0
    for i, article in enumerate(load_articles(50000)):
        for token in set(article['tokens']):
            tokens_global[token] += 1
        for bigram in set(article['bigrams']):
            bigrams_global[bigram] += 1
        if i % 100 == 0:
            print "Done: %s" % i
        articles += 1

    print "Calculating IDF..."
    for file_name, terms in (('tokens', tokens_global), ('bigrams', bigrams_global)):
        data = {'articles': articles}
        data['terms'] = dict()
        for term, count in terms.items():
            idf = math.log((articles/(1+count)))
            data['terms'][term] = idf

        with open(data_path('idf_%s.json' % file_name), 'wb') as fh:
            json.dump(data, fh)
Ejemplo n.º 5
0
def load_idf_model(type_):
    with open(data_path('idf_%s.json' % type_), 'rb') as fh:
        return json.load(fh)
Ejemplo n.º 6
0
def load_idf_model(type_):
    with open(data_path('idf_%s.json' % type_), 'rb') as fh:
        return json.load(fh)