コード例 #1
0
ファイル: tfidf.py プロジェクト: pudo-attic/spon-scraper
def parse_articles(field):
    model = load_idf_model(field)
    print "Extracting..."
    #articles = 0
    for article in articles.find(_limit=400):
        print "\n\nDOCUMENT", article['article_url']
        most = article_terms(model, article)
        pprint(most[:5])
コード例 #2
0
ファイル: tfidf.py プロジェクト: pombredanne/spon-scraper
def parse_articles(field):
    model = load_idf_model(field)
    print "Extracting..."
    #articles = 0
    for article in articles.find(_limit=400):
        print "\n\nDOCUMENT", article['article_url']
        most = article_terms(model, article)
        pprint(most[:5])
コード例 #3
0
ファイル: tfidf.py プロジェクト: pudo-attic/spon-scraper
def load_articles(limit):
    for article in articles.find(_limit=limit):
        if 'spiegel.de/international' in article['article_url']:
            continue
        yield {
            'url': article['article_url'],
            'text': article['body_text'],
            'bigrams': list(make_bigrams(article['body_text'])),
            'tokens': list(tokenize(article['body_text']))
        }
コード例 #4
0
ファイル: tfidf.py プロジェクト: pombredanne/spon-scraper
def load_articles(limit):
    for article in articles.find(_limit=limit):
        if 'spiegel.de/international' in article['article_url']:
            continue
        yield {
            'url': article['article_url'],
            'text': article['body_text'],
            'bigrams': list(make_bigrams(article['body_text'])),
            'tokens': list(tokenize(article['body_text']))
        }
コード例 #5
0
ファイル: graph.py プロジェクト: pudo-attic/spon-scraper
def topic_graph():
    print "Loading IDF model..."
    model = load_idf_model('tokens')
    print "Making a graph..."
    #articles = 0
    edges = defaultdict(int)
    for article in articles.find(_limit=10000):
        if 'spiegel.de/international' in article['article_url']:
            continue
        terms = article_terms(model, article)[:15]
        for (term1, score1), (term2, score2) in combinations(terms, 2):
            key = max(term1, term2), min(term1, term2)
            edges[key] += score1 * score2

    G = nx.Graph()
    #for (s, d), w in edges.items():
    for (s, d), w in sorted(edges.items(), key=lambda (a, b): b, reverse=True)[:20000]:
        G.add_edge(s, d, weight=w)
    nx.write_gexf(G, data_path('topic_graph_abridged.gexf'))
コード例 #6
0
ファイル: graph.py プロジェクト: pombredanne/spon-scraper
def topic_graph():
    print "Loading IDF model..."
    model = load_idf_model('tokens')
    print "Making a graph..."
    #articles = 0
    edges = defaultdict(int)
    for article in articles.find(_limit=10000):
        if 'spiegel.de/international' in article['article_url']:
            continue
        terms = article_terms(model, article)[:15]
        for (term1, score1), (term2, score2) in combinations(terms, 2):
            key = max(term1, term2), min(term1, term2)
            edges[key] += score1 * score2

    G = nx.Graph()
    #for (s, d), w in edges.items():
    for (s, d), w in sorted(edges.items(), key=lambda (a, b): b,
                            reverse=True)[:20000]:
        G.add_edge(s, d, weight=w)
    nx.write_gexf(G, data_path('topic_graph_abridged.gexf'))