Ejemplo n.º 1
0
def load_articles(limit):
    for article in articles.find(_limit=limit):
        if 'spiegel.de/international' in article['article_url']:
            continue
        yield {
            'url': article['article_url'],
            'text': article['body_text'],
            'bigrams': list(make_bigrams(article['body_text'])),
            'tokens': list(tokenize(article['body_text']))
        }
Ejemplo n.º 2
0
def load_articles(limit):
    for article in articles.find(_limit=limit):
        if 'spiegel.de/international' in article['article_url']:
            continue
        yield {
            'url': article['article_url'],
            'text': article['body_text'],
            'bigrams': list(make_bigrams(article['body_text'])),
            'tokens': list(tokenize(article['body_text']))
        }
Ejemplo n.º 3
0
def article_terms(model, article):
    terms = defaultdict(int)
    for token in tokenize(article['body_text']):
        terms[token] += 1

    total = float(sum(terms.values()))
    if total == 0:
        return []
    max_f = max(terms.values())/total
    #print "MAX", max_f, max(terms.values()), terms.values()
    tf_idfs = {}
    for term, count in terms.items():
        tf = 0.5 + ((0.5*(count/total))/max_f)
        tf_idfs[term] = tf * model['terms'].get(term, 0)

    return sorted(tf_idfs.items(), key=lambda (a, b): b, reverse=True)
Ejemplo n.º 4
0
def article_terms(model, article):
    terms = defaultdict(int)
    for token in tokenize(article['body_text']):
        terms[token] += 1

    total = float(sum(terms.values()))
    if total == 0:
        return []
    max_f = max(terms.values()) / total
    #print "MAX", max_f, max(terms.values()), terms.values()
    tf_idfs = {}
    for term, count in terms.items():
        tf = 0.5 + ((0.5 * (count / total)) / max_f)
        tf_idfs[term] = tf * model['terms'].get(term, 0)

    return sorted(tf_idfs.items(), key=lambda (a, b): b, reverse=True)