def load_articles(limit): for article in articles.find(_limit=limit): if 'spiegel.de/international' in article['article_url']: continue yield { 'url': article['article_url'], 'text': article['body_text'], 'bigrams': list(make_bigrams(article['body_text'])), 'tokens': list(tokenize(article['body_text'])) }
def article_terms(model, article): terms = defaultdict(int) for token in tokenize(article['body_text']): terms[token] += 1 total = float(sum(terms.values())) if total == 0: return [] max_f = max(terms.values())/total #print "MAX", max_f, max(terms.values()), terms.values() tf_idfs = {} for term, count in terms.items(): tf = 0.5 + ((0.5*(count/total))/max_f) tf_idfs[term] = tf * model['terms'].get(term, 0) return sorted(tf_idfs.items(), key=lambda (a, b): b, reverse=True)
def article_terms(model, article): terms = defaultdict(int) for token in tokenize(article['body_text']): terms[token] += 1 total = float(sum(terms.values())) if total == 0: return [] max_f = max(terms.values()) / total #print "MAX", max_f, max(terms.values()), terms.values() tf_idfs = {} for term, count in terms.items(): tf = 0.5 + ((0.5 * (count / total)) / max_f) tf_idfs[term] = tf * model['terms'].get(term, 0) return sorted(tf_idfs.items(), key=lambda (a, b): b, reverse=True)