Python tokenize Examples

Programming Language: Python

Namespace/Package Name: spon.extract.tokenize

Method/Function: tokenize

Examples at hotexamples.com: 4

Python tokenize - 4 examples found. These are the top rated real world Python examples of spon.extract.tokenize.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: tfidf.py Project: pudo-attic/spon-scraper

def load_articles(limit):
    for article in articles.find(_limit=limit):
        if 'spiegel.de/international' in article['article_url']:
            continue
        yield {
            'url': article['article_url'],
            'text': article['body_text'],
            'bigrams': list(make_bigrams(article['body_text'])),
            'tokens': list(tokenize(article['body_text']))
        }

Example #2

Show file

File: tfidf.py Project: pombredanne/spon-scraper

def load_articles(limit):
    for article in articles.find(_limit=limit):
        if 'spiegel.de/international' in article['article_url']:
            continue
        yield {
            'url': article['article_url'],
            'text': article['body_text'],
            'bigrams': list(make_bigrams(article['body_text'])),
            'tokens': list(tokenize(article['body_text']))
        }

Example #3

Show file

File: tfidf.py Project: pudo-attic/spon-scraper

def article_terms(model, article):
    terms = defaultdict(int)
    for token in tokenize(article['body_text']):
        terms[token] += 1

    total = float(sum(terms.values()))
    if total == 0:
        return []
    max_f = max(terms.values())/total
    #print "MAX", max_f, max(terms.values()), terms.values()
    tf_idfs = {}
    for term, count in terms.items():
        tf = 0.5 + ((0.5*(count/total))/max_f)
        tf_idfs[term] = tf * model['terms'].get(term, 0)

    return sorted(tf_idfs.items(), key=lambda (a, b): b, reverse=True)

Example #4

Show file

File: tfidf.py Project: pombredanne/spon-scraper

def article_terms(model, article):
    terms = defaultdict(int)
    for token in tokenize(article['body_text']):
        terms[token] += 1

    total = float(sum(terms.values()))
    if total == 0:
        return []
    max_f = max(terms.values()) / total
    #print "MAX", max_f, max(terms.values()), terms.values()
    tf_idfs = {}
    for term, count in terms.items():
        tf = 0.5 + ((0.5 * (count / total)) / max_f)
        tf_idfs[term] = tf * model['terms'].get(term, 0)

    return sorted(tf_idfs.items(), key=lambda (a, b): b, reverse=True)