Python tokenize Examples

Programming Language: Python

Namespace/Package Name: hifi.preprocess

Method/Function: tokenize

Examples at hotexamples.com: 4

Python tokenize - 4 examples found. These are the top rated real world Python examples of hifi.preprocess.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def classifier_for_training_set(positive, negative, blacklist=[]):
    """
    Returns a Bayesian classifier for the given positive and negative sentences.
    """
    positive_feedback \
        = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'positive'),
              positive)
    negative_feedback \
        = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'negative'),
              negative)

    training_set = apply_features(_classifier_features_for_document,
                                  positive_feedback + negative_feedback)
    return nltk.classify.NaiveBayesClassifier.train(training_set)

Example #2

Show file

def positive_sentiment_for_sentences(classifier, sentences, blacklist=[]):
    """
    Returns the ratio of positive sentiment for the given list of sentences
    between 0 and 1 inclusive.
    """
    statistics = defaultdict(int)

    for sentence in sentences:
        tokens = tokenize(sentence, blacklist)
        freq_dist = nltk.FreqDist(tokens)

        classification = classifier.classify(_classifier_features_for_document(freq_dist))
        statistics[classification] += 1

    if len(statistics) == 0:
        return 0

    positive = statistics['positive']
    negative = statistics['negative']

    return positive / float(positive + negative)

Example #3

Show file

File: example_preprocess_mbox.py Project: btaylor/hifi

    mkdir(data_dir)

mailbox = mbox(argv[1])
for index, message in enumerate(mailbox):
    payload = message.get_payload(decode=True)
    if not payload:
        continue

    try:
        body = unicode(payload, get_charset(message), 'replace')
    except UnicodeEncodeError:
        continue

    if not body:
        continue

    clean_msg = preprocess(body)

    sentences = split_sentences(clean_msg)
    tokens = tokenize(clean_msg, blacklist)
    bigrams = bigrams_for_message(sentences)

    with open(path.join(data_dir, "preprocess-%0.4d.json" % (index + 1)),
              'w') as f:
        f.write(
            json.dumps({
                'sentences': sentences,
                'tokens': tokens,
                'bigrams': bigrams
            }).encode('UTF-8'))

Example #4

Show file

File: example_preprocess_mbox.py Project: btaylor/hifi

data_dir = path.abspath('./data')
if not path.isdir(data_dir):
    mkdir(data_dir)

mailbox = mbox(argv[1])
for index, message in enumerate(mailbox):
    payload = message.get_payload(decode=True)
    if not payload:
        continue;

    try:
        body = unicode(payload, get_charset(message), 'replace')
    except UnicodeEncodeError:
        continue

    if not body:
        continue

    clean_msg = preprocess(body)

    sentences = split_sentences(clean_msg)
    tokens = tokenize(clean_msg, blacklist)
    bigrams = bigrams_for_message(sentences)

    with open(path.join(data_dir, "preprocess-%0.4d.json" % (index + 1)), 'w') as f:
        f.write(json.dumps({
            'sentences': sentences,
            'tokens': tokens,
            'bigrams': bigrams
        }).encode('UTF-8'))