Example #1
0
def classifier_for_training_set(positive, negative, blacklist=[]):
    """
    Returns a Bayesian classifier for the given positive and negative sentences.
    """
    positive_feedback \
        = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'positive'),
              positive)
    negative_feedback \
        = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'negative'),
              negative)

    training_set = apply_features(_classifier_features_for_document,
                                  positive_feedback + negative_feedback)
    return nltk.classify.NaiveBayesClassifier.train(training_set)
Example #2
0
def positive_sentiment_for_sentences(classifier, sentences, blacklist=[]):
    """
    Returns the ratio of positive sentiment for the given list of sentences
    between 0 and 1 inclusive.
    """
    statistics = defaultdict(int)

    for sentence in sentences:
        tokens = tokenize(sentence, blacklist)
        freq_dist = nltk.FreqDist(tokens)

        classification = classifier.classify(_classifier_features_for_document(freq_dist))
        statistics[classification] += 1

    if len(statistics) == 0:
        return 0

    positive = statistics['positive']
    negative = statistics['negative']

    return positive / float(positive + negative)
Example #3
0
    mkdir(data_dir)

mailbox = mbox(argv[1])
for index, message in enumerate(mailbox):
    payload = message.get_payload(decode=True)
    if not payload:
        continue

    try:
        body = unicode(payload, get_charset(message), 'replace')
    except UnicodeEncodeError:
        continue

    if not body:
        continue

    clean_msg = preprocess(body)

    sentences = split_sentences(clean_msg)
    tokens = tokenize(clean_msg, blacklist)
    bigrams = bigrams_for_message(sentences)

    with open(path.join(data_dir, "preprocess-%0.4d.json" % (index + 1)),
              'w') as f:
        f.write(
            json.dumps({
                'sentences': sentences,
                'tokens': tokens,
                'bigrams': bigrams
            }).encode('UTF-8'))
Example #4
0
data_dir = path.abspath('./data')
if not path.isdir(data_dir):
    mkdir(data_dir)

mailbox = mbox(argv[1])
for index, message in enumerate(mailbox):
    payload = message.get_payload(decode=True)
    if not payload:
        continue;

    try:
        body = unicode(payload, get_charset(message), 'replace')
    except UnicodeEncodeError:
        continue

    if not body:
        continue

    clean_msg = preprocess(body)

    sentences = split_sentences(clean_msg)
    tokens = tokenize(clean_msg, blacklist)
    bigrams = bigrams_for_message(sentences)

    with open(path.join(data_dir, "preprocess-%0.4d.json" % (index + 1)), 'w') as f:
        f.write(json.dumps({
            'sentences': sentences,
            'tokens': tokens,
            'bigrams': bigrams
        }).encode('UTF-8'))