def classifier_for_training_set(positive, negative, blacklist=[]): """ Returns a Bayesian classifier for the given positive and negative sentences. """ positive_feedback \ = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'positive'), positive) negative_feedback \ = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'negative'), negative) training_set = apply_features(_classifier_features_for_document, positive_feedback + negative_feedback) return nltk.classify.NaiveBayesClassifier.train(training_set)
def positive_sentiment_for_sentences(classifier, sentences, blacklist=[]): """ Returns the ratio of positive sentiment for the given list of sentences between 0 and 1 inclusive. """ statistics = defaultdict(int) for sentence in sentences: tokens = tokenize(sentence, blacklist) freq_dist = nltk.FreqDist(tokens) classification = classifier.classify(_classifier_features_for_document(freq_dist)) statistics[classification] += 1 if len(statistics) == 0: return 0 positive = statistics['positive'] negative = statistics['negative'] return positive / float(positive + negative)
mkdir(data_dir) mailbox = mbox(argv[1]) for index, message in enumerate(mailbox): payload = message.get_payload(decode=True) if not payload: continue try: body = unicode(payload, get_charset(message), 'replace') except UnicodeEncodeError: continue if not body: continue clean_msg = preprocess(body) sentences = split_sentences(clean_msg) tokens = tokenize(clean_msg, blacklist) bigrams = bigrams_for_message(sentences) with open(path.join(data_dir, "preprocess-%0.4d.json" % (index + 1)), 'w') as f: f.write( json.dumps({ 'sentences': sentences, 'tokens': tokens, 'bigrams': bigrams }).encode('UTF-8'))
data_dir = path.abspath('./data') if not path.isdir(data_dir): mkdir(data_dir) mailbox = mbox(argv[1]) for index, message in enumerate(mailbox): payload = message.get_payload(decode=True) if not payload: continue; try: body = unicode(payload, get_charset(message), 'replace') except UnicodeEncodeError: continue if not body: continue clean_msg = preprocess(body) sentences = split_sentences(clean_msg) tokens = tokenize(clean_msg, blacklist) bigrams = bigrams_for_message(sentences) with open(path.join(data_dir, "preprocess-%0.4d.json" % (index + 1)), 'w') as f: f.write(json.dumps({ 'sentences': sentences, 'tokens': tokens, 'bigrams': bigrams }).encode('UTF-8'))