forked from AlasdairNorton/ClusterCloudAssg2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainer.py
71 lines (57 loc) · 2.3 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# coding=UTF-8
import csv, codecs, string
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.classify import NaiveBayesClassifier, MaxentClassifier, DecisionTreeClassifier
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.collocations import BigramAssocMeasures
def read_input(infile, NUM_TRAIN, NUM_TEST):
with codecs.open(infile, 'rb') as csvfile:
train = []
test = []
pos_tweets = 0
neg_tweets = 0
reader = csv.reader(csvfile)
tokenizer = TweetTokenizer(preserve_case=True)
for line in reader:
if line[0] == "0":
sent="Negative"
neg_tweets+=1
if neg_tweets < NUM_TRAIN:
text = tokenizer.tokenize(line[5].decode("utf-8"))
text = [token for token in text if token != u'\ufffd']
train.append((text, sent))
elif neg_tweets < NUM_TRAIN+NUM_TEST:
text = tokenizer.tokenize(line[5].decode("utf-8"))
text = [token for token in text if token != u'\ufffd']
test.append((text, sent))
if line[0] == "4":
sent = "Positive"
pos_tweets+=1
if pos_tweets < NUM_TRAIN:
text = tokenizer.tokenize(line[5].decode("utf-8"))
text = [token for token in text if token != u'\ufffd']
train.append((text, sent))
elif pos_tweets < NUM_TRAIN+NUM_TEST:
text = tokenizer.tokenize(line[5].decode("utf-8"))
text = [token for token in text if token != u'\ufffd']
test.append((text, sent))
return test, train
# Read in annotated data
NUM_TRAIN = 10000
NUM_TEST = 2500
test, train = read_input("train.csv",NUM_TRAIN,NUM_TEST)
sentiment_analyzer = SentimentAnalyzer()
#all_words = sentiment_analyzer.all_words([mark_negation(doc[0]) for doc in train])
all_words = sentiment_analyzer.all_words([doc[0] for doc in train])
unigrams = sentiment_analyzer.unigram_word_feats(all_words, min_freq=4)
# print unigrams
sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigrams)
training_set=sentiment_analyzer.apply_features(train)
test_set=sentiment_analyzer.apply_features(test)
trainer = NaiveBayesClassifier.train
classifier = sentiment_analyzer.train(trainer, training_set)
save_file(sentiment_analyzer, "sentiment_classifier.pkl")
for key,value in sorted(sentiment_analyzer.evaluate(test_set).items()):
print("{0}: {1}".format(key,value))