/
bag_of_words.py
205 lines (152 loc) · 6.49 KB
/
bag_of_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/python
import pandas as pd
import numpy as np
import re
import string
import functions as fn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation
stop = set(stopwords.words('english'))
class Preprocess:
def frequentist_score(self, profile) :
# Replicate Epidemico's "first pass" by including
# profiles with dr/doctor/md
words = profile.split()
score = 0
if len(words) > 0 :
for word in words :
if word in ['dr', 'doctor', 'md'] : score += 1
return score
def clean_profile(self, subset) :
# Returns a list of profiles that have been "cleaned" of
# stop words, urls, punctuation, and capital letters
clean_profiles = []
for i in xrange( 0, len(subset) ):
clean_profiles.append(str(fn.profile_to_words(subset[i])))
return clean_profiles
def vectorize_words(self, clean_profiles, max_features = 500) :
# Vectorize the words in the cleaned profiles using
# term frequency/inverse document frequency (TF-IDF)
print "Creating the bag of words...\n"
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = TfidfVectorizer(min_df=1, max_features = max_features)
vectorizer._validate_vocabulary()
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
data_features = vectorizer.fit_transform(clean_profiles)
# Numpy arrays are easy to work with, so convert the result to an
# array
data_features = data_features.toarray()
print data_features.shape
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(data_features, axis=0)
return vectorizer, data_features, vocab, dist
def train_test(self, data, f_train = 0.75) :
# Break list of tweeters into testing and training sets
n_entries = data.shape[0]
n_train = int(f_train*n_entries)
train = data[:n_train]
test = data[n_train:]
print train.shape, 'profiles for training'
print test.shape, 'profiles for testing'
return train, test
class Modelling :
def train_random_forest(self, vectorizer, train_data_features, train_class) :
# Train the model using a random forest
print "Training the random forest..."
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100)
# Fit the forest to the training set, using the bag of words as
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
vec_forest = Pipeline([('vec', vectorizer), ('rf', forest)])
# save forest and vectorizer for later
joblib.dump(vec_forest, 'forest/forest.pkl')
forest = forest.fit( train_data_features, train_class )
importances = forest.feature_importances_
scores = cross_validation.cross_val_score(forest,
train_data_features, train_class, cv=5)
print 'Cross validation scores:', scores.mean(), '+/-', scores.std()
return forest, importances
def test_set(self, vectorizer, forest, clean_test_profiles) :
print 'Running model on test set...'
# run forest model on the test set to test model
# and find first pass scores
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_profiles)
test_data_features = test_data_features.toarray()
# Use the random forest to make sentiment label predictions
model_class = forest.predict(test_data_features)
probs = forest.predict_proba(test_data_features)[:,1]
return model_class, probs
class ReadWrite :
def read_tweeters(self, file) :
# Read in the list of profiles, which has been randomly shuffled
# and marked if a medical professional or not (1 or 0, respectively)
data = pd.read_csv(file, header=0, \
delimiter="\t")
print data.shape
print data.columns.values
return data
def write_words(self, file, vocab, amount) :
# For each, print the vocabulary word and the number of times it
# appears in the training set
f = open(file, 'w')
for tag, count in zip(vocab, amount):
f.write('{0:f}'.format(count) + ',' + tag + '\n')
f.close()
return
def write_to_file(self, test_class, model_class, freq_scores, probs, profiles) :
# Copy the test set to a pandas dataframe including actual class,
# the "first pass" model, the probability (used to plot the ROC curve),
# and the original profile
output = pd.DataFrame( data={"actual_sentiment":test_class,
"model_sentiment":model_class, "freq_score" : freq_scores,
"probability" : probs, "profile":profiles} )
# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False)#, quoting=3 )
return
if __name__ == '__main__':
rw = ReadWrite()
mod = Modelling()
pre = Preprocess()
#Load the tweets
file = 'shuffled_tweeters.tsv'
data = rw.read_tweeters(file)
# Break tweets into train/test sets of f_train, 1-f_train sizes
train, test = pre.train_test(data, f_train = 0.75)
# Save df as array (easier to deal with later)
train_class = train["sentiment"].values
train_profiles = train["profile"].values
test_class = test["sentiment"].values
test_profiles = test["profile"].values
# Clean training profiles - take out stop words, punctuation, etc.
clean_train_profiles = pre.clean_profile(train_profiles)
# Vectorize training profiles using TF-IDF
vectorizer, train_data_features, vocab, dist =\
pre.vectorize_words(clean_train_profiles, max_features = 500)
# Save word counts to file (not totally necessary, but interesting)
rw.write_words('word_counts.txt', vocab, dist)
# Train the model!
forest, importances = mod.train_random_forest(vectorizer,\
train_data_features, train_class)
# Write word importances to file
rw.write_words('word_importance.txt', vocab, importances)
# Preprocess test set of profiles
clean_test_profiles = pre.clean_profile(test_profiles)
# Run test set through model
model_class, probs = mod.test_set(vectorizer, forest, clean_test_profiles)
# Find frequentist score for test profiles (Epidemico first pass)
freq_scores = [pre.frequentist_score(profile) for profile in clean_test_profiles]
# Write test set results to file
rw.write_to_file(test_class, model_class, freq_scores, probs, test_profiles)