-
Notifications
You must be signed in to change notification settings - Fork 0
/
classification2.py
410 lines (328 loc) · 14.1 KB
/
classification2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
import csv # used for import csv tweets
import random # used for train/test set creation
import os # used for calling Frog in new terminal
import operator
import time # used for timer
#from pynlpl.clients.frogclient import FrogClient # used for Frog client
#import subprocess # used for calling Frog in new terminal
#import signal # used for calling Frog in new terminal
#import math
#import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_recall_fscore_support
import helpers
from start_svm import Start_SVM
from start_nb import Start_NB
from preprocessing import Preprocessing
from bagofwords import BagOfWords
class Main(object):
"""
Class for blablabla
"""
# Input file reader
DELIMITER = "\t"
# Dictionary for class, classify unkown into non-activity
class_dict = {"Y": 0, "N": 1, "U": 0}
tweets = {}
tweet_class = {}
# Processed arrays
stemmed_tweets_array = []
tokenized_tweets_array = []
pos_tweets_array = []
lemmatized_tweets_array = []
# File for classification results
RESULTFILE = 'results_classification_26juni_rest.csv'
TRAININGFILE = "2000test_annotated_v3.csv"
# Arrays for classification
train_tweetclasses = []
train_vectors= []
test_tweetclasses = []
test_vectors = []
# Cross validation folds
CROSS_VALIDATION = 10
def __init__(self, testingmode, mode):
""" Initialize tweets, preprocess and create train/test sets"""
self.mode = mode
#self.debug = "--debug" in mode
#self.dump = "--write" in mode
self.debug = True
self.dump = False
self.testingmode = testingmode
training_filename = self.TRAININGFILE.split('.')[0]
self.initialize(self.TRAININGFILE)
self.preprocess_tweets(self.mode,self.tweets, training_filename)
if ( self.testingmode ):
values = self.tweet_class.values()
self.count_classes(values)
def initialize(self, filename):
""" Initializes tweet and class sets """
print "** Initialize.."
data = csv.reader(open(filename, 'rU'), delimiter=self.DELIMITER)
for i, row in enumerate(data):
if(i == 0): # Ignore header
pass
else:
# Get tweet and class of tweet
if (self.class_dict.get(row[5].upper()) is not None):
self.tweets[i-1] = row[3]
if ( self.testingmode ):
self.tweet_class[i-1] = self.class_dict.get(row[5].upper())
def get_activity_tweets(self, inputfile, mode, ngrambow, nr, loadclassifier):
""" Extract new activity tweets from file"""
# Create classifier on trainingdata of class
(array, tuplebows) = self.setup_input_classification(mode, ngrambow, 0,0, nr)
if 'svm' in mode:
svmObject = Start_SVM(array, mode, self.tweet_class, True, tuplebows, self.CROSS_VALIDATION)
if 'nb' in mode:
nbObject = Start_NB(array, mode, self.tweet_class, True, tuplebows, ngrambow, self.CROSS_VALIDATION)
# Get tweets of new data
new_tweets = {}
index = 0
newdata = csv.reader(open(inputfile, 'rU'), delimiter=self.DELIMITER)
for i, row in enumerate(newdata):
if( row[5] == '' and row[0].isdigit()):
new_tweets[index] = row[3]
index += 1
# Preprocess new data
inputfile_filename = inputfile.split('.')[0]
self.preprocess_tweets(mode,new_tweets, inputfile_filename)
array = self.get_preprocessed_array(mode)
# Classify newdata
if 'svm' in mode:
prediction = svmObject.start_classification(mode,array, loadclassifier, 0.001, 10)
if 'nb' in mode:
prediction = nbObject.start_classification(mode,array, False, loadclassifier)
# Print to file
self.count_classes(prediction.tolist())
classification_filename = inputfile_filename + "_class.csv"
helpers.write_classification_to_tweetfile(prediction,0, 5, inputfile, classification_filename)
def analysis_classification(self, mode, ngrambow, nr, loadclassifier):
""" Analyse classification of training & testdata"""
DELIMITER = "\t"
# Get tweets
all_tweets = {}
index = 0
data = csv.reader(open(self.TRAININGFILE, 'rU'), delimiter=DELIMITER)
for i, row in enumerate(data):
if i == 0:
pass
else:
all_tweets[index] = row[3]
index += 1
# Create classifier on trainingdata of class
(array, tuplebows) = self.setup_input_classification(mode, ngrambow, 0,0, nr)
if 'svm' in mode:
svmObject = Start_SVM(array, mode, self.tweet_class, True, tuplebows, self.CROSS_VALIDATION)
if 'nb' in mode:
nbObject = Start_NB(array, mode, self.tweet_class, True, tuplebows, ngrambow, self.CROSS_VALIDATION)
print "preprocess new data"
# Preprocess new dataata
training_filename = self.TRAININGFILE.split('.')[0]
self.preprocess_tweets(mode,self.tweets, training_filename)
array = self.get_preprocessed_array(mode)
# Classify tweets
if 'svm' in mode:
prediction = svmObject.start_classification(mode,array, loadclassifier, 0.001, 10)
if 'nb' in mode:
prediction = nbObject.start_classification(mode,array, False, loadclassifier)
self.count_classes(prediction.tolist())
classification_filename = training_filename + "_class.csv"
helpers.write_classification_to_tweetfile(prediction,1, 7, self.TRAININGFILE, classification_filename)
def start_svm_evaluation(self, array, mode, svmtype, ngrambow, minborder, maxborder, nr, tuplebows):
""" Start SVM classification learning. Return results (resultscores_tuple, gamma1, c)"""
svmObject = Start_SVM(array, mode, self.tweet_class, True, tuplebows, self.CROSS_VALIDATION)
results = svmObject.start_svm_evaluation(mode, svmtype, minborder, maxborder, nr, tuplebows)
return results
def start_naivebayes_classification(self, array, mode, ngrambow, minborder, maxborder, nr, tuplebows):
""" Start Naive Bayes classification learning. Return results (resultscores_tuple, N.A., N.A.)"""
nbObject = Start_NB(array, mode, self.tweet_class, True, tuplebows, ngrambow, self.CROSS_VALIDATION)
results = nbObject.start_naivebayes_evaluation(mode, minborder, maxborder, nr)
return results
def setup_input_classification(self, mode, ngrambow, minborder, maxborder, nr):
""" Set up array and BOWs used for classification"""
array = self.get_preprocessed_array(mode)
tuplebows = self.collect_bow(array, ngrambow, minborder, maxborder, nr/2)
return (array,tuplebows)
def compare_dummy_classification(self):
""" Compares classifier to dummy classifiers. Return results (resultscores_tuple, N.A., N.A.)"""
X_train = self.train_vectors
y_train = self.train_tweetclasses
X_test = self.test_vectors
y_test = self.test_tweetclasses
dummy_results = []
dummy = DummyClassifier(strategy='most_frequent',random_state=0)
dummy.fit(X_train, y_train)
y_true, y_preddum = y_test, dummy.predict(X_test)
tuples = precision_recall_fscore_support(y_true, y_preddum)
dummy1 = DummyClassifier(strategy='stratified',random_state=0)
dummy1.fit(X_train, y_train)
y_true, y_preddum1 = y_test, dummy1.predict(X_test)
tuples1 = precision_recall_fscore_support(y_true, y_preddum1)
dummy2 = DummyClassifier(strategy='uniform',random_state=0)
dummy2.fit(X_train, y_train)
y_true, y_preddum2 = y_test, dummy2.predict(X_test)
tuples2 = precision_recall_fscore_support(y_true, y_preddum2)
resulttuple = ('dummy freq', 'N.A.','N.A.', 'N.A.', 'N.A.', tuples)
resulttuple1 = ('dummy strat', 'N.A.', 'N.A.', 'N.A.', 'N.A.', tuples1)
resulttuple2 = ('dummy uni', 'N.A.', 'N.A.', 'N.A.', 'N.A.', tuples2)
dummy_results.append(resulttuple)
dummy_results.append(resulttuple1)
dummy_results.append(resulttuple2)
return dummy_results
def collect_bow(self, array, ngram_types_array, posborder, negborder, nr):
""" Collect Bag of words of array with specified array and ngrams
Returns negative and positive bag of words
"""
bowObject = BagOfWords(array, self.tweet_class)
negbow = {}
posbow = {}
# Create positive and negative bag of words
for item in ngram_types_array:
bowObject.create_corpus(item)
posbow.update(bowObject.bow_partial(max_border=0+posborder, min_border=-1, nr=nr))
negbow.update(bowObject.bow_partial(max_border=1, min_border=0+negborder, nr=nr))
return (negbow, posbow)
def get_preprocessed_array(self, arrayname):
""" Get processed array according to name """
mode = arrayname.split()
if ( "stem" in mode[2]):
return self.stemmed_tweets_array
if ( "token" in mode[2]):
return self.tokenized_tweets_array
if ( "pos" in mode[2]):
return self.pos_tweets_array
if ( "lemma" in mode[2]):
return self.lemmatized_tweets_array
else:
return []
def preprocess_tweets(self, mode, tweets_dict, filename):
""" Process tweets according to mode and set arrays """
processObject = Preprocessing(mode, tweets_dict,filename)
processObject.preprocess_tweets()
if ( "stem" in mode):
self.stemmed_tweets_array = processObject.stemmed_tweets_array
if ( "token" in mode):
self.tokenized_tweets_array = processObject.tokenized_tweets_array
if ( "pos" in mode):
self.pos_tweets_array = processObject.pos_tweets_array
if ( "lemma" in mode):
self.lemmatized_tweets_array = processObject.lemmatized_tweets_array
def count_classes(self, tweet_class_array):
""" Counts occurance of each class
"""
total = len(tweet_class_array)
# Count occurances of classes
activity_count = tweet_class_array.count(0)
nonactivity_count = tweet_class_array.count(1)
# Print
print ">> Statistics:"
print "Total number of tweets: %i" % total
print "Total activity tweets: %i" % activity_count
print "Total non-activity tweets: %i" % nonactivity_count
def string_metrics(self, f1_array):
""" Create array of string values from values in tuples """
metrics_string_array = []
for item in f1_array:
metric = "%.4f" %item
metrics_string_array.append(metric)
return metrics_string_array
def write_results_to_file(self, results):
""" Write results to CSV file"""
rows = []
try:
for item in results:
mode, gamma, c, ngram, bow, tuples = item
f1_avg, f1_array = tuples
if isinstance(gamma, float):
gamma = "%.4f" % gamma
if isinstance(c, float):
c = "%.0f" %c
f1_avg_4f = "%.4f" % f1_avg
metriclist = self.string_metrics(f1_array)
row = [mode, gamma, c, ngram, bow, f1_avg_4f]
row += metriclist
rows.append(row)
except TypeError:
print "Error: Type of parameter result"
print results
helpers.write_to_csv(self.RESULTFILE, "a", rows)
def write_begin(self):
""" Write header for results to CSV file """
# Create headers for rounds
list_roundnr = []
for i in range(1, self.CROSS_VALIDATION+1):
roundnr_string = "Round %i" %i
list_roundnr.append(roundnr_string)
headers = [["MODE","GAMMA", "C", "NGRAM", "BOW", "F1 AVG"]]
rows = [ headers[0]+list_roundnr ]
helpers.write_to_csv(self.RESULTFILE, "wb", rows)
def run_classification_evaluation(self, modes, ngramarray, lenbows):
""" Run classifications according to input and write results to file."""
begin = time.time()
# Run classifications according to parameters
for mode in modes:
print "-- RUN NEW MODE: %s.." % mode
for ngram in ngramarray:
print "-- RUN NEW NGRAM: %s.." % str(ngram)
for lenbow in lenbows:
#try:
resulttuple = None
array, tuplebow = self.setup_input_classification(mode, ngram, 0, 0, lenbow)
if 'svm' in mode:
svmtype = mode.split()[1]
(result, gamma, c) = self.start_svm_evaluation(array, mode, svmtype, ngram, 0,0, lenbow,tuplebow)
resulttuple = [(mode, gamma, c, ngram, lenbow, result)]
if 'nb' in mode:
(result, gamma, c) = self.start_naivebayes_classification(array,mode, ngram, 0, 0, lenbow, tuplebow)
resulttuple = [(mode, gamma, c, ngram, lenbow, result)]
#except Exception:
# print "PROBLEM OCCURED in mode: %s, ngram: %s, lenbow: %i" %(mode,str(ngram), lenbow)
# print "A"
self.write_results_to_file(resulttuple)
# Run dummy classification
#dummy_result_array = self.compare_dummy_classification()
#self.write_results_to_file(dummy_result_array)
print "TIME TAKEN: %f seconds" % (time.time() - begin)
# call main with mode
m = Main(True, "frog lemma pos stem token --debug")
#m.get_activity_tweets('day_saturday.csv','svm lemma posneg --debug --write', [1,2], 100, True)
#m.write_begin()
#m.analysis_classification('nb lemma posneg --debug', [1,2], 100, False)
#Dmodes = ['nb token posneg', 'nb token pos1', 'nb token neg1',
#D 'nb stem posneg', 'nb stem pos1', 'nb stem neg1',
#D 'nb lemma posneg', 'nb lemma pos1', 'nb lemma neg1',
#D 'nb pos posneg', 'nb pos pos1', 'nb pos neg1']
#modes = ['svm token posneg', 'svm token pos1', 'svm token neg1','svm token freq']
#DONE 'svm token pn-neutral' , 'svm token posneg', 'svm token pos1', 'svm token neg1','svm token freq'
#DONE 'svm stem pn-neutral' , 'svm stem posneg', 'svm stem pos1', 'svm stem neg1','svm stem freq',
#modes= ['svm lemma pn-neutral' , 'svm lemma posneg', 'svm lemma pos1', 'svm lemma neg1','svm lemma freq',
# 'svm pos posneg', 'svm pos pos1', 'svm pos neg1','svm pos freq']
#DONE ['svm token freq', 'svm stem freq', 'svm lemma freq','svm pos freq']
#TODO: modes = ['svm lemma pn-neutral', svm pos pn-neutral
"""
ALL POSSIBLE MODES:
modes = {svm ln, svm rb} x {token, stem, lemma, pos} x{posneg, pos1, neg1, freq} x {[1], [1,2], [1,2,3], [2,3]} x {50, 74, 100, 124, 150, 174, 200}
+ {nb} x {token, stem, lemma, pos} x {posneg, pos1, neg1} x {[1], [1,2], [1,2,3], [2,3]} x {50, 74, 100, 124, 150, 174, 200}
= 896+336
['svm token posneg', 'svm token pos1', 'svm token neg1','svm token freq',
'svm stem posneg', 'svm stem pos1', 'svm stem neg1','svm stem freq',
'svm lemma posneg', 'svm lemma pos1', 'svm lemma neg1','svm lemma freq',
'svm pos posneg', 'svm pos pos1', 'svm pos neg1','svm pos freq']
"""
modes= ['svm rbf stem posneg', 'svm rbf stem pos1', 'svm rbf stem neg1']
ngramarray = [[1],[1,2], [1,2,3], [2,3]]
lenbows = [50, 74, 100, 124, 150, 174, 200]
m.run_classification_evaluation(modes, ngramarray, lenbows)
#
modes = [ 'svm rbf token posneg']
ngramarray = [[1,2,3]]
lenbow = [150,174,200]
m.run_classification_evaluation(modes, ngramarray, lenbows)
#
ngramarray = [[2,3]]
lenbows = [50, 74, 100, 124, 150, 174, 200]
m.run_classification_evaluation(modes, ngramarray, lenbows)
#m.write_begin()
modes = ['svm ln pos posneg', 'svm ln pos pos1', 'svm ln pos neg1','svm lnpos freq']
ngramarray = [[1],[1,2], [1,2,3], [2,3]]
m.run_classification_evaluation(modes, ngramarray, lenbows)