/
detection_offensive.py
476 lines (396 loc) · 21.3 KB
/
detection_offensive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
import pandas as pd
from pandas import DataFrame
import numpy as np
from pathlib import Path, PureWindowsPath
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras import Sequential
from keras import layers
from keras import utils
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
#word2vec
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors
from keras_preprocessing.text import text_to_word_sequence
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
#from spellchecker import SpellChecker
from emot import EMOTICONS
from emot import UNICODE_EMO
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader import wordnet
def set_up_model(vocab_size, input_length, embedding=np.zeros((1,0))):
"""embedding optional"""
model = Sequential()
if(embedding.size):
print("Embedding layer using weights calculated in embedding matrix ")
model.add(layers.Embedding(vocab_size,embedding_length , input_length=input_length, weights=[embedding], trainable=False))
else:
print("Embedding layer with random intiialized word vectors")
model.add(layers.Embedding(vocab_size,embedding_length , input_length=input_length, trainable=False))
model.add(layers.Conv1D(filter_n, filter_heigth, strides=strides, padding='valid', activation='elu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dropout(0.5))
#model.add(layers.Dense(10, activation='relu'))#sigmoid for multicalss, softmax for single classes
model.add(layers.Dense(3, activation='softmax'))#sigmoid for multicalss, softmax for single classes
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
return model
def get_info_labels(labels, axis=None):
classes, classes_c = np.unique(labels, return_counts=True, axis=axis)
print("number of labels: " + str(len(classes)))
print("labels available and occurrences: ")
for i, v in enumerate(classes):
print(v, end='')
print(" : ", end=' ')
print(classes_c[i])
return classes_c, classes
def calculate_confusion_matrix(correct_labels, prediction_labels, correct1hot=True, prediction1hot=False, names=None):
"""Description: print to screen confusion matrix of predicted labels compared to correct ones. Correct labels can be 1hotencoded """
if(correct1hot):
correct_labels = np.argmax(correct_labels, axis=1)
if(prediction1hot):
prediction_labels = np.argmax(prediction_labels, axis=1)
#Predicted0|Predicted1|Predict3|
#Actual0| | | |
#Actual1| | | |
#Actual2| | | |
c_matrix = confusion_matrix(correct_labels, prediction_labels, labels=names)
print(c_matrix)
print("Accuracy score: ", accuracy_score(correct_labels, prediction_labels))
print("Report")
print(classification_report(correct_labels, prediction_labels))
def run_kfold(splits, data, labels, word2vec=True, pre_embedding=False):
kf = KFold(n_splits=splits, shuffle=True)
encoder = LabelEncoder()
labels_int = encoder.fit(labels).transform(labels)
print(encoder.classes_)
label_names = encoder.classes_
#1hot encode the labels - [1,0,0] : 0; [0,1,0]: 1; [0,0,1]: 2
labels_encoded = utils.to_categorical(labels_int)
history = list()
accuracies_training = list()
accuracies_testing = list()
for train_index, test_index in kf.split(data):
#data is splitted based on the
data_train , data_test = data[train_index], data[test_index]
label_train , label_test = labels_encoded[train_index], labels_encoded[test_index]
#resetting the model every iteration
history_temp, a_tr, a_te = run_model(data_train, data_test, label_train, label_test, word2vec, pre_embedding)
accuracies_training.append(a_tr)
accuracies_testing.append(a_te)
history.append(history_temp)
print("KFold ended")
print("Accuracy training average: {}".format(str(average_accuracies(accuracies_training)[0])))
print("Accuracy testing average: {}".format(str(average_accuracies(accuracies_testing)[0])))
return history, accuracies_training, accuracies_testing
def average_accuracies(accuracies):
"""Assumes that accuracies is a list of tuples (accuracy, loss)"""
average_accuracy = np.average(list(list(zip(*accuracies))[0]))
average_loss = np.average(list(list(zip(*accuracies))[1]))
return average_accuracy, average_loss
def plot_history(training):
acc = training.history['accuracy']
val_acc = training.history['val_accuracy']
loss = training.history['loss']
val_loss = training.history['val_loss']
x = range(1, len(acc) + 1)
plt.figure(figsize=(14, 8))
plt.subplot(1, 2, 2)
plt.plot(x, acc, 'b', label='Training acc', color="blue")
plt.plot(x, val_acc, 'r', label='Validation acc', color="green")
plt.title('Training and validation accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss', color="pink")
plt.plot(x, val_loss, 'r', label='Validation loss', color="red")
plt.title('Training and validation loss')
plt.legend()
plt.show()
#file_path = PureWindowsPath("E:\Users\User\Documents\SCHOOL\\5thYear\data_wrangling\cw-partB\short.csv")
path_full = "E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\train.csv"
directory = "E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\"
path_sampled = "E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\sampled.csv"
path_sampled5 = "E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\sampled5.csv"
path_full_no_emojis = ".\\datasets\\full_noemojis.csv"
# if file_path.exists():
# print("ok")
# else:
# raise FileNotFoundError("the file specified doesn't exist!")
#hyperparameters
max_len = 100 #number of words per sentences. If sentence has less, then added wv empty <0,0,0>
embedding_length = 300 #describe how many elements a word vector will have
filter_n = 100
filter_heigth = 3
strides = 1
batch_size = 32
epochs = 10
#
#path_model_wc = 'E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\models\\init_50.wv' #model to be used to load a pretrained wc
#path_model_wc = 'C:\\Users\\Alessio\\gensim-data\\glove-twitter-25\\glove-twitter-25.gz'
#path_model_wc = 'C:\\Users\\Alessio\\gensim-data\\glove-wiki-gigaword-50\\glove-wiki-gigaword-50.gz'
path_model_wc = 'E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\models\\cleaned_noemoj.wv'
def word2vec_embedding(vocab_size, word_index, pre_embedding=False, data_embedding=np.zeros((1,0))):
"""data_embedding: optional, only needed if pre_embedding is not used. pre_embedding states that a preembedding generated model will be loaded from
global variable path_model_wc (bad practice fix). data_embedding used to pass the data to train the word embedding"""
#load pretrained model
print("Fitting the embedding matrix")
word_vectors : KeyedVectors
if not pre_embedding:
print("Calculating the words vector from datasetfile")
if not data_embedding.size:
#list is empty and should not be: throw error
raise AttributeError("data embedding parameter should not be empty if pre_embedding is set to False")
word_vectors = get_word2vec_embedding(data_embedding)
else:
print("Loading preembedded word vector from {}".format(path_model_wc))
word_vectors = KeyedVectors.load_word2vec_format(path_model_wc, binary=False)
embedding_matrix = np.zeros((vocab_size, embedding_length))
c_not_present = 0
for word, position in word_index.items():
#vocab_size is the total number of the whole available words
if position >= vocab_size: #not too useful to use word_index for the vocab_size. Because will create a wc with all the available words, but when trained, the sequences created by the tokenizer will only output the n_more_common_words(n is set when creatign the tokenizer)
continue
try:
embedding_word = word_vectors[word]
embedding_matrix[position] = embedding_word
except KeyError:
#create random word vector - can check if there is difference by random and 0 intialized
embedding_matrix[position] = np.random.normal(scale = 0.1,size = (embedding_length, ))
c_not_present += 1
print("Words not present, total intialized at random: {} %".format(str(c_not_present * 100 / len(embedding_matrix))))
return embedding_matrix
def run_model(data_train, data_test, label_train, label_test, word2vec=True, pre_embedding=False):
"""Return: history: of training, evaluation_training: (accuracy, loss), evaluation_testing : (accuracy, loss). If embedding is set to true, prembedding words matrix is calculated """
print("Training labels")
get_info_labels(label_train,0)
print("Testing labels")
get_info_labels(label_test,0)
tokenizer = Tokenizer(lower=False, filters=[])
tokenizer.fit_on_texts(data_train)
#debug
c = 0
for word,position in tokenizer.word_index.items():
#print(word + str(tokenizer.word_counts[word]))
if(tokenizer.word_counts[word] == 1):
c+=1
print(c)
#end debug
pd.DataFrame()
x_train = tokenizer.texts_to_sequences(data_train)
x_test = tokenizer.texts_to_sequences(data_test)
vocab_size = len(tokenizer.word_index) + 1
print(data_train[0])
print(x_train[0])
data_train_seq = sequence.pad_sequences(x_train, maxlen=max_len)
data_test_seq = sequence.pad_sequences(x_test, maxlen=max_len)
print(data_train_seq[0])
print(data_train_seq.shape[1])
print(len(data_train_seq[0]))
print(len(data_train_seq))
#set it to none. Only being loaded if the runmodel has passed True embedding
embedding_matrix = np.zeros((1,0))
#check if user prefers that embedded model is loaded with a calculated matrix weights
if(word2vec):
embedding_matrix = word2vec_embedding(vocab_size, tokenizer.word_index, pre_embedding, np.concatenate((data_train,data_test)))
#embedding_matrix[0] = np.array([1] * embedding_length)
model = set_up_model(vocab_size, data_train_seq.shape[1], embedding_matrix)
training = model.fit(data_train_seq, label_train, batch_size=batch_size, epochs=epochs, validation_data=(data_test_seq, label_test))
loss_train, accuracy_train = model.evaluate(data_train_seq, label_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_train))
loss_test, accuracy_test = model.evaluate(data_test_seq, label_test, verbose=True)
print("Testing Accuracy: {:.4f}".format(accuracy_test))
# print(data_test.shape)
#debug trying to predict specific ones
predictions_class = model.predict_classes(data_train_seq, batch_size)#, len(data_train_seq) // batch_size + 1 )
print("predictions:", pd.unique(predictions_class))
calculate_confusion_matrix(label_train, predictions_class)
predictions = model.predict_classes(data_test_seq, batch_size)
print("predictions:", pd.unique(predictions))
calculate_confusion_matrix(label_test, predictions)
return training, (accuracy_train, loss_train), (accuracy_test, loss_test)
def run_single_model(data, labels, word2vec=True, pre_embedding=False, downsample=False):
#model = set_up_model()
encoder = LabelEncoder()
labels_int = encoder.fit(labels).transform(labels)
print(encoder.classes_)
label_names = encoder.classes_
#1hot encode the labels - [1,0,0] : 0; [0,1,0]: 1; [0,0,1]: 2
labels_encoded = utils.to_categorical(labels_int)
data_train, data_test, label_train, label_test = train_test_split(data, labels_encoded, test_size=0.20)
if(downsample):
data_train, label_train = downsample_frequent_labels(data_train, label_train)
return run_model(data_train, data_test, label_train, label_test, word2vec=word2vec, pre_embedding=pre_embedding)
def get_list_index_labels(data_labels, labels_type):
"""returns a list of lists. Each position will contain all the indexes of the same type of elements that are in the jumbled list data provided
labels_type is the unique type of labels in the data"""
sorted_by_type = [[] for x in range(len(labels_type) + 1)] #np.empty((len(labels_type) + 1,0))
for index, label in enumerate(data_labels):
for i, label_t in enumerate(labels_type):
if(np.array_equal(label,label_t)):
sorted_by_type[i].append(index)
return sorted_by_type
def oversample_rare_labels(data, lables):
"""function used to give more representation to more rare labels by duplicating instances of rare label classes"""
def downsample_frequent_labels(data, labels):
"""function used to give more representation to more rare labels by removing instances of frequent label classes"""
labels_c, labels_u = get_info_labels(labels, 0)#unique and count labels returned
min_index = np.argmin(labels_c)
min_n = labels_c[min_index]
l_indexes = get_list_index_labels(labels, labels_u)
l_chosen = []
#pick a number of random position to be removed from the l_indexes - not usign the chosen approach because it woudl create a list with ordered labels by type. Not good for the CNN
for index in range(len(labels_c)):
l_chosen.append(np.random.choice(l_indexes[index], replace=False, size=labels_c[index] - min_n))
#remove all the elements using reversed loop sorted
flattened = [item for sublist in l_chosen for item in sublist]
data = np.delete(data, flattened)
labels = np.delete(labels, flattened, axis=0)#kinda harcoded
return data, labels
# output_data = data[ [item for sublist in l_chosen for item in sublist]] #have to flatten the l_chosen (shape 3,0)
# output_labels = labels[ [item for sublist in l_chosen for item in sublist]]
# down_sampled = np.chararray(())
# for index in range(len(labels_c)): #pick one item from each places
# for i in range(labels_c[index] - min_n):#pop as many items as the number of minimum label class
# random_position = np.random.randint(0, len(l_indexes[index])) #used to pick a random position in the l_indexes
# #pop element from either labels and from the data array
# data.pop(l_indexes[random_position])
# labels.pop(l_indexes[random_position])
# #have to pop also from the l_indexes to reflect changes in the arrays
# l_indexes.pop(random_position)
# get_info_labels(output_labels, 0)
# return output_data, output_labels
###helper functions###
def save_model_from_pretrained(sentences, pretrained_path, model_name, model_outpath, embedding_length):
"""will create a full model file and a word2vec file using only the words provided. This creates a smaller model than the very big sized pretrained provided
words: list of strings"""
#embedding size must be the same as the pretrained embedding word vector length
t = Tokenizer()
t.fit_on_texts(sentences)
model = Word2Vec(size=embedding_length, min_count=1)
words = t.index_word.values()
model.build_vocab([words], update=False)
model.intersect_word2vec_format(pretrained_path, binary=False)
model.save(model_outpath + model_name + ".model")
model.wv.save_word2vec_format(model_outpath + model_name + ".wv")
print("Model and word vectors have been saved to: {}".format(model_outpath))
def get_max_length_sentences(file_path, n_elements):
"""specific for the cw dataset. n_elements will print give out the n_longest sentences"""
df = pd.read_csv(file_path, names=["id", "text", "label"], sep=",");
data = df["text"].values
data = np.array(data)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
series = pd.Series(data)
splitted = series.str.split(' ')
max_ = list((x) for x in splitted._values)
max_length = dict((index, len(x)) for index, x in enumerate(splitted._values))
#max_length.sort(reverse=True)
s = sorted(max_length.items(), key=lambda x: x[1], reverse=True)
# for i in range(10):
# print(data[s[i][0]])
# print('-*10')
return s[0:n_elements]
def get_word2vec_embedding(data):
sentences_words = list(text_to_word_sequence(x, filters=[], lower=False) for x in data)
model = Word2Vec(sentences_words, size=embedding_length, workers=4, min_count=1 , sg=1, hs=1, iter=5)
print("Number of word vectors: {}".format(len(model.wv.vocab)))
return model.wv
def write_df_to_file(df, pathout):
print("writing df to {}".format(pathout))
df.to_csv(pathout, header=False, index=False)
def convert_emoticons(text):
for emot in EMOTICONS:
text = re.sub(u'('+emot+')', " ".join(EMOTICONS[emot].replace(",","").split()), text)
return text
def convert_emojis(text):
for emot in UNICODE_EMO:
text = text.replace(emot, " ".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()) + " ")
return text
def lemmatize_words(text):
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} # Pos tag, used Noun, Verb, Adjective and Adverb
pos_tagged_text = nltk.pos_tag(text.split())
return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
def write_convert_df_to_text_emojis(df, column, pathout):
df[column] = df[column].apply(convert_emoticons)
df[column] = df[column].apply(convert_emojis)
write_df_to_file(df, pathout)
def clean_dict(df : DataFrame, column):
print("Preprocessing data text - cleanining")
#drop duplicates
d = df.duplicated(column, keep='first')
print("removing duplicate sentences. Duplicates = {} senteces".format(len(df[d][column])))
#lowercase
df[column] = df[column].str.lower()
print("len before removing: {}".format(len(df[column])))
df.drop_duplicates(subset=column, inplace=True, keep='first')
print("len after removing: {}".format(len(df[column])))
stop = stopwords.words('english')
for i, row in df.iterrows():
sentence = row[column]
#remove http url
sentence = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', sentence)
#handle emojis before html
sentence = convert_emojis(sentence)
sentence = convert_emoticons(sentence)
#remove html
soup = BeautifulSoup(sentence, "html.parser")
sentence = soup.get_text()
#remove common stop words - might have to use it earlier because some stop words use symbols xes: isn't
sentence = " ".join(x for x in sentence.split() if x not in stop)
#remove punctuation
sentence = sentence.translate(str.maketrans('', '', string.punctuation))
#stemming?lemmatize?
sentence = lemmatize_words(sentence)
#spelling correction? textblob or pyspellchecker - problem: slang and not english words will be transformed in other words.
df.at[i, column] = sentence
print(df[column].head())
return df
def sample_file_percentage(pathin, directory, percentage):
""" Description:
Returns a random sample of length file path * percentage % without replacement by opening the file in the path
"""
df = pd.read_csv(pathin, names=["id", "text", "label"], sep=",");
elements_n = int(len(df.values) * percentage / 100)
dff = pd.DataFrame(df.values)
sampled = dff.sample(n=elements_n, replace=False)
print(len(sampled))
#write to file - still have to clean manually the columns names!
sampled.to_csv(directory + "sampled" + str(percentage) + ".csv", index=False, index_label=False)
##end helper functions
def main():
file_path = ".\\datasets\\full_clean_noemojis_spacesbetween.csv"#"E:\\Users\\User\\Documents\\SCHOOL\\5thYear\\data_wrangling\\cw-partB\\train_no_dup.csv"
df = pd.read_csv(file_path, names=["id", "text", "label"], sep=",")
#df = clean_dict(df, "text")
data = df["text"].fillna("NAN_sentence").values
labels = df["label"].values
print("total labels")
labels_n, __ = get_info_labels(labels)
#set the embedding, otherwise is None - so wv are created randomly
training, accuracies_training , accuracies_testing = run_single_model(data, labels, word2vec=True, pre_embedding=False, downsample=False)
#history, accuracies_training, accuracies_testing = run_kfold(5, data, labels, word2vec=True, pre_embedding=True)
print("Training")
print(accuracies_training)
print("Testing")
print(accuracies_testing)
#print hyperparameters used
print("max_len: {}, embedding_length : {}, filter_n : {}, filter_high : {}, strides : {}, batch_size : {}, epochs : {}".format(max_len, embedding_length, filter_n, filter_heigth, strides, batch_size, epochs))
# plt.style.use('ggplot')
# for training in history:
# plot_history(training)
if __name__ == "__main__":
main()