def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=100): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all G = GloveEmbedding(self.embedding_filename, dimensions=100) word_to_idx, idx_to_word, embedding = G.read_embedding() # print("hello", embedding[47]) # print("hello", embedding[9876]) # S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding) # # edata = [] # padding_vect = [0] * 100 # # # // exit(0) # n = 0 # for i in X_train_sentences: # # print("Buenoooooooo", n) # m = S.map_sentence(i) # # print("n:", n) # if m.shape[0] < 75: # m = np.vstack((m, np.zeros((75-m.shape[0],100)))) # # cuando codear "eficientemente" tiene que ser utilizado # # while m.shape[0] < 75: # # m = np.vstack((m,np.array(padding_vect))) # else: # if m.shape[0] == 100: # m = np.array([m]) # m = np.vstack((m, np.zeros((75-m.shape[0],100)))) # p = np.array([m]) # # print("ghjkl", str(p.shape), " ghjkluhnm ", n, i) # if n > 0: # edata = np.vstack((edata, p)) # else: # edata = p # # print("----------------------------------->" + str(edata.shape)) # n = n+1 # np.save("array", edata) hjkl = np.load("data/array.npy") print("----------------------------------->" + str(hjkl.shape)) # exit(0) X_train = hjkl Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) # plt.imshow(X_train[0]) # plt.show() #Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print("----------------------------------->" + str(X_train.shape)) print( "Entiendo que esto es la data que utiliza para hacer le training ", len(X_train), len(X_train[0]), len(X_train[0][0]), " Y_train ", len(Y_train)) print("Train data convert to numpy arrays") NN = KerasInceptionCNN(0, G) print("model created") kernel_regularizer = l2(0.001) NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) sgd = SGD(lr=0.05) NN.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - HERE history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=3, class_weight=class_weight_dictionary) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=50): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 # if (label == 2): # label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all # Divide after conversions # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 33 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) # Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print("data divided on value: ", limit) print("lengths X_train, Y_train: ", len(X_train), len(Y_train)) print("lengths X_test, Y_test: ", len(X_test), len(Y_test)) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) #NN = TweetSentiment2LSTM2Dense(trim_size, G) #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G) #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G) #NN = TweetSentimentCNN(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) NN = TweetSentiment1D(trim_size, G) #NN = TweetSentiment1DRev(trim_size, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3, # first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.0003, decay=0.001) sgd = SGD(lr=0.05) NN.compile(optimizer=adam, loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") #callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary) history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=64, class_weight=class_weight_dictionary, validation_split=0.2) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=100): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all G = GloveEmbedding(self.embedding_filename, dimensions=100) word_to_idx, idx_to_word, embedding = G.read_embedding() print("hello", embedding[47]) print("hello", embedding[9876]) S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding) #X_train_matrixes = S.map_sentence(c) edata = [] padding_vect = [0] * 100 # # s = "I love New York and music locon" # # e = "I love New York and music locon" # # mskdn = [e, s] for i in X_train_sentences: m = S.map_sentence(i) if len(m) < 75: while len(m) < 75: m = np.vstack((m, padding_vect)) edata.append(m) print("hello", edata[1]) print("len", len(edata[1]), " ", len(edata[1][1]), " ") # padding_len = self.max_len - len(sentence) # if (padding_len > 0): # padding = [] # r = range(0, padding_len) # for _ in r: # padding.append(0) # return sentence + padding # print("Train data mappend to indices") # if max_len % 2 !=0: # max_len = max_len + 1 # # P = PadSentences(max_len) # X_train_pad = P.pad_list(X_train_indices) # print("Train data padded") # # TRIM # trim_size = max_len # #trim_size = 33 # Trim = TrimSentences(trim_size) # X_train_pad = Trim.trim_list(X_train_pad) # print("X[0], ", X_train_pad[0]) # #convert to numPY arrays X_train = np.array(edata) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) #Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print( "Entiendo que esto es la data que utiliza para hacer le training ", X_train) # print ("data divided on value: ", limit) # print("lengths X_train, Y_train: ", len(X_train), len(Y_train)) # print("lengths X_test, Y_test: ", len(X_test), len(Y_test)) print("Train data convert to numpy arrays") NN = KerasInceptionCNN(0, G) print("model created") kernel_regularizer = l2(0.001) NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) sgd = SGD(lr=0.05) NN.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")