def training(parameters, alg, class_num=2): if class_num == 2: train_data = pd.read_csv('../movie_dataset/SerbMR-2C.csv') else: train_data = pd.read_csv('../movie_dataset/SerbMR-3C.csv') # train_data = pd.read_csv('E:/Faks/M/OPJ/Projekat/bbc-text.csv') train_data_X = train_data['Text'] train_data_y = train_data['class-att'] X_train, X_test, y_train, y_test = train_test_split(train_data_X, train_data_y, test_size=0.2, random_state=7, stratify=train_data_y) text_clf = Pipeline([ # ('vect', CountVectorizer(tokenizer=tokenizer.text_to_tokens, min_df=3, ngram_range=(1, 2))), # ('tfidf', TfidfTransformer()), ('tfidf', TfidfVectorizer(min_df=3, ngram_range=(1, 2), tokenizer=tokenizer.text_to_tokens)), ('alg', alg), ]) gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1) gs_clf = gs_clf.fit( X_train, y_train) # it returns optimized classifier that we can use to predict print(gs_clf.best_score_) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, gs_clf.best_params_[param_name])) print(gs_clf.cv_results_) y_pred = gs_clf.predict(X_test) if class_num == 2: plotting.calculate_normalized_confusion_matrix(y_test, y_pred, 2) else: plotting.calculate_normalized_confusion_matrix(y_test, y_pred, 3) plotting.show_confusion_matrix() return accuracy_score(y_test, y_pred)
y_true.append(-1) # Three classes boundary_both = 0.595 if classes_num is 3: for y in summ: if y >= boundary_both: y_both.append(1) elif y > (-1) * boundary_both: y_both.append(0) else: y_both.append(-1) for y in list_out: if y == 'POSITIVE': y_true.append(1) elif y == 'NEUTRAL': y_true.append(0) else: y_true.append(-1) cm3 = plotting.calculate_normalized_confusion_matrix( y_true, y_both, classes_num, title=preprocessed_name + ", negation: " + str(negation) + ", Levenshtein's distance: " + str(leven_num)) plotting.show_confusion_matrix() print(accuracy_score(y_true, y_both)) preprocessed_name = "Preprocessed dictionary"
def keras_mlp_loop_all(classes_num): layer2_num = [10, 20, 50] layer3_num = [0, 10] layer3_activation = ["relu", "sigmoid"] best_acc = -3 curr_y_test = [] curr_x_test = [] curr_order = "" curr_reduction = "" curr_num_of_featchures = 0 curr_l2num = 0 curr_13num = 0 curr_l3act = 0 best_model = None orders = ["reduce_first", "reduce_last"] reductions = ["PCA", "TruncatedSVD"] for reduction in reductions: for order in orders: print("###########") print("Class of models: " + order + " " + reduction) print("###########") if classes_num == 3 and not (order == "reduce_last" and reduction == "TruncatedSVD"): print("Not enough RAM memory to support " + order + " " + reduction) continue with open("../movie_dataset/mlp_matrix_" + str(classes_num) + "_" + order + "_" + reduction + ".json", "r", encoding='utf-8') as f: results = json.load(f) x_train = results["x_train_fit"] # x_test = results["x_test_fit"] y_train = results["y_train"] # y_test = results["y_test"] x_train, x_test, y_train, y_test = train_test_split( x_train, y_train, test_size=0.15, stratify=y_train, random_state=7) x_train_60, x_validate, y_train_60, y_validate = train_test_split( x_train, y_train, test_size=0.20, stratify=y_train, random_state=7) # One-hot encoding encoder = LabelEncoder() encoder.fit(y_train) encoded_Y_60 = encoder.transform(y_train_60) encoded_y_validate = encoder.transform(y_validate) encoded_y_test = encoder.transform(y_test) # convert integers to dummy variables (i.e. one hot encoded) y_train_60 = np_utils.to_categorical(encoded_Y_60) y_validate = np_utils.to_categorical(encoded_y_validate) y_test = np_utils.to_categorical(encoded_y_test) number_of_features = len(x_train[0]) for l2num in layer2_num: for l3num in layer3_num: for l3act in layer3_activation: print("") print("Model description: ") print("Input layer: " + str(number_of_features) + " neurons") print("First hidden layer: " + str(l2num) + " neurons") if not l3num == 0: print("Second hidden layer: " + str(l3num) + " neurons, " + l3act + " activations") else: print("No second hidden layer") print("------------------------") model = build_mlp(number_of_features, l2num, l3num, l3act, classes_num) x_train_60 = np.array(x_train_60) y_train_60 = np.array(y_train_60) x_validate = np.array(x_validate) y_validate = np.array(y_validate) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10) # ToDo Checkpointing mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1) history = model.fit(x_train_60, y_train_60, validation_data=(x_validate, y_validate), verbose=1, epochs=100, callbacks=[es]) _, validation_accuracy = model.evaluate( x_validate, y_validate) _, train_accuracy = model.evaluate( x_train_60, y_train_60) print('Train: %.3f, Test: %.3f' % (train_accuracy, validation_accuracy)) print("------------------------") if validation_accuracy > best_acc: curr_y_test = y_test curr_x_test = x_test best_model = model best_acc = validation_accuracy curr_order = order curr_reduction = reduction curr_num_of_featchures = number_of_features curr_l2num = l2num curr_13num = l3num curr_l3act = l3act print("###################") print("Final evaluation: ") print("Best validation acc: " + str(best_acc)) print("Class of models: " + curr_order + " " + curr_reduction) print("Input layer: " + str(curr_num_of_featchures) + " neurons") print("First hidden layer: " + str(curr_l2num) + " neurons") if not curr_13num == 0: print("Second hidden layer: " + str(curr_13num) + " neurons, " + curr_l3act + " activations") else: print("No second hidden layer") print("------------------------") x_test = curr_x_test y_test = curr_y_test print("Testing best model: ") _, test_accuracy = best_model.evaluate(np.array(x_test), np.array(y_test)) print('Accuracy on test set: %.3f' % test_accuracy) print("------------------------") y_pred = best_model.predict(np.array(x_test)) y_pred_categorical = [] for row in y_pred: pred_class = np.argmax(row) y_pred_categorical.append(pred_class) y_pred = np.array(y_pred_categorical) y_test_old = y_test y_test_categorical = [] for row in y_test_old: pred_class = np.argmax(row) y_test_categorical.append(pred_class) y_test_old = np.array(y_test_categorical) plotting.calculate_normalized_confusion_matrix( y_test_old, y_pred, class_num=classes_num, title="Best hyperparameters combination") plotting.show_confusion_matrix()
def keras_1_layer_perceptron(data_set_json, classes_num): _, engDict = build_english() # swap the dict if needed engDictStemmed = stemmer.stem_dictionary(engDict) _, gerDict = build_german() # swap the dict if needed gerDictStemmed = stemmer.stem_dictionary(gerDict) if classes_num == 2: estimator = KerasClassifier(build_fn=build_1L_2C_perceptron, epochs=200, batch_size=5) else: estimator = KerasClassifier(build_fn=build_1L_3C_perceptron, epochs=200, batch_size=5) splits = 5 seed = 7 np.random.seed(seed) kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=seed) x = [] y = [] for data in data_set_json: sentiment_class = data['class_att'] tokens_original = data['tokens_original'] tokens_stemmed = data['tokens_stemmed'] summ_eng = comment_weight_calculation(engDictStemmed, "English", tokens_original, tokens_stemmed, 5, modification_use=False, amplification_use=False) summ_ger = comment_weight_calculation(gerDictStemmed, "German", tokens_original, tokens_stemmed, 5, modification_use=False, amplification_use=False) one_x = [summ_eng, summ_ger] one_x.append(1) x.append(one_x) y.append(class_encode(sentiment_class)) x = np.array(x) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x = sc.fit_transform(x) x = sc.transform(x) y = np.array(y) old_y = y # One-hot encoding encoder = LabelEncoder() encoder.fit(y) encoded_Y = encoder.transform(y) # convert integers to dummy variables (i.e. one hot encoded) y = np_utils.to_categorical(encoded_Y) if classes_num == 2: model = build_1L_2C_perceptron() else: model = build_1L_3C_perceptron() # Version with our cross-validation: cvscores = [] cms = [] cmdata = [] for train, test in kf.split(x, old_y): # Fit the model model.fit(x[train], y[train], epochs=100, batch_size=10, verbose=0) # evaluate the model scores = model.evaluate(x[test], y[test], verbose=0) y_pred = model.predict(x[test]) y_pred_categorical = [] for row in y_pred: pred_class = np.argmax(row) y_pred_categorical.append(pred_class) y_pred = np.array(y_pred_categorical) y_test = y[test] y_test_categorical = [] for row in y_test: pred_class = np.argmax(row) y_test_categorical.append(pred_class) y_test = np.array(y_test_categorical) cm = confusion_matrix(y_test, y_pred) cmdata.append([y_test, y_pred]) cms.append(cm) print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) cvscores.append(scores[1] * 100) print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) # results = cross_val_score(estimator, x, y, cv=kf) cnt = 1 for cmpair in cmdata: plotting.calculate_normalized_confusion_matrix( cmpair[0], cmpair[1], classes_num, title="Fold " + str(cnt) + ", accuracy: " + str(cvscores[cnt - 1])) cnt += 1 plotting.show_confusion_matrix() return np.array(cvscores)