Esempio n. 1
0
def training(parameters, alg, class_num=2):
    if class_num == 2:
        train_data = pd.read_csv('../movie_dataset/SerbMR-2C.csv')
    else:
        train_data = pd.read_csv('../movie_dataset/SerbMR-3C.csv')
        # train_data = pd.read_csv('E:/Faks/M/OPJ/Projekat/bbc-text.csv')

    train_data_X = train_data['Text']
    train_data_y = train_data['class-att']

    X_train, X_test, y_train, y_test = train_test_split(train_data_X,
                                                        train_data_y,
                                                        test_size=0.2,
                                                        random_state=7,
                                                        stratify=train_data_y)

    text_clf = Pipeline([
        # ('vect', CountVectorizer(tokenizer=tokenizer.text_to_tokens, min_df=3, ngram_range=(1, 2))),
        # ('tfidf', TfidfTransformer()),
        ('tfidf',
         TfidfVectorizer(min_df=3,
                         ngram_range=(1, 2),
                         tokenizer=tokenizer.text_to_tokens)),
        ('alg', alg),
    ])

    gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

    gs_clf = gs_clf.fit(
        X_train,
        y_train)  # it returns optimized classifier that we can use to predict
    print(gs_clf.best_score_)
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    print(gs_clf.cv_results_)

    y_pred = gs_clf.predict(X_test)

    if class_num == 2:
        plotting.calculate_normalized_confusion_matrix(y_test, y_pred, 2)
    else:
        plotting.calculate_normalized_confusion_matrix(y_test, y_pred, 3)
    plotting.show_confusion_matrix()

    return accuracy_score(y_test, y_pred)
                        y_true.append(-1)

            # Three classes
            boundary_both = 0.595

            if classes_num is 3:
                for y in summ:
                    if y >= boundary_both:
                        y_both.append(1)
                    elif y > (-1) * boundary_both:
                        y_both.append(0)
                    else:
                        y_both.append(-1)
                for y in list_out:
                    if y == 'POSITIVE':
                        y_true.append(1)
                    elif y == 'NEUTRAL':
                        y_true.append(0)
                    else:
                        y_true.append(-1)

            cm3 = plotting.calculate_normalized_confusion_matrix(
                y_true,
                y_both,
                classes_num,
                title=preprocessed_name + ", negation: " + str(negation) +
                ", Levenshtein's distance: " + str(leven_num))
            plotting.show_confusion_matrix()
            print(accuracy_score(y_true, y_both))
    preprocessed_name = "Preprocessed dictionary"
def keras_mlp_loop_all(classes_num):
    layer2_num = [10, 20, 50]
    layer3_num = [0, 10]
    layer3_activation = ["relu", "sigmoid"]

    best_acc = -3
    curr_y_test = []
    curr_x_test = []
    curr_order = ""
    curr_reduction = ""
    curr_num_of_featchures = 0
    curr_l2num = 0
    curr_13num = 0
    curr_l3act = 0
    best_model = None
    orders = ["reduce_first", "reduce_last"]
    reductions = ["PCA", "TruncatedSVD"]
    for reduction in reductions:
        for order in orders:
            print("###########")
            print("Class of models: " + order + " " + reduction)
            print("###########")
            if classes_num == 3 and not (order == "reduce_last"
                                         and reduction == "TruncatedSVD"):
                print("Not enough RAM memory to support " + order + " " +
                      reduction)
                continue

            with open("../movie_dataset/mlp_matrix_" + str(classes_num) + "_" +
                      order + "_" + reduction + ".json",
                      "r",
                      encoding='utf-8') as f:
                results = json.load(f)
                x_train = results["x_train_fit"]
                # x_test = results["x_test_fit"]
                y_train = results["y_train"]
                # y_test = results["y_test"]

                x_train, x_test, y_train, y_test = train_test_split(
                    x_train,
                    y_train,
                    test_size=0.15,
                    stratify=y_train,
                    random_state=7)
                x_train_60, x_validate, y_train_60, y_validate = train_test_split(
                    x_train,
                    y_train,
                    test_size=0.20,
                    stratify=y_train,
                    random_state=7)

                # One-hot encoding
                encoder = LabelEncoder()
                encoder.fit(y_train)
                encoded_Y_60 = encoder.transform(y_train_60)
                encoded_y_validate = encoder.transform(y_validate)
                encoded_y_test = encoder.transform(y_test)
                # convert integers to dummy variables (i.e. one hot encoded)
                y_train_60 = np_utils.to_categorical(encoded_Y_60)
                y_validate = np_utils.to_categorical(encoded_y_validate)
                y_test = np_utils.to_categorical(encoded_y_test)

                number_of_features = len(x_train[0])

                for l2num in layer2_num:
                    for l3num in layer3_num:
                        for l3act in layer3_activation:
                            print("")
                            print("Model description: ")
                            print("Input layer: " + str(number_of_features) +
                                  " neurons")
                            print("First hidden layer: " + str(l2num) +
                                  " neurons")
                            if not l3num == 0:
                                print("Second hidden layer: " + str(l3num) +
                                      " neurons, " + l3act + " activations")
                            else:
                                print("No second hidden layer")
                            print("------------------------")
                            model = build_mlp(number_of_features, l2num, l3num,
                                              l3act, classes_num)

                            x_train_60 = np.array(x_train_60)
                            y_train_60 = np.array(y_train_60)
                            x_validate = np.array(x_validate)
                            y_validate = np.array(y_validate)
                            es = EarlyStopping(monitor='val_loss',
                                               mode='min',
                                               verbose=1,
                                               patience=10)
                            # ToDo Checkpointing mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1)
                            history = model.fit(x_train_60,
                                                y_train_60,
                                                validation_data=(x_validate,
                                                                 y_validate),
                                                verbose=1,
                                                epochs=100,
                                                callbacks=[es])
                            _, validation_accuracy = model.evaluate(
                                x_validate, y_validate)
                            _, train_accuracy = model.evaluate(
                                x_train_60, y_train_60)
                            print('Train: %.3f, Test: %.3f' %
                                  (train_accuracy, validation_accuracy))
                            print("------------------------")
                            if validation_accuracy > best_acc:
                                curr_y_test = y_test
                                curr_x_test = x_test
                                best_model = model
                                best_acc = validation_accuracy
                                curr_order = order
                                curr_reduction = reduction
                                curr_num_of_featchures = number_of_features
                                curr_l2num = l2num
                                curr_13num = l3num
                                curr_l3act = l3act

    print("###################")
    print("Final evaluation: ")
    print("Best validation acc: " + str(best_acc))
    print("Class of models: " + curr_order + " " + curr_reduction)
    print("Input layer: " + str(curr_num_of_featchures) + " neurons")
    print("First hidden layer: " + str(curr_l2num) + " neurons")
    if not curr_13num == 0:
        print("Second hidden layer: " + str(curr_13num) + " neurons, " +
              curr_l3act + " activations")
    else:
        print("No second hidden layer")
    print("------------------------")
    x_test = curr_x_test
    y_test = curr_y_test

    print("Testing best model: ")
    _, test_accuracy = best_model.evaluate(np.array(x_test), np.array(y_test))
    print('Accuracy on test set: %.3f' % test_accuracy)
    print("------------------------")

    y_pred = best_model.predict(np.array(x_test))
    y_pred_categorical = []
    for row in y_pred:
        pred_class = np.argmax(row)
        y_pred_categorical.append(pred_class)
    y_pred = np.array(y_pred_categorical)

    y_test_old = y_test
    y_test_categorical = []
    for row in y_test_old:
        pred_class = np.argmax(row)
        y_test_categorical.append(pred_class)
    y_test_old = np.array(y_test_categorical)

    plotting.calculate_normalized_confusion_matrix(
        y_test_old,
        y_pred,
        class_num=classes_num,
        title="Best hyperparameters combination")
    plotting.show_confusion_matrix()
def keras_1_layer_perceptron(data_set_json, classes_num):
    _, engDict = build_english()  # swap the dict if needed
    engDictStemmed = stemmer.stem_dictionary(engDict)
    _, gerDict = build_german()  # swap the dict if needed
    gerDictStemmed = stemmer.stem_dictionary(gerDict)

    if classes_num == 2:
        estimator = KerasClassifier(build_fn=build_1L_2C_perceptron,
                                    epochs=200,
                                    batch_size=5)
    else:
        estimator = KerasClassifier(build_fn=build_1L_3C_perceptron,
                                    epochs=200,
                                    batch_size=5)

    splits = 5
    seed = 7
    np.random.seed(seed)
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=seed)

    x = []
    y = []
    for data in data_set_json:
        sentiment_class = data['class_att']
        tokens_original = data['tokens_original']
        tokens_stemmed = data['tokens_stemmed']
        summ_eng = comment_weight_calculation(engDictStemmed,
                                              "English",
                                              tokens_original,
                                              tokens_stemmed,
                                              5,
                                              modification_use=False,
                                              amplification_use=False)
        summ_ger = comment_weight_calculation(gerDictStemmed,
                                              "German",
                                              tokens_original,
                                              tokens_stemmed,
                                              5,
                                              modification_use=False,
                                              amplification_use=False)

        one_x = [summ_eng, summ_ger]
        one_x.append(1)
        x.append(one_x)
        y.append(class_encode(sentiment_class))

    x = np.array(x)
    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x = sc.fit_transform(x)
    x = sc.transform(x)

    y = np.array(y)
    old_y = y
    # One-hot encoding
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_Y = encoder.transform(y)
    # convert integers to dummy variables (i.e. one hot encoded)
    y = np_utils.to_categorical(encoded_Y)

    if classes_num == 2:
        model = build_1L_2C_perceptron()
    else:
        model = build_1L_3C_perceptron()

    # Version with our cross-validation:
    cvscores = []
    cms = []
    cmdata = []
    for train, test in kf.split(x, old_y):

        # Fit the model
        model.fit(x[train], y[train], epochs=100, batch_size=10, verbose=0)
        # evaluate the model
        scores = model.evaluate(x[test], y[test], verbose=0)

        y_pred = model.predict(x[test])
        y_pred_categorical = []
        for row in y_pred:
            pred_class = np.argmax(row)
            y_pred_categorical.append(pred_class)
        y_pred = np.array(y_pred_categorical)

        y_test = y[test]
        y_test_categorical = []
        for row in y_test:
            pred_class = np.argmax(row)
            y_test_categorical.append(pred_class)
        y_test = np.array(y_test_categorical)

        cm = confusion_matrix(y_test, y_pred)

        cmdata.append([y_test, y_pred])
        cms.append(cm)

        print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
        cvscores.append(scores[1] * 100)

    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

    # results = cross_val_score(estimator, x, y, cv=kf)
    cnt = 1
    for cmpair in cmdata:
        plotting.calculate_normalized_confusion_matrix(
            cmpair[0],
            cmpair[1],
            classes_num,
            title="Fold " + str(cnt) + ", accuracy: " + str(cvscores[cnt - 1]))
        cnt += 1
        plotting.show_confusion_matrix()

    return np.array(cvscores)