def main():

    start = time()

    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH,nolabel = False)
    Y_train = dfTrain["label"].to_numpy()

    lines_length = len(dfTrain.values)
    
    trainVal = dfTrain["text"].values
    
    training_text = [trainVal[i] for i in range(lines_length)]
    
    X_train_TFIDF= TFIDF(training_text)
    X_train_CV = CV(training_text)
    X_train_WV = word2vec(training_text)

    trainVal = dfTrain["text"].values

    training_text = [trainVal[i] for i in range(lines_length)]

    X_train_TFIDF = TFIDF(training_text)
    X_train_CV = CV(training_text)
    X_train_WV = word2vec(training_text)

    X_train_TFIDF = np.array(X_train_TFIDF)

    X_train_CV = np.array(X_train_CV)


    X_train_WV = np.array(X_train_WV)

    print("\nFor W2V\n")
    print(X_train_WV.shape, " is before removal the X_train shape")

    print("\nFor TFIDF\n")
    print(X_train_TFIDF.shape," is before removal the X_train shape")
    
    
    print("\nFor CV\n")
    print(X_train_CV.shape," is before removal the X_train shape")


    result1,perCVtrain = outlierDetection(X_train_CV,"CV train")
    result3,perTFIDFtrain = outlierDetection(X_train_TFIDF,"TFIDF train")
    result5,perWVtrain = outlierDetection(X_train_WV,"WV train")

    trainOutliers = [perCVtrain,perTFIDFtrain,perWVtrain]
    graphOutliers(trainOutliers)
  

    end = time()
    taken = (end - start) / 60.00
    print("Time taken :%f minutes"%taken)
Exemple #2
0
def main():
    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH)

    X = dfTrain['text'].to_numpy()
    y = dfTrain['label'].to_numpy()

    if sys.argv[1] == "cv":
        X = CV(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train")

    elif sys.argv[1] == 'tfidf':
        X = TFIDF(X)  # shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train")

    elif sys.argv[1] == 'word2vec':
        X = word2vec(X)
        X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train")

    else:
        print("Error")
        return

    if int(sys.argv[2]) == 0:  # actual run
        # after k-fold and run support vector machine
        kf = KFold(n_splits=3, random_state=1)
        svm = SVC(C=0.25, kernel='linear')
        acc_list = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            svm.fit(X_train, y_train)
            print("----Start Evaluating----")
            acc = svm.score(X_test, y_test)
            acc_list.append(acc)
            print("Testing Accuracy:", acc)
        print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

        y_pred = svm.predict(X_test)

        # Store y_pred vector
        save_y(sys.argv[1], "svm_y_pred", y_pred)

    else:  # grid search
        print("Performing Grid Search on SVM...")
        svm = SVC()
        parameters = {'kernel': ('linear', 'rbf'), 'C': (0.25, 0.5, 0.75)}
        grid = GridSearchCV(estimator=svm,
                            param_grid=parameters,
                            n_jobs=-1,
                            cv=3,
                            verbose=1)
        grid_result = grid.fit(X, y)
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
Exemple #3
0
def main():
    # PCA feature reduction on cv, tfidf, word2vec
    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH, nolabel=False)
    X = dfTrain['text'].to_numpy()
    if sys.argv[1] == 'cv':
        model_name = 'Count Vectorizer'
        X = CV(X)
    elif sys.argv[1] == 'tfidf':
        model_name = 'TFIDF'
        X = TFIDF(X)
    elif sys.argv[1] == 'word2vec':
        model_name = 'word2vec'
        X = word2vec(X)
    else:
        print("Error")
        return

    pca = PCA(n_components=2)
    if int(sys.argv[2]) == 1:
        X = StandardScaler().fit_transform(X)
    comp = pca.fit_transform(X)
    xdf = pd.DataFrame(data=comp, columns=['a', 'b'])
    finaldf = pd.concat([xdf, dfTrain[['label']]], axis=1)
    plot(finaldf, model_name)
Exemple #4
0
def main():
    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH)
    X = dfTrain['text'].to_numpy()
    y = dfTrain['label'].to_numpy()

    if sys.argv[1] == "cv":
        X = CV(X)
        X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train")
        look_back = 1

    elif sys.argv[1] == 'tfidf':
        X = TFIDF(X)
        X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train")
        look_back = 1

    elif sys.argv[1] == 'word2vec':
        # if you were to run real recurrence, use lstm=True
        # And un-comment the below lookback
        X = word2vec(X, lstm=False)
        X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train")
        look_back = 1
        # look_back = X.shape[1] # un-comment to be used for real recurrence

    else:
        print("Error")
        return

    num_samples = X.shape[0]

    if look_back == 1:
        # reshape input to be [samples, time steps, features]
        num_features = X.shape[1]
        X = np.reshape(np.array(X), (num_samples, look_back, num_features))
    else:
        num_features = X.shape[2]

    batch_size = 256

    if int(sys.argv[2]) == 0:  # actual run
        epochs = 25  # can change this
        kf = KFold(n_splits=3, random_state=1)
        acc_list = []
        X_train = None  # init
        X_test = None  # init
        y_test = None  #init

        # Doing cross validation testing
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = create_model(look_back=look_back, input_nodes=num_features)
            history = model.fit(X_train,
                                y_train,
                                validation_data=(X_test, y_test),
                                epochs=epochs,
                                batch_size=batch_size)
            print("----Start Evaluating----")
            _, acc = model.evaluate(X_test, y_test, verbose=1)
            acc_list.append(acc)
            print("Testing Accuracy:", acc)
        print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

        loss = history.history['loss']
        val_loss = history.history['val_loss']
        accuracy = history.history['accuracy']
        val_accuracy = history.history['val_accuracy']
        graphs_nn(loss, val_loss, accuracy, val_accuracy)

        y_pred = model.predict(X_test)

        # Store y_pred vector
        save_y(sys.argv[1], "lstm_y_pred", y_pred)

        # Store y_true vector (Only one script needs this)
        y_true_file = Path("./model_Ys/true/y_true_" + sys.argv[1] + ".npy")
        if not y_true_file.is_file():
            save_y("true", "y_true_" + sys.argv[1] + ".npy", y_test)

    else:  # doing grid search
        epochs = 20
        model = KerasClassifier(build_fn=create_model,
                                look_back=look_back,
                                input_nodes=num_features,
                                epochs=epochs,
                                batch_size=batch_size,
                                verbose=1)
        param_grid = get_param_grid()

        grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
        grid_result = grid.fit(X, y)
        evaluate(grid_result)
Exemple #5
0
def main():

    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH)

    X = dfTrain['text'].to_numpy()
    y = dfTrain['label'].to_numpy()

    if sys.argv[1] == "cv":
        X = CV(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train")

    elif sys.argv[1] == 'tfidf':
        X = TFIDF(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train")

    elif sys.argv[1] == 'word2vec':
        X = word2vec(X)
        X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train")

    else:
        print("Error")
        return

    num_samples = X.shape[0]
    num_features = X.shape[1]

    if int(sys.argv[2]) == 0:
        kf = KFold(n_splits=3, random_state=1)
        model = ANN(
        )  #need to populate this with best hyperparameters after all Grid search
        acc_list = []
        X_train = None  # init
        X_test = None  # init
        for train_index, test_index in kf.split(X):
            # Doing cross validation testing
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = ANN(input_dim=num_features)
            history = model.fit(X_train,
                                y_train,
                                validation_data=(X_test, y_test),
                                epochs=EPOCHS,
                                batch_size=BATCH_SIZE)
            print("----Start Evaluating----")
            _, acc = model.evaluate(X_test, y_test, verbose=1)
            acc_list.append(acc)
            print("Testing Accuracy:", acc)

        print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

        loss = history.history['loss']
        val_loss = history.history['val_loss']
        accuracy = history.history['acc']
        val_accuracy = history.history['val_acc']
        graphs_nn(loss, val_loss, accuracy, val_accuracy)

        y_pred = model.predict(X_test)

        # Store y_pred vector
        save_y(sys.argv[1], "ann_y_pred", y_pred)

    else:

        model = KerasClassifier(build_fn=ANN,
                                input_dim=num_features,
                                epochs=EPOCHS,
                                batch_size=BATCH_SIZE,
                                verbose=1,
                                activation="relu",
                                optimizer="Adam")

        # grid search on ann hyperparameters
        param_grid = get_param_grid()
        grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

        grid_result = grid.fit(X, y)
        print("Best: %f using %s" %
              (grid_result.best_score_, grid_result.best_params_))
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
Exemple #6
0
def main():
    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH)

    X = dfTrain['text'].to_numpy()
    y = dfTrain['label'].to_numpy()

    if sys.argv[1] == "cv":
        X = CV(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train")

    elif sys.argv[1] == 'tfidf':
        X = TFIDF(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train")

    elif sys.argv[1] == 'word2vec':
        X = word2vec(X)
        X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train")

    else:
        print("Error")
        return

    if int(sys.argv[2]) == 0:  # actual run
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1,
                                                            test_size=0.34)

        # These are the best hyper-para from the results of grid search
        max_depth = 9
        min_samples_leaf = 4
        min_samples_split = 5
        n_estimators = 400
        acc_list = []
        X_train = None  # init
        X_test = None  # init

        kf = KFold(n_splits=3, random_state=1)

        for train_index, test_index in kf.split(X):
            # Doing cross validation testing
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = RandomForestClassifier(max_depth=max_depth,
                                           min_samples_leaf=min_samples_leaf,
                                           min_samples_split=min_samples_split,
                                           n_estimators=n_estimators,
                                           n_jobs=-1)
            model.fit(X_train, y_train)
            print("----Start Evaluating----")
            acc = model.score(X_test, y_test)
            acc_list.append(acc)
            print("Testing Accuracy:", acc)
        print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

        y_pred = model.predict(X_test)
        # Store y_pred vector
        save_y(sys.argv[1], "random_forest_y_pred", y_pred)

    elif int(sys.argv[2]) == 1:  # grid search
        # below are the hyperparameters to be grid-searched on
        n_estimators = [200, 400, 800]
        max_depth = [1, 5, 9]
        min_samples_leaf = [2, 4]
        min_samples_split = [5, 10]
        param_grid = dict(n_estimators=n_estimators,
                          max_depth=max_depth,
                          min_samples_leaf=min_samples_leaf,
                          min_samples_split=min_samples_split)

        model = RandomForestClassifier()
        grid = GridSearchCV(estimator=model,
                            param_grid=param_grid,
                            n_jobs=-1,
                            cv=3)
        grid_result = grid.fit(X, y)
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
    else:
        print("Error")
        return
Exemple #7
0
def main():
    dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH)

    X = dfTrain['text'].to_numpy()
    y = dfTrain['label'].to_numpy()

    if sys.argv[1] == "cv":
        X = CV(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train")

    elif sys.argv[1] == 'tfidf':
        X = TFIDF(X)  # train shape: (17973, 10000)
        X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train")

    elif sys.argv[1] == 'word2vec':
        X = word2vec(X)
        X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train")

    else:
        print("Error")
        return

    # reshape input to be [samples, features]
    num_samples = X.shape[0]
    num_features = X.shape[1]
    X = np.reshape(np.array(X), (num_samples, num_features))

    if int(sys.argv[2]) == 0:  # actual run
        C = None  # to be set to the best hyperpara
        solver = None  # to be set to the best hyperpara
        kf = KFold(n_splits=3, random_state=1)
        logistic = LogisticRegression(max_iter=500, C=C, solver=solver)
        acc_list = []
        # Doing cross validation testing
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            logistic.fit(X_train, y_train)
            print("----Start Evaluating----")
            acc = logistic.score(X_test, y_test)
            acc_list.append(acc)
            print("Testing Accuracy:", acc)
        print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

        y_pred = logistic.predict(X_test)

        # Store y_pred vector
        save_y(sys.argv[1], "logreg_y_pred", y_pred)

    else:  # grid search
        # creating space for constant C
        c = np.logspace(0, 4, 10)
        # various solvers - only used solvers that supported L2
        solver = ['newton-cg', 'sag', 'lbfgs', 'liblinear']
        param_grid = dict(C=c, solver=solver)
        logistic = LogisticRegression(max_iter=500)
        grid = GridSearchCV(logistic, param_grid=param_grid, cv=3, verbose=1)
        grid_result = grid.fit(X, y)
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))