def main(): dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH) X = dfTrain['text'].to_numpy() y = dfTrain['label'].to_numpy() if sys.argv[1] == "cv": X = CV(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train") elif sys.argv[1] == 'tfidf': X = TFIDF(X) # shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train") elif sys.argv[1] == 'word2vec': X = word2vec(X) X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train") else: print("Error") return if int(sys.argv[2]) == 0: # actual run # after k-fold and run support vector machine kf = KFold(n_splits=3, random_state=1) svm = SVC(C=0.25, kernel='linear') acc_list = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] svm.fit(X_train, y_train) print("----Start Evaluating----") acc = svm.score(X_test, y_test) acc_list.append(acc) print("Testing Accuracy:", acc) print("Mean testing accuracy:", sum(acc_list) / len(acc_list)) y_pred = svm.predict(X_test) # Store y_pred vector save_y(sys.argv[1], "svm_y_pred", y_pred) else: # grid search print("Performing Grid Search on SVM...") svm = SVC() parameters = {'kernel': ('linear', 'rbf'), 'C': (0.25, 0.5, 0.75)} grid = GridSearchCV(estimator=svm, param_grid=parameters, n_jobs=-1, cv=3, verbose=1) grid_result = grid.fit(X, y) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))
def main(): start = time() dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH,nolabel = False) Y_train = dfTrain["label"].to_numpy() lines_length = len(dfTrain.values) trainVal = dfTrain["text"].values training_text = [trainVal[i] for i in range(lines_length)] X_train_TFIDF= TFIDF(training_text) X_train_CV = CV(training_text) X_train_WV = word2vec(training_text) trainVal = dfTrain["text"].values training_text = [trainVal[i] for i in range(lines_length)] X_train_TFIDF = TFIDF(training_text) X_train_CV = CV(training_text) X_train_WV = word2vec(training_text) X_train_TFIDF = np.array(X_train_TFIDF) X_train_CV = np.array(X_train_CV) X_train_WV = np.array(X_train_WV) print("\nFor W2V\n") print(X_train_WV.shape, " is before removal the X_train shape") print("\nFor TFIDF\n") print(X_train_TFIDF.shape," is before removal the X_train shape") print("\nFor CV\n") print(X_train_CV.shape," is before removal the X_train shape") result1,perCVtrain = outlierDetection(X_train_CV,"CV train") result3,perTFIDFtrain = outlierDetection(X_train_TFIDF,"TFIDF train") result5,perWVtrain = outlierDetection(X_train_WV,"WV train") trainOutliers = [perCVtrain,perTFIDFtrain,perWVtrain] graphOutliers(trainOutliers) end = time() taken = (end - start) / 60.00 print("Time taken :%f minutes"%taken)
def main(): # PCA feature reduction on cv, tfidf, word2vec dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH, nolabel=False) X = dfTrain['text'].to_numpy() if sys.argv[1] == 'cv': model_name = 'Count Vectorizer' X = CV(X) elif sys.argv[1] == 'tfidf': model_name = 'TFIDF' X = TFIDF(X) elif sys.argv[1] == 'word2vec': model_name = 'word2vec' X = word2vec(X) else: print("Error") return pca = PCA(n_components=2) if int(sys.argv[2]) == 1: X = StandardScaler().fit_transform(X) comp = pca.fit_transform(X) xdf = pd.DataFrame(data=comp, columns=['a', 'b']) finaldf = pd.concat([xdf, dfTrain[['label']]], axis=1) plot(finaldf, model_name)
def main(): dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH) X = dfTrain['text'].to_numpy() y = dfTrain['label'].to_numpy() if sys.argv[1] == "cv": X = CV(X) X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train") look_back = 1 elif sys.argv[1] == 'tfidf': X = TFIDF(X) X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train") look_back = 1 elif sys.argv[1] == 'word2vec': # if you were to run real recurrence, use lstm=True # And un-comment the below lookback X = word2vec(X, lstm=False) X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train") look_back = 1 # look_back = X.shape[1] # un-comment to be used for real recurrence else: print("Error") return num_samples = X.shape[0] if look_back == 1: # reshape input to be [samples, time steps, features] num_features = X.shape[1] X = np.reshape(np.array(X), (num_samples, look_back, num_features)) else: num_features = X.shape[2] batch_size = 256 if int(sys.argv[2]) == 0: # actual run epochs = 25 # can change this kf = KFold(n_splits=3, random_state=1) acc_list = [] X_train = None # init X_test = None # init y_test = None #init # Doing cross validation testing for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = create_model(look_back=look_back, input_nodes=num_features) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size) print("----Start Evaluating----") _, acc = model.evaluate(X_test, y_test, verbose=1) acc_list.append(acc) print("Testing Accuracy:", acc) print("Mean testing accuracy:", sum(acc_list) / len(acc_list)) loss = history.history['loss'] val_loss = history.history['val_loss'] accuracy = history.history['accuracy'] val_accuracy = history.history['val_accuracy'] graphs_nn(loss, val_loss, accuracy, val_accuracy) y_pred = model.predict(X_test) # Store y_pred vector save_y(sys.argv[1], "lstm_y_pred", y_pred) # Store y_true vector (Only one script needs this) y_true_file = Path("./model_Ys/true/y_true_" + sys.argv[1] + ".npy") if not y_true_file.is_file(): save_y("true", "y_true_" + sys.argv[1] + ".npy", y_test) else: # doing grid search epochs = 20 model = KerasClassifier(build_fn=create_model, look_back=look_back, input_nodes=num_features, epochs=epochs, batch_size=batch_size, verbose=1) param_grid = get_param_grid() grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3) grid_result = grid.fit(X, y) evaluate(grid_result)
def main(): dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH) X = dfTrain['text'].to_numpy() y = dfTrain['label'].to_numpy() if sys.argv[1] == "cv": X = CV(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train") elif sys.argv[1] == 'tfidf': X = TFIDF(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train") elif sys.argv[1] == 'word2vec': X = word2vec(X) X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train") else: print("Error") return num_samples = X.shape[0] num_features = X.shape[1] if int(sys.argv[2]) == 0: kf = KFold(n_splits=3, random_state=1) model = ANN( ) #need to populate this with best hyperparameters after all Grid search acc_list = [] X_train = None # init X_test = None # init for train_index, test_index in kf.split(X): # Doing cross validation testing X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = ANN(input_dim=num_features) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=EPOCHS, batch_size=BATCH_SIZE) print("----Start Evaluating----") _, acc = model.evaluate(X_test, y_test, verbose=1) acc_list.append(acc) print("Testing Accuracy:", acc) print("Mean testing accuracy:", sum(acc_list) / len(acc_list)) loss = history.history['loss'] val_loss = history.history['val_loss'] accuracy = history.history['acc'] val_accuracy = history.history['val_acc'] graphs_nn(loss, val_loss, accuracy, val_accuracy) y_pred = model.predict(X_test) # Store y_pred vector save_y(sys.argv[1], "ann_y_pred", y_pred) else: model = KerasClassifier(build_fn=ANN, input_dim=num_features, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, activation="relu", optimizer="Adam") # grid search on ann hyperparameters param_grid = get_param_grid() grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3) grid_result = grid.fit(X, y) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))
def main(): dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH) X = dfTrain['text'].to_numpy() y = dfTrain['label'].to_numpy() if sys.argv[1] == "cv": X = CV(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train") elif sys.argv[1] == 'tfidf': X = TFIDF(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train") elif sys.argv[1] == 'word2vec': X = word2vec(X) X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train") else: print("Error") return if int(sys.argv[2]) == 0: # actual run X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.34) # These are the best hyper-para from the results of grid search max_depth = 9 min_samples_leaf = 4 min_samples_split = 5 n_estimators = 400 acc_list = [] X_train = None # init X_test = None # init kf = KFold(n_splits=3, random_state=1) for train_index, test_index in kf.split(X): # Doing cross validation testing X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train) print("----Start Evaluating----") acc = model.score(X_test, y_test) acc_list.append(acc) print("Testing Accuracy:", acc) print("Mean testing accuracy:", sum(acc_list) / len(acc_list)) y_pred = model.predict(X_test) # Store y_pred vector save_y(sys.argv[1], "random_forest_y_pred", y_pred) elif int(sys.argv[2]) == 1: # grid search # below are the hyperparameters to be grid-searched on n_estimators = [200, 400, 800] max_depth = [1, 5, 9] min_samples_leaf = [2, 4] min_samples_split = [5, 10] param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split) model = RandomForestClassifier() grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X, y) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) else: print("Error") return
def main(): dfTrain = readdata.read_clean_data(readdata.TRAINFILEPATH) X = dfTrain['text'].to_numpy() y = dfTrain['label'].to_numpy() if sys.argv[1] == "cv": X = CV(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="CV_Train") elif sys.argv[1] == 'tfidf': X = TFIDF(X) # train shape: (17973, 10000) X, y = getRemovedVals(X=X, Y=y, Ftype="TFIDF_Train") elif sys.argv[1] == 'word2vec': X = word2vec(X) X, y = getRemovedVals(X=X, Y=y, Ftype="W2V_Train") else: print("Error") return # reshape input to be [samples, features] num_samples = X.shape[0] num_features = X.shape[1] X = np.reshape(np.array(X), (num_samples, num_features)) if int(sys.argv[2]) == 0: # actual run C = None # to be set to the best hyperpara solver = None # to be set to the best hyperpara kf = KFold(n_splits=3, random_state=1) logistic = LogisticRegression(max_iter=500, C=C, solver=solver) acc_list = [] # Doing cross validation testing for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] logistic.fit(X_train, y_train) print("----Start Evaluating----") acc = logistic.score(X_test, y_test) acc_list.append(acc) print("Testing Accuracy:", acc) print("Mean testing accuracy:", sum(acc_list) / len(acc_list)) y_pred = logistic.predict(X_test) # Store y_pred vector save_y(sys.argv[1], "logreg_y_pred", y_pred) else: # grid search # creating space for constant C c = np.logspace(0, 4, 10) # various solvers - only used solvers that supported L2 solver = ['newton-cg', 'sag', 'lbfgs', 'liblinear'] param_grid = dict(C=c, solver=solver) logistic = LogisticRegression(max_iter=500) grid = GridSearchCV(logistic, param_grid=param_grid, cv=3, verbose=1) grid_result = grid.fit(X, y) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))