def Multinomial_NB(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = ClassRead.Reader() # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] ####################################################################################################################### model = MultinomialNB() # Fit Multinomial Naive Bayes according to x, y # Make a prediction using the Multinomial Naive Bayes Model model.fit(x_train, y_train) # x : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. # y : array-like, shape (n_samples,) Target values. y_pred = model.predict(x_test) ####################################################################################################################### # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') roc = roc_auc_score(y_test, y_pred) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str(av_roc / count) + '\n') ####################################################################################################################### y_pred = (y_pred > 0.5) # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
import KNeighbors # Implements KNeighbors classification import MultinomialNB # Implements MultinomialNB classification import VotingEnsembles # Implements VotingEnsembles classification import LSTM # Implements LSTM classification import Conv1D # Implements Conv1D classification import os.path ############################################################################################################################################################## ############################################################################################################################################################## # Main ############################################################################################################################################################## ############################################################################################################################################################## reading = ClassRead.Reader( ) # Import the ClassRead.py file, that reads the input and the training sets dir = os.getcwd() # Gets the current working directory ############################################################################################################################################################## # Read input and training file, check if the dataset is imbalanced ############################################################################################################################################################## reading.readTrain() #reading.checkImbalance() ############################################################################################################################################################## # Call all algorithms with different combinations of feature selection and encoding
def lstm(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = ClassRead.Reader() # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Above 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # fix random seed for reproducibility numpy.random.seed(7) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Initializing Neural Network classifier = Sequential() print(x_train.shape[0], ' ', x_train.shape[1]) print(x_test.shape[0], ' ', x_test.shape[1]) x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1]) x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1]) classifier.add(LSTM(10, input_shape=(1, x_train.shape[2]), return_sequences=True, activation='softplus')) classifier.add(Dropout(0.2)) classifier.add(LSTM(20, activation='softplus')) classifier.add(Dropout(0.2)) classifier.add(Dense(500, kernel_initializer='glorot_uniform', activation='softsign', kernel_constraint=maxnorm(2))) # Adding the output layer with 1 output classifier.add(Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid')) optimizer = RMSprop(lr=0.001) # Compiling Neural Network classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto') # Fitting our model classifier.fit(x_train, y_train, batch_size=20, epochs=50) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') test_preds = classifier.predict_proba(x_test, verbose=0) roc = roc_auc_score(y_test, test_preds) scores = classifier.evaluate(x_test, y_test) print(scores) # Print your model summary print(classifier.summary()) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Scores: ' + str(scores) + '\n' + 'Classifier summary: ' + str( classifier.summary()) + '\n' + 'ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str( av_roc / count) + '\n') # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, test_preds) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Predicting the Test set results y_pred = classifier.predict(x_test) y_pred = (y_pred > 0.5) # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str( accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str( f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
def Voting_Ensembles(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = ClassRead.Reader() # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Above 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] ####################################################################################################################### class1 = svm.SVC(kernel='rbf', C=10000, gamma=0.1) class2 = svm.SVC(kernel='rbf', C=1000, gamma=0.1) class3 = svm.SVC(kernel='rbf', C=100, gamma=0.1) class4 = svm.SVC(kernel='rbf', C=10, gamma=0.1) class5 = KNeighborsClassifier(n_neighbors=140) class6 = BernoulliNB() model = VotingClassifier( estimators=[('svm1', class1), ('svm2', class2), ('svm3', class3), ('svm4', class4), ('kneigh', class5), ('bern', class6)], voting='hard') ####################################################################################################################### model.fit(x_train, y_train) y_pred = model.predict(x_test) ####################################################################################################################### # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') y_prob_pred = model.predict(x_test) roc = roc_auc_score(y_test, y_prob_pred) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str(av_roc / count) + '\n') y_pred = (y_pred > 0.5) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, y_pred) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
def svm_func(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = ClassRead.Reader( ) # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Below 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc( x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] # Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object # For very large C, the margin is hard, and points cannot lie in it. For smaller C, the margin is softer, and can grow to encompass some points. # gamma: Higher the value of gamma, will try to exact fit the training data set i.e.generalization error and cause over-fitting problem. model = svm.SVC(kernel='rbf', C=100, gamma=0.1) ####################################################################################################################### # Feature Scaling #sc = StandardScaler() #x_train = sc.fit_transform(x_train) #x_test = sc.transform(x_test) ####################################################################################################################### model.fit(x_train, y_train) model.score(x_train, y_train) # Predict Output y_pred = model.predict(x_test) ####################################################################################################################### # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') roc = roc_auc_score(y_test, y_pred) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str(av_roc / count) + '\n') y_pred = (y_pred > 0.5) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, y_pred) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support( y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) ########################################################################################################################## # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
def K_Neighbors(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = ClassRead.Reader( ) # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Above 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc( x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] ####################################################################################################################### # leaf_size: int, optional(default=30) # p : integer, optional (default = 2) # When p = 1, this is equivalent to using manhattan_distance (l1), # and euclidean_distance (l2) for p = 2. # For arbitrary p, minkowski_distance (l_p) is used. # algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, optional Algorithm used to compute the nearest neighbors: # ‘ball_tree’ will use BallTree # ‘kd_tree’ will use KDTree # ‘brute’ will use a brute-force search. # ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method. # weights : str or callable, optional (default = ‘uniform’) weight function used in prediction. Possible values: # ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally. # ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. scaler = Normalizer() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) classifier = KNeighborsClassifier(n_neighbors=140) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) ####################################################################################################################### # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') roc = roc_auc_score(y_test, y_pred) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str(av_roc / count) + '\n') # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, y_pred) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- y_pred = (y_pred > 0.5) # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') report = classification_report(y_test, y_pred) print(report) temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support( y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
def neural(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = ClassRead.Reader( ) # Import the ClassRead.py file, to get the encoding x = np.array(words_of_tweets) y = train_A['label'] # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Above 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc( x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Initializing Neural Network classifier = Sequential() feature_dimensions = x_train.shape[1] print("second dimension (feature dimension): ", x_train.shape[1]) # Adding the input layer and the first hidden layer (20 neurons) classifier.add( Dense(20, kernel_initializer='glorot_uniform', activation='softsign', input_dim=feature_dimensions, kernel_constraint=maxnorm(2))) classifier.add(Dropout(0.2)) # Adding the second hidden layer (10 neurons) classifier.add( Dense(10, kernel_initializer='glorot_uniform', activation='softsign', kernel_constraint=maxnorm(2))) classifier.add(Dropout(0.2)) # Adding the output layer with 1 output classifier.add( Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid')) optimizer = RMSprop(lr=0.001) # Compiling Neural Network classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # How to Tune Batch Size and Number of Epochs # create model model = KerasClassifier(build_fn=create_model, verbose=0) # define the grid search parameters batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 20, 40] param_grid = dict(batch_size=batch_size, epochs=epochs) ''' ''' # create model model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=20, verbose=0) # How to Tune the Training Optimization Algorithm # define the grid search parameters optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'] param_grid = dict(optimizer=optimizer) # How to Tune Learning Rate and Momentum # define the grid search parameters learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3] # momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9] param_grid = dict(learn_rate=learn_rate) # How to Tune Network Weight Initialization # define the grid search parameters init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] param_grid = dict(init_mode=init_mode) # How to Tune the Neuron Activation Function # define the grid search parameters activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] param_grid = dict(activation=activation) # How to Tune Dropout Regularization # define the grid search parameters weight_constraint = [1, 2, 3, 4, 5] dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint) # How to Tune the Number of Neurons in the Hidden Layer # define the grid search parameters neurons = [1, 5, 10, 15, 20, 25, 30, 35, 40] param_grid = dict(neurons=neurons) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1) # Use only the training data set (cannot use whole data set cause it is not encoded) grid_result = grid.fit(x_train, y_train) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # classifier = model # classifier = create_model() callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto') # Fitting our model classifier.fit(x_train, y_train, batch_size=20, epochs=50) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') test_preds = classifier.predict_proba(x_test, verbose=0) roc = roc_auc_score(y_test, test_preds) scores = classifier.evaluate(x_test, y_test) print(scores) # Print your model summary print(classifier.summary()) # Print your ROC-AUC score for your kfold, and the running score average print('ROC: ', roc) av_roc += roc print('Continued Avg: ', av_roc / count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Scores: ' + str(scores) + '\n' + 'Classifier summary: ' + str(classifier.summary()) + '\n' + 'ROC: ' + str(roc) + '\n' + 'Continued Avg: ' + str(av_roc / count) + '\n') # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, test_preds) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count-1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Predicting the Test set results y_pred = classifier.predict(x_test) y_pred = (y_pred > 0.5) # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support( y_test, y_pred, average='binary') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')