def svm_func(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file):
    reading = Twitter_Depression_Detection.Reader()  # Import the Twitter_Depression_Detection.py file, to get the encoding
    print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(words_of_tweets)
    x = np.array(words_of_tweets)
    y = train_A['label']

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0
    # Below 3 variables are used for ROC-AUC curve
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=41, shuffle=True)

    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)


        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file)
        y_train, y_test = y[train_index], y[test_index]

        # Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
        # Create SVM classification object
        # For very large C, the margin is hard, and points cannot lie in it. For smaller C, the margin is softer, and can grow to encompass some points.
        # gamma: Higher the value of gamma, will try to exact fit the training data set i.e.generalization error and cause over-fitting problem.
        model = naive_bayes.GaussianNB()


#######################################################################################################################
        # Feature Scaling
        minMaxScaler = MinMaxScaler(feature_range=(0, 1))
        # Get points and discard classification labels
        #x_train = minMaxScaler.fit_transform(x_train)
        #x_test = minMaxScaler.transform(x_test)
#######################################################################################################################
        oversample = SMOTE(sampling_strategy='minority', k_neighbors=10, random_state=0)
        model.fit(x_train, y_train)
        return model
#######################################################################################################################
        # Visualization of normal and oversampled data

        '''visualize_data(x_train, y_train, "Normal Dataset")'''

        # 'minority': resample only the minority class;
        x_train, y_train = oversample.fit_resample(x_train, y_train)
        '''visualize_data(x_train, y_train, "Oversampled Dataset")'''

#######################################################################################################################

        model.score(x_train, y_train)
        # Predict Output
        y_pred = model.predict(x_test)
        #return model
#######################################################################################################################

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        #roc = roc_auc_score(y_test, y_pred)

        # Print your ROC-AUC score for your kfold, and the running score average
        #print('ROC: ', roc)
        #av_roc += roc
        #print('Continued Avg: ', av_roc / count)

        #with open(print_file, "a") as myfile: # Write above print into output file
            #myfile.write('ROC: ' + str(Continued Avg: ' + str(av_roc / count) + '\n')

        #y_pred = (y_pred > 0.5)

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        '''
        # Compute ROC curve and area the curve

        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc))
        '''
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        '''
        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)


        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write(str(cm) + '\n')
        '''
        print(y_pred)
        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                                        average='macro')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)

        print(metrics.classification_report(y_test,y_pred))


    # Create ROC-AUC curve
#    compute_ROC_Curve(tprs, mean_fpr, aucs)


##########################################################################################################################


    # Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    #print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
def svm_func2(model2, train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file):
    reading = Twitter_Depression_Detection.Reader()  # Import the Twitter_Depression_Detection.py file, to get the encoding

    x = np.array(words_of_tweets)
    y = train_A['label']

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0
    # Below 3 variables are used for ROC-AUC curve
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=7, shuffle=True)
    print(x.size)
    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)
        print(train_index)
        print(test_index)


        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file)
        y_train, y_test = y[train_index], y[test_index]
        x_train = model2.predict_proba(x_train)
        x_test = model2.predict_proba(x_test)
        # Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
        # Create SVM classification object
        # For very large C, the margin is hard, and points cannot lie in it. For smaller C, the margin is softer, and can grow to encompass some points.
        # gamma: Higher the value of gamma, will try to exact fit the training data set i.e.generalization error and cause over-fitting problem.
        model = svm.SVC(kernel='rbf', C=100, gamma=0.1)


#######################################################################################################################
        # Feature Scaling
        minMaxScaler = MinMaxScaler(feature_range=(0, 1))
        # Get points and discard classification labels
        x_train = minMaxScaler.fit_transform(x_train)
        x_test = minMaxScaler.transform(x_test)
#######################################################################################################################

        model.fit(x_train, y_train)


        model.score(x_train, y_train)
        # Predict Output
        y_pred = model.predict(x_test)
        #return model
#######################################################################################################################

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        #roc = roc_auc_score(y_test, y_pred)

        # Print your ROC-AUC score for your kfold, and the running score average
        #print('ROC: ', roc)
        #av_roc += roc
        #print('Continued Avg: ', av_roc / count)

        #with open(print_file, "a") as myfile: # Write above print into output file
            #myfile.write('ROC: ' + str(Continued Avg: ' + str(av_roc / count) + '\n')

        #y_pred = (y_pred > 0.5)

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        '''
        # Compute ROC curve and area the curve

        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc))
        '''
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        '''
        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)

        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write(str(cm) + '\n')
        '''
        print(y_pred)
        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                                        average='macro')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)


# =============================================================================

    # Plot HEATMAP

# =============================================================================

        '''plt.title('SVM - Confusion Matrix '

                  '\n[Accuracy = %0.2f, Recall = %0.2f, Precision = %0.2f, F1-Score = %0.2f] '
                  '\nTrue Positive = %d, False Positive = %d '
                  '\nFalse Negative = %d, True Negative = %d]' % (
            temp_accuracy * 100, temp_recall * 100, temp_precision * 100, temp_f1_score * 100, cm[0][0], cm[0][1], cm[1][0], cm[1][1]))

        sns.heatmap(cm, cmap='Oranges',  # Color of heatmap
                    annot=True, fmt="d",
                    # Enables values inside the heatmap boxes and sets that are integer values with fmt="d"
                    cbar=False,  # Delete the heat bar (shows the numbers corresponding to colors)
                    xticklabels=["depression", "no depression"], yticklabels=["depression", "no depression"]
                    # Name the x and y value labels
                    ).tick_params(left=False, bottom=False)  # Used to delete dash from name values of axis x and y

        # Fix a bug where heatmap top and bottom boxes are cut off
        b, t = plt.ylim()  # discover the values for bottom and top
        b += 0.5  # Add 0.5 to the bottom
        t -= 0.5  # Subtract 0.5 from the top
        plt.ylim(b, t)  # update the ylim(bottom, top) values

        plt.xlabel('True output')
        plt.ylabel('Predicted output')
        plt.show()
        '''
# =============================================================================


    # Create ROC-AUC curve
#    compute_ROC_Curve(tprs, mean_fpr, aucs)


##########################################################################################################################


    # Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    #print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
Esempio n. 3
0
def Bayes(train_A, words_of_tweets, extra_features, feature_selection,
          encoding, print_file):
    reading = Twitter_Depression_Detection.Reader(
    )  # Import the ClassRead.py file, to get the encoding

    x = np.array(words_of_tweets)
    y = train_A['label']

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0
    # Above 3 variables are used for ROC-AUC curve
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=41, shuffle=True)

    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index],
                                          train_index, extra_features,
                                          feature_selection, encoding,
                                          print_file), reading.get_enc(
                                              x[test_index], 0, y[test_index],
                                              test_index, extra_features,
                                              feature_selection, encoding,
                                              print_file)
        y_train, y_test = y[train_index], y[test_index]

        #######################################################################################################################

        model = GaussianNB()

        #######################################################################################################################
        # 'minority': resample only the minority class;
        oversample = SMOTE(sampling_strategy='minority',
                           k_neighbors=10,
                           random_state=0)
        x_train, y_train = oversample.fit_resample(x_train, y_train)

        # Fit Gaussian Naive Bayes according to x, y
        # Make a prediction using the Naive Bayes Model
        model.fit(
            x_train, y_train
        )  # x : array-like, shape (n_samples, n_features)   Training vectors, where n_samples is the number of samples and n_features is the number of features.
        # y : array-like, shape (n_samples,)   Target values.

        y_pred = model.predict(x_test)

        #######################################################################################################################

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        roc = roc_auc_score(y_test, y_pred)

        # Print your ROC-AUC score for your kfold, and the running score average
        print('ROC: ', roc)
        av_roc += roc
        print('Continued Avg: ', av_roc / count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' +
                         str(av_roc / count) + '\n')

        y_pred = (y_pred > 0.5)
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        '''
        
        # Compute ROC curve and area the curve

        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc))
        
        '''
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(str(cm) + '\n')

        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(
            y_test, y_pred, average='binary')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)

    # Create ROC-AUC curve
#    compute_ROC_Curve(tprs, mean_fpr, aucs)

# Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' +
                     "Average Accuracy: " + str(accuracy / 10) + '\n' +
                     "Average Recall: " + str(recall / 10) + '\n' +
                     "Average F1-score: " + str(f1score / 10) + '\n' +
                     'Average ROC:' + str(av_roc / 10) + '\n')
def conv1d_class(train_A, words_of_tweets, extra_features, feature_selection,
                 encoding, print_file):
    reading = Twitter_Depression_Detection.Reader(
    )  # Import the ClassRead.py file, to get the encoding

    # fix random seed for reproducibility
    numpy.random.seed(7)

    x = np.array(words_of_tweets)
    y = train_A['label']

    # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0

    # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=41, shuffle=True)

    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index],
                                          train_index, extra_features,
                                          feature_selection, encoding,
                                          print_file), reading.get_enc(
                                              x[test_index], 0, y[test_index],
                                              test_index, extra_features,
                                              feature_selection, encoding,
                                              print_file)
        y_train, y_test = y[train_index], y[test_index]

        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Initializing Neural Network
        classifier = Sequential()

        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # 'minority': resample only the minority class;
        oversample = SMOTE(sampling_strategy='minority',
                           k_neighbors=10,
                           random_state=0)
        x_train, y_train = oversample.fit_resample(x_train, y_train)
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        print(x_train.shape[0], ' ', x_train.shape[1])
        print(x_test.shape[0], ' ', x_test.shape[1])
        x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
        x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

        classifier.add(
            Dense(20,
                  kernel_initializer='glorot_uniform',
                  activation='softsign',
                  kernel_constraint=maxnorm(2),
                  input_shape=(1, x_train.shape[2])))

        classifier.add(
            Conv1D(filters=32,
                   kernel_size=3,
                   padding='same',
                   activation='relu'))
        classifier.add(Dropout(0.2))

        classifier.add(
            Conv1D(filters=32,
                   kernel_size=3,
                   padding='same',
                   activation='relu'))

        classifier.add(
            Conv1D(filters=32,
                   kernel_size=3,
                   padding='same',
                   activation='relu'))

        classifier.add(
            Conv1D(filters=32,
                   kernel_size=3,
                   padding='same',
                   activation='relu'))

        classifier.add(
            Conv1D(filters=32,
                   kernel_size=3,
                   padding='same',
                   activation='relu'))
        classifier.add(Dropout(0.2))

        classifier.add(GlobalAveragePooling1D())

        classifier.add(
            Dense(500,
                  kernel_initializer='glorot_uniform',
                  activation='softsign',
                  kernel_constraint=maxnorm(2)))

        # Adding the output layer with 1 output
        classifier.add(
            Dense(1, kernel_initializer='glorot_uniform',
                  activation='sigmoid'))

        optimizer = RMSprop(lr=0.001)

        # Compiling Neural Network
        classifier.compile(optimizer=optimizer,
                           loss='binary_crossentropy',
                           metrics=['accuracy'])

        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        callbacks.EarlyStopping(monitor='val_loss',
                                min_delta=0,
                                patience=2,
                                verbose=0,
                                mode='auto')

        # Fitting our model
        classifier.fit(x_train, y_train, batch_size=20, epochs=50)

        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        test_preds = classifier.predict_proba(x_test, verbose=0)

        roc = roc_auc_score(y_test, test_preds)
        scores = classifier.evaluate(x_test, y_test)
        print(scores)

        # Print your model summary
        print(classifier.summary())

        # Print your ROC-AUC score for your kfold, and the running score average
        print('ROC: ', roc)
        av_roc += roc
        print('Continued Avg: ', av_roc / count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('Scores: ' + str(scores) + '\n' +
                         'Classifier summary: ' + str(classifier.summary()) +
                         '\n' + 'ROC: ' + str(roc) + '\n' + 'Continued Avg: ' +
                         str(av_roc / count) + '\n')

        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Predicting the Test set results
        y_pred = classifier.predict(x_test)
        y_pred = (y_pred > 0.5)

        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(str(cm) + '\n')

        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(
            y_test, y_pred, average='binary')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)

    # Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' +
                     "Average Accuracy: " + str(accuracy / 10) + '\n' +
                     "Average Recall: " + str(recall / 10) + '\n' +
                     "Average F1-score: " + str(f1score / 10) + '\n' +
                     'Average ROC:' + str(av_roc / 10) + '\n')
Esempio n. 5
0
def K_Neighbors(train_A, words_of_tweets, extra_features, feature_selection,
                encoding, print_file):
    reading = Twitter_Depression_Detection.Reader(
    )  # Import the ClassRead.py file, to get the encoding

    x = np.array(words_of_tweets)
    y = train_A['label']

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0
    # Above 3 variables are used for ROC-AUC curve
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=41, shuffle=True)

    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index],
                                          train_index, extra_features,
                                          feature_selection, encoding,
                                          print_file), reading.get_enc(
                                              x[test_index], 0, y[test_index],
                                              test_index, extra_features,
                                              feature_selection, encoding,
                                              print_file)
        y_train, y_test = y[train_index], y[test_index]

        #######################################################################################################################

        # leaf_size: int, optional(default=30)

        # p : integer, optional (default = 2)
        # When p = 1, this is equivalent to using manhattan_distance (l1),
        # and euclidean_distance (l2) for p = 2.
        # For arbitrary p, minkowski_distance (l_p) is used.

        # algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, optional Algorithm used to compute the nearest neighbors:
        # ‘ball_tree’ will use BallTree
        # ‘kd_tree’ will use KDTree
        # ‘brute’ will use a brute-force search.
        # ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.

        # weights : str or callable, optional (default = ‘uniform’) weight function used in prediction. Possible values:
        # ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
        # ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

        scaler = Normalizer()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

        classifier = KNeighborsClassifier(n_neighbors=40)

        # 'minority': resample only the minority class;
        oversample = SMOTE(sampling_strategy='minority',
                           k_neighbors=10,
                           random_state=0)
        x_train, y_train = oversample.fit_resample(x_train, y_train)

        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)

        #######################################################################################################################

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        roc = roc_auc_score(y_test, y_pred)

        # Print your ROC-AUC score for your kfold, and the running score average
        print('ROC: ', roc)
        av_roc += roc
        print('Continued Avg: ', av_roc / count)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write('ROC: ' + str(roc) + '\n' + 'Continued Avg: ' +
                         str(av_roc / count) + '\n')

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        '''
        # Compute ROC curve and area the curve

        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc))
        '''
        # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        y_pred = (y_pred > 0.5)

        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(str(cm) + '\n')

        report = classification_report(y_test, y_pred)
        print(report)

        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(
            y_test, y_pred, average='binary')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)

    # Create ROC-AUC curve
#    compute_ROC_Curve(tprs, mean_fpr, aucs)

# Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' +
                     "Average Accuracy: " + str(accuracy / 10) + '\n' +
                     "Average Recall: " + str(recall / 10) + '\n' +
                     "Average F1-score: " + str(f1score / 10) + '\n' +
                     'Average ROC:' + str(av_roc / 10) + '\n')