def evaluate_classifier(predicted_output, y_test): y_test = y_test.array accuracy = evaluation_metrics.accuracy(y_test, predicted_output) precision = evaluation_metrics.precision(y_test, predicted_output) recall = evaluation_metrics.recall(y_test, predicted_output) f1_score = evaluation_metrics.f1_score(recall, precision) return accuracy, precision, recall, f1_score
def evaluate_MCAP_bernoulli_model(dataset_name): """ This is the method used for evaluation of multinomial NB on a particular dataset :param dataset_name: This is the given dataset name :return: All the evaluation metrics """ # We first import training data for the training try: spam_email_bernoulli_model1, ham_email_bernoulli_model1, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bernoulli_model.convert_to_bernoulli_model( dataset_name, True) except: print "You have given wrong file name, please check and run again" exit(-1) # Firstly we will divide our training data into training and validation data train_data, validation_data = MCAP_logistic_regression.divide_into_validation_and_train(spam_email_bernoulli_model1, ham_email_bernoulli_model1) # Now we find the lambda value by using the grid search algorithm lambda_parameter = MCAP_logistic_regression.mcap_validation(train_data, validation_data, total_file_dictionary) alpha_value = 0.01 # Here we merge the training data and the validation data again train_data = train_data + validation_data # In this step the algorithm learns the weights weights = MCAP_logistic_regression.mcap_logistic_regression_train(train_data, total_file_dictionary, alpha_value, lambda_parameter, 500) # We now import the data for testing spam_email_bernoulli_model_test, ham_email_bernoulli_model_test, spam_mail_in_all_documents_test, ham_mail_in_all_documents_test, size_of_total_dataset_test, size_of_spam_dataset_test, size_of_ham_dataset_test, total_file_dictionary_test = bernoulli_model.convert_to_bernoulli_model( dataset_name, False) spam_predict = [] # In this step the algorithm predicts the output for a given dataset for each_document in spam_email_bernoulli_model_test: spam_predict.append(MCAP_logistic_regression.mcap_logistic_regression_test(each_document, weights)) # We are taking spam as 1 spam_actual = [1] * len(spam_predict) ham_predict = [] for each_document in ham_email_bernoulli_model_test: ham_predict.append(MCAP_logistic_regression.mcap_logistic_regression_test(each_document, weights)) ham_actual = [0] * len(ham_predict) total_actual = spam_actual + ham_actual total_predict = spam_predict + ham_predict # Now we find the evaluation metrics for the method accuracy = evaluation_metrics.accuracy(total_actual, total_predict) precision = evaluation_metrics.precision(total_actual, total_predict) recall = evaluation_metrics.recall(total_actual, total_predict) f1_score = evaluation_metrics.f1_score(recall, precision) return accuracy, precision, recall, f1_score, lambda_parameter\ # evaluate_MCAP_bag_of_words(dataset_name) #for bag of words # evaluate_MCAP_bernoulli_model(dataset_name) # for bernoulli_model
def evaluate_SGD_bernoulli_model(dataset_name): """ This is the method used for evaluation of multinomial NB on a particular dataset :param dataset_name: This is the given dataset name :return: All the evaluation metrics """ # We first import training data for the training try: spam_email_bernoulli_model1, ham_email_bernoulli_model1, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bernoulli_model.convert_to_bernoulli_model( dataset_name, True) spam_email_bernoulli_model_test, ham_email_bernoulli_model_test, spam_mail_in_all_documents_test, ham_mail_in_all_documents_test, size_of_total_dataset_test, size_of_spam_dataset_test, size_of_ham_dataset_test, total_file_dictionary_test = bernoulli_model.convert_to_bernoulli_model( dataset_name, False) except: print "You have given wrong file name, please check and run again" exit(-1) train_data, validation_data = SGDClassifier.divide_into_validation_and_train( spam_email_bernoulli_model1, ham_email_bernoulli_model1) test_data = SGDClassifier.get_data_from_given_model( spam_email_bernoulli_model_test, ham_email_bernoulli_model_test) words_list = list(train_data[0]) # we import the train, test and validation datasets train_x, train_y = SGDClassifier.convert_data_for_SGD_classifier( train_data, words_list) test_x, test_y = SGDClassifier.convert_data_for_SGD_classifier( test_data, words_list) valid_x, valid_y = SGDClassifier.convert_data_for_SGD_classifier( validation_data, words_list) # In this step we are getting the best parameters for the sklearn SGD classifier classifier_model = SGDClassifier.parameter_tuning(valid_x, valid_y) # In this step the classifier model is being trained on the training dataset trained_classifier_model = SGDClassifier.train_SGD(train_x, train_y, classifier_model) # In this step we find the output for the classifier. predicted_y, actual_y = SGDClassifier.test_SGD(trained_classifier_model, test_x, test_y) # Now calculate the evaluation metrics accuracy = evaluation_metrics.accuracy(actual_y, predicted_y) precision = evaluation_metrics.precision(actual_y, predicted_y) recall = evaluation_metrics.recall(actual_y, predicted_y) f1_score = evaluation_metrics.f1_score(recall, precision) return accuracy, precision, recall, f1_score # evaluate_SGD_bag_of_words(dataset_name) # for bow # evaluate_SGD_bernoulli_model(dataset_name) # for bm
def evaluate_multinomial_NB(dataset_name): """ This is the method used for evaluation of multinomial NB on a particular dataset :param dataset_name: This is the given dataset name :return: The method returns the accuracy, precision, recall and f1_score for the given dataset """ # We first import training data for the training try: spam_email_bag_of_words, ham_email_bag_of_words, text_in_all_document, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bag_of_words.convert_to_bag_of_words( dataset_name, True) except: print "You have given wrong file name, please check and run again" exit(-1) prior, conditional_probability, conditional_probability_of_non_occurring_word = multi_nomial_naive_bayes.train_multinomial_NB( spam_email_bag_of_words, ham_email_bag_of_words, text_in_all_document, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary) # We now import the data for testing spam_email_bag_of_words, ham_email_bag_of_words, text_in_all_document, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bag_of_words.convert_to_bag_of_words( dataset_name, False) # We calculate the evaluation metric # Here we first predict for the spam class and then the ham class spam_predict = [] for each_document in spam_email_bag_of_words: spam_predict.append(multi_nomial_naive_bayes.test_multinomial_naive_bayes(prior, conditional_probability, conditional_probability_of_non_occurring_word, each_document)) # We are taking spam as 1 spam_actual = [1] * len(spam_predict) ham_predict = [] for each_document in ham_email_bag_of_words: ham_predict.append(multi_nomial_naive_bayes.test_multinomial_naive_bayes(prior, conditional_probability, conditional_probability_of_non_occurring_word, each_document)) ham_actual = [0] * len(ham_predict) total_actual = spam_actual + ham_actual total_predict = spam_predict + ham_predict # Now we find the evaluation metrics for the method accuracy = evaluation_metrics.accuracy(total_actual, total_predict) precision = evaluation_metrics.precision(total_actual, total_predict) recall = evaluation_metrics.recall(total_actual, total_predict) f1_score = evaluation_metrics.f1_score(recall, precision) return accuracy, precision, recall, f1_score