def binary_naive_bayes(): model = nb.NaiveBayesModel() clean = cn.DataCLean() doc_vector = dv.DocumentVector() df_clean, uniqueWords = clean.Clean() df_clean_test, df_clean_train = split( df_clean, 0, int(.3 * (df_clean['class'].count()))) docVector = doc_vector.binary_docvector(df_clean_train, uniqueWords) # print(docVector) df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel( docVector, uniqueWords) # print("Model Trained") predict_df, test_data = model.predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI, df_clean_test, clean) print( "--------------Binary Naive Bayes Accuracy Stats---------------------------" ) stats = em.Evaluate() TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df) print("Accuracy = ", stats.Accuracy(TP, TN, FP, FN)) print("Precision = ", stats.Precision(TP, FP)) print("Recall = ", stats.Recall(TP, FN)) print("fScore = ", stats.fScore(TP, FN, FP)) print("True Negative = ", stats.TrueNegative(TN, FP)) print( "---------------------------------------------------------------------" )
def binary_naive_bayes_kfold(): model = nb.NaiveBayesModel() clean = cn.DataCLean() doc_vector = dv.DocumentVector() final_df, df = clean.extract(pathData) count = 0 start = -200 end = 0 accuracy = [] precision = [] recall = [] fscore = [] true_neg = [] stats = em.Evaluate() for count in range(5): start = start + 200 end = end + 200 df_test, df_train = split(final_df, start, end) # print(df_train) li_clean_text, df_clean = clean.clean_data(df_train) uniqueWords = clean.make_unique_li(li_clean_text) # # print(uniqueWords) docVector = doc_vector.binary_docvector(df_clean, uniqueWords) df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel( docVector, uniqueWords) predict_df, punc_df = model.predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI, df_test, clean) # print("--------------Naive Bayes Accuracy Stats---------------------------") TP, FN, TN, FP = stats.confusion_matrix(punc_df, predict_df) accuracy.append(stats.Accuracy(TP, TN, FP, FN)) precision.append(stats.Precision(TP, FP)) recall.append(stats.Recall(TP, FN)) fscore.append(stats.fScore(TP, FN, FP)) true_neg.append(stats.TrueNegative(TN, FP)) # print("---------------------------------------------------------------------") print( "---------------------------------------------------------------------" ) print("Binary Naive Bayes wit k-fold Accuracy Stats") print("accuracy = ", accuracy) print("precison = ", precision) print("recall = ", recall) print("f-score = ", fscore) print("True Negative = ", true_neg) print("accuracy = ", Average(accuracy)) print("precison = ", Average(precision)) print("recall = ", Average(recall)) print("f-score = ", Average(fscore)) print("true negative = ", Average(true_neg))
def generatingTrainSet(): _dcl = cl.DataCLean() final_df, uniqueWords = _dcl.Clean() _dv = dv.DocumentVector() # docVector = _dv.tf_idf(final_df, uniqueWords) docVector = _dv.DocVector(final_df, uniqueWords) # docVector = _dv.binary_docvector(final_df, uniqueWords) # ------------------------------------------------------------------------- # using textblob dict approach # import NaiveBayesTextBlob as tb # polarity_docVector = tb.text_blob(docVector, uniqueWords) # docVector = polarity_docVector # ------------------------------------------------------------------------- df = docVector.values X_train, Y = df[:, :-1], df[:, -1] Y_train = convert_to_0_or_1(Y) return (X_train, Y_train)