print("Load training first") else: path = command.split(" ") if (len(path) < 2): print("Please enter filepath") else: path = path[1] path = Path('.').joinpath(path) text = "" try: text = path.open('r', encoding='utf-8').read() except OSError as e: print("File doesn't exist/Invalid path") print(text) text = pp.clean_text(text) pos, neg = classifier.test(text) print("CLASS: ", end='') if pos == classifier.pos_prior or neg == classifier.neg_prior: print("SOMETHING WENT WRONG") elif pos >= neg: print("POSITIVE") else: print("NEGATIVE") #Display stats clause elif command.startswith('d'): scores = pp.load_stats() pp.print_stats(scores) #Refresh menu clause elif command.startswith('m'): print_menu()
import pandas from matplotlib import pyplot from bayes import Bayes from utils import generate_datasets data = pandas.read_csv('./datasets/leaf.csv') labels = data["species"] data.drop(data.columns[-1], axis=1, inplace=True) print(data.index) for dataset in generate_datasets(data, labels): print('\n' + dataset.name) for training_percent in range(60, 91, 5): classifier = Bayes(dataset.data, labels, training_percent) classifier.train() classifier.test() dataset.result.append(classifier.get_accuracy()) print('Training percent: ' + str(training_percent) + '%, accuracy: ' + str(classifier.get_accuracy())) pyplot.plot(range(60, 91, 5), dataset.result, label=dataset.name) pyplot.xlabel('Training percent') pyplot.ylabel('Accuracy') pyplot.legend() pyplot.savefig('plot', dpi=200, bbox_inches='tight')
train_data, test_data, train_labels, test_labels = train_test_split( data_standard, data.as_matrix()[:, -1], test_size=0.2, random_state=int(time.time())) ''' 贝叶斯算法识别 ''' print("---------------------------贝叶斯----------------------------------") start = time.clock() by = Bayes() by.train(list(train_data), list(train_labels)) test_data_size = test_data.shape[0] error_count = 0 for index, td in enumerate(list(test_data)): this_label = by.test(td) print("预测类别:{0},真实类别:{1}".format(this_label, test_labels[index])) if this_label != test_labels[index]: error_count += 1 end = time.clock() error_rate = (error_count / test_data_size) * 100 time_consuming = end - start print("错误率为:{0:.2f}%".format(error_rate)) print("耗时:{0:.4f}s".format(time_consuming)) ''' k-近邻算法识别 ''' print("---------------------------knn----------------------------------") start = time.clock() knn = Knn() test_data_size = test_data.shape[0]
def main(): #Making list of .txt-files (per sentiment) print("\tLOADING FILES") path = Path('..').joinpath('Data') test_ = path.joinpath('test') train = path.joinpath('train') tp_reviews = txtToList(test_.joinpath('pos')) tn_reviews = txtToList(test_.joinpath("neg")) pos_reviews = txtToList(train.joinpath("pos")) neg_reviews = txtToList(train.joinpath("neg")) print("\tFILES LOADED") #Cleaning reviews reviews = [pos_reviews, neg_reviews, tp_reviews, tn_reviews] print("\tCLEANING REVIEWS") for list_ in reviews: for i, review in enumerate(list_): list_[i] = clean_text(review) #Joining the reviews into one string (per sentiment) pos_string = "".join([string for string in pos_reviews]) neg_string = "".join([string for string in neg_reviews]) #Counting the frequency of words (per sentiment and total) posCounter = Counter(pos_string.split()) negCounter = Counter(neg_string.split()) vocabCounter = Counter(pos_string.split() + neg_string.split()) for term in list(posCounter): if (posCounter[term] == 1): del posCounter[term] for term in list(negCounter): if (negCounter[term] == 1): del negCounter[term] classifier = Bayes(vocab_counts=vocabCounter) classifier.train(posCounter, negCounter) testSets = [tp_reviews, tn_reviews] n_pos_tp, n_neg_tp = 0, 0 n_pos_tn, n_neg_tn = 0, 0 for i, testSet in enumerate(testSets): print("_" * 15 + "RESULTS" + "_" * 15) n_pos, n_neg = 0, 0 for review in testSet: pos, neg = classifier.test(review) if (pos >= neg): n_pos += 1 else: n_neg += 1 if (i == 0): print("Positive Testset: ") n_pos_tp, n_neg_tp = n_pos, n_neg else: print("Negative Testset: ") n_pos_tn, n_neg_tn = n_pos, n_neg print("Positive reviews: {}".format(n_pos)) print("Negative reviews: {}".format(n_neg)) pos_prec = n_pos_tp / (n_pos_tp + len(tn_reviews) - n_neg_tn) pos_rec = n_pos_tp / len(tp_reviews) pos_f1 = 2 * ((pos_prec * pos_rec) / (pos_prec + pos_rec)) neg_prec = n_neg_tn / (n_neg_tn + len(tp_reviews) - n_pos_tp) neg_rec = n_neg_tn / len(tn_reviews) neg_f1 = 2 * ((neg_prec * neg_rec) / (neg_prec + neg_rec)) scores = [pos_prec, pos_rec, pos_f1, neg_prec, neg_rec, neg_f1] save_stats(scores) print_stats(scores) return classifier