def main(): data, target = DataSetFileReader.read_dataset_file( 'data/SMSSpamCollection') classifier = NaiveBayes() classifier.fit(data, target) input_data = DataSetFileReader.read_input_data_file('data/inputdata') result = NaiveBayes.predict(classifier, input_data) for pred, msg in zip(result, input_data): print('{0} -> {1}'.format(pred.upper(), msg))
def init(): def plot(arg): plt.xlabel('Iterations') plt.ylabel(arg) plt.legend() plt.show() plt.clf() # Arguments Parser Structure parser = argparse.ArgumentParser(description='Classification models: Logistic Regression, SVM, Naive Bayes Training.') parser.add_argument('-p', '--preprocess', action='store_true', help='perform preprocessing of emails') parser.add_argument('-f', '--figure', action='store_true', help='plot training figures (performs validation)') # Parse Arguments parsed = parser.parse_args() preprocess = parsed.preprocess figure = parsed.figure dataHandler = DataHandler() if preprocess: print('Extracting Features ............. ', end='', flush=True) start = time.time() dataHandler.saveFeatures() print('done -- ' + str(round(time.time()-start, 3)) + 's') print('Loading Data .................... ') start = time.time() x_train, y_train, x_test, y_test = dataHandler.loadTrainingData() x_val = deepcopy(x_test) y_val = deepcopy(y_test) #Logistic Regression logistic = LogisticRegression(lr=0.2, num_iter=1000, val=figure) start = time.time() train_history, val_history = logistic.fit(x_train, y_train, x_val, y_val) plt.plot(range(len(train_history)), train_history, label='Training Loss') plt.plot(range(len(val_history)), val_history, label='Validation Loss') accuracy = logistic.test(x_test, y_test) print('Test Accurarcy: {}%'.format(round(100*accuracy, 2))) # Plot if figure: plot("Loss") #SVM Training svm = SVM(lr=0.1, num_iter=420, val=figure) start = time.time() train_history, val_history = svm.fit(x_train, y_train, x_val, y_val) plt.plot(range(len(train_history)), train_history, label='Training Misclassification Ratio') plt.plot(range(len(val_history)), val_history, label='Validation Missclassification Ratio') accuracy = svm.test(x_test, y_test) print('Test Accurarcy: {}%'.format(round(100*accuracy, 2))) # Plot if figure: plot("Missclassification Ratio") #Naive Bayes bayes = NaiveBayes() start = time.time() bayes.fit(x_train, y_train) accuracy = bayes.test(x_test, y_test) print('Test Accurarcy: {}%'.format(round(100*accuracy, 2)))
data = pd.read_csv('merged.csv') features = data['text'] labels = data['is_trump'] x_train = features.loc[:int(.7 * features.shape[0])] y_train = labels.loc[:int(.7 * labels.shape[0])] x_test = features.loc[int(.7 * features.shape[0]) + 1:].reset_index(drop = True) y_test = labels.loc[int(.7 * labels.shape[0]) + 1:].reset_index(drop = True) accuracy = lambda y, y_hat: np.mean(y == y_hat['Class']) nb = NaiveBayes() print("Training") nb.fit(x_train, y_train) print("Training accuracy: {}".format(accuracy(y_train, nb.predict(x_train)))) print("Testing accuracy: {}".format(accuracy(y_test, nb.predict(x_test)))) print('Saving NB Classifier') def key_to_string(d): res = {} res['True'] = d[True] res['False'] = d[False] return res
x_axis = np.zeros(1001) accuracy = np.zeros(1001) precision = np.zeros(1001) # str = 0.45 # end = 0.50 # for i in range(0, 1001): # para = str + (end - str) * 0.001 * i # print("para = %f" %(para)) # x_axis[i] = para para = 0.4743 NB = NaiveBayes() NB.fit(X_train, Y_train, para) NB.save() ''' count = 0 PSpam = 0 TSpam = 0 for j in range(0, len(Y_test)): tag = NB.predict(X_test[j]) if Y_test[j] == tag: count += 1 # 1为spam, 0为ham,与标签一致 if tag == 1: PSpam += 1 if Y_test[j] == 1:
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from NaiveBayes import NaiveBayes def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayes() nb.fit(X_train, y_train) predictions = nb.predict(X_test) print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
'Cars': 'https://raw.githubusercontent.com/JadeMaveric/GEC/main/ML/data/data.tutorial.3.csv', 'Customers': 'https://raw.githubusercontent.com/JadeMaveric/GEC/main/ML/data/data.tutorial.6.csv' } dataset = st.selectbox('Dataset', ['None'] + list(demosets.keys())) if dataset != 'None': df = pd.read_csv(demosets[dataset], sep='\s+') else: df = None if df is not None: nb = NaiveBayes() nb.fit(df, probability_selector) with st.beta_expander("Dataset", expanded=True): st.header("Dataset") st.write(df) with st.beta_expander("Probabilities", expanded=False): st.subheader("Prior Probabilities") st.write(nb.prior) st.subheader("Likelihood") st.write(nb.likelihood) st.subheader("Evidence") st.write(nb.evidence) st.header("Classify a record") with st.form(key='test_record'):
testing_x = vectorizer.transform(test_x).toarray() training_y = vectorizer.fit_transform(train_y) num_class = 20 num_sample = len(train_x) feature_name = vectorizer.get_feature_names() print(feature_name[0]) print(len(vectorizer.get_feature_names())) print(test_x.shape[0]) print(testing_x[1][6]) print(feature_name) model_bayes = NaiveBayes(training_x=training_x, training_y=training_y, num_class=num_class, theta_k=np.full((num_class, 1), 0.0), theta_j_k=np.full((num_feature, num_class), 0.0), num_feature=num_feature, num_sample=num_sample) start_time = time.time() model_bayes.fit() print("--- %s runtime in seconds ---" % (time.time() - start_time)) pred_y = model_bayes.predict(testing_x[1:100], feature_name) print(pred_y) print(test_y[1:100]) print(metrics.accuracy_score(test_y[1:100], pred_y)) print(metrics.classification_report(test_y[1:100], pred_y))
# accuracy[t] = test_result accuracy = [] for t in test_sizes: test_result = [] nb = NaiveBayes(5) files, labels = get_files("data_processed") x, _, labels = np.unique(labels, return_inverse=True, return_index=True) train_files, test_files, train_labels, test_labels = train_test_split(files, labels, test_size=t, stratify=labels, random_state=7) nb.fit(train_files, train_labels, feature_select=False, feature_percent=1) print("score : test size : ", str(t)) nb.score(test_files, test_labels) test_result.append(nb.score(test_files, test_labels)) pr = nb.predict(test_files) accuracy.append(test_result) print(accuracy) # test_sizes = [0.1, 0.2, 0.3, 0.5] # feat_percent = [0.1, 0.3, 0.5, 0.7, 0.9] # results = np.array([[0.664, 0.848, 0.91, 0.95, 0.948], # [0.617, 0.854, 0.924, 0.949, 0.956], # [0.534, 0.9146, 0.922, 0.95733, 0.9614],
def callcustom(): X = np.random.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) clf = NaiveBayes() clf.fit(X, y) clf.predict(X)
clf.predict(X) def callcustom(): X = np.random.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) clf = NaiveBayes() clf.fit(X, y) clf.predict(X) def parsecall(): parser = ParseText() parser.fit(r'./train', delimiter=' ') if __name__ == '__main__': # print(timeit.timeit(calldef, number=500)) # print(timeit.timeit(callcustom, number=5000)) # print(timeit.timeit(parsecall, number=1)) parser = ParseText() X, y = parser.fit(r'./train', delimiter=' ') clf = NaiveBayes() clf.fit(X, y) X, ya = parser.vectorize(r'./test', delimiter=' ') yp = clf.predict(X) # print(ya, yp) from sklearn.metrics import confusion_matrix, accuracy_score print(confusion_matrix(ya, yp), accuracy_score(ya, yp))
from NaiveBayes import NaiveBayes import pandas as pd if __name__ == '__main__': data = pd.read_csv('merged.csv') nb = NaiveBayes() nb.fit(data['text'], data['is_trump']) supports = nb.freqs tc = nb.total_counts trump_supports = supports[True] control_supports = supports[False] words = set(trump_supports.keys()) | set(control_supports.keys()) relative_probs = [] for word in words: if word in trump_supports and word in control_supports: trump_prob = trump_supports[word] / tc[True] control_prob = control_supports[word] / tc[True] relative_probs.append([word, trump_prob - control_prob]) sort = reversed(sorted(relative_probs, key=lambda x: x[1])) df = pd.DataFrame(sort, columns=['Word', 'RelativeProb']) df.to_csv('relative_probs.csv')
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets from sklearn.metrics import accuracy_score, f1_score import matplotlib.pyplot as plt from NaiveBayes import NaiveBayes X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) nbc = NaiveBayes() nbc.fit(X_train, y_train) y_pred = nbc.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print("accuracy_score:", acc) print("f1 score:", f1)