Ejemplo n.º 1
0
def main():
    data, target = DataSetFileReader.read_dataset_file(
        'data/SMSSpamCollection')

    classifier = NaiveBayes()
    classifier.fit(data, target)

    input_data = DataSetFileReader.read_input_data_file('data/inputdata')

    result = NaiveBayes.predict(classifier, input_data)

    for pred, msg in zip(result, input_data):
        print('{0} -> {1}'.format(pred.upper(), msg))
Ejemplo n.º 2
0
def init():

    def plot(arg):
        plt.xlabel('Iterations')
        plt.ylabel(arg)
        plt.legend()
        plt.show()
        plt.clf()
		
	# Arguments Parser Structure
    parser = argparse.ArgumentParser(description='Classification models: Logistic Regression, SVM, Naive Bayes Training.')
    parser.add_argument('-p', '--preprocess', action='store_true', help='perform preprocessing of emails')
    parser.add_argument('-f', '--figure', action='store_true', help='plot training figures (performs validation)')

	# Parse Arguments
    parsed = parser.parse_args()
    preprocess = parsed.preprocess
    figure = parsed.figure
    
    dataHandler = DataHandler()
	
    if preprocess:
        print('Extracting Features ............. ', end='', flush=True)
        start = time.time()
        dataHandler.saveFeatures()
        print('done -- ' + str(round(time.time()-start, 3)) + 's')

    print('Loading Data .................... ')
    start = time.time()
    x_train, y_train, x_test, y_test = dataHandler.loadTrainingData()
    x_val = deepcopy(x_test)
    y_val = deepcopy(y_test)
    
	#Logistic Regression
    logistic = LogisticRegression(lr=0.2, num_iter=1000, val=figure)
    start = time.time()
    train_history, val_history = logistic.fit(x_train, y_train, x_val, y_val)
    plt.plot(range(len(train_history)), train_history, label='Training Loss')
    plt.plot(range(len(val_history)), val_history, label='Validation Loss')
    
    accuracy = logistic.test(x_test, y_test)
    print('Test Accurarcy: {}%'.format(round(100*accuracy, 2)))

    # Plot
    if figure: plot("Loss")
    
	#SVM Training
    svm = SVM(lr=0.1, num_iter=420, val=figure)
    start = time.time()
    train_history, val_history = svm.fit(x_train, y_train, x_val, y_val)
    plt.plot(range(len(train_history)), train_history, label='Training Misclassification Ratio')
    plt.plot(range(len(val_history)), val_history, label='Validation Missclassification Ratio')
    accuracy = svm.test(x_test, y_test)
    print('Test Accurarcy: {}%'.format(round(100*accuracy, 2)))
    
    # Plot
    if figure: plot("Missclassification Ratio")

    #Naive Bayes
    bayes = NaiveBayes()
    start = time.time()
    bayes.fit(x_train, y_train)
    accuracy = bayes.test(x_test, y_test)
    print('Test Accurarcy: {}%'.format(round(100*accuracy, 2)))
    data = pd.read_csv('merged.csv')

    features = data['text']
    labels = data['is_trump']

    x_train = features.loc[:int(.7 * features.shape[0])]
    y_train = labels.loc[:int(.7 * labels.shape[0])]
    x_test = features.loc[int(.7 * features.shape[0]) + 1:].reset_index(drop = True)
    y_test = labels.loc[int(.7 * labels.shape[0]) + 1:].reset_index(drop = True)

    accuracy = lambda y, y_hat: np.mean(y == y_hat['Class'])

    nb = NaiveBayes()

    print("Training")
    nb.fit(x_train, y_train)

    print("Training accuracy: {}".format(accuracy(y_train, nb.predict(x_train))))
    print("Testing accuracy:  {}".format(accuracy(y_test, nb.predict(x_test))))


    print('Saving NB Classifier')

    def key_to_string(d):
        res = {}
        res['True'] = d[True]
        res['False'] = d[False]

        return res

Ejemplo n.º 4
0
x_axis = np.zeros(1001)
accuracy = np.zeros(1001)
precision = np.zeros(1001)

# str = 0.45
# end = 0.50

# for i in range(0, 1001):
#   para = str + (end - str) * 0.001 * i
# print("para = %f" %(para))
# x_axis[i] = para

para = 0.4743
NB = NaiveBayes()
NB.fit(X_train, Y_train, para)

NB.save()
'''
count = 0 
PSpam = 0
TSpam = 0

for j in range(0, len(Y_test)):
  tag = NB.predict(X_test[j])
  if Y_test[j] == tag:
    count += 1
  # 1为spam, 0为ham,与标签一致
  if tag == 1:
    PSpam += 1
    if Y_test[j] == 1:
Ejemplo n.º 5
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from NaiveBayes import NaiveBayes


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Ejemplo n.º 6
0
        'Cars':
        'https://raw.githubusercontent.com/JadeMaveric/GEC/main/ML/data/data.tutorial.3.csv',
        'Customers':
        'https://raw.githubusercontent.com/JadeMaveric/GEC/main/ML/data/data.tutorial.6.csv'
    }

    dataset = st.selectbox('Dataset', ['None'] + list(demosets.keys()))

    if dataset != 'None':
        df = pd.read_csv(demosets[dataset], sep='\s+')
    else:
        df = None

if df is not None:
    nb = NaiveBayes()
    nb.fit(df, probability_selector)

    with st.beta_expander("Dataset", expanded=True):
        st.header("Dataset")
        st.write(df)

    with st.beta_expander("Probabilities", expanded=False):
        st.subheader("Prior Probabilities")
        st.write(nb.prior)
        st.subheader("Likelihood")
        st.write(nb.likelihood)
        st.subheader("Evidence")
        st.write(nb.evidence)

    st.header("Classify a record")
    with st.form(key='test_record'):
    testing_x = vectorizer.transform(test_x).toarray()
    training_y = vectorizer.fit_transform(train_y)
    num_class = 20
    num_sample = len(train_x)
    feature_name = vectorizer.get_feature_names()
    print(feature_name[0])
    print(len(vectorizer.get_feature_names()))

    print(test_x.shape[0])
    print(testing_x[1][6])
    print(feature_name)

    model_bayes = NaiveBayes(training_x=training_x,
                             training_y=training_y,
                             num_class=num_class,
                             theta_k=np.full((num_class, 1), 0.0),
                             theta_j_k=np.full((num_feature, num_class), 0.0),
                             num_feature=num_feature,
                             num_sample=num_sample)

    start_time = time.time()
    model_bayes.fit()
    print("--- %s runtime in seconds ---" % (time.time() - start_time))

    pred_y = model_bayes.predict(testing_x[1:100], feature_name)
    print(pred_y)
    print(test_y[1:100])
    print(metrics.accuracy_score(test_y[1:100], pred_y))
    print(metrics.classification_report(test_y[1:100], pred_y))
Ejemplo n.º 8
0
#     accuracy[t] = test_result

accuracy = []


for t in test_sizes:
    test_result = []

    nb = NaiveBayes(5)
    files, labels = get_files("data_processed")
    x, _, labels = np.unique(labels, return_inverse=True, return_index=True)
    train_files, test_files, train_labels, test_labels = train_test_split(files, labels, test_size=t,
                                                                          stratify=labels,
                                                                          random_state=7)

    nb.fit(train_files, train_labels, feature_select=False, feature_percent=1)
    print("score : test size : ", str(t))
    nb.score(test_files, test_labels)
    test_result.append(nb.score(test_files, test_labels))
    pr = nb.predict(test_files)

    accuracy.append(test_result)

print(accuracy)

# test_sizes = [0.1, 0.2, 0.3, 0.5]
# feat_percent = [0.1, 0.3, 0.5, 0.7, 0.9]

# results = np.array([[0.664, 0.848, 0.91, 0.95, 0.948],
#            [0.617, 0.854, 0.924, 0.949, 0.956],
#            [0.534, 0.9146, 0.922, 0.95733, 0.9614],
Ejemplo n.º 9
0
def callcustom():
    X = np.random.randint(5, size=(6, 100))
    y = np.array([1, 2, 3, 4, 5, 6])
    clf = NaiveBayes()
    clf.fit(X, y)
    clf.predict(X)
Ejemplo n.º 10
0
    clf.predict(X)


def callcustom():
    X = np.random.randint(5, size=(6, 100))
    y = np.array([1, 2, 3, 4, 5, 6])
    clf = NaiveBayes()
    clf.fit(X, y)
    clf.predict(X)


def parsecall():
    parser = ParseText()
    parser.fit(r'./train', delimiter=' ')


if __name__ == '__main__':
    # print(timeit.timeit(calldef, number=500))
    # print(timeit.timeit(callcustom, number=5000))
    # print(timeit.timeit(parsecall, number=1))
    parser = ParseText()
    X, y = parser.fit(r'./train', delimiter=' ')
    clf = NaiveBayes()
    clf.fit(X, y)

    X, ya = parser.vectorize(r'./test', delimiter=' ')
    yp = clf.predict(X)
    # print(ya, yp)
    from sklearn.metrics import confusion_matrix, accuracy_score
    print(confusion_matrix(ya, yp), accuracy_score(ya, yp))
Ejemplo n.º 11
0
from NaiveBayes import NaiveBayes
import pandas as pd

if __name__ == '__main__':
    data = pd.read_csv('merged.csv')

    nb = NaiveBayes()
    nb.fit(data['text'], data['is_trump'])

    supports = nb.freqs
    tc = nb.total_counts

    trump_supports = supports[True]
    control_supports = supports[False]

    words = set(trump_supports.keys()) | set(control_supports.keys())

    relative_probs = []
    for word in words:
        if word in trump_supports and word in control_supports:
            trump_prob = trump_supports[word] / tc[True]
            control_prob = control_supports[word] / tc[True]

            relative_probs.append([word, trump_prob - control_prob])

    sort = reversed(sorted(relative_probs, key=lambda x: x[1]))
    df = pd.DataFrame(sort, columns=['Word', 'RelativeProb'])
    df.to_csv('relative_probs.csv')
Ejemplo n.º 12
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

from NaiveBayes import NaiveBayes

X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nbc = NaiveBayes()
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("accuracy_score:", acc)
print("f1 score:", f1)