Esempio n. 1
0
def cross_validate(dataset, scramBool):
    global num_classes
    backup_data = copy.copy(dataset)
    test_results = []
    stats = []
    full_set_stats = []
    if scramBool:
        dataset = ten_percent_scrambler(dataset)
    backup_data = splitter(backup_data)
    for i in range(10): # iterates through passing each of the 10 subsets of our now scrambled and split dataset
        nb.freqTable = []
        to_learn = copy.copy(backup_data) # Grabs a fresh copy of the dataset each time, since the to_learn list pops deletes a tenth of the data in each loop
        to_test = make_test_set(to_learn.pop(i))
        to_learn = flatten_list(to_learn)
        # print('tester')
        # array_printer_2d(to_test)
        # print('learner')
        nb.train(to_learn)
        #array_printer_3d(nb.freqTable)
        # array_printer_2d(to_learn)
        to_test = nb.classify(to_test)
        test_results.append(to_test)
        # print("classified data")
        # array_printer_2d(to_test)
        stats.append(analyze(backup_data[i], to_test, num_classes))
        # print(len(to_learn))
        # learn(temp) # this will call the learner algo
        # test_results.append(test(to_test, dataset[i])) # This tests our model with the current tenth of the dataset
    #array_printer_2d(stats)
    full_set_stats = analyze(flatten_list(backup_data), flatten_list(test_results), num_classes) # Performs analysis on the entire classified set compared to the original data
    array_printer_2d(full_set_stats)
Esempio n. 2
0
def naive_bayes(train_docs, train_keys, test_docs, test_keys,model_file, N):
    X_train, y_train, phrase_list_train, idf_vec= extract_features(train_docs, train_keys)
    #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys)
    #print y_train
    print "--Feature matrices calculated, NB now training..."
    clf = NB.train(X_train, y_train)
    print "--Saving model..."
    with open(model_file, 'w') as f:
        pickle.dump(clf, f)
    with open(model_file+'.phrase_list', 'w') as f:
        pickle.dump(phrase_list_train, f)
    with open(model_file+'.idf_vec', 'w') as f:
        pickle.dump(idf_vec, f)
    with open(model_file+'.training_size', 'w') as f:
        pickle.dump(len(train_docs), f)
    print "--NB trained, NB now testing..."
    #accuracy = NB.score(clf, X_test, y_test)
    accuracy = 0

    precisions = []
    recalls = []
    for doc, true_keys in zip(test_docs, test_keys):
        candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs))
        precision, recall = evaluate_one_doc('NB', clf, candidates, features, true_keys, N)
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    #features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys)
    #avg_precision, avg_recall = evaluate_on_each_doc('NB', clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys, 10)
    return {'accuracy': accuracy,
            'recall': avg_recall,
            'precision': avg_precision}
Esempio n. 3
0
 def rebuild_models(self, for_eval=False):
     ''' Rebuilds all models over the current labeled datasets. '''
     datasets = self.labeled_datasets
     if self.undersample_before_eval and for_eval:
         print "undersampling before building models.."
         datasets = self.undersample_function()
         
     all_train_sets, labels = self._datasets_to_matrices(datasets)
     self.models = [NB_Model(naive_bayes.train(training_set, labels)) for training_set in all_train_sets]
Esempio n. 4
0
 def rebuild_models(self, for_eval=False):
     ''' Rebuilds all models over the current labeled datasets. '''
     datasets = self.labeled_datasets
     if self.undersample_before_eval and for_eval:
         print "undersampling before building models.."
         datasets = self.undersample_function()
         
     all_train_sets, labels = self._datasets_to_matrices(datasets)
     self.models = [NB_Model(naive_bayes.train(training_set, labels)) for training_set in all_train_sets]
def main():
    training = read_data(argv[1])
    test = read_data(argv[2])

    classifiers_unfiltered = naive_bayes.train(training, False)
    accuracy_unfiltered_nb = naive_bayes.test(test, False,
                                              classifiers_unfiltered)
    print("Naive Bayes is", "{0:.6f}".format(accuracy_unfiltered_nb),
          "accurate with stop words unfiltered")

    classifiers_filtered = naive_bayes.train(training, True)
    accuracy_filtered_nb = naive_bayes.test(test, True, classifiers_filtered)
    print("Naive Bayes is", "{0:.6f}".format(accuracy_filtered_nb),
          "accurate with stop words filtered")

    for i in range(3, len(argv)):
        print()
        lambda_constant = float(argv[i])

        weights_unfiltered = logistic_regression.train(training, False, 25,
                                                       lambda_constant)
        accuracy_unfiltered_lr = logistic_regression.test(
            test, False, weights_unfiltered)
        print(
            "Logistic Regression is", "{0:.6f}".format(accuracy_unfiltered_lr),
            "accurate with stop words unfiltered and lambda constant equal to",
            lambda_constant)

        weights_filtered = logistic_regression.train(training, True, 25,
                                                     lambda_constant)
        accuracy_filtered_lr = logistic_regression.test(
            test, True, weights_filtered)
        print(
            "Logistic Regression is", "{0:.6f}".format(accuracy_filtered_lr),
            "accurate with stop words filtered and lambda constant equal to",
            lambda_constant)
def main():
    args_parser = build_args_parser()
    args = args_parser.parse_args()

    results_dir_path = 'results'
    raw_data_dir_path = 'data'

    if not os.path.exists(results_dir_path):
        os.makedirs(results_dir_path)

    for file_path in glob.glob(raw_data_dir_path + '/*.csv'):
        file_name = os.path.basename(file_path)
        file_name = file_name.replace(pathlib.Path(file_name).suffix, "")

        df = pd.read_csv(file_path)
        train_sample, test_sample = train_test_split(df, test_size=0.2)

        model = None

        if args.option == 1:
            model = nb.train(train_sample)
        if args.option == 2:
            model = knn.train(train_sample)

        if model is not None:
            predicted = model.predict(test_sample['conteudo'])
            precision = np.mean(predicted == test_sample['saida'])

            from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

            file = open(results_dir_path + "/" + file_name + "_classification.txt", "w")
            file.write("Quantidade de entradas para treino: " + str(len(train_sample.index)) + "\n")
            file.write("Quantidade de entradas para teste: " + str(len(test_sample.index)) + "\n")
            file.write("Precisão: " + str(precision) + "\n")
            file.write(str(confusion_matrix(test_sample['saida'], predicted)))
            file.write(str(classification_report(test_sample['saida'], predicted)))
            file.write(str(accuracy_score(test_sample['saida'], predicted)))
            # file.write(str("Recall: %0.2f (+/- %0.2f)" % (scores['test_recall_macro'] .mean(), scores['test_recall_macro'] .std() * 2)))
            file.close()
Esempio n. 7
0
def naive_bayes(train_docs, train_keys, test_docs, test_keys, model_file, N):
    X_train, y_train, phrase_list_train, idf_vec = extract_features(
        train_docs, train_keys)
    #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys)
    #print y_train
    print "--Feature matrices calculated, NB now training..."
    clf = NB.train(X_train, y_train)
    print "--Saving model..."
    with open(model_file, 'w') as f:
        pickle.dump(clf, f)
    with open(model_file + '.phrase_list', 'w') as f:
        pickle.dump(phrase_list_train, f)
    with open(model_file + '.idf_vec', 'w') as f:
        pickle.dump(idf_vec, f)
    with open(model_file + '.training_size', 'w') as f:
        pickle.dump(len(train_docs), f)
    print "--NB trained, NB now testing..."
    #accuracy = NB.score(clf, X_test, y_test)
    accuracy = 0

    precisions = []
    recalls = []
    for doc, true_keys in zip(test_docs, test_keys):
        candidates, features = extract_candidates_doc(doc, phrase_list_train,
                                                      idf_vec, len(train_docs))
        precision, recall = evaluate_one_doc('NB', clf, candidates, features,
                                             true_keys, N)
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    #features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys)
    #avg_precision, avg_recall = evaluate_on_each_doc('NB', clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys, 10)
    return {
        'accuracy': accuracy,
        'recall': avg_recall,
        'precision': avg_precision
    }
#!/usr/bin/env python
INPUT_FILE = 'SMSSpamCollection'
TEST_FILE = 'TestCollection'
import csv
import naive_bayes

dataset = naive_bayes.load_dataset(INPUT_FILE)
model = naive_bayes.train(dataset)
total = {'spam': 0, 'ham': 0}
correct = {'spam': 0, 'ham': 0}
with open(TEST_FILE) as f:
    reader = csv.reader(f, delimiter='\t')
    for line in reader:
        result = line[0]
        #print result,
        #print ': '
        prediction = naive_bayes.predict(line[1], model)
        if (result == 'ham'):
            if prediction == result:
                correct['ham'] += 1
            total['ham'] += 1
        else:
            if prediction == result:
                correct['spam'] += 1

            total['spam'] += 1

print 'ham accuracy: {}%'.format(float(correct['ham']) * 100 / total['ham'])
print 'spam accuracy: {}%'.format(float(correct['spam']) * 100 / total['spam'])
Esempio n. 9
0
    # naive_bayes_predictions

    # pdb.set_trace()

    X_train_bal, y_train_bal, start_index_rich, start_index_non = get_twenty_twenty(
        tweets, binary_labels, start_index_rich, start_index_non)

    # array = np.array([])
    # for line in tweets:
    #             array = np.append(array,  re.sub(r'[^\w\'] ', " ",  line).split() )

    #  vectorizer = CountVectorizer(tokenizer=tokenize, analyzer='word', )

    # pdb.set_trace()

    nb_classifier, nb_grams, nb_features = naive_bayes.train(
        X_train_bal, y_train_bal)

    # randForest_classifier = RandomForestClassifier()
    # randForest_classifier.fit(X_train_bal, y_train_bal)

    # show_most_informative_features(vectorizer, nb_classifier, 20)

    importances = nb_classifier.feature_importances_
    std = np.std(
        [tree.feature_importances_ for tree in nb_classifier.estimators_],
        axis=0)
    indices = np.argsort(importances)[::-1]

    feature_importance = []
    i = 0
    for gram in nb_grams:
Esempio n. 10
0
from naive_bayes import extract_features, train, predict


#reading the data text files in unicode and spliting into train and test sets
print("\t-------Loading Dataset-------")
X, y = load_dataset()  #generating the train set
print("Length of Dataset:", len(X))

#td-idf vectorizer and split data to the test and train sets
print("\t-------Extracting Features and Splitting Dataset-------")
train_x, test_x, train_y, test_y = extract_features(X,y)  #generating the train set
print("Length of Training set:", len(train_x))
print("Length of Test set:", len(test_x))

print("\t-------Start Training------")
classifier = train(train_x, train_y)
f = open(checkpoint_path, 'wb')
pickle.dump(classifier, f)
f.close()
print("Model saved:", checkpoint_path)    
print("\t-------End Training-------")

print("\t-------Start Testing------")
f = open(checkpoint_path, 'rb')
classifier = pickle.load(f)
f.close()

accuracy, confusion_matrix = predict(classifier,test_x,test_y)
print("Accuracy :", accuracy * 100)
print("\nConfusion Matrix:")
print(confusion_matrix)
Esempio n. 11
0
#!/usr/bin/env python
INPUT_FILE = 'SMSSpamCollection'
TEST_FILE = 'TestCollection'
import csv
import naive_bayes

dataset = naive_bayes.load_dataset(INPUT_FILE)
model = naive_bayes.train(dataset)
total = {'spam': 0, 'ham': 0}
correct = {'spam': 0, 'ham': 0}
with open(TEST_FILE) as f:
   reader = csv.reader(f, delimiter='\t')
   for line in reader:
      result = line[0]
      #print result,
      #print ': '
      prediction = naive_bayes.predict(line[1], model)
      if(result == 'ham'):
         if prediction == result :
            correct['ham'] += 1
         total['ham'] += 1
      else:
         if prediction == result :
            correct['spam'] += 1
         
         total['spam'] += 1
      

print 'ham accuracy: {}%'.format( float(correct['ham'])*100/total['ham'] )
print 'spam accuracy: {}%'.format(float(correct['spam'])*100/total['spam'] )
Esempio n. 12
0
import naive_bayes
import scipy.io

# 1. TRAIN
# 1.1. Load training data
print 'Load training data ...'
training_data = scipy.io.loadmat('spamTrain.mat')
#print training_data
X = training_data['X']
y = training_data['y']
print 'X.shape =', X.shape
print 'y.shape =', y.shape

# 1.2. Train Naive Bayes classifier
print 'Train Naive Bayes classifier ...'
phi, phi0, phi1 = naive_bayes.train(X, y)
print 'phi =', phi
print 'phi0[0:10] =', phi0[0:10]
print 'phi1[0:10] =', phi1[0:10]

# 2. TEST
# 2.1. Load test data
print 'Load test data ...'
test_data = scipy.io.loadmat('spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest']
print 'X_test.shape =', X_test.shape
print 'y_test.shape =', y_test.shape

# 2.2. Test Naive Bayes classifier
print 'Test Naive Bayes classifier ...'