예제 #1
0
    def __init__(self,
                 directory=os.path.abspath(os.path.join(
                     '.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()
예제 #2
0
__author__ = 'Nestor Bermudez'
__email__ = '[email protected], [email protected]'

from classifier import NaiveBayesClassifier
from averageVectorFeatureExtractor import AverageVectorFeatureExtractor
from parser import Parser
from util import Util

if __name__ == '__main__':
    import pdb
    import time

    start = time.clock()
    parser = Parser('part1data/yes_train.txt', 'part1data/no_train.txt')
    extractor = AverageVectorFeatureExtractor()
    classifier = NaiveBayesClassifier(smoothing=0.25)
    classifier.train(extractor.items(parser.items()))
    print('Training time: ' + str((time.clock() - start) * 1000) + 'ms')

    evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt')
    confusion_matrix, acc = classifier.evaluate(
        extractor.items(evaluationData.items()))
    Util.print_confusion_matrix(confusion_matrix, 2, 2)
    print('Overall accuracy: ', round(acc * 100, 2))

    labels = sorted(list(classifier.highest_likely_examples.keys()))
    for label in labels:
        features, _ = classifier.highest_likely_examples[label]
        print('Highest likelihood for class: ', label)
        Util.print_as_string(features, 25, 10)
        print('\n')
예제 #3
0
    data = list(csv.reader(f, delimiter="\t"))


def clean(s):
    translator = str.maketrans("", "", string.punctuation)
    return s.translate(translator)


def normalize_string(string):
    litter = ['.', ',', '!', '"', '\'', ':', ' -', ' —', '(', ')']
    clear_string = string.lower()

    for symbol in litter:
        clear_string = clear_string.replace(symbol, '')

    return clear_string


X, y = [], []

for target, msg in data:
    X.append(msg)
    y.append(target)

X = [normalize_string(x) for x in X]
X_train, y_train, X_test, y_test = X[:3900], y[:3900], X[3900:], y[3900:]

model = NaiveBayesClassifier(1)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
예제 #4
0
__author__ = 'Nestor Bermudez'
__email__ = '[email protected], [email protected]'

from classifier import NaiveBayesClassifier
from singleBlockFeatureExtractor import SingleBlockFeatureExtractor
from blockGroupFeatureExtractor import BlockGroupFeatureExtractor
from unsegmentedDataParser import UnsegmentedDataParser
from parser import Parser
from util import Util

if __name__ == '__main__':
    import pdb
    import time

    start = time.clock()
    parser = UnsegmentedDataParser('extradata')
    extractor = BlockGroupFeatureExtractor()
    classifier = NaiveBayesClassifier(smoothing=1.5)
    classifier.train(extractor.items(parser.items()))
    print('Training time: ' + str((time.clock() - start) * 1000) + 'ms')

    evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt')
    confusion_matrix, acc = classifier.evaluate(
        extractor.items(evaluationData.items()))
    Util.print_confusion_matrix(confusion_matrix, 2, 2)
    print('Overall accuracy: ', round(acc * 100, 2))
예제 #5
0
    return cnf_mat

def split_dataset(dataset: pd.DataFrame, train_frac):
    train = dataset.sample(frac=train_frac, random_state=300660)
    test = dataset.drop(train.index)
    return train.drop(columns='class'), test.drop(columns='class'), \
           train['class'], test['class']


# reading clean dataset
main_df = pd.read_csv(r'seeds_dataset_clean.txt', header=None, sep='\t')
main_df.columns = ['area', 'perimeter', 'compactness', 'kernel length',
                    'kernel width', 'asymmetry coef.', 'groove length', 'class']


nbc = NaiveBayesClassifier()
gnb = GaussianNB()


# finding best train/(train+test) ratio
train_fractions = np.linspace(start=0.1, stop=0.9, num=17)

nbc_prediction_accuracies = np.zeros((17, 1))

for idx, train_frac in enumerate(train_fractions):
    X_train, X_test, y_train, y_test = split_dataset(main_df, train_frac=train_frac)
    # alternatively sklearn.model_selection.train_test_split can be used
    nbc.fit(X_train, y_train)
    predictions = nbc.predict(X_test)
    nbc_prediction_accuracies[idx] = accuracy_score(y_test, predictions)
예제 #6
0
    corpus_tokens = []
    corpus_labels = []
    for category in corpus.category_list:
        content = Tokenizer.load_category(category)
        if content:
            corpus_tokens.extend(content)
            corpus_labels.extend([corpus.category_list.index(category)] *
                                 len(content))
    feature = Feature()
    feature.make_vsm(corpus_tokens)
    # feature.print_vsm()
    # reduce feature, k==0 means auto detect
    # feature.reducex(corpus_labels, cate_list=corpus.category_list)
    feature.reduce_feature(corpus_labels, k=0)
    feature_id = "feature.txt"
    feature.store(feature_id)

    # classify
    # lib svm
    classifier = LibSvmClassifier(feature_id)
    y_actual, y_predict = classifier.do_classify()
    Classifier.predict_info("Lib SVM", y_actual, y_predict)
    #  sklearn svm
    classifier = SvmClassifier(feature.feature_vec, feature.feature_label)
    y_actual, y_predict = classifier.do_classify()
    Classifier.predict_info("Sklearn SVM", y_actual, y_predict)
    # naive bayes
    classifier = NaiveBayesClassifier(feature.feature_vec,
                                      feature.feature_label)
    y_actual, y_predict = classifier.do_classify()
    Classifier.predict_info("Naive Bayes", y_actual, y_predict)
예제 #7
0
# -*- coding: utf-8 -*-
from classifier import NaiveBayesClassifier

nbc = NaiveBayesClassifier(
    "iris-treinamento.txt",
    ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])

vars_combinations = [['Sepal Length', 'Sepal Width'],
                     ['Sepal Length', 'Petal Width'],
                     ['Sepal Length', 'Petal Length'],
                     ['Petal Length', 'Petal Width'],
                     ['Petal Length', 'Sepal Width'],
                     ['Petal Width', 'Sepal Width']]
for vars_combination in vars_combinations:
    nbc.plot_two_var_normal(vars_combination)
예제 #8
0
 def load(self, data):
     storage_backend = MemoryBackend(data)
     self.classifier = NaiveBayesClassifier(storage_backend)
예제 #9
0
    X_train_val, X_test, y_train_val, y_test, ted_ids, X_ted = build_X(
        datapath)
    print("X_train_val shape: {}, X_test shape: {}".format(
        X_train_val.shape, X_test.shape))
    print("y_train_val shape: {}, y_test shape: {}".format(
        y_train_val.shape, y_test.shape))
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                      y_train_val,
                                                      test_size=0.1,
                                                      random_state=42)
    print("X_train shape: {}, X_val shape: {}".format(X_train.shape,
                                                      X_val.shape))
    print("y_train shape: {}, y_val shape: {}".format(y_train.shape,
                                                      y_val.shape))

    nb_clf = NaiveBayesClassifier()
    nb_clf.fit(X_train, y_train)
    y_pred_val = nb_clf.predict(X_val)
    y_pred_test = nb_clf.predict(X_test)
    print('NB validation acc: {}'.format((y_pred_val == y_val).mean()))
    evaluate(y_test, y_pred_test)

    for k in [1, 5, 9]:
        knn_clf = KNNClassifier(k)
        knn_clf.fit(X_train, y_train)
        y_pred_val = knn_clf.predict(X_val)
        y_pred_test = knn_clf.predict(X_test)
        print('{}-nn validation acc: {}'.format(k,
                                                (y_pred_val == y_val).mean()))
        evaluate(y_test, y_pred_test)