Python NaiveBayesClassifier 예제들, classifier.NaiveBayesClassifier Python 예제들

예제 #1

0

파일 보기

    def __init__(self,
                 directory=os.path.abspath(os.path.join(
                     '.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()

예제 #2

0

파일 보기

파일: teachme.py 프로젝트: soulaklabs/tofbot

class PluginTeachMe(Plugin):
    def __init__(self, *args):
        Plugin.__init__(self, *args)
        self.classifier = None
        self.load({})
        self.curr_msg = ''
        self.last_msg = ''
        self.last_joke = ()
        self.just_joked = False

    def load(self, data):
        storage_backend = MemoryBackend(data)
        self.classifier = NaiveBayesClassifier(storage_backend)

    def save(self):
        return self.classifier.storage.data

    def get_what_to_learn(self):
        if self.curr_msg in ('CMB', 'cmb'):
            return 'CMB'
        if self.curr_msg in ('CTB', 'ctb'):
            return 'CTB'
        if self.curr_msg in ('TWSS', 'twss'):
            return "That's what she said!"
        return 'None'

    def got_congratulated(self):
        return self.curr_msg in ('GG', 'gg', 'GG Tofbot', 'gg Tofbot')

    def did_bad_joke(self):
        return self.curr_msg in ('TG', 'tg', 'TG Tofbot', 'tg Tofbot')

    def handle_msg(self, msg_text, chan, nick):
        just_joked = self.just_joked
        self.just_joked = False
        self.last_msg = self.curr_msg
        self.curr_msg = msg_text.strip()
        if self.got_congratulated():
            if self.last_joke:
                self.classifier.train(*self.last_joke)
        elif self.did_bad_joke():
            if self.last_joke:
                self.classifier.train(self.last_joke[0], 'None')
        else:
            scores = self.classifier.classify(self.curr_msg.split())
            joke = 'None'
            if scores:
                joke = scores[0][0]
            if joke != 'None':
                self.say(joke)
                self.last_joke = (self.curr_msg.split(), joke)
            else:
                if not just_joked:
                    self.classifier.train(self.last_msg.split(),
                                          self.get_what_to_learn())

예제 #3

0

파일 보기

파일: teachme.py 프로젝트: chmduquesne/tofbot

class PluginTeachMe(Plugin):
    def __init__(self, *args):
        Plugin.__init__(self, *args)
        self.classifier = None
        self.load({})
        self.curr_msg = ""
        self.last_msg = ""
        self.last_joke = ()
        self.just_joked = False

    def load(self, data):
        storage_backend = MemoryBackend(data)
        self.classifier = NaiveBayesClassifier(storage_backend)

    def save(self):
        return self.classifier.storage.data

    def get_what_to_learn(self):
        if self.curr_msg in ("CMB", "cmb"):
            return "CMB"
        if self.curr_msg in ("CTB", "ctb"):
            return "CTB"
        if self.curr_msg in ("TWSS", "twss"):
            return "That's what she said!"
        return "None"

    def got_congratulated(self):
        return self.curr_msg in ("GG", "gg", "GG Tofbot", "gg Tofbot")

    def did_bad_joke(self):
        return self.curr_msg in ("TG", "tg", "TG Tofbot", "tg Tofbot")

    def handle_msg(self, msg_text, chan, nick):
        just_joked = self.just_joked
        self.just_joked = False
        self.last_msg = self.curr_msg
        self.curr_msg = msg_text.strip()
        if self.got_congratulated():
            if self.last_joke:
                self.classifier.train(*self.last_joke)
        elif self.did_bad_joke():
            if self.last_joke:
                self.classifier.train(self.last_joke[0], "None")
        else:
            scores = self.classifier.classify(self.curr_msg.split())
            joke = "None"
            if scores:
                joke = scores[0][0]
            if joke != "None":
                self.say(joke)
                self.last_joke = (self.curr_msg.split(), joke)
            else:
                if not just_joked:
                    self.classifier.train(self.last_msg.split(), self.get_what_to_learn())

예제 #4

0

파일 보기

파일: train.py 프로젝트: BikerDroid/plino

    def __init__(self,
                 directory=os.path.abspath(
                     os.path.join('.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500
                 ):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()

예제 #5

0

파일 보기

파일: extra3.py 프로젝트: nbermudezs/UIUC_CS440

__author__ = 'Nestor Bermudez'
__email__ = '[email protected], [email protected]'

from classifier import NaiveBayesClassifier
from averageVectorFeatureExtractor import AverageVectorFeatureExtractor
from parser import Parser
from util import Util

if __name__ == '__main__':
    import pdb
    import time

    start = time.clock()
    parser = Parser('part1data/yes_train.txt', 'part1data/no_train.txt')
    extractor = AverageVectorFeatureExtractor()
    classifier = NaiveBayesClassifier(smoothing=0.25)
    classifier.train(extractor.items(parser.items()))
    print('Training time: ' + str((time.clock() - start) * 1000) + 'ms')

    evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt')
    confusion_matrix, acc = classifier.evaluate(
        extractor.items(evaluationData.items()))
    Util.print_confusion_matrix(confusion_matrix, 2, 2)
    print('Overall accuracy: ', round(acc * 100, 2))

    labels = sorted(list(classifier.highest_likely_examples.keys()))
    for label in labels:
        features, _ = classifier.highest_likely_examples[label]
        print('Highest likelihood for class: ', label)
        Util.print_as_string(features, 25, 10)
        print('\n')

예제 #6

0

파일 보기

파일: train.py 프로젝트: see0/plino

class Trainer(object):

    """
    The trainer class
    """

    def __init__(self,
                 directory=os.path.abspath(
                     os.path.join('.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500
                 ):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()

    def train_classifier(self, path, label, verbose):
        """
        The function doing the actual classification here.

        :param self: Trainer object
        :param path: The path of the data directory
        :param label: The label underwhich the data directory is
        :param verbose: Decides the verbosity of the messages to be shown
        """

        limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \
            or self.limit

        if verbose:
            print colored("Training {0} emails in {1} class".format(
                limit, label
            ), 'green'
            )
            logging.debug("Training {0} emails in {1} class".format(
                limit, label
            )
            )

        # changing the path to that particular directory
        os.chdir(path)

        for email in os.listdir(path)[:self.limit]:
            if verbose and verbose > 1:
                print colored("Processing file: {0}".format(email), 'green')
                logging.info("Processing file: {0}".format(email))
            email_file = open(email, 'r')  # explicit better than implicit
            email_text = email_file.read()

            """
            Don't even get me started on the Unicode issues that I faced
            here. Thankfullly 'BeautifulSoup' was there to our rescue.

            Thanks to Leonard Richardson for this module
            """

            try:
                email_text = bs4.UnicodeDammit.detwingle(
                    email_text).decode('utf-8')
            except:
                print colored("Skipping file {0} due to bad encoding".format(email), 'red')
                logging.error("Skipping file {0} due to bad encoding".format(
                    os.path.join(path, email)
                )
                )
                continue

            email_file.close()
            email_text = email_text.encode("ascii", "ignore")

            # Extracting the features from the text
            features = self.extract_features(email_text)

            # Training the classifier
            self.classifier.train(features, label)

        """prints the __str__ overridden method in the class
        'NaiveBayesClassier'
        """
        print self.classifier

    def train(self, verbose=False):
        """
        :param self: Trainer object
        :param verbose: Printing more details when
                        Defaults to False
        """
        self.train_classifier(self.spamdir, 'spam', verbose)
        self.train_classifier(self.hamdir, 'ham', verbose)

        return self.classifier

    def extract_features(self, text):
        """
        Will convert the document into tokens and extract the features.

        Possible features
        - Attachments
        - Links in text
        - CAPSLOCK words
        - Numbers
        - Words in text

        So these are some possible features which would make an email a SPAM

        :param self: Trainer object
        :param text: Email text from which we will extract features
        :returns: A list which contains the feature set
        """
        features = []
        tokens = text.split()
        link = re.compile(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        # ^ for detecting whether the string is a link

        # Will use PorterStemmer() for stemming
        porterStemmer = stem.porter.PorterStemmer()

        # cleaning out the stopwords
        tokens = [
            token for token in tokens if token not in stopwords.words(
                "english"
            )
        ]

        for token in tokens:
            if len(token.translate(None, string.punctuation)) < 3:
                continue
            if token.isdigit():
                features.append("NUMBER")
            elif "." + token in mimetypes.types_map.keys():
                """
                >>> import mimetypes
                >>> mimetypes.types_map.keys()
                ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio',
                 '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js',
                 '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12',
                 '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai',
                 '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar',
                 '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico',
                 '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls',
                 '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm',
                 '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml',
                 '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf',
                 '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl',
                 '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt',
                 '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me',
                 '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm',
                 '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip',
                 '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm',
                 '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4']
                >>>
                """
                features.append("ATTACHMENT")
            elif token.upper() == token:
                features.append("ALL_CAPS")
                features.append(
                    porterStemmer.stem(
                        token.translate(None, string.punctuation)
                    ).lower()
                )
            elif link.match(token):
                features.append("LINK")
            else:
                features.append(
                    porterStemmer.stem(token.translate(
                        None, string.punctuation
                    )
                    ).lower()
                )

        return features

예제 #7

0

파일 보기

파일: train.py 프로젝트: BikerDroid/plino

class Trainer(object):

    """
    The trainer class
    """

    def __init__(self,
                 directory=os.path.abspath(
                     os.path.join('.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500
                 ):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()

    def train_classifier(self, path, label, verbose):
        """
        The function doing the actual classification here.

        :param self: Trainer object
        :param path: The path of the data directory
        :param label: The label underwhich the data directory is
        :param verbose: Decides the verbosity of the messages to be shown
        """

        limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \
            or self.limit

        if verbose:
            print colored("Training {0} emails in {1} class".format(
                limit, label
            ), 'green'
            )
            logging.debug("Training {0} emails in {1} class".format(
                limit, label
            )
            )

        # changing the path to that particular directory
        os.chdir(path)

        for email in os.listdir(path)[:self.limit]:
            if verbose and verbose > 1:
                print colored("Processing file: {0}".format(email), 'green')
                logging.info("Processing file: {0}".format(email))
            email_file = open(email, 'r')  # explicit better than implicit
            email_text = email_file.read()

            """
            Don't even get me started on the Unicode issues that I faced
            here. Thankfullly 'BeautifulSoup' was there to our rescue.

            Thanks to Leonard Richardson for this module
            """

            try:
                email_text = bs4.UnicodeDammit.detwingle(
                    email_text).decode('utf-8')
            except:
                print colored("Skipping file {0} due to bad encoding".format(email), 'red')
                logging.error("Skipping file {0} due to bad encoding".format(
                    os.path.join(path, email)
                )
                )
                continue

            email_file.close()
            email_text = email_text.encode("ascii", "ignore")

            # Extracting the features from the text
            features = self.extract_features(email_text)

            # Training the classifier
            self.classifier.train(features, label)

        """prints the __str__ overridden method in the class
        'NaiveBayesClassier'
        """
        print self.classifier

    def train(self, verbose=False):
        """
        :param self: Trainer object
        :param verbose: Printing more details when
                        Defaults to False
        """
        self.train_classifier(self.spamdir, 'spam', verbose)
        self.train_classifier(self.hamdir, 'ham', verbose)

        return self.classifier

    def extract_features(self, text):
        """
        Will convert the document into tokens and extract the features.

        Possible features
        - Attachments
        - Links in text
        - CAPSLOCK words
        - Numbers
        - Words in text

        So these are some possible features which would make an email a SPAM

        :param self: Trainer object
        :param text: Email text from which we will extract features
        :returns: A list which contains the feature set
        """
        features = []
        tokens = text.split()
        link = re.compile(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        # ^ for detecting whether the string is a link

        # Will use PorterStemmer() for stemming
        porterStemmer = stem.porter.PorterStemmer()

        # cleaning out the stopwords
        tokens = [
            token for token in tokens if token not in stopwords.words(
                "english"
            )
        ]

        for token in tokens:
            if len(token.translate(None, string.punctuation)) < 3:
                continue
            if token.isdigit():
                features.append("NUMBER")
            elif "." + token in mimetypes.types_map.keys():
                """
                >>> import mimetypes
                >>> mimetypes.types_map.keys()
                ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio',
                 '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js',
                 '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12',
                 '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai',
                 '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar',
                 '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico',
                 '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls',
                 '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm',
                 '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml',
                 '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf',
                 '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl',
                 '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt',
                 '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me',
                 '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm',
                 '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip',
                 '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm',
                 '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4']
                >>>
                """
                features.append("ATTACHMENT")
            elif token.upper() == token:
                features.append("ALL_CAPS")
                features.append(
                    porterStemmer.stem(
                        token.translate(None, string.punctuation)
                    ).lower()
                )
            elif link.match(token):
                features.append("LINK")
            else:
                features.append(
                    porterStemmer.stem(token.translate(
                        None, string.punctuation
                    )
                    ).lower()
                )

        return features

예제 #8

0

파일 보기

    data = list(csv.reader(f, delimiter="\t"))


def clean(s):
    translator = str.maketrans("", "", string.punctuation)
    return s.translate(translator)


def normalize_string(string):
    litter = ['.', ',', '!', '"', '\'', ':', ' -', ' —', '(', ')']
    clear_string = string.lower()

    for symbol in litter:
        clear_string = clear_string.replace(symbol, '')

    return clear_string


X, y = [], []

for target, msg in data:
    X.append(msg)
    y.append(target)

X = [normalize_string(x) for x in X]
X_train, y_train, X_test, y_test = X[:3900], y[:3900], X[3900:], y[3900:]

model = NaiveBayesClassifier(1)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

예제 #9

0

파일 보기

파일: server.py 프로젝트: terdenan/cs102

    for row in rows:
        [prediction] = model.predict([row.normal_title])
        if prediction == 'good':
            news.append(row)

    return template('templates/news_recommendations', rows=news)


def get_training_data():
    rows = s.query(News).filter(News.label != None).all()
    X_train = [row.normal_title for row in rows]
    y_train = [row.label for row in rows]

    return X_train, y_train


if __name__ == '__main__':
    s = session()
    X_train, y_train = get_training_data()
    model = NaiveBayesClassifier(1)
    model.fit(X_train, y_train)
    run(host='localhost', port=8080)

    # print(len(s.query(News).filter(News.label != None).all()))
    # cnt = 183
    # X, y = get_training_data()
    # X_train, y_train, X_test, y_test = X[:cnt], y[:cnt], X[cnt:], y[cnt:]
    # model = NaiveBayesClassifier(1)
    # model.fit(X_train, y_train)
    # print(model.score(X_test, y_test))

예제 #10

0

파일 보기

    return cnf_mat

def split_dataset(dataset: pd.DataFrame, train_frac):
    train = dataset.sample(frac=train_frac, random_state=300660)
    test = dataset.drop(train.index)
    return train.drop(columns='class'), test.drop(columns='class'), \
           train['class'], test['class']


# reading clean dataset
main_df = pd.read_csv(r'seeds_dataset_clean.txt', header=None, sep='\t')
main_df.columns = ['area', 'perimeter', 'compactness', 'kernel length',
                    'kernel width', 'asymmetry coef.', 'groove length', 'class']


nbc = NaiveBayesClassifier()
gnb = GaussianNB()


# finding best train/(train+test) ratio
train_fractions = np.linspace(start=0.1, stop=0.9, num=17)

nbc_prediction_accuracies = np.zeros((17, 1))

for idx, train_frac in enumerate(train_fractions):
    X_train, X_test, y_train, y_test = split_dataset(main_df, train_frac=train_frac)
    # alternatively sklearn.model_selection.train_test_split can be used
    nbc.fit(X_train, y_train)
    predictions = nbc.predict(X_test)
    nbc_prediction_accuracies[idx] = accuracy_score(y_test, predictions)

예제 #11

0

파일 보기

파일: startup.py 프로젝트: mspenn/doc_manager

    corpus_tokens = []
    corpus_labels = []
    for category in corpus.category_list:
        content = Tokenizer.load_category(category)
        if content:
            corpus_tokens.extend(content)
            corpus_labels.extend([corpus.category_list.index(category)] *
                                 len(content))
    feature = Feature()
    feature.make_vsm(corpus_tokens)
    # feature.print_vsm()
    # reduce feature, k==0 means auto detect
    # feature.reducex(corpus_labels, cate_list=corpus.category_list)
    feature.reduce_feature(corpus_labels, k=0)
    feature_id = "feature.txt"
    feature.store(feature_id)

    # classify
    # lib svm
    classifier = LibSvmClassifier(feature_id)
    y_actual, y_predict = classifier.do_classify()
    Classifier.predict_info("Lib SVM", y_actual, y_predict)
    #  sklearn svm
    classifier = SvmClassifier(feature.feature_vec, feature.feature_label)
    y_actual, y_predict = classifier.do_classify()
    Classifier.predict_info("Sklearn SVM", y_actual, y_predict)
    # naive bayes
    classifier = NaiveBayesClassifier(feature.feature_vec,
                                      feature.feature_label)
    y_actual, y_predict = classifier.do_classify()
    Classifier.predict_info("Naive Bayes", y_actual, y_predict)

예제 #12

0

파일 보기

# -*- coding: utf-8 -*-
from classifier import NaiveBayesClassifier

nbc = NaiveBayesClassifier(
    "iris-treinamento.txt",
    ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])

vars_combinations = [['Sepal Length', 'Sepal Width'],
                     ['Sepal Length', 'Petal Width'],
                     ['Sepal Length', 'Petal Length'],
                     ['Petal Length', 'Petal Width'],
                     ['Petal Length', 'Sepal Width'],
                     ['Petal Width', 'Sepal Width']]
for vars_combination in vars_combinations:
    nbc.plot_two_var_normal(vars_combination)

예제 #13

0

파일 보기

파일: teachme.py 프로젝트: soulaklabs/tofbot

 def load(self, data):
     storage_backend = MemoryBackend(data)
     self.classifier = NaiveBayesClassifier(storage_backend)

예제 #14

0

파일 보기

파일: teachme.py 프로젝트: chmduquesne/tofbot

 def load(self, data):
     storage_backend = MemoryBackend(data)
     self.classifier = NaiveBayesClassifier(storage_backend)

예제 #15

0

파일 보기

    X_train_val, X_test, y_train_val, y_test, ted_ids, X_ted = build_X(
        datapath)
    print("X_train_val shape: {}, X_test shape: {}".format(
        X_train_val.shape, X_test.shape))
    print("y_train_val shape: {}, y_test shape: {}".format(
        y_train_val.shape, y_test.shape))
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                      y_train_val,
                                                      test_size=0.1,
                                                      random_state=42)
    print("X_train shape: {}, X_val shape: {}".format(X_train.shape,
                                                      X_val.shape))
    print("y_train shape: {}, y_val shape: {}".format(y_train.shape,
                                                      y_val.shape))

    nb_clf = NaiveBayesClassifier()
    nb_clf.fit(X_train, y_train)
    y_pred_val = nb_clf.predict(X_val)
    y_pred_test = nb_clf.predict(X_test)
    print('NB validation acc: {}'.format((y_pred_val == y_val).mean()))
    evaluate(y_test, y_pred_test)

    for k in [1, 5, 9]:
        knn_clf = KNNClassifier(k)
        knn_clf.fit(X_train, y_train)
        y_pred_val = knn_clf.predict(X_val)
        y_pred_test = knn_clf.predict(X_test)
        print('{}-nn validation acc: {}'.format(k,
                                                (y_pred_val == y_val).mean()))
        evaluate(y_test, y_pred_test)

예제 #16

0

파일 보기

from train_test_split import get_train_test_split, get_label_lookup
from feature_extraction import features_from_file
from classifier import NaiveBayesClassifier
import os

ROOT_DIR = os.path.dirname(os.path.realpath(__file__))

if __name__ == '__main__':
    train, test = get_train_test_split(
        os.path.join(ROOT_DIR, 'data', 'emails'), 0.6)
    label_lookup = get_label_lookup(
        os.path.join(ROOT_DIR, 'data', 'labels.txt'))

    nb_classifier = NaiveBayesClassifier()
    training_data = [
        (label_lookup[x],
         features_from_file(os.path.join(ROOT_DIR, 'data', 'emails', x)))
        for x in train
    ]
    nb_classifier.train(training_data)

    true_positive = true_negative = false_positive = false_negative = 0
    for filename in test:
        predicted_label = nb_classifier.classify(
            features_from_file(
                os.path.join(ROOT_DIR, 'data', 'emails', filename)), 'spam',
            'not_spam')
        if predicted_label == 'spam' and label_lookup[filename] == 'spam':
            true_positive += 1
        if predicted_label == 'not_spam' and label_lookup[
                filename] == 'not_spam':