예제 #1
0
class PluginTeachMe(Plugin):
    def __init__(self, *args):
        Plugin.__init__(self, *args)
        self.classifier = None
        self.load({})
        self.curr_msg = ''
        self.last_msg = ''
        self.last_joke = ()
        self.just_joked = False

    def load(self, data):
        storage_backend = MemoryBackend(data)
        self.classifier = NaiveBayesClassifier(storage_backend)

    def save(self):
        return self.classifier.storage.data

    def get_what_to_learn(self):
        if self.curr_msg in ('CMB', 'cmb'):
            return 'CMB'
        if self.curr_msg in ('CTB', 'ctb'):
            return 'CTB'
        if self.curr_msg in ('TWSS', 'twss'):
            return "That's what she said!"
        return 'None'

    def got_congratulated(self):
        return self.curr_msg in ('GG', 'gg', 'GG Tofbot', 'gg Tofbot')

    def did_bad_joke(self):
        return self.curr_msg in ('TG', 'tg', 'TG Tofbot', 'tg Tofbot')

    def handle_msg(self, msg_text, chan, nick):
        just_joked = self.just_joked
        self.just_joked = False
        self.last_msg = self.curr_msg
        self.curr_msg = msg_text.strip()
        if self.got_congratulated():
            if self.last_joke:
                self.classifier.train(*self.last_joke)
        elif self.did_bad_joke():
            if self.last_joke:
                self.classifier.train(self.last_joke[0], 'None')
        else:
            scores = self.classifier.classify(self.curr_msg.split())
            joke = 'None'
            if scores:
                joke = scores[0][0]
            if joke != 'None':
                self.say(joke)
                self.last_joke = (self.curr_msg.split(), joke)
            else:
                if not just_joked:
                    self.classifier.train(self.last_msg.split(),
                                          self.get_what_to_learn())
예제 #2
0
class PluginTeachMe(Plugin):
    def __init__(self, *args):
        Plugin.__init__(self, *args)
        self.classifier = None
        self.load({})
        self.curr_msg = ""
        self.last_msg = ""
        self.last_joke = ()
        self.just_joked = False

    def load(self, data):
        storage_backend = MemoryBackend(data)
        self.classifier = NaiveBayesClassifier(storage_backend)

    def save(self):
        return self.classifier.storage.data

    def get_what_to_learn(self):
        if self.curr_msg in ("CMB", "cmb"):
            return "CMB"
        if self.curr_msg in ("CTB", "ctb"):
            return "CTB"
        if self.curr_msg in ("TWSS", "twss"):
            return "That's what she said!"
        return "None"

    def got_congratulated(self):
        return self.curr_msg in ("GG", "gg", "GG Tofbot", "gg Tofbot")

    def did_bad_joke(self):
        return self.curr_msg in ("TG", "tg", "TG Tofbot", "tg Tofbot")

    def handle_msg(self, msg_text, chan, nick):
        just_joked = self.just_joked
        self.just_joked = False
        self.last_msg = self.curr_msg
        self.curr_msg = msg_text.strip()
        if self.got_congratulated():
            if self.last_joke:
                self.classifier.train(*self.last_joke)
        elif self.did_bad_joke():
            if self.last_joke:
                self.classifier.train(self.last_joke[0], "None")
        else:
            scores = self.classifier.classify(self.curr_msg.split())
            joke = "None"
            if scores:
                joke = scores[0][0]
            if joke != "None":
                self.say(joke)
                self.last_joke = (self.curr_msg.split(), joke)
            else:
                if not just_joked:
                    self.classifier.train(self.last_msg.split(), self.get_what_to_learn())
예제 #3
0
__email__ = '[email protected], [email protected]'

from classifier import NaiveBayesClassifier
from averageVectorFeatureExtractor import AverageVectorFeatureExtractor
from parser import Parser
from util import Util

if __name__ == '__main__':
    import pdb
    import time

    start = time.clock()
    parser = Parser('part1data/yes_train.txt', 'part1data/no_train.txt')
    extractor = AverageVectorFeatureExtractor()
    classifier = NaiveBayesClassifier(smoothing=0.25)
    classifier.train(extractor.items(parser.items()))
    print('Training time: ' + str((time.clock() - start) * 1000) + 'ms')

    evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt')
    confusion_matrix, acc = classifier.evaluate(
        extractor.items(evaluationData.items()))
    Util.print_confusion_matrix(confusion_matrix, 2, 2)
    print('Overall accuracy: ', round(acc * 100, 2))

    labels = sorted(list(classifier.highest_likely_examples.keys()))
    for label in labels:
        features, _ = classifier.highest_likely_examples[label]
        print('Highest likelihood for class: ', label)
        Util.print_as_string(features, 25, 10)
        print('\n')
예제 #4
0
파일: train.py 프로젝트: see0/plino
class Trainer(object):

    """
    The trainer class
    """

    def __init__(self,
                 directory=os.path.abspath(
                     os.path.join('.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500
                 ):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()

    def train_classifier(self, path, label, verbose):
        """
        The function doing the actual classification here.

        :param self: Trainer object
        :param path: The path of the data directory
        :param label: The label underwhich the data directory is
        :param verbose: Decides the verbosity of the messages to be shown
        """

        limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \
            or self.limit

        if verbose:
            print colored("Training {0} emails in {1} class".format(
                limit, label
            ), 'green'
            )
            logging.debug("Training {0} emails in {1} class".format(
                limit, label
            )
            )

        # changing the path to that particular directory
        os.chdir(path)

        for email in os.listdir(path)[:self.limit]:
            if verbose and verbose > 1:
                print colored("Processing file: {0}".format(email), 'green')
                logging.info("Processing file: {0}".format(email))
            email_file = open(email, 'r')  # explicit better than implicit
            email_text = email_file.read()

            """
            Don't even get me started on the Unicode issues that I faced
            here. Thankfullly 'BeautifulSoup' was there to our rescue.

            Thanks to Leonard Richardson for this module
            """

            try:
                email_text = bs4.UnicodeDammit.detwingle(
                    email_text).decode('utf-8')
            except:
                print colored("Skipping file {0} due to bad encoding".format(email), 'red')
                logging.error("Skipping file {0} due to bad encoding".format(
                    os.path.join(path, email)
                )
                )
                continue

            email_file.close()
            email_text = email_text.encode("ascii", "ignore")

            # Extracting the features from the text
            features = self.extract_features(email_text)

            # Training the classifier
            self.classifier.train(features, label)

        """prints the __str__ overridden method in the class
        'NaiveBayesClassier'
        """
        print self.classifier

    def train(self, verbose=False):
        """
        :param self: Trainer object
        :param verbose: Printing more details when
                        Defaults to False
        """
        self.train_classifier(self.spamdir, 'spam', verbose)
        self.train_classifier(self.hamdir, 'ham', verbose)

        return self.classifier

    def extract_features(self, text):
        """
        Will convert the document into tokens and extract the features.

        Possible features
        - Attachments
        - Links in text
        - CAPSLOCK words
        - Numbers
        - Words in text

        So these are some possible features which would make an email a SPAM

        :param self: Trainer object
        :param text: Email text from which we will extract features
        :returns: A list which contains the feature set
        """
        features = []
        tokens = text.split()
        link = re.compile(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        # ^ for detecting whether the string is a link

        # Will use PorterStemmer() for stemming
        porterStemmer = stem.porter.PorterStemmer()

        # cleaning out the stopwords
        tokens = [
            token for token in tokens if token not in stopwords.words(
                "english"
            )
        ]

        for token in tokens:
            if len(token.translate(None, string.punctuation)) < 3:
                continue
            if token.isdigit():
                features.append("NUMBER")
            elif "." + token in mimetypes.types_map.keys():
                """
                >>> import mimetypes
                >>> mimetypes.types_map.keys()
                ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio',
                 '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js',
                 '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12',
                 '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai',
                 '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar',
                 '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico',
                 '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls',
                 '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm',
                 '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml',
                 '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf',
                 '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl',
                 '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt',
                 '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me',
                 '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm',
                 '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip',
                 '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm',
                 '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4']
                >>>
                """
                features.append("ATTACHMENT")
            elif token.upper() == token:
                features.append("ALL_CAPS")
                features.append(
                    porterStemmer.stem(
                        token.translate(None, string.punctuation)
                    ).lower()
                )
            elif link.match(token):
                features.append("LINK")
            else:
                features.append(
                    porterStemmer.stem(token.translate(
                        None, string.punctuation
                    )
                    ).lower()
                )

        return features
예제 #5
0
파일: train.py 프로젝트: BikerDroid/plino
class Trainer(object):

    """
    The trainer class
    """

    def __init__(self,
                 directory=os.path.abspath(
                     os.path.join('.', 'data', 'corpus1')),
                 spam='spam',
                 ham='ham',
                 limit=1500
                 ):
        """
        :param self: Trainer object
        :param directory: location of the training dataset
        :param spam: the sub directory inside the 'directory' which has spam
        :param ham: the sub directory inside the 'directory' which has ham
        :param limit: The maximum number of mails, the classifier should \
                      be trained over with
        """

        self.spamdir = os.path.join(directory, spam)
        self.hamdir = os.path.join(directory, ham)
        self.limit = limit

        self.classifier = NaiveBayesClassifier()

    def train_classifier(self, path, label, verbose):
        """
        The function doing the actual classification here.

        :param self: Trainer object
        :param path: The path of the data directory
        :param label: The label underwhich the data directory is
        :param verbose: Decides the verbosity of the messages to be shown
        """

        limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \
            or self.limit

        if verbose:
            print colored("Training {0} emails in {1} class".format(
                limit, label
            ), 'green'
            )
            logging.debug("Training {0} emails in {1} class".format(
                limit, label
            )
            )

        # changing the path to that particular directory
        os.chdir(path)

        for email in os.listdir(path)[:self.limit]:
            if verbose and verbose > 1:
                print colored("Processing file: {0}".format(email), 'green')
                logging.info("Processing file: {0}".format(email))
            email_file = open(email, 'r')  # explicit better than implicit
            email_text = email_file.read()

            """
            Don't even get me started on the Unicode issues that I faced
            here. Thankfullly 'BeautifulSoup' was there to our rescue.

            Thanks to Leonard Richardson for this module
            """

            try:
                email_text = bs4.UnicodeDammit.detwingle(
                    email_text).decode('utf-8')
            except:
                print colored("Skipping file {0} due to bad encoding".format(email), 'red')
                logging.error("Skipping file {0} due to bad encoding".format(
                    os.path.join(path, email)
                )
                )
                continue

            email_file.close()
            email_text = email_text.encode("ascii", "ignore")

            # Extracting the features from the text
            features = self.extract_features(email_text)

            # Training the classifier
            self.classifier.train(features, label)

        """prints the __str__ overridden method in the class
        'NaiveBayesClassier'
        """
        print self.classifier

    def train(self, verbose=False):
        """
        :param self: Trainer object
        :param verbose: Printing more details when
                        Defaults to False
        """
        self.train_classifier(self.spamdir, 'spam', verbose)
        self.train_classifier(self.hamdir, 'ham', verbose)

        return self.classifier

    def extract_features(self, text):
        """
        Will convert the document into tokens and extract the features.

        Possible features
        - Attachments
        - Links in text
        - CAPSLOCK words
        - Numbers
        - Words in text

        So these are some possible features which would make an email a SPAM

        :param self: Trainer object
        :param text: Email text from which we will extract features
        :returns: A list which contains the feature set
        """
        features = []
        tokens = text.split()
        link = re.compile(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        # ^ for detecting whether the string is a link

        # Will use PorterStemmer() for stemming
        porterStemmer = stem.porter.PorterStemmer()

        # cleaning out the stopwords
        tokens = [
            token for token in tokens if token not in stopwords.words(
                "english"
            )
        ]

        for token in tokens:
            if len(token.translate(None, string.punctuation)) < 3:
                continue
            if token.isdigit():
                features.append("NUMBER")
            elif "." + token in mimetypes.types_map.keys():
                """
                >>> import mimetypes
                >>> mimetypes.types_map.keys()
                ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio',
                 '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js',
                 '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12',
                 '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai',
                 '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar',
                 '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico',
                 '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls',
                 '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm',
                 '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml',
                 '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf',
                 '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl',
                 '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt',
                 '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me',
                 '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm',
                 '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip',
                 '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm',
                 '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4']
                >>>
                """
                features.append("ATTACHMENT")
            elif token.upper() == token:
                features.append("ALL_CAPS")
                features.append(
                    porterStemmer.stem(
                        token.translate(None, string.punctuation)
                    ).lower()
                )
            elif link.match(token):
                features.append("LINK")
            else:
                features.append(
                    porterStemmer.stem(token.translate(
                        None, string.punctuation
                    )
                    ).lower()
                )

        return features
예제 #6
0
ROOT_DIR = os.path.dirname(os.path.realpath(__file__))

if __name__ == '__main__':
    train, test = get_train_test_split(
        os.path.join(ROOT_DIR, 'data', 'emails'), 0.6)
    label_lookup = get_label_lookup(
        os.path.join(ROOT_DIR, 'data', 'labels.txt'))

    nb_classifier = NaiveBayesClassifier()
    training_data = [
        (label_lookup[x],
         features_from_file(os.path.join(ROOT_DIR, 'data', 'emails', x)))
        for x in train
    ]
    nb_classifier.train(training_data)

    true_positive = true_negative = false_positive = false_negative = 0
    for filename in test:
        predicted_label = nb_classifier.classify(
            features_from_file(
                os.path.join(ROOT_DIR, 'data', 'emails', filename)), 'spam',
            'not_spam')
        if predicted_label == 'spam' and label_lookup[filename] == 'spam':
            true_positive += 1
        if predicted_label == 'not_spam' and label_lookup[
                filename] == 'not_spam':
            true_negative += 1
        if predicted_label == 'spam' and label_lookup[filename] == 'not_spam':
            false_positive += 1
        if predicted_label == 'not_spam' and label_lookup[filename] == 'spam':