Example #1
0
class Hammie(hammie.Hammie):
    def __init__(self, bayes):
        hammie.Hammie.__init__(self, bayes)
        self.tokenizer = Tokenizer()
    def _scoremsg(self, msg, evidence=False):
        return self.bayes.spamprob(self.tokenizer.tokenize(msg), evidence)
    def train(self, msg, is_spam, add_header=False):
        self.bayes.learn(self.tokenizer.tokenize(msg), is_spam)
Example #2
0
            message = self.urlCorpus.makeMessage(url_key, fake_message_string)
            self.urlCorpus.addMessage(message)
        else:
            fake_message_string = cached_message.as_string()

        msg = message_from_string(fake_message_string)

        # We don't want to do full header tokenising, as this is
        # optimised for messages, not webpages, so we just do the
        # basic stuff.
        bht = options["Tokenizer", "basic_header_tokenize"]
        bhto = options["Tokenizer", "basic_header_tokenize_only"]
        options["Tokenizer", "basic_header_tokenize"] = True
        options["Tokenizer", "basic_header_tokenize_only"] = True

        tokens = Tokenizer().tokenize(msg)
        pf = options["URLRetriever", "x-web_prefix"]
        tokens = ["%s%s" % (pf, tok) for tok in tokens]

        # Undo the changes
        options["Tokenizer", "basic_header_tokenize"] = bht
        options["Tokenizer", "basic_header_tokenize_only"] = bhto
        return tokens

    def _base_url(self, url):
        # To try and speed things up, and to avoid following
        # unique URLS, we convert the URL to as basic a form
        # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
        # would become http://massey.ac.nz and http://id.example.com
        # would become http://example.com
        url += '/'
Example #3
0
 def __init__(self, bayes):
     hammie.Hammie.__init__(self, bayes)
     self.tokenizer = Tokenizer()