コード例 #1
0
ファイル: shared_test.py プロジェクト: jahlives/fuglu
class SuspectFilterTestCase(unittest.TestCase):
    """Test Suspectfilter"""
    def setUp(self):
        self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex')

    def tearDown(self):
        pass

    def test_sf_get_args(self):
        """Test SuspectFilter files"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')
        suspect.tags['testtag'] = 'testvalue'

        headermatches = self.candidate.get_args(suspect)
        self.assertTrue('Sent to unittest domain!' in headermatches,
                        "To_domain not found in headercheck")
        self.assertTrue(
            'Envelope sender is [email protected]' in headermatches,
            "Envelope Sender not matched in header chekc")
        self.assertTrue('Mime Version is 1.0' in headermatches,
                        "Standard header Mime Version not found")
        self.assertTrue('A tag match' in headermatches,
                        "Tag match did not work")
        self.assertTrue('Globbing works' in headermatches,
                        "header globbing failed")
        self.assertTrue('body rule works' in headermatches,
                        "decoded body rule failed")
        self.assertTrue('full body rule works' in headermatches,
                        "full body failed")
        self.assertTrue('mime rule works' in headermatches, "mime rule failed")
        self.assertFalse(
            'this should not match in a body rule' in headermatches,
            'decoded body rule matched raw body')

        # perl style advanced rules
        self.assertTrue('perl-style /-notation works!' in headermatches,
                        "new rule format failed: %s" % headermatches)
        self.assertTrue(
            'perl-style recipient match' in headermatches,
            "new rule format failed for to_domain: %s" % headermatches)
        self.assertFalse('this should not match' in headermatches,
                         "rule flag ignorecase was not detected")

        # TODO: raw body rules

    def test_sf_matches(self):
        """Test SuspectFilter extended matches"""

        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')

        (match, info) = self.candidate.matches(suspect, extended=True)
        self.assertTrue(match, 'Match should return True')
        field, matchedvalue, arg, regex = info
        self.assertTrue(field == 'to_domain')
        self.assertTrue(matchedvalue == 'unittests.fuglu.org')
        self.assertTrue(arg == 'Sent to unittest domain!')
        self.assertTrue(regex == 'unittests\.fuglu\.org')

    def test_sf_get_field(self):
        """Test SuspectFilter field extract"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')

        # additional field tests
        self.assertEqual(
            self.candidate.get_field(suspect, 'clienthelo')[0], 'helo1')
        self.assertEqual(
            self.candidate.get_field(suspect, 'clientip')[0], '10.0.0.1')
        self.assertEqual(
            self.candidate.get_field(suspect, 'clienthostname')[0], 'rdns1')

    def test_strip(self):
        html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz"""

        declarationtest = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de">
  <head>
    <title>greetings</title>
  </head>
  <body>
    <font color="red">well met!</font>
  </body>
</html>
"""
        # word generated empty message
        wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml"
xmlns:o=3D"urn:schemas-microsoft-com:office:office"
xmlns:w=3D"urn:schemas-microsoft-com:office:word"
xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml"
xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META
HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html;
charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15
(filtered medium)"><style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0cm;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:#0563C1;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:#954F72;
	text-decoration:underline;}
span.E-MailFormatvorlage17
	{mso-style-type:personal-compose;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
@page WordSection1
	{size:612.0pt 792.0pt;
	margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext=3D"edit">
<o:idmap v:ext=3D"edit" data=3D"1" />
</o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH
link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p
class=3DMsoNormal><o:p> </o:p></p></div></body></html>"""

        for use_bfs in [True, False]:
            stripped = self.candidate.strip_text(html, use_bfs=use_bfs)
            self.assertEqual(stripped, 'foobarbaz')

            docstripped = self.candidate.strip_text(declarationtest,
                                                    use_bfs=use_bfs)
            self.assertEqual(docstripped.split(),
                             ['greetings', 'well', 'met!'])

            wordhtmstripped = self.candidate.strip_text(wordhtml,
                                                        use_bfs=use_bfs)
            self.assertEqual(wordhtmstripped.strip(), '')
コード例 #2
0
class RateLimitPlugin(ScannerPlugin):
    """This is a generic rolling window rate limiting plugin. It allows limiting the amount of accepted messages based on any combination of supported SuspectFilter fields.
    This means you could for example limit the number of similar subjects by sender domain to implement a simple bulk filter.

    Important notes:
        - This plugin is experimental and has not been tested in production
        - This plugin only makes sense in pre-queue mode.
        - The content filter stage is usually *not* the best place to implement rate-limiting.
          Faster options are postfix built-in rate limits or a policy access daemon
          which doesn't need to accept the full message to make a decision
        - the backends don't automatically perform global expiration of all events.
          Old entries are only cleared per event the next time the same event happens.
          Add a cron job for your backend to clear all old events from time to time.

    Supported backends:
        - memory: stores events in memory. Do not use this in production.
        - sqlalchemy: Stores events in a SQL database. Recommended for small/low-traffic setups
        - redis: stores events in a redis database. This is the fastest and therefore recommended backend.

    Configuration example for redis. Prerequisite: python redis module
        backendtype = redis
        backendconfig = localhost:6379:0

    Configuration example for mysql: Prerequisite: python sqlalchemy module. The database must exist. The table will be created automatically.
        backendtype = sqlalchemy
        backendconfig = mysql://root@localhost/fuglu

    ratelimit.conf format: (not final yet)

    Each limiter is defined by a line which must match the following format. Each limiter is evaluated in the order specified.

    limit name=**name** rate=**max**/**timeframe** fields=**fieldlist** [match=/**filter regex**/ [skip=**skiplist** ]] action=**action** message=**message**

        **name**        : a descriptive name for this filter, one word. Required to reference in skip lists
        **max**         : the maximum number of events that may occur in the specified timeframe before an action is limited.
                          Specify a negative value to indicate "no limit"
        **timeframe**   : Timeframe for the limit
        **fields**      : comma separated list of fields which should be used as unique values to limit
        **match** (optional): regular expression to apply to the actuall values. The limiter is only applied if this regular expression matches.
                              If the limiter consists of multiple input fields,
                              The regex will be applied to the comma separated list of field values.
        **skip** (optional):  Comma separated list of subsequent limiter names, that should be skipped if this this limiter's regex matched the input values.
                              Used for overrides.
        **action**      : Action that should be performed if the limit is exceeded. ( REJECT / DEFER / ... )
        **message**     : Message returned to the connecting client


    Examples:

    # no sending limit for our newsletter
    limit name=newsletter rate=-1/1 fields=from_address match=/^newsletter@example\.com$/ skip=fromaddr,serverhelo action=DUNNO message=OK

    # max 10 messages in 30 seconds per unique sender address:
    limit name=fromaddr rate=10/30 fields=from_address action=REJECT message=Too many messages from ${from_address}

    # max 100 messages with same subject per hour per server helo
    limit name=serverhelo rate=100/3600 fields=clienthelo,subject action=REJECT message=Bulk message detected

    """
    def __init__(self, config, section=None):
        ScannerPlugin.__init__(self, config, section)
        self.requiredvars = {
            'limiterfile': {
                'default': '/etc/fuglu/ratelimit.conf',
                'description': 'file based rate limits',
            },
            'backendtype': {
                'default':
                'memory',
                'description':
                'type of backend where the events are stored. memory is only recommended for low traffic standalone systems. alternatives are: redis, sqlalchemy'
            },
            'backendconfig': {
                'default':
                '',
                'description':
                'backend specific configuration. sqlalchemy: the database url, redis: hostname:port:db'
            }
        }

        self.logger = self._logger()
        self.backend_instance = None
        self.limiters = None
        self.filter = SuspectFilter(None)

    #TODO: make action and message optional
    def load_limiter_config(self, text):
        patt = re.compile(
            r'^limit\s+name=(?P<name>[^\s]+)\s+rate=(?P<max>\-?\d{1,10})\/(?P<time>\d{1,10})\s+fields=(?P<fieldlist>[^\s]+)(\s+match=\/(?P<matchregex>.+)\/(\s+skip=(?P<skiplist>[^\s]+))?)?\s+action=(?P<action>[^\s]+)\s+message=(?P<message>.*)$'
        )
        limiters = []
        lineno = 0
        for line in text.split('\n'):
            lineno += 1
            line = line.strip()
            if line.startswith('#') or line.strip() == '':
                continue
            match = patt.match(line)
            if match == None:
                self.logger.error('cannot parse limiter config line %s' %
                                  lineno)
                continue
            gdict = match.groupdict()
            limiter = Limiter()
            limiter.name = gdict['name']
            limiter.max = int(gdict['max'])
            limiter.timespan = int(gdict['time'])
            limiter.fields = gdict['fieldlist'].split(',')
            limiter.regex = gdict['matchregex']
            if gdict['skiplist'] != None:
                limiter.skip = gdict['skiplist'].split(',')
            action = string_to_actioncode(gdict['action'])
            if action == None:
                self.logger.error(
                    "Limiter config line %s : invalid action %s" %
                    (lineno, gdict['action']))
            limiter.action = action
            limiter.message = gdict['message']
            limiters.append(limiter)
        return limiters

    def examine(self, suspect):
        if self.limiters == None:
            filename = self.config.get(self.section, 'limiterfile')
            if not os.path.exists(filename):
                self.logger.error("Limiter config file %s not found" %
                                  filename)
                return
            limiterconfig = open(filename, 'r').read()
            limiters = self.load_limiter_config(limiterconfig)
            self.limiters = limiters
            self.logger.info("Found %s limiter configurations" %
                             (len(limiters)))

        if self.backend_instance == None:
            btype = self.config.get(self.section, 'backendtype')
            if btype not in AVAILABLE_RATELIMIT_BACKENDS:
                self.logger.error('ratelimit backend %s not available' %
                                  (btype))
                return
            self.backend_instance = AVAILABLE_RATELIMIT_BACKENDS[btype](
                self.config.get(self.section, 'backendconfig'))

        skiplist = []
        for limiter in self.limiters:
            if limiter.name in skiplist:  # check if this limiter is skipped by a previous one
                self.logger.debug('limiter %s skipped due to previous match' %
                                  limiter.name)
                continue

            #get field values
            allfieldsavailable = True
            fieldvalues = []
            for fieldname in limiter.fields:
                values = self.filter.get_field(suspect, fieldname)
                if len(values) < 1:
                    allfieldsavailable = False
                    self.logger.debug(
                        'Skipping limiter %s - field %s not available' %
                        (limiter.name, fieldname))
                    break
                fieldvalues.append(values[0])
            if not allfieldsavailable:  #rate limit can not be applied
                continue

            checkval = ','.join(fieldvalues)
            if limiter.regex != None:
                if re.match(limiter.regex, checkval):
                    if limiter.skip != None:
                        skiplist.extend(limiter.skip)
                else:  #no match, skip this limiter
                    self.logger.debug(
                        'Skipping limiter %s - regex does not match' %
                        (limiter.name))
                    continue
            #self.logger.debug("check %s"%str(limiter))
            eventname = limiter.name + checkval
            timespan = limiter.timespan
            max = limiter.max
            if max < 0:  #no limit
                continue
            event_count = self.backend_instance.check_count(
                eventname, timespan)
            self.logger.debug("Limiter event %s  count: %s" %
                              (eventname, event_count))
            if event_count > max:
                return limiter.action, apply_template(limiter.message, suspect)
コード例 #3
0
ファイル: shared_test.py プロジェクト: steigr/fuglu
class SuspectFilterTestCase(unittest.TestCase):

    """Test Header Filter"""

    def setUp(self):
        self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex')

    def tearDown(self):
        pass

    def test_sf_get_args(self):
        """Test SuspectFilter files"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')
        suspect.tags['testtag'] = 'testvalue'

        headermatches = self.candidate.get_args(suspect)
        self.assertTrue(
            'Sent to unittest domain!' in headermatches, "To_domain not found in headercheck")
        self.assertTrue('Envelope sender is [email protected]' in headermatches,
                        "Envelope Sender not matched in header chekc")
        self.assertTrue('Mime Version is 1.0' in headermatches,
                        "Standard header Mime Version not found")
        self.assertTrue(
            'A tag match' in headermatches, "Tag match did not work")
        self.assertTrue(
            'Globbing works' in headermatches, "header globbing failed")
        self.assertTrue(
            'body rule works' in headermatches, "decoded body rule failed")
        self.assertTrue(
            'full body rule works' in headermatches, "full body failed")
        self.assertTrue('mime rule works' in headermatches, "mime rule failed")
        self.assertFalse('this should not match in a body rule' in headermatches,
                         'decoded body rule matched raw body')

        # perl style advanced rules
        self.assertTrue('perl-style /-notation works!' in headermatches,
                        "new rule format failed: %s" % headermatches)
        self.assertTrue('perl-style recipient match' in headermatches,
                        "new rule format failed for to_domain: %s" % headermatches)
        self.assertFalse('this should not match' in headermatches,
                         "rule flag ignorecase was not detected")

        # TODO: raw body rules

    def test_sf_matches(self):
        """Test SuspectFilter extended matches"""

        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')

        (match, info) = self.candidate.matches(suspect, extended=True)
        self.assertTrue(match, 'Match should return True')
        field, matchedvalue, arg, regex = info
        self.assertTrue(field == 'to_domain')
        self.assertTrue(matchedvalue == 'unittests.fuglu.org')
        self.assertTrue(arg == 'Sent to unittest domain!')
        self.assertTrue(regex == 'unittests\.fuglu\.org')

    def test_sf_get_field(self):
        """Test SuspectFilter field extract"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')

        # additional field tests
        self.assertEqual(self.candidate.get_field(
            suspect, 'clienthelo')[0], 'helo1')
        self.assertEqual(self.candidate.get_field(
            suspect, 'clientip')[0], '10.0.0.1')
        self.assertEqual(self.candidate.get_field(
            suspect, 'clienthostname')[0], 'rdns1')

    def test_strip(self):
        html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz"""

        declarationtest = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de">
  <head>
    <title>greetings</title>
  </head>
  <body>
    <font color="red">well met!</font>
  </body>
</html>
"""
        # word generated empty message
        wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml"
xmlns:o=3D"urn:schemas-microsoft-com:office:office"
xmlns:w=3D"urn:schemas-microsoft-com:office:word"
xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml"
xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META
HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html;
charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15
(filtered medium)"><style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0cm;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:#0563C1;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:#954F72;
	text-decoration:underline;}
span.E-MailFormatvorlage17
	{mso-style-type:personal-compose;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
@page WordSection1
	{size:612.0pt 792.0pt;
	margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext=3D"edit">
<o:idmap v:ext=3D"edit" data=3D"1" />
</o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH
link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p
class=3DMsoNormal><o:p> </o:p></p></div></body></html>"""

        for use_bfs in [True, False]:
            stripped = self.candidate.strip_text(html, use_bfs=use_bfs)
            self.assertEqual(stripped, 'foobarbaz')

            docstripped = self.candidate.strip_text(
                declarationtest, use_bfs=use_bfs)
            self.assertEqual(
                docstripped.split(), ['greetings', 'well', 'met!'])

            wordhtmstripped = self.candidate.strip_text(
                wordhtml, use_bfs=use_bfs)
            self.assertEqual(wordhtmstripped.strip(), '')
コード例 #4
0
class BayesPlugin(object):
    def __init__(self):
        self.requiredvars = {
            'backendtype': {
                'default':
                'redis',
                'description':
                'Token store backend type. Allowed values are: sqlalchemy , redis',
            },
            'backendconfig': {
                'default':
                '',
                'description':
                'Backend configuration. Depends on backendtype, eg. sqlalchemy url, redis host:port:db',
            },
            'spambias': {
                'default':
                '0.5',
                'description':
                'overall spam bias. 0.5=no bias. 0.8=around 80% of scanned mail traffic is spam',
            },
            'minimum-token-occurence': {
                'default':
                '3',
                'description':
                "don't make assumptions on tokens seen less than this amount",
            },
            'maximum-tokens-per-message': {
                'default': '5000',
                'description': 'stop tokenizing after x tokens',
            },
            'minimum-ham': {
                'default': '10',
                'description': "minimum known hams for classification",
            },
            'minimum-spam': {
                'default': '10',
                'description': "minimum known spams for classification",
            },
        }
        self.tokenstore = None
        self.calc_minimum = 0.00000001  # work around division by zero etc

        self.logger = self._logger()
        self.filter = SuspectFilter(None)

    def init_backend(self):
        if self.tokenstore != None:
            return
        backendtype = self.config.get(self.section, 'backendtype')
        if backendtype not in SUPPORTED_BACKENDS:
            self.logger.error(
                "Bayes tokenstore %s not supported, maybe misspelled or missing dependency"
                % backendtype)

        backend = SUPPORTED_BACKENDS[backendtype](self.config.get(
            self.section, 'backendconfig'))
        self.tokenstore = backend

    def single_token_spam_probability(self, token):
        """Compute the probability that a message containing a given token is spam
        ( "spamicity of a word" )
        """
        total_spam = self.tokenstore.get_total_spam_count()
        if total_spam < self.config.getint(self.section, 'minimum-spam'):
            self.logger.warning(
                "Not enough known spams for bayes classification")
            return 0.5

        total_ham = self.tokenstore.get_total_ham_count()
        if total_ham < self.config.getint(self.section, 'minimum-ham'):
            self.logger.warning(
                "Not enough known hams for bayes classification")
            return 0.5

        pr_s = self.config.getfloat(
            self.section,
            'spambias')  # probability that any given message is spam
        pr_h = 1 - pr_s  # probability that any given message is ham

        spam_count = self.tokenstore.get_spam_count(
            token)  # number of known spams containing this token
        ham_count = self.tokenstore.get_ham_count(
            token)  # number of known hams containing this token

        # "Dealing with rare words"
        if spam_count + ham_count < self.config.get(self.section,
                                                    'minimum-token-occurence'):
            pr_s_w = 0.5
        else:
            pr_w_s = float(
                spam_count
            ) / total_spam  #  the probability that the token appears in spam messages
            pr_w_h = float(
                ham_count
            ) / total_ham  #   the probability that the token appears in ham messages
            divisor = (pr_w_s * pr_s + pr_w_h * pr_h)
            if divisor < self.calc_minimum:
                divisor = self.calc_minimum
            pr_s_w = pr_w_s * pr_s / divisor
        #self.logger.info("Token '%s' : seen in %s spams, %s hams => spamicity= %.4f"%(token,spam_count,ham_count,pr_s_w))
        return pr_s_w

    def spam_probability(self, suspect):
        """
        :param text:
        :return: the probability that the given text is spam. float value between 0.0 and 1.0
        """
        tokens = self.tokenize(suspect)
        self.logger.debug("Got %s tokens" % len(tokens))
        total = 0
        for t in tokens:
            spamicity = self.single_token_spam_probability(t)
            if spamicity < self.calc_minimum:
                spamicity = self.calc_minimum

            #make sure we get at least a very small amount
            x = 1 - spamicity
            if x < self.calc_minimum:
                x = self.calc_minimum
            n = math.log(x) - math.log(spamicity)
            total += n
        try:
            probability = 1.0 / (1 + math.pow(math.e, total))
        except OverflowError:
            return 0.0

        return round(probability, 4)

    def ngrams(self, sequence, n=3, maxnumber=None):
        sequence = list(sequence)
        count = max(0, len(sequence) - n + 1)
        if maxnumber == None:
            maxnumber = count
        return [
            "".join(sequence[i:i + n]) for i in range(min(count, maxnumber))
        ]

    def tokenize(self, suspect):
        visible_texts = self.filter.get_field(suspect, 'body:stripped')
        stripped = " ".join(
            [t.strip() for t in visible_texts if t.strip() != ''])
        maxtokens = self.config.getint(self.section,
                                       'maximum-tokens-per-message')
        if maxtokens == 0:
            maxtokens = None
        tokens = self.ngrams(stripped, n=3, maxnumber=maxtokens)
        #self.logger.debug(tokens)
        return tokens
コード例 #5
0
ファイル: bayes.py プロジェクト: gryphius/fuglu-extra-plugins
class BayesPlugin(object):
    def __init__(self):
        self.requiredvars = {
            "backendtype": {
                "default": "redis",
                "description": "Token store backend type. Allowed values are: sqlalchemy , redis",
            },
            "backendconfig": {
                "default": "",
                "description": "Backend configuration. Depends on backendtype, eg. sqlalchemy url, redis host:port:db",
            },
            "spambias": {
                "default": "0.5",
                "description": "overall spam bias. 0.5=no bias. 0.8=around 80% of scanned mail traffic is spam",
            },
            "minimum-token-occurence": {
                "default": "3",
                "description": "don't make assumptions on tokens seen less than this amount",
            },
            "maximum-tokens-per-message": {"default": "5000", "description": "stop tokenizing after x tokens"},
            "minimum-ham": {"default": "10", "description": "minimum known hams for classification"},
            "minimum-spam": {"default": "10", "description": "minimum known spams for classification"},
        }
        self.tokenstore = None
        self.calc_minimum = 0.00000001  # work around division by zero etc

        self.logger = self._logger()
        self.filter = SuspectFilter(None)

    def init_backend(self):
        if self.tokenstore != None:
            return
        backendtype = self.config.get(self.section, "backendtype")
        if backendtype not in SUPPORTED_BACKENDS:
            self.logger.error("Bayes tokenstore %s not supported, maybe misspelled or missing dependency" % backendtype)

        backend = SUPPORTED_BACKENDS[backendtype](self.config.get(self.section, "backendconfig"))
        self.tokenstore = backend

    def single_token_spam_probability(self, token):
        """Compute the probability that a message containing a given token is spam
        ( "spamicity of a word" )
        """
        total_spam = self.tokenstore.get_total_spam_count()
        if total_spam < self.config.getint(self.section, "minimum-spam"):
            self.logger.warning("Not enough known spams for bayes classification")
            return 0.5

        total_ham = self.tokenstore.get_total_ham_count()
        if total_ham < self.config.getint(self.section, "minimum-ham"):
            self.logger.warning("Not enough known hams for bayes classification")
            return 0.5

        pr_s = self.config.getfloat(self.section, "spambias")  # probability that any given message is spam
        pr_h = 1 - pr_s  # probability that any given message is ham

        spam_count = self.tokenstore.get_spam_count(token)  # number of known spams containing this token
        ham_count = self.tokenstore.get_ham_count(token)  # number of known hams containing this token

        # "Dealing with rare words"
        if spam_count + ham_count < self.config.get(self.section, "minimum-token-occurence"):
            pr_s_w = 0.5
        else:
            pr_w_s = float(spam_count) / total_spam  #  the probability that the token appears in spam messages
            pr_w_h = float(ham_count) / total_ham  #   the probability that the token appears in ham messages
            divisor = pr_w_s * pr_s + pr_w_h * pr_h
            if divisor < self.calc_minimum:
                divisor = self.calc_minimum
            pr_s_w = pr_w_s * pr_s / divisor
        # self.logger.info("Token '%s' : seen in %s spams, %s hams => spamicity= %.4f"%(token,spam_count,ham_count,pr_s_w))
        return pr_s_w

    def spam_probability(self, suspect):
        """
        :param text:
        :return: the probability that the given text is spam. float value between 0.0 and 1.0
        """
        tokens = self.tokenize(suspect)
        self.logger.debug("Got %s tokens" % len(tokens))
        total = 0
        for t in tokens:
            spamicity = self.single_token_spam_probability(t)
            if spamicity < self.calc_minimum:
                spamicity = self.calc_minimum

            # make sure we get at least a very small amount
            x = 1 - spamicity
            if x < self.calc_minimum:
                x = self.calc_minimum
            n = math.log(x) - math.log(spamicity)
            total += n
        try:
            probability = 1.0 / (1 + math.pow(math.e, total))
        except OverflowError:
            return 0.0

        return round(probability, 4)

    def ngrams(self, sequence, n=3, maxnumber=None):
        sequence = list(sequence)
        count = max(0, len(sequence) - n + 1)
        if maxnumber == None:
            maxnumber = count
        return ["".join(sequence[i : i + n]) for i in range(min(count, maxnumber))]

    def tokenize(self, suspect):
        visible_texts = self.filter.get_field(suspect, "body:stripped")
        stripped = " ".join([t.strip() for t in visible_texts if t.strip() != ""])
        maxtokens = self.config.getint(self.section, "maximum-tokens-per-message")
        if maxtokens == 0:
            maxtokens = None
        tokens = self.ngrams(stripped, n=3, maxnumber=maxtokens)
        # self.logger.debug(tokens)
        return tokens