Python SuspectFilter.strip_textの例

プログラミング言語: Python

名前空間/パッケージ名: fuglu.shared

クラス/型: SuspectFilter

メソッド/関数: strip_text

hotexamples.comのコード掲載数: 4

Python SuspectFilter.strip_text - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのfuglu.shared.SuspectFilter.strip_textの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

SuspectFilter(13)

matches(6)

get_field(3)

get_args(2)

lint(2)

strip_text(2)

コード例 #1

ファイルを表示

class FuzorDigest(object):
    def __init__(self, msg):
        self.debug = []
        self.digest = None
        self.predigest = None
        self.bodytext_size = 0
        self.filter = SuspectFilter(None)
        self.logger = logging.getLogger('fuglu.plugins.fuzor.Digest')
        
        # digest config
        self.LONG_WORD_THRESHOLD = 10  # what is considered a long word
        self.REPLACE_LONG_WORD = '[LONG]'  # Replace long words in pre-digest with... None to disable
        self.REPLACE_EMAIL = '[EMAIL]'  # Replace email addrs in pre-digest with... None to disable
        self.REPLACE_URL = '[LINK]'  # Replace urls in pre-digest with... None to disable
        self.INCLUDE_ATTACHMENT_CONTENT = False  # should non-text attachment contents be included in digest (not recommended, there are better attachment hash systems)
        self.INCLUDE_ATTACHMENT_COUNT = True  # should the number of non-text-attachments be included in the digest
        self.MINIMUM_PREDIGEST_SIZE = 27  # if the predigest is smaller than this, ignore this message
        self.MINIMUM_UNMODIFIED_CONTENT = 27  # minimum unmodified content after stripping, eg. [SOMETHING] removed from the predigest (27>'von meinem Iphone gesendet')
        self.MINIMUM_BODYTEXT_SIZE = 27  # if the body text content is smaller than this, ignore this message
        self.STRIP_WHITESPACE = True  # remove all whitespace from the pre-digest
        self.STRIP_HTML_MARKUP = True  # remove html tags (but keep content)
        self.REMOVE_HTML_TAGS = [
            'script',
            'style']  # strip tags (including content)
        
        self.predigest = self._make_predigest(msg)
        self.digest = self._make_hash(self.predigest)
    
    
    
    def _make_hash(self, predigest):
        if self.bodytext_size < self.MINIMUM_BODYTEXT_SIZE:
            return None
        predigest = predigest.strip()
        if len(predigest) < self.MINIMUM_PREDIGEST_SIZE:
            return None
        unmodified = re.sub(r'\[[A-Z0-9:]+\]', '', predigest)
        if len(unmodified) < self.MINIMUM_UNMODIFIED_CONTENT:
            return None
        
        predigest = predigest.encode('utf-8', errors='ignore')
        return hashlib.sha1(predigest).hexdigest()
    
    
    
    def _handle_text_part(self, part):
        payload = part.get_payload(decode=True)
        charset = part.get_content_charset()
        errors = "ignore"
        if not charset:
            charset = "ascii"
        elif charset.lower().replace("_", "-") in ("quopri-codec", "quopri", "quoted-printable", "quotedprintable"):
            errors = "strict"
        
        try:
            payload = payload.decode(charset, errors)
        except (LookupError, UnicodeError, AssertionError):
            payload = payload.decode("ascii", "ignore")
        
        if self.STRIP_HTML_MARKUP:
            payload = self.filter.strip_text(
                payload,
                remove_tags=self.REMOVE_HTML_TAGS,
                use_bfs=True)
        
        if self.REPLACE_EMAIL is not None:
            payload = re.sub(r'\S{1,50}@\S{1,30}', self.REPLACE_EMAIL, payload)
        
        if self.REPLACE_URL is not None:
            payload = re.sub(r'[a-z]+:\S{1,100}', self.REPLACE_URL, payload)
        
        if self.REPLACE_LONG_WORD is not None:
            patt = r'\S{%s,}' % self.LONG_WORD_THRESHOLD
            payload = re.sub(patt, self.REPLACE_LONG_WORD, payload)
        
        if self.STRIP_WHITESPACE:
            payload = re.sub(r'\s', '', payload)
        payload = payload.strip()
        return payload
    
    
    
    def _make_predigest(self, msg):
        attachment_count = 0
        predigest = ''
        for part in msg.walk():
            if part.is_multipart():
                continue
            
            if part.get_content_maintype() == "text":
                try:
                    normalized_text_part = self._handle_text_part(part)
                    predigest += normalized_text_part
                    self.bodytext_size += len(normalized_text_part)
                except Exception as e:
                    self.logger.warn(e)
            else:
                attachment_count += 1
                if self.INCLUDE_ATTACHMENT_CONTENT:
                    predigest += "[ATTH:%s]" % hashlib.sha1(
                        part.get_payload()).hexdigest()
        
        if self.INCLUDE_ATTACHMENT_COUNT and attachment_count:
            predigest += "[ATTC:%s]" % attachment_count
        
        if self.STRIP_WHITESPACE:
            predigest = re.sub(r'\s', '', predigest)
        
        return predigest

コード例 #2

ファイルを表示

ファイル: fuzor.py プロジェクト: gryphius/fuglu-extra-plugins

class FuzorDigest(object):

    def __init__(self, msg):
        self.debug = []
        self.digest = None
        self.predigest = None
        self.bodytext_size = 0
        self.filter = SuspectFilter(None)
        self.logger = logging.getLogger('fuglu.plugins.fuzor.Digest')

        # digest config
        self.LONG_WORD_THRESHOLD = 10  # what is considered a long word
        self.REPLACE_LONG_WORD = '[LONG]'  # Replace long words in pre-digest with... None to disable
        self.REPLACE_EMAIL = '[EMAIL]'  # Replace email addrs in pre-digest with... None to disable
        self.REPLACE_URL = '[LINK]'  # Replace urls in pre-digest with... None to disable
        self.INCLUDE_ATTACHMENT_CONTENT = False  # should non-text attachment contents be included in digest (not recommended, there are better attachment hash systems)
        self.INCLUDE_ATTACHMENT_COUNT = True  # should the number of non-text-attachments be included in the digest
        self.MINIMUM_PREDIGEST_SIZE = 27  # if the predigest is smaller than this, ignore this message
        self.MINIMUM_UNMODIFIED_CONTENT = 27  # minimum unmodified content after stripping, eg. [SOMETHING] removed from the predigest (27>'von meinem Iphone gesendet')
        self.MINIMUM_BODYTEXT_SIZE = 27  # if the body text content is smaller than this, ignore this message
        self.STRIP_WHITESPACE = True  # remove all whitespace from the pre-digest
        self.STRIP_HTML_MARKUP = True  # remove html tags (but keep content)
        self.REMOVE_HTML_TAGS = [
            'script',
            'style']  # strip tags (including content)

        self.predigest = self._make_predigest(msg)
        self.digest = self._make_hash(self.predigest)



    def _make_hash(self, predigest):
        if self.bodytext_size < self.MINIMUM_BODYTEXT_SIZE:
            return None
        predigest = predigest.strip()
        if isinstance(predigest, unicode):
            predigest = predigest.encode('utf-8', 'ignore')
        if len(predigest) < self.MINIMUM_PREDIGEST_SIZE:
            return None
        unmodified = re.sub(r'\[[A-Z0-9:]+\]', '', predigest)
        if len(unmodified) < self.MINIMUM_UNMODIFIED_CONTENT:
            return None
        try:
            return hashlib.sha1(predigest).hexdigest()
        except:
            return None



    def _handle_text_part(self, part):
        payload = part.get_payload(decode=True)
        charset = part.get_content_charset()
        errors = "ignore"
        if not charset:
            charset = "ascii"
        elif (charset.lower().replace("_", "-") in ("quopri-codec",
              "quopri", "quoted-printable", "quotedprintable")):
            errors = "strict"

        try:
            payload = payload.decode(charset, errors)
        except (LookupError, UnicodeError, AssertionError):
            payload = payload.decode("ascii", "ignore")

        if self.STRIP_HTML_MARKUP:
            payload = self.filter.strip_text(
                payload,
                remove_tags=self.REMOVE_HTML_TAGS,
                use_bfs=True)

        if self.REPLACE_EMAIL is not None:
            payload = re.sub(r'\S{1,50}@\S{1,30}', self.REPLACE_EMAIL, payload)

        if self.REPLACE_URL is not None:
            payload = re.sub(r'[a-z]+:\S{1,100}', self.REPLACE_URL, payload)

        if self.REPLACE_LONG_WORD is not None:
            patt = r'\S{%s,}' % self.LONG_WORD_THRESHOLD
            payload = re.sub(patt, self.REPLACE_LONG_WORD, payload)

        if self.STRIP_WHITESPACE:
            payload = re.sub(r'\s', '', payload)
        payload = payload.strip()
        return payload



    def _make_predigest(self, msg):
        attachment_count = 0
        predigest = ''
        for part in msg.walk():
            if part.is_multipart():
                continue

            if part.get_content_maintype() == "text":
                try:
                    normalized_text_part = self._handle_text_part(part)
                    predigest += normalized_text_part
                    self.bodytext_size += len(normalized_text_part)
                except Exception as e:
                    self.logger.warn(e)
            else:
                attachment_count += 1
                if self.INCLUDE_ATTACHMENT_CONTENT:
                    predigest += "[ATTH:%s]" % hashlib.sha1(
                        part.get_payload()).hexdigest()

        if self.INCLUDE_ATTACHMENT_COUNT and attachment_count:
            predigest += "[ATTC:%s]" % attachment_count

        if self.STRIP_WHITESPACE:
            predigest = re.sub(r'\s', '', predigest)

        return predigest

コード例 #3

ファイルを表示

ファイル: shared_test.py プロジェクト: jahlives/fuglu

class SuspectFilterTestCase(unittest.TestCase):
    """Test Suspectfilter"""
    def setUp(self):
        self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex')

    def tearDown(self):
        pass

    def test_sf_get_args(self):
        """Test SuspectFilter files"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')
        suspect.tags['testtag'] = 'testvalue'

        headermatches = self.candidate.get_args(suspect)
        self.assertTrue('Sent to unittest domain!' in headermatches,
                        "To_domain not found in headercheck")
        self.assertTrue(
            'Envelope sender is [email protected]' in headermatches,
            "Envelope Sender not matched in header chekc")
        self.assertTrue('Mime Version is 1.0' in headermatches,
                        "Standard header Mime Version not found")
        self.assertTrue('A tag match' in headermatches,
                        "Tag match did not work")
        self.assertTrue('Globbing works' in headermatches,
                        "header globbing failed")
        self.assertTrue('body rule works' in headermatches,
                        "decoded body rule failed")
        self.assertTrue('full body rule works' in headermatches,
                        "full body failed")
        self.assertTrue('mime rule works' in headermatches, "mime rule failed")
        self.assertFalse(
            'this should not match in a body rule' in headermatches,
            'decoded body rule matched raw body')

        # perl style advanced rules
        self.assertTrue('perl-style /-notation works!' in headermatches,
                        "new rule format failed: %s" % headermatches)
        self.assertTrue(
            'perl-style recipient match' in headermatches,
            "new rule format failed for to_domain: %s" % headermatches)
        self.assertFalse('this should not match' in headermatches,
                         "rule flag ignorecase was not detected")

        # TODO: raw body rules

    def test_sf_matches(self):
        """Test SuspectFilter extended matches"""

        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')

        (match, info) = self.candidate.matches(suspect, extended=True)
        self.assertTrue(match, 'Match should return True')
        field, matchedvalue, arg, regex = info
        self.assertTrue(field == 'to_domain')
        self.assertTrue(matchedvalue == 'unittests.fuglu.org')
        self.assertTrue(arg == 'Sent to unittest domain!')
        self.assertTrue(regex == 'unittests\.fuglu\.org')

    def test_sf_get_field(self):
        """Test SuspectFilter field extract"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**',
                          TESTDATADIR + '/helloworld.eml')

        # additional field tests
        self.assertEqual(
            self.candidate.get_field(suspect, 'clienthelo')[0], 'helo1')
        self.assertEqual(
            self.candidate.get_field(suspect, 'clientip')[0], '10.0.0.1')
        self.assertEqual(
            self.candidate.get_field(suspect, 'clienthostname')[0], 'rdns1')

    def test_strip(self):
        html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz"""

        declarationtest = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de">
  <head>
    <title>greetings</title>
  </head>
  <body>
    <font color="red">well met!</font>
  </body>
</html>
"""
        # word generated empty message
        wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml"
xmlns:o=3D"urn:schemas-microsoft-com:office:office"
xmlns:w=3D"urn:schemas-microsoft-com:office:word"
xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml"
xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META
HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html;
charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15
(filtered medium)"><style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0cm;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:#0563C1;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:#954F72;
	text-decoration:underline;}
span.E-MailFormatvorlage17
	{mso-style-type:personal-compose;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
@page WordSection1
	{size:612.0pt 792.0pt;
	margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext=3D"edit">
<o:idmap v:ext=3D"edit" data=3D"1" />
</o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH
link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p
class=3DMsoNormal><o:p> </o:p></p></div></body></html>"""

        for use_bfs in [True, False]:
            stripped = self.candidate.strip_text(html, use_bfs=use_bfs)
            self.assertEqual(stripped, 'foobarbaz')

            docstripped = self.candidate.strip_text(declarationtest,
                                                    use_bfs=use_bfs)
            self.assertEqual(docstripped.split(),
                             ['greetings', 'well', 'met!'])

            wordhtmstripped = self.candidate.strip_text(wordhtml,
                                                        use_bfs=use_bfs)
            self.assertEqual(wordhtmstripped.strip(), '')

コード例 #4

ファイルを表示

ファイル: shared_test.py プロジェクト: steigr/fuglu

class SuspectFilterTestCase(unittest.TestCase):

    """Test Header Filter"""

    def setUp(self):
        self.candidate = SuspectFilter(TESTDATADIR + '/headertest.regex')

    def tearDown(self):
        pass

    def test_sf_get_args(self):
        """Test SuspectFilter files"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')
        suspect.tags['testtag'] = 'testvalue'

        headermatches = self.candidate.get_args(suspect)
        self.assertTrue(
            'Sent to unittest domain!' in headermatches, "To_domain not found in headercheck")
        self.assertTrue('Envelope sender is [email protected]' in headermatches,
                        "Envelope Sender not matched in header chekc")
        self.assertTrue('Mime Version is 1.0' in headermatches,
                        "Standard header Mime Version not found")
        self.assertTrue(
            'A tag match' in headermatches, "Tag match did not work")
        self.assertTrue(
            'Globbing works' in headermatches, "header globbing failed")
        self.assertTrue(
            'body rule works' in headermatches, "decoded body rule failed")
        self.assertTrue(
            'full body rule works' in headermatches, "full body failed")
        self.assertTrue('mime rule works' in headermatches, "mime rule failed")
        self.assertFalse('this should not match in a body rule' in headermatches,
                         'decoded body rule matched raw body')

        # perl style advanced rules
        self.assertTrue('perl-style /-notation works!' in headermatches,
                        "new rule format failed: %s" % headermatches)
        self.assertTrue('perl-style recipient match' in headermatches,
                        "new rule format failed for to_domain: %s" % headermatches)
        self.assertFalse('this should not match' in headermatches,
                         "rule flag ignorecase was not detected")

        # TODO: raw body rules

    def test_sf_matches(self):
        """Test SuspectFilter extended matches"""

        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')

        (match, info) = self.candidate.matches(suspect, extended=True)
        self.assertTrue(match, 'Match should return True')
        field, matchedvalue, arg, regex = info
        self.assertTrue(field == 'to_domain')
        self.assertTrue(matchedvalue == 'unittests.fuglu.org')
        self.assertTrue(arg == 'Sent to unittest domain!')
        self.assertTrue(regex == 'unittests\.fuglu\.org')

    def test_sf_get_field(self):
        """Test SuspectFilter field extract"""
        suspect = Suspect('*****@*****.**',
                          '*****@*****.**', TESTDATADIR + '/helloworld.eml')

        # additional field tests
        self.assertEqual(self.candidate.get_field(
            suspect, 'clienthelo')[0], 'helo1')
        self.assertEqual(self.candidate.get_field(
            suspect, 'clientip')[0], '10.0.0.1')
        self.assertEqual(self.candidate.get_field(
            suspect, 'clienthostname')[0], 'rdns1')

    def test_strip(self):
        html = """foo<a href="bar">bar</a><script language="JavaScript">echo('hello world');</script>baz"""

        declarationtest = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de">
  <head>
    <title>greetings</title>
  </head>
  <body>
    <font color="red">well met!</font>
  </body>
</html>
"""
        # word generated empty message
        wordhtml = """<html xmlns:v=3D"urn:schemas-microsoft-com:vml"
xmlns:o=3D"urn:schemas-microsoft-com:office:office"
xmlns:w=3D"urn:schemas-microsoft-com:office:word"
xmlns:m=3D"http://schemas.microsoft.com/office/2004/12/omml"
xmlns=3D"http://www.w3.org/TR/REC-html40"><head><META
HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html;
charset=3Dus-ascii"><meta name=3DGenerator content=3D"Microsoft Word 15
(filtered medium)"><style><!--
/* Font Definitions */
@font-face
	{font-family:"Cambria Math";
	panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
	{font-family:Calibri;
	panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
	{margin:0cm;
	margin-bottom:.0001pt;
	font-size:11.0pt;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
a:link, span.MsoHyperlink
	{mso-style-priority:99;
	color:#0563C1;
	text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
	{mso-style-priority:99;
	color:#954F72;
	text-decoration:underline;}
span.E-MailFormatvorlage17
	{mso-style-type:personal-compose;
	font-family:"Calibri",sans-serif;
	color:windowtext;}
.MsoChpDefault
	{mso-style-type:export-only;
	font-family:"Calibri",sans-serif;
	mso-fareast-language:EN-US;}
@page WordSection1
	{size:612.0pt 792.0pt;
	margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.WordSection1
	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext=3D"edit">
<o:idmap v:ext=3D"edit" data=3D"1" />
</o:shapelayout></xml><![endif]--></head><body lang=3DDE-CH
link=3D"#0563C1" vlink=3D"#954F72"><div class=3DWordSection1><p
class=3DMsoNormal><o:p> </o:p></p></div></body></html>"""

        for use_bfs in [True, False]:
            stripped = self.candidate.strip_text(html, use_bfs=use_bfs)
            self.assertEqual(stripped, 'foobarbaz')

            docstripped = self.candidate.strip_text(
                declarationtest, use_bfs=use_bfs)
            self.assertEqual(
                docstripped.split(), ['greetings', 'well', 'met!'])

            wordhtmstripped = self.candidate.strip_text(
                wordhtml, use_bfs=use_bfs)
            self.assertEqual(wordhtmstripped.strip(), '')