def testFilter0(self):
     testcfg.set('filter.domain.0', '')
     testcfg.set('filter.domain.1', '')
     testcfg.set('filter.domain.2', '')
     testcfg.set('filter.domain.3', '')
     testcfg.set('filter.domain.4', '')
     domain_filter.g_exdm = None                                 # force reload
     self.assertEqual(None, domain_filter.match(''))
     self.assertEqual(None, domain_filter.match('http://abc'))
Ejemplo n.º 2
0
def preparse_filter(first_block, meta):
    """ Filter by domain and magic header.
        Returns distill result
    """
    uri = meta.get('uri','')
    dm = domain_filter.match(uri)
    if dm:
        return EXDOMAIN, dm

    guessed = magic.guess_type(first_block)
    if guessed and guessed != 'text/html' and guessed != 'text/plain':
        return NON_HTML, guessed

    return 0
    def testFilter1(self):
        domain_filter.g_exdm = None                                 # force reload

        self.assertEqual(None,       domain_filter.match(''))

        # exact domain match
        self.assertEqual('abc.com',  domain_filter.match('http://abc.com/'))
        self.assertEqual(None,       domain_filter.match('http://www.abc.com/'))
        self.assertEqual('abc.com',  domain_filter.match('http://abc.com/index.html?a=b#c'))
        self.assertEqual('abc.com',  domain_filter.match('http://*****:*****@abc.com/index.html?a=b#c'))
        self.assertEqual('def',      domain_filter.match('http://def/'))
        self.assertEqual(None,       domain_filter.match('http://www.def.com/'))

        # suffix domain match
        self.assertEqual(None,       domain_filter.match('http://xyz.com/'))
        self.assertEqual('.xyz.com', domain_filter.match('http://www.xyz.com/'))
        self.assertEqual('.xyz.com', domain_filter.match('http://www.xyz.com/index.html?a=b#c'))
        self.assertEqual('.xyz.com', domain_filter.match('http://*****:*****@www.xyz.com/index.html?a=b#c'))