Example #1
0
    def test_backward_filename_filter(self):
        url_filter = BackwardFilenameFilter(
            accepted=['html', 'image.*.png'],
            rejected=['bmp', 'jp[eg]', 'image.123.png']
        )

        record = URLRecord()
        record.url = 'http://example.com/'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example/index.html'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example/myimage.1003.png'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/myimage.123.png'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/blah.png'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/image.1003.png.bmp'),
            record
        ))
Example #2
0
    def test_directory_filter(self):
        record = URLRecord()
        record.url = 'http://example.com/blog/'

        url_filter = DirectoryFilter()

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com'),
            record
        ))

        url_filter = DirectoryFilter(accepted=['/blog'])

        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com'),
            record
        ))

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            record
        ))

        url_filter = DirectoryFilter(rejected=['/cgi-bin/'])

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com/cgi-bin'),
            record
        ))
Example #3
0
def new_mock_url_record():
    url_record = URLRecord()
    url_record.url = 'http://example.com'
    url_record.parent_url = 'http://example.com'
    url_record.level = 0

    return url_record
Example #4
0
    def test_recursive_filter_requisites(self):
        record = URLRecord()
        record.level = 0
        record.inline_level = 1
        url_filter = RecursiveFilter(page_requisites=True)

        self.assertTrue(url_filter.test(None, record))
Example #5
0
    def test_regex_filter(self):
        record = URLRecord()
        record.url = 'http://example.com/blog/'

        url_filter = RegexFilter()
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net'),
            record
        ))

        url_filter = RegexFilter(accepted=r'blo[a-z]/$')
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net/blob/'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net/blob/123'),
            record
        ))

        url_filter = RegexFilter(rejected=r'\.gif$')
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net/blob/'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net/blob/123.gif'),
            record
        ))
Example #6
0
    def test_recursive_filter_on(self):
        record = URLRecord()
        record.level = 0
        url_filter = RecursiveFilter(enabled=True)

        self.assertTrue(url_filter.test(None, record))

        record.level = 1
        self.assertTrue(url_filter.test(None, record))
Example #7
0
    def test_recursive_filter_off(self):
        record = URLRecord()
        record.level = 0
        url_filter = RecursiveFilter()

        self.assertTrue(url_filter.test(None, record))

        record.level = 1
        self.assertFalse(url_filter.test(None, record))
Example #8
0
    def test_add_referer_https_to_http(self):
        request = Request()
        url_record = URLRecord()
        url_record.parent_url = 'https://example.com/'
        url_record.url = 'http://example.com/image.png'

        WebProcessorSession._add_referrer(request, url_record)

        self.assertNotIn('referer', request.fields)
Example #9
0
    def test_add_referer(self):
        request = Request()
        url_record = URLRecord()
        url_record.parent_url = 'http://example.com/'
        url_record.url = 'http://example.com/image.png'

        WebProcessorSession._add_referrer(request, url_record)

        self.assertEqual('http://example.com/', request.fields['Referer'])
Example #10
0
    def _new_url_record(cls, request: Request) -> URLRecord:
        '''Return new empty URLRecord.'''
        url_record = URLRecord()

        url_record.url = request.url_info.url
        url_record.status = Status.in_progress
        url_record.try_count = 0
        url_record.level = 0

        return url_record
Example #11
0
    def test_tries_filter(self):
        record = URLRecord()
        record.try_count = 4
        url_filter = TriesFilter(0)
        self.assertTrue(url_filter.test(None, record))

        url_filter = TriesFilter(5)
        record.try_count = 4
        self.assertTrue(url_filter.test(None, record))
        record.try_count = 5
        self.assertFalse(url_filter.test(None, record))
Example #12
0
    def test_parent_filter(self):
        record = URLRecord()
        url_filter = ParentFilter()

        record.root_url = 'http://example.com/blog/topic2/'
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic2/'),
            record
        ))
        record.root_url = 'http://example.com/blog/topic1/'
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://example.com/blog/topic1/blah2.html'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('https://example.com/blog/'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://somewhere.com/'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://somewhere.com/'),
            record
        ))

        record.inline_level = 1
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/styles.css'),
            record
        ))
Example #13
0
    def _new_url_record(cls, request: Request) -> URLRecord:
        '''Return new empty URLRecord.'''
        url_record = URLRecord()

        url_record.url = request.url_info.url
        url_record.status = Status.in_progress
        url_record.try_count = 0
        url_record.level = 0

        return url_record
Example #14
0
    def test_https_filter(self):
        record= URLRecord()

        url_filter = HTTPSOnlyFilter()
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://example.net'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse("javascript:alert('hello!')"),
            record
        ))
Example #15
0
    def test_span_hosts_filter(self):
        record = URLRecord()
        record.url = 'http://example.com'

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            enabled=False
        )

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
            record
        ))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            enabled=True
        )
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
            record
        ))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            page_requisites=True
        )
        record = URLRecord()
        record.url = 'http://1.example.com/'
        record.inline_level = 1

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://1.example.com/'),
            record
        ))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            linked_pages=True,
        )
        record = URLRecord()
        record.url = 'http://1.example.com/'
        record.parent_url = 'http://example.com/blog/'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://1.example.com/'),
            record
        ))

        record = URLRecord()
        record.url = 'http://1.example.com/blah.html'
        record.parent_url = 'http://1.example.com/'

        self.assertFalse(url_filter.test(
            URLInfo.parse('http://1.example.com/blah.html'),
            record
        ))
Example #16
0
    def test_follow_ftp_filter(self):
        record = URLRecord()
        url_filter = FollowFTPFilter()

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        record.parent_url = 'http://wolf.farts'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        url_filter = FollowFTPFilter(follow=True)

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        record.parent_url = 'ftp://wolf.farts'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        url_filter = FollowFTPFilter(follow=True)

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))
Example #17
0
    def test_level_filter(self):
        record = URLRecord()
        record.level = 4
        url_filter = LevelFilter(0)
        self.assertTrue(url_filter.test(None, record))

        url_filter = LevelFilter(5)
        record.level = 5
        self.assertTrue(url_filter.test(None, record))
        record.level = 6
        self.assertFalse(url_filter.test(None, record))

        url_filter = LevelFilter(5)
        record.inline_level = 1
        record.level = 5
        self.assertTrue(url_filter.test(None, record))
        record.level = 6
        self.assertTrue(url_filter.test(None, record))
        record.level = 7
        self.assertTrue(url_filter.test(None, record))
        record.level = 8
        self.assertFalse(url_filter.test(None, record))

        url_filter = LevelFilter(0)
        record.inline_level = 1
        self.assertTrue(url_filter.test(None, record))
        record.inline_level = 2
        self.assertTrue(url_filter.test(None, record))
        record.inline_level = 3
        self.assertTrue(url_filter.test(None, record))
        record.inline_level = 4
        self.assertTrue(url_filter.test(None, record))
        record.inline_level = 5
        self.assertTrue(url_filter.test(None, record))
        record.inline_level = 6
        self.assertFalse(url_filter.test(None, record))

        record.level = 1

        url_filter = LevelFilter(0, inline_max_depth=0)
        record.inline_level = 1000
        self.assertTrue(url_filter.test(None, record))

        url_filter = LevelFilter(5, inline_max_depth=1)
        record.inline_level = 1
        self.assertTrue(url_filter.test(None, record))
        record.inline_level = 2
        self.assertFalse(url_filter.test(None, record))
Example #18
0
    def to_plain(self) -> URLRecord:
        record = URLRecord()
        record.url = self.url
        record.parent_url = self.parent_url
        record.root_url = self.root_url
        record.status = Status(self.status)
        record.try_count = self.try_count
        record.level = self.level
        record.inline_level = self.inline_level
        record.link_type = LinkType(self.link_type) if self.link_type else None
        record.priority = self.priority
        record.post_data = self.post_data
        record.status_code = self.status_code
        record.filename = self.filename

        return record
Example #19
0
    def child_url_record(self, url: str, inline: bool=False,
                         link_type: Optional[LinkType]=None,
                         post_data: Optional[str]=None,
                         level: Optional[int]=None):
        '''Return a child URLRecord.

        This function is useful for testing filters before adding to table.
        '''
        url_record = URLRecord()
        url_record.url = url
        url_record.status = Status.todo
        url_record.try_count = 0
        url_record.level = self.url_record.level + 1 if level is None else level
        url_record.root_url = self.url_record.root_url or self.url_record.url
        url_record.parent_url = self.url_record.url
        url_record.inline_level = (self.url_record.inline_level or 0) + 1 if inline else 0
        url_record.link_type = link_type
        url_record.post_data = post_data

        return url_record
Example #20
0
    def to_plain(self) -> URLRecord:
        record = URLRecord()
        record.url = self.url
        record.parent_url = self.parent_url
        record.root_url = self.root_url
        record.status = Status(self.status)
        record.try_count = self.try_count
        record.level = self.level
        record.inline_level = self.inline_level
        record.link_type = LinkType(self.link_type) if self.link_type else None
        record.priority = self.priority
        record.post_data = self.post_data
        record.status_code = self.status_code
        record.filename = self.filename

        return record