def test_level_filter(self): record = URLRecord() record.level = 4 url_filter = LevelFilter(0) self.assertTrue(url_filter.test(None, record)) url_filter = LevelFilter(5) record.level = 5 self.assertTrue(url_filter.test(None, record)) record.level = 6 self.assertFalse(url_filter.test(None, record)) url_filter = LevelFilter(5) record.inline_level = 1 record.level = 5 self.assertTrue(url_filter.test(None, record)) record.level = 6 self.assertTrue(url_filter.test(None, record)) record.level = 7 self.assertTrue(url_filter.test(None, record)) record.level = 8 self.assertFalse(url_filter.test(None, record)) url_filter = LevelFilter(0) record.inline_level = 1 self.assertTrue(url_filter.test(None, record)) record.inline_level = 2 self.assertTrue(url_filter.test(None, record)) record.inline_level = 3 self.assertTrue(url_filter.test(None, record)) record.inline_level = 4 self.assertTrue(url_filter.test(None, record)) record.inline_level = 5 self.assertTrue(url_filter.test(None, record)) record.inline_level = 6 self.assertFalse(url_filter.test(None, record)) record.level = 1 url_filter = LevelFilter(0, inline_max_depth=0) record.inline_level = 1000 self.assertTrue(url_filter.test(None, record)) url_filter = LevelFilter(5, inline_max_depth=1) record.inline_level = 1 self.assertTrue(url_filter.test(None, record)) record.inline_level = 2 self.assertFalse(url_filter.test(None, record))
def test_recursive_filter_requisites(self): record = URLRecord() record.level = 0 record.inline_level = 1 url_filter = RecursiveFilter(page_requisites=True) self.assertTrue(url_filter.test(None, record))
def to_plain(self) -> URLRecord: record = URLRecord() record.url = self.url record.parent_url = self.parent_url record.root_url = self.root_url record.status = Status(self.status) record.try_count = self.try_count record.level = self.level record.inline_level = self.inline_level record.link_type = LinkType(self.link_type) if self.link_type else None record.priority = self.priority record.post_data = self.post_data record.status_code = self.status_code record.filename = self.filename return record
def child_url_record(self, url: str, inline: bool=False, link_type: Optional[LinkType]=None, post_data: Optional[str]=None, level: Optional[int]=None): '''Return a child URLRecord. This function is useful for testing filters before adding to table. ''' url_record = URLRecord() url_record.url = url url_record.status = Status.todo url_record.try_count = 0 url_record.level = self.url_record.level + 1 if level is None else level url_record.root_url = self.url_record.root_url or self.url_record.url url_record.parent_url = self.url_record.url url_record.inline_level = (self.url_record.inline_level or 0) + 1 if inline else 0 url_record.link_type = link_type url_record.post_data = post_data return url_record
def test_parent_filter(self): record = URLRecord() url_filter = ParentFilter() record.root_url = 'http://example.com/blog/topic2/' self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic2/'), record )) record.root_url = 'http://example.com/blog/topic1/' self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://example.com/blog/topic1/blah2.html'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com/blog/'), record )) self.assertFalse(url_filter.test( URLInfo.parse('https://example.com/blog/'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://somewhere.com/'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://somewhere.com/'), record )) record.inline_level = 1 self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/styles.css'), record ))
def test_span_hosts_filter(self): record = URLRecord() record.url = 'http://example.com' url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], enabled=False ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], enabled=True ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], page_requisites=True ) record = URLRecord() record.url = 'http://1.example.com/' record.inline_level = 1 self.assertTrue(url_filter.test( URLInfo.parse('http://1.example.com/'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], linked_pages=True, ) record = URLRecord() record.url = 'http://1.example.com/' record.parent_url = 'http://example.com/blog/' self.assertTrue(url_filter.test( URLInfo.parse('http://1.example.com/'), record )) record = URLRecord() record.url = 'http://1.example.com/blah.html' record.parent_url = 'http://1.example.com/' self.assertFalse(url_filter.test( URLInfo.parse('http://1.example.com/blah.html'), record ))