def new_mock_url_record(): url_record = URLRecord() url_record.url = 'http://example.com' url_record.parent_url = 'http://example.com' url_record.level = 0 return url_record
def test_add_referer(self): request = Request() url_record = URLRecord() url_record.parent_url = 'http://example.com/' url_record.url = 'http://example.com/image.png' WebProcessorSession._add_referrer(request, url_record) self.assertEqual('http://example.com/', request.fields['Referer'])
def test_add_referer_https_to_http(self): request = Request() url_record = URLRecord() url_record.parent_url = 'https://example.com/' url_record.url = 'http://example.com/image.png' WebProcessorSession._add_referrer(request, url_record) self.assertNotIn('referer', request.fields)
def to_plain(self) -> URLRecord: record = URLRecord() record.url = self.url record.parent_url = self.parent_url record.root_url = self.root_url record.status = Status(self.status) record.try_count = self.try_count record.level = self.level record.inline_level = self.inline_level record.link_type = LinkType(self.link_type) if self.link_type else None record.priority = self.priority record.post_data = self.post_data record.status_code = self.status_code record.filename = self.filename return record
def child_url_record(self, url: str, inline: bool=False, link_type: Optional[LinkType]=None, post_data: Optional[str]=None, level: Optional[int]=None): '''Return a child URLRecord. This function is useful for testing filters before adding to table. ''' url_record = URLRecord() url_record.url = url url_record.status = Status.todo url_record.try_count = 0 url_record.level = self.url_record.level + 1 if level is None else level url_record.root_url = self.url_record.root_url or self.url_record.url url_record.parent_url = self.url_record.url url_record.inline_level = (self.url_record.inline_level or 0) + 1 if inline else 0 url_record.link_type = link_type url_record.post_data = post_data return url_record
def test_follow_ftp_filter(self): record = URLRecord() url_filter = FollowFTPFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) record.parent_url = 'http://wolf.farts' self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertFalse(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) url_filter = FollowFTPFilter(follow=True) self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) record.parent_url = 'ftp://wolf.farts' self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) url_filter = FollowFTPFilter(follow=True) self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record ))
def test_span_hosts_filter(self): record = URLRecord() record.url = 'http://example.com' url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], enabled=False ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], enabled=True ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], page_requisites=True ) record = URLRecord() record.url = 'http://1.example.com/' record.inline_level = 1 self.assertTrue(url_filter.test( URLInfo.parse('http://1.example.com/'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], linked_pages=True, ) record = URLRecord() record.url = 'http://1.example.com/' record.parent_url = 'http://example.com/blog/' self.assertTrue(url_filter.test( URLInfo.parse('http://1.example.com/'), record )) record = URLRecord() record.url = 'http://1.example.com/blah.html' record.parent_url = 'http://1.example.com/' self.assertFalse(url_filter.test( URLInfo.parse('http://1.example.com/blah.html'), record ))