def test_fully_qualified_url_can_be_created_from_uri(self): """Test fully qualified url can be created from a uri and url.""" root = 'https://example.com/foo' root = Url.from_string(root) self.assertEqual( 'https://example.com/foo/bar', root.create_child_url('bar').to_string() ) self.assertEqual( 'https://example.com/bar', root.create_child_url('/bar').to_string() ) self.assertEqual( 'https://foo.com', root.create_child_url('//foo.com').to_string() ) root = 'https://example.com?foo=bar' root = Url.from_string(root) self.assertEqual( 'https://example.com/bar#foo', root.create_child_url('/bar#foo').to_string() )
def test_trailing_slash_is_removed_from_path(self): """Test trailing slash of path is treated the same as without slash.""" url1 = 'https://example.com/foo/bar/' # with trailing slash url2 = 'https://example.com/foo/bar' # without trailing slash url1 = Url.from_string(url1) url2 = Url.from_string(url2) self.assertEqual('https://example.com/foo/bar', url1.to_string()) self.assertEqual('https://example.com/foo/bar', url2.to_string())
def test_crawler_can_read_next_url_from_index(self): """Test crawler can read next url from source.""" index = Index() url = Url.from_string('https://example.com/foo') index.remove_uncrawled_url = MagicMock() index.random_uncrawled_url = MagicMock(return_value=url) self.crawler = Crawler(self.path_to_url_source, index) self.assertEqual( Url.from_string('https://example.com/foo').to_string(), self.crawler._next_url().to_string()) index.remove_uncrawled_url.assert_called_with(url.hash())
def test_filename_can_be_created(self): """Test filename can be created from url.""" url = Url.from_string('https://example.com/foo/BAR-baz-bAt') self.assertEqual('bar-baz-bat.png', url.make_filename()) url = Url.from_string('https://example.com/') self.assertEqual('index.png', url.make_filename()) url = Url.from_string('https://example.com?someid=123&otherid=FOO') self.assertEqual('someid-123-and-otherid-foo.png', url.make_filename()) url = Url.from_string('https://example.com?someid=123456#foobarbaz') self.assertEqual('foobarbaz.png', url.make_filename())
def test_directory_can_be_created(self): """Test directory can be created from url.""" url = Url.from_string('https://example.com/foo/baz/BAR-123/index.html') self.assertEqual('/foo/baz/bar-123/', url.make_directory()) url = Url.from_string('https://example.com') self.assertEqual('/', url.make_directory()) url = Url.from_string('https://example.com/') self.assertEqual('/', url.make_directory()) url = Url.from_string('https://example.com/foo?someid=123&otherid=FOO') self.assertEqual('/foo/?/', url.make_directory()) url = Url.from_string('https://example.com/1/2?id=123¶m#foo') self.assertEqual('/1/2/?/id-123-and-param/#/', url.make_directory())
def test_index_can_store_photo(self): """Test index can store a photo.""" self.index.es.index = MagicMock() time.time = MagicMock(return_value=time.time()) url = Url.from_string('http://example.com') path = PhotoPath(self.datadir) path.filesize = MagicMock(return_value=10000) photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly) self.index.save_photo(photo) self.index.es.index.assert_called_with( index='photos', doc_type='photo', id=path.uuid, body={ 'url_id': url.hash(), 'refresh_rate': refresh.Hourly.lock_format(), 'captured_at': refresh.Hourly().lock(), 'filesize': photo.filesize(), 'filename': photo.filename(), 'directory': photo.directory(), 'domain': photo.domain(), 'timestamp': int(time.time()) })
def random_uncrawled_url(self) -> Url: """Get random uncrawled url. Returns: An uncrawled url Url Raises: EmptySearchResultException: if index is empty """ res = self.es.search(index=Index.UNCRAWLED, size=1, body={ 'query': { 'function_score': { 'query': { 'bool': { 'must_not': { 'term': { 'crawled': True } } } }, 'random_score': {} } } }) if res['hits']['total'] == 0: raise EmptySearchResultException('uncrawled index is empty') url = Url.from_string(res['hits']['hits'][0]['_source']['url']) return url
def _next_url(self): """Get next url to crawl. Returns: A url to crawl, None if no url was found Url or None """ self._open_source('r') lines = sum(1 for line in self.source) if lines == 0: return self._next_url_in_index() self.source.seek(0) lines = self.source.read().split('\n') if lines[0] == '': return self._next_url_in_index() self._open_source('w') for line in lines: if line != lines[0] and line.strip() != '': self.source.write(line + '\n') self._close_source() url = Url.from_string(lines[0]) return url
def test_url_can_be_hashed(self): """Test url can be hashed.""" url = 'https://example.com/foo/bar/baz?bax=bat&moo=loo#hello' hash = hashlib.sha256(url.encode()).hexdigest() parsed = Url.from_string(url) self.assertEqual(hash, parsed.hash())
def _route(self, url: Url): """Route camera to url. Args: url: A Url to route camera to """ self.webdriver.set_page_load_timeout(10) self.webdriver.get(url.to_string())
def test_crawler_can_read_next_url_from_source(self): """Test crawler can read next url from source.""" self.add_url_source('https://example.com') self.crawler = Crawler(self.path_to_url_source, Index()) self.assertEqual( Url.from_string('https://example.com').to_string(), self.crawler._next_url().to_string())
def test_loading_photo_can_be_saved_to_datadir(self): """Test a loading photo can be saved to data directory.""" path = PhotoPath(self.datadir) url = Url.from_string('https://example.com') photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly) photo.save_loading_text() self.assertTrue(isfile(path.full_path())) self.assertEqual('loading', photo.get_raw())
def test_url_can_be_created_from_string(self): """Test url can be created from string.""" url = 'https://example.com/foo/bar/baz?bax=bat&moo=loo#hello' url = Url.from_string(url) self.assertEqual('https', url.scheme) self.assertEqual('example.com', url.domain) self.assertEqual('/foo/bar/baz', url.path) self.assertEqual('bax=bat&moo=loo', url.query) self.assertEqual('hello', url.fragment)
def does_url_checkout(self): """Test does checkut of url. Add mock to affected index methods. """ url = Url.from_string('https://example.com') self.index.recently_crawled_url = MagicMock(return_value=url) self.index.crawled_urls_count = MagicMock(return_value=1) self.index.lock_crawled_url = MagicMock()
def get_page(url: Url) -> Page: """Get page. Fetch page at url Args: url: Url page is located at Returns: The requested page if response is 200 otherwise None Page """ parser = LinkParser() page = Page() try: with urllib.request.urlopen(url.to_string()) as response: html = response.read() except HTTPError as error: page.status_code = error.getcode() return page page.status_code = response.getcode() headers = dict(response.headers) for header in headers: if header.lower() == 'content-type': page.content_type = headers[header] links = parser.parse(str(html)) for link in links: try: page.add_url(Url.from_string(link)) except InvalidUrlException as e: try: page.add_url(url.create_child_url(link)) except InvalidUrlException as e: pass return page
def set_status_code_for_crawled_url(self, url: Url, status_code: int): """Set status code for a crawled url. Args: url: Url to set status code of status_code: the status code of the http request to the url """ self.es.update(index=Index.CRAWLED, doc_type='url', id=url.hash(), retry_on_conflict=3, body={'doc': { 'status_code': status_code }})
def test_filesystem_can_translate_path_to_file_in_datadir(self): """Test filesystem can translate path to file in datadir.""" datadir_path = PhotoPath(self.datadir) url = Url.from_string('https://example.com/foo/bar') photo = Screenshot(url, datadir_path, self.refresh_rate) self.index.es.index = MagicMock() photo.path.filesize = MagicMock(return_value=10000) self.index.save_photo(photo) self.index.photos_file_exists = MagicMock(return_value=123000) self.index.photos_get_photo = MagicMock(return_value=photo) path = self.filesystem._translate_path( '/example.com/2019-01-13H20:00/foo/bar.png') self.assertEqual(datadir_path.full_path(), path)
def test_crawler_removes_urls_read_from_source(self): """Test crawler removes urls read from source.""" self.add_url_source('https://example.com') self.add_url_source('https://example.com/foo') self.add_url_source('https://example.com/bar') self.crawler = Crawler(self.path_to_url_source, Index()) # first line should now be https://example.com self.assertEqual( Url.from_string('https://example.com').to_string(), self.crawler._next_url().to_string()) # first line should now be https://example.com/foo self.assertEqual( Url.from_string('https://example.com/foo').to_string(), self.crawler._next_url().to_string()) # first line should now be https://example.com/bar self.assertEqual( Url.from_string('https://example.com/bar').to_string(), self.crawler._next_url().to_string()) self.crawler = Crawler(self.path_to_url_source, Index())
def recently_crawled_url(self, refresh_rate=RefreshRate): """Get recently crawled url. Picks from the last 5 crawled to prevent two running photograpers to fetch the same one. Args: refresh_rate: the refresh reate to search for and use to avoid locked urls Returns: A url that has been crawled with status code 200 Url Raises: EmptySearchResultException: if no url was found """ res = self.es.search(index=Index.CRAWLED, size=5, body={ 'query': { 'bool': { 'must': { 'term': { 'status_code': 200, } }, 'must_not': [{ 'term': { 'lock_value': refresh_rate().lock(), } }] } }, 'sort': [{ 'timestamp': { 'order': 'desc' } }] }) if res['hits']['total'] == 0: raise EmptySearchResultException('no crawled url was found') hits = res['hits']['hits'] return Url.from_string(random.choice(hits)['_source']['url'])
def test_lock_can_be_placed_on_crawled_url(self): """Test lock can be placed on crawled url.""" url = Url.from_string('http://example.com') self.index.es.update = MagicMock() self.index.lock_crawled_url(url, refresh.Hourly) self.index.es.update.assert_called_with( index='crawled', doc_type='url', id=url.hash(), retry_on_conflict=3, body={ 'doc': { 'lock_format': refresh.Hourly.lock_format(), 'lock_value': refresh.Hourly().lock(), } })
def lock_crawled_url(self, url: Url, refresh_rate: Type[RefreshRate]): """Lock a crawld url. Place a lock on a crawled url for a given refresh rate. Args: url: Url to lock refresh_rate: Refresh rate to use (Hourly, Daily, etc.) """ self.es.update(index=Index.CRAWLED, doc_type='url', id=url.hash(), retry_on_conflict=3, body={ 'doc': { 'lock_format': refresh_rate.lock_format(), 'lock_value': refresh_rate().lock(), } })
def setUp(self): """Set up test.""" self.camera = Camera() webdriver.Firefox.__init__ = MagicMock(return_value=None) self.url = Url.from_string('https://example.com') self.datadir = DataDirectory(dirname(__file__) + '/datadir')
def test_url_can_be_converted_back_to_string(self): """Test url can be converted back to string.""" url = 'https://example.com/foo/bar/baz?bax=bat&moo=loo#hello' parsed = Url.from_string(url) self.assertEqual(url, parsed.to_string())