Esempio n. 1
0
    def test_fully_qualified_url_can_be_created_from_uri(self):
        """Test fully qualified url can be created from a uri and url."""
        root = 'https://example.com/foo'
        root = Url.from_string(root)

        self.assertEqual(
            'https://example.com/foo/bar',
            root.create_child_url('bar').to_string()
        )

        self.assertEqual(
            'https://example.com/bar',
            root.create_child_url('/bar').to_string()
        )

        self.assertEqual(
            'https://foo.com',
            root.create_child_url('//foo.com').to_string()
        )

        root = 'https://example.com?foo=bar'
        root = Url.from_string(root)

        self.assertEqual(
            'https://example.com/bar#foo',
            root.create_child_url('/bar#foo').to_string()
        )
Esempio n. 2
0
    def test_trailing_slash_is_removed_from_path(self):
        """Test trailing slash of path is treated the same as without slash."""
        url1 = 'https://example.com/foo/bar/'  # with trailing slash
        url2 = 'https://example.com/foo/bar'  # without trailing slash
        url1 = Url.from_string(url1)
        url2 = Url.from_string(url2)

        self.assertEqual('https://example.com/foo/bar', url1.to_string())
        self.assertEqual('https://example.com/foo/bar', url2.to_string())
Esempio n. 3
0
    def test_crawler_can_read_next_url_from_index(self):
        """Test crawler can read next url from source."""
        index = Index()
        url = Url.from_string('https://example.com/foo')
        index.remove_uncrawled_url = MagicMock()
        index.random_uncrawled_url = MagicMock(return_value=url)

        self.crawler = Crawler(self.path_to_url_source, index)

        self.assertEqual(
            Url.from_string('https://example.com/foo').to_string(),
            self.crawler._next_url().to_string())
        index.remove_uncrawled_url.assert_called_with(url.hash())
Esempio n. 4
0
    def test_filename_can_be_created(self):
        """Test filename can be created from url."""
        url = Url.from_string('https://example.com/foo/BAR-baz-bAt')
        self.assertEqual('bar-baz-bat.png', url.make_filename())

        url = Url.from_string('https://example.com/')
        self.assertEqual('index.png', url.make_filename())

        url = Url.from_string('https://example.com?someid=123&otherid=FOO')
        self.assertEqual('someid-123-and-otherid-foo.png', url.make_filename())

        url = Url.from_string('https://example.com?someid=123456#foobarbaz')
        self.assertEqual('foobarbaz.png', url.make_filename())
Esempio n. 5
0
    def test_directory_can_be_created(self):
        """Test directory can be created from url."""
        url = Url.from_string('https://example.com/foo/baz/BAR-123/index.html')
        self.assertEqual('/foo/baz/bar-123/', url.make_directory())

        url = Url.from_string('https://example.com')
        self.assertEqual('/', url.make_directory())

        url = Url.from_string('https://example.com/')
        self.assertEqual('/', url.make_directory())

        url = Url.from_string('https://example.com/foo?someid=123&otherid=FOO')
        self.assertEqual('/foo/?/', url.make_directory())

        url = Url.from_string('https://example.com/1/2?id=123&param#foo')
        self.assertEqual('/1/2/?/id-123-and-param/#/', url.make_directory())
Esempio n. 6
0
    def test_index_can_store_photo(self):
        """Test index can store a photo."""
        self.index.es.index = MagicMock()
        time.time = MagicMock(return_value=time.time())

        url = Url.from_string('http://example.com')
        path = PhotoPath(self.datadir)
        path.filesize = MagicMock(return_value=10000)

        photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly)

        self.index.save_photo(photo)
        self.index.es.index.assert_called_with(
            index='photos',
            doc_type='photo',
            id=path.uuid,
            body={
                'url_id': url.hash(),
                'refresh_rate': refresh.Hourly.lock_format(),
                'captured_at': refresh.Hourly().lock(),
                'filesize': photo.filesize(),
                'filename': photo.filename(),
                'directory': photo.directory(),
                'domain': photo.domain(),
                'timestamp': int(time.time())
            })
Esempio n. 7
0
    def random_uncrawled_url(self) -> Url:
        """Get random uncrawled url.

        Returns:
            An uncrawled url
            Url

        Raises:
            EmptySearchResultException: if index is empty
        """
        res = self.es.search(index=Index.UNCRAWLED,
                             size=1,
                             body={
                                 'query': {
                                     'function_score': {
                                         'query': {
                                             'bool': {
                                                 'must_not': {
                                                     'term': {
                                                         'crawled': True
                                                     }
                                                 }
                                             }
                                         },
                                         'random_score': {}
                                     }
                                 }
                             })

        if res['hits']['total'] == 0:
            raise EmptySearchResultException('uncrawled index is empty')

        url = Url.from_string(res['hits']['hits'][0]['_source']['url'])
        return url
Esempio n. 8
0
    def _next_url(self):
        """Get next url to crawl.

        Returns:
            A url to crawl, None if no url was found
            Url or None
        """
        self._open_source('r')
        lines = sum(1 for line in self.source)

        if lines == 0:
            return self._next_url_in_index()

        self.source.seek(0)
        lines = self.source.read().split('\n')

        if lines[0] == '':
            return self._next_url_in_index()

        self._open_source('w')
        for line in lines:
            if line != lines[0] and line.strip() != '':
                self.source.write(line + '\n')
        self._close_source()

        url = Url.from_string(lines[0])
        return url
Esempio n. 9
0
    def test_url_can_be_hashed(self):
        """Test url can be hashed."""
        url = 'https://example.com/foo/bar/baz?bax=bat&moo=loo#hello'
        hash = hashlib.sha256(url.encode()).hexdigest()

        parsed = Url.from_string(url)
        self.assertEqual(hash, parsed.hash())
Esempio n. 10
0
    def _route(self, url: Url):
        """Route camera to url.

        Args:
            url: A Url to route camera to
        """
        self.webdriver.set_page_load_timeout(10)
        self.webdriver.get(url.to_string())
Esempio n. 11
0
    def test_crawler_can_read_next_url_from_source(self):
        """Test crawler can read next url from source."""
        self.add_url_source('https://example.com')

        self.crawler = Crawler(self.path_to_url_source, Index())
        self.assertEqual(
            Url.from_string('https://example.com').to_string(),
            self.crawler._next_url().to_string())
Esempio n. 12
0
 def test_loading_photo_can_be_saved_to_datadir(self):
     """Test a loading photo can be saved to data directory."""
     path = PhotoPath(self.datadir)
     url = Url.from_string('https://example.com')
     photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly)
     photo.save_loading_text()
     self.assertTrue(isfile(path.full_path()))
     self.assertEqual('loading', photo.get_raw())
Esempio n. 13
0
 def test_url_can_be_created_from_string(self):
     """Test url can be created from string."""
     url = 'https://example.com/foo/bar/baz?bax=bat&moo=loo#hello'
     url = Url.from_string(url)
     self.assertEqual('https', url.scheme)
     self.assertEqual('example.com', url.domain)
     self.assertEqual('/foo/bar/baz', url.path)
     self.assertEqual('bax=bat&moo=loo', url.query)
     self.assertEqual('hello', url.fragment)
Esempio n. 14
0
    def does_url_checkout(self):
        """Test does checkut of url.

        Add mock to affected index methods.
        """
        url = Url.from_string('https://example.com')
        self.index.recently_crawled_url = MagicMock(return_value=url)
        self.index.crawled_urls_count = MagicMock(return_value=1)
        self.index.lock_crawled_url = MagicMock()
Esempio n. 15
0
    def get_page(url: Url) -> Page:
        """Get page.

        Fetch page at url

        Args:
            url: Url page is located at

        Returns:
            The requested page if response is 200 otherwise None
            Page
        """
        parser = LinkParser()
        page = Page()
        try:
            with urllib.request.urlopen(url.to_string()) as response:
                html = response.read()
        except HTTPError as error:
            page.status_code = error.getcode()
            return page

        page.status_code = response.getcode()
        headers = dict(response.headers)
        for header in headers:
            if header.lower() == 'content-type':
                page.content_type = headers[header]

        links = parser.parse(str(html))

        for link in links:
            try:
                page.add_url(Url.from_string(link))
            except InvalidUrlException as e:
                try:
                    page.add_url(url.create_child_url(link))
                except InvalidUrlException as e:
                    pass
        return page
Esempio n. 16
0
    def set_status_code_for_crawled_url(self, url: Url, status_code: int):
        """Set status code for a crawled url.

        Args:
            url: Url to set status code of
            status_code: the status code of the http request to the url
        """
        self.es.update(index=Index.CRAWLED,
                       doc_type='url',
                       id=url.hash(),
                       retry_on_conflict=3,
                       body={'doc': {
                           'status_code': status_code
                       }})
Esempio n. 17
0
    def test_filesystem_can_translate_path_to_file_in_datadir(self):
        """Test filesystem can translate path to file in datadir."""
        datadir_path = PhotoPath(self.datadir)
        url = Url.from_string('https://example.com/foo/bar')
        photo = Screenshot(url, datadir_path, self.refresh_rate)
        self.index.es.index = MagicMock()
        photo.path.filesize = MagicMock(return_value=10000)
        self.index.save_photo(photo)

        self.index.photos_file_exists = MagicMock(return_value=123000)
        self.index.photos_get_photo = MagicMock(return_value=photo)

        path = self.filesystem._translate_path(
            '/example.com/2019-01-13H20:00/foo/bar.png')
        self.assertEqual(datadir_path.full_path(), path)
Esempio n. 18
0
    def test_crawler_removes_urls_read_from_source(self):
        """Test crawler removes urls read from source."""
        self.add_url_source('https://example.com')
        self.add_url_source('https://example.com/foo')
        self.add_url_source('https://example.com/bar')

        self.crawler = Crawler(self.path_to_url_source, Index())

        # first line should now be https://example.com
        self.assertEqual(
            Url.from_string('https://example.com').to_string(),
            self.crawler._next_url().to_string())

        # first line should now be https://example.com/foo
        self.assertEqual(
            Url.from_string('https://example.com/foo').to_string(),
            self.crawler._next_url().to_string())

        # first line should now be https://example.com/bar
        self.assertEqual(
            Url.from_string('https://example.com/bar').to_string(),
            self.crawler._next_url().to_string())

        self.crawler = Crawler(self.path_to_url_source, Index())
Esempio n. 19
0
    def recently_crawled_url(self, refresh_rate=RefreshRate):
        """Get recently crawled url.

        Picks from the last 5 crawled to prevent two running
        photograpers to fetch the same one.

        Args:
            refresh_rate: the refresh reate to search for and
                use to avoid locked urls

        Returns:
            A url that has been crawled with status code 200
            Url

        Raises:
            EmptySearchResultException: if no url was found
        """
        res = self.es.search(index=Index.CRAWLED,
                             size=5,
                             body={
                                 'query': {
                                     'bool': {
                                         'must': {
                                             'term': {
                                                 'status_code': 200,
                                             }
                                         },
                                         'must_not': [{
                                             'term': {
                                                 'lock_value':
                                                 refresh_rate().lock(),
                                             }
                                         }]
                                     }
                                 },
                                 'sort': [{
                                     'timestamp': {
                                         'order': 'desc'
                                     }
                                 }]
                             })

        if res['hits']['total'] == 0:
            raise EmptySearchResultException('no crawled url was found')

        hits = res['hits']['hits']

        return Url.from_string(random.choice(hits)['_source']['url'])
Esempio n. 20
0
    def test_lock_can_be_placed_on_crawled_url(self):
        """Test lock can be placed on crawled url."""
        url = Url.from_string('http://example.com')
        self.index.es.update = MagicMock()

        self.index.lock_crawled_url(url, refresh.Hourly)
        self.index.es.update.assert_called_with(
            index='crawled',
            doc_type='url',
            id=url.hash(),
            retry_on_conflict=3,
            body={
                'doc': {
                    'lock_format': refresh.Hourly.lock_format(),
                    'lock_value': refresh.Hourly().lock(),
                }
            })
Esempio n. 21
0
    def lock_crawled_url(self, url: Url, refresh_rate: Type[RefreshRate]):
        """Lock a crawld url.

        Place a lock on a crawled url for a given refresh rate.

        Args:
            url: Url to lock
            refresh_rate: Refresh rate to use (Hourly, Daily, etc.)
        """
        self.es.update(index=Index.CRAWLED,
                       doc_type='url',
                       id=url.hash(),
                       retry_on_conflict=3,
                       body={
                           'doc': {
                               'lock_format': refresh_rate.lock_format(),
                               'lock_value': refresh_rate().lock(),
                           }
                       })
Esempio n. 22
0
 def setUp(self):
     """Set up test."""
     self.camera = Camera()
     webdriver.Firefox.__init__ = MagicMock(return_value=None)
     self.url = Url.from_string('https://example.com')
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
Esempio n. 23
0
 def test_url_can_be_converted_back_to_string(self):
     """Test url can be converted back to string."""
     url = 'https://example.com/foo/bar/baz?bax=bat&moo=loo#hello'
     parsed = Url.from_string(url)
     self.assertEqual(url, parsed.to_string())