Ejemplo n.º 1
0
    def test_index_can_store_photo(self):
        """Test index can store a photo."""
        self.index.es.index = MagicMock()
        time.time = MagicMock(return_value=time.time())

        url = Url.from_string('http://example.com')
        path = PhotoPath(self.datadir)
        path.filesize = MagicMock(return_value=10000)

        photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly)

        self.index.save_photo(photo)
        self.index.es.index.assert_called_with(
            index='photos',
            doc_type='photo',
            id=path.uuid,
            body={
                'url_id': url.hash(),
                'refresh_rate': refresh.Hourly.lock_format(),
                'captured_at': refresh.Hourly().lock(),
                'filesize': photo.filesize(),
                'filename': photo.filename(),
                'directory': photo.directory(),
                'domain': photo.domain(),
                'timestamp': int(time.time())
            })
Ejemplo n.º 2
0
    def test_directories_within_a_directory_can_be_fetched(self):
        """Test directories within a directory can be fetched."""
        format = refresh.Hourly.lock_format()
        capture = refresh.Hourly().lock()
        self.search_returns_aggregation(
            'photos', [{
                'key': '/path/to/some/dir/',
            }, {
                'key': '/path/to/some/other/dir/',
            }, {
                'key': '/path/to/not/same/other/dir/',
            }, {
                'key': '/path/to/a/dir/',
            }])

        directories = self.index.photos_list_directories_in_directory(
            domain='example.com',
            captured_at=capture,
            directory='/path/to/',
            refresh_rate=refresh.Hourly)

        self.assertEqual(['some', 'not', 'a'], directories)
        self.index.es.search.assert_called_with(
            index='photos',
            size=0,
            body={
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'domain': 'example.com',
                            }
                        }, {
                            'term': {
                                'refresh_rate': format,
                            }
                        }, {
                            'term': {
                                'captured_at': capture,
                            }
                        }, {
                            'wildcard': {
                                'directory': '/path/to/*',
                            }
                        }],
                    }
                },
                'aggs': {
                    'photos': {
                        'terms': {
                            'field': 'directory',
                            'size': 10000
                        }
                    }
                }
            })
Ejemplo n.º 3
0
    def test_photo_can_be_retrieved(self):
        """Test photo can be retrieved."""
        format = refresh.Hourly.lock_format()
        capture = refresh.Hourly().lock()
        self.search_returns_doc({
            '_id': 'uuid-xxx...',
            '_source': {
                'url_id': 'xxx...',
                'refresh_rate': format,
                'captured_at': capture,
                'filename': 'some-filename.png',
                'directory': '/some/path/',
                'domain': 'example.com',
                'filesize': 12300,
                'timestamp': time.time(),
            }
        })

        photo = self.index.photos_get_photo(
            domain='example.com',
            captured_at=capture,
            full_filename='/some/path/some-filename.png',
            refresh_rate=refresh.Hourly)

        self.assertIsInstance(cls=Photo, obj=photo)
        self.assertEqual('uuid-xxx...', photo.path.uuid)
        self.index.es.search.assert_called_with(
            index='photos',
            size=1,
            body={
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'domain': 'example.com',
                            }
                        }, {
                            'term': {
                                'refresh_rate': format,
                            }
                        }, {
                            'term': {
                                'captured_at': capture,
                            }
                        }, {
                            'term': {
                                'directory': '/some/path/',
                            }
                        }, {
                            'term': {
                                'filename': 'some-filename.png',
                            }
                        }],
                    }
                }
            })
Ejemplo n.º 4
0
    def test_lock_can_be_placed_on_crawled_url(self):
        """Test lock can be placed on crawled url."""
        url = Url.from_string('http://example.com')
        self.index.es.update = MagicMock()

        self.index.lock_crawled_url(url, refresh.Hourly)
        self.index.es.update.assert_called_with(
            index='crawled',
            doc_type='url',
            id=url.hash(),
            retry_on_conflict=3,
            body={
                'doc': {
                    'lock_format': refresh.Hourly.lock_format(),
                    'lock_value': refresh.Hourly().lock(),
                }
            })
Ejemplo n.º 5
0
    def test_recently_crawled_url_can_be_fetched(self):
        """Test recently crawled url can be fetched."""
        self.search_returns_doc({
            '_id': 'xxx...',
            '_source': {
                'url': 'http://example.com',
                'timestamp': 1547229873.257901
            }
        })

        url = self.index.recently_crawled_url(refresh.Hourly)
        self.assertIsInstance(cls=Url, obj=url)
        self.assertEqual('http://example.com', url.to_string())
        self.index.es.search.assert_called_with(
            index='crawled',
            size=5,
            body={
                'query': {
                    'bool': {
                        'must': {
                            'term': {
                                'status_code': 200,
                            }
                        },
                        'must_not': [{
                            'term': {
                                'lock_value': refresh.Hourly().lock(),
                            }
                        }]
                    }
                },
                'sort': [{
                    'timestamp': {
                        'order': 'desc'
                    }
                }]
            })