def setUp(self): """Set up test.""" self.console = console self.refresh_rate = refresh.Hourly self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.index = Index(self.datadir, MagicMock()) self.filesystem = Filesystem(self.index, self.refresh_rate)
def test_next_url_returns_none_if_no_url_was_found(self): """Test _next_url() returns None if no url was found.""" index = Index() index.random_uncrawled_url = MagicMock() index.random_uncrawled_url.side_effect = EmptySearchResultException() self.crawler = Crawler(self.path_to_url_source, index) self.assertEqual(None, self.crawler._next_url())
def test_crawler_can_read_next_url_from_index(self): """Test crawler can read next url from source.""" index = Index() url = Url.from_string('https://example.com/foo') index.remove_uncrawled_url = MagicMock() index.random_uncrawled_url = MagicMock(return_value=url) self.crawler = Crawler(self.path_to_url_source, index) self.assertEqual( Url.from_string('https://example.com/foo').to_string(), self.crawler._next_url().to_string()) index.remove_uncrawled_url.assert_called_with(url.hash())
def test_crawler_can_read_next_url_from_source(self): """Test crawler can read next url from source.""" self.add_url_source('https://example.com') self.crawler = Crawler(self.path_to_url_source, Index()) self.assertEqual( Url.from_string('https://example.com').to_string(), self.crawler._next_url().to_string())
def throughput(index: Index, timeframe: int) -> int: """Get current throughput. Args: index: Index photos are stored in timeframe: timeframe in minutes Returns: number of photos stored in index during timeframe int """ return index.calculate_throughput(timeframe)
def _update(domain: str, index: idx.Index, refresh_rate: Type[RefreshRate]): """Update cache. Args: domain: domain to cache index: Index photos are stored in refresh_rate: refresh rate capture should be for """ capture = index.photos_most_recent_capture_of_domain( domain, refresh_rate) LastCapture.captures[domain] = capture LastCapture.cached_at[domain] = time.time()
def test_crawler_removes_urls_read_from_source(self): """Test crawler removes urls read from source.""" self.add_url_source('https://example.com') self.add_url_source('https://example.com/foo') self.add_url_source('https://example.com/bar') self.crawler = Crawler(self.path_to_url_source, Index()) # first line should now be https://example.com self.assertEqual( Url.from_string('https://example.com').to_string(), self.crawler._next_url().to_string()) # first line should now be https://example.com/foo self.assertEqual( Url.from_string('https://example.com/foo').to_string(), self.crawler._next_url().to_string()) # first line should now be https://example.com/bar self.assertEqual( Url.from_string('https://example.com/bar').to_string(), self.crawler._next_url().to_string()) self.crawler = Crawler(self.path_to_url_source, Index())
def _stats_thread(elasticsearch_host: str): """Stats thread. Prints system and saas statistics every 5th minute Args: elasticsearch_host: elasticsearch host """ start = time.time() last_print = 1 while Controller.SHOULD_RUN: time.sleep(1) mins = int(int(time.time() - start) / 60) if mins % 5 != 0 or mins <= last_print: continue index = Index(host=elasticsearch_host) last_print = mins t = '[throughput] 5m: {}, 15m: {}, 30min: {}, 1h: {}'.format( stats.throughput(index, 5), stats.throughput(index, 15), stats.throughput(index, 30), stats.throughput(index, 60), ) ta = '{} 5m: {}, 15m: {}, 30min: {}, 1h: {}'.format( '[throughput 1min avg]', round(stats.throughput(index, 5) / 5, 2) if mins > 4 else 'n/a', round(stats.throughput(index, 15) / 15, 2) if mins > 14 else 'n/a', round(stats.throughput(index, 30) / 30, 2) if mins > 29 else 'n/a', round(stats.throughput(index, 60) / 60, 2) if mins > 59 else 'n/a', ) load = '[load avg] 1m: {}, 5m: {}, 15min: {}'.format( stats.load_avg(1), stats.load_avg(5), stats.load_avg(15), ) cpu = f'[current cpu usage] {stats.cpu_usage(10)}%' mem = f'[memory usage] {stats.memory_usage(10)}%' for msg in [t, ta, load, cpu, mem]: console.p(msg)
def _crawler_thread( url_file: str, ignore_found_urls: bool, stay_at_domain: bool, elasticsearch_host: str, debug: bool, thread_id: str ): """Crawler thread. Args: url_file: path to url file ignore_found_urls: if crawler should ignore new urls found on pages it crawls stay_at_domain: if crawler should ignore urls from a different domain than the one it was found at elasticsearch_host: elasticsearch host debug: Display debugging information thread_id: id of thread """ try: crawler = Crawler( url_file=url_file, index=Index(host=elasticsearch_host), ignore_found_urls=ignore_found_urls, stay_at_domain=stay_at_domain, ) while Controller.SHOULD_RUN: crawler.tick() except UrlFileNotFoundError: console.p(f'ERROR: url_file was not found at \'{url_file}\'') time.sleep(2) Controller.threads[thread_id]['running'] = False Controller.stop_all() except Exception as e: console.p(f'error occured in crawler thread {thread_id}: {e}') if debug: raise e finally: Controller.threads[thread_id]['running'] = False
def start_filesystem( mountpoint: str, datadir: DataDirectory, refresh_rate: Type[refresh.RefreshRate], elasticsearch_host: str ): """Start filesystem process. FUSE python library will kill the main process, forking main process and mounts the filesystem from that process instead. Args: mountpoint: where to mount filesystem datadir: Data directory to store pictures in refresh_rate: Which refresh rate filesystem should use for fetching photos elasticsearch_host: elasticsearch host Returns: True if main process, False if the forked process bool """ console.p(f'mounting filesystem at: {real_path(mountpoint)}') pid = os.fork() if pid != 0: Controller.FUSE_PID = pid return True try: Filesystem.mount( mountpoint, Index(datadir, host=elasticsearch_host), refresh_rate ) except RuntimeError as e: console.p(f'failed to mount FUSE filesystem: {e}') return False
def _photographer_thread( refresh_rate: Type[refresh.RefreshRate], datadir: DataDirectory, viewport_width: int, viewport_height: int, viewport_max_height: Optional[int], elasticsearch_host: str, debug: bool, thread_id: str ): """Photographer thread. Args: refresh_rate: How often photographs should be refreshed datadir: Data directory to store pictures in viewport_width: width of camera viewport viewport_height: height of camera viewport viewport_max_height: max height of camera viewport elasticsearch_host: elasticsearch host debug: Display debugging information thread_id: id of thread """ try: photographer = p.Photographer( Index(host=elasticsearch_host), refresh_rate, datadir, viewport_width, viewport_height, viewport_max_height ) while Controller.SHOULD_RUN: photographer.tick() except Exception as e: console.p(f'error occured in photographer thread {thread_id}: {e}') if debug: raise e finally: Controller.threads[thread_id]['running'] = False
def setUp(self): """Set up test.""" self.index = Index() self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.photographer = Photographer(self.index, refresh.Hourly, self.datadir)
def main(): """Entry point for saas.""" try: parser = arguments.get_argument_parser() args = parser.parse_args(sys.argv[1:]) console.DEBUG = args.debug JavascriptSnippets.load() index = Index(host=args.elasticsearch_host) if not index.ping(): console.p('ERROR: failed to connect to elasticsearch') sys.exit() if not index.verify(): if not args.setup_elasticsearch and not args.clear_elasticsearch: console.p('ERROR: elasticsearch is not configured') console.p(' {} {}'.format( 'start saas with --setup-elasticsearch', 'to configure elasticsearch')) sys.exit() datadir = DataDirectory(args.data_dir, args.optimize_storage) refresh_rate = { 'day': refresh.Daily, 'hour': refresh.Hourly, 'minute': refresh.EveryMinute, }[args.refresh_rate] if args.setup_elasticsearch: index.create_indices() if args.clear_elasticsearch: index.clear() index.create_indices() if args.clear_data_dir: datadir.clear() if not Controller.start_filesystem( mountpoint=args.mountpoint, datadir=datadir, refresh_rate=refresh_rate, elasticsearch_host=args.elasticsearch_host): sys.exit() Controller.start_stats(elasticsearch_host=args.elasticsearch_host) Controller.start_crawlers(amount=args.crawler_threads, url_file=args.url_file, ignore_found_urls=args.ignore_found_urls, stay_at_domain=args.stay_at_domain, elasticsearch_host=args.elasticsearch_host, debug=args.debug) Controller.start_photographers( amount=args.photographer_threads, refresh_rate=refresh_rate, datadir=datadir, viewport_width=args.viewport_width, viewport_height=args.viewport_height, viewport_max_height=args.viewport_max_height, elasticsearch_host=args.elasticsearch_host, debug=args.debug) while True: if args.stop_if_idle == 0: time.sleep(10) continue try: crawled = index.timestamp_of_most_recent_document( index.CRAWLED) photos = index.timestamp_of_most_recent_document(index.PHOTOS) timestamp = photos if crawled > timestamp: timestamp = crawled seconds = int(time.time()) - timestamp mins = int(seconds / 60) if mins >= args.stop_if_idle: console.p(f'was idle for {mins} minutes', end='') raise StopIfIdleTimeoutExpired except EmptySearchResultException: pass finally: time.sleep(2) except (KeyboardInterrupt, StopIfIdleTimeoutExpired): console.p(' terminating.') Controller.stop_all() console.p('')
class TestFilesystem(unittest.TestCase): """Test filesystem class.""" def setUp(self): """Set up test.""" self.console = console self.refresh_rate = refresh.Hourly self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.index = Index(self.datadir, MagicMock()) self.filesystem = Filesystem(self.index, self.refresh_rate) def tearDown(self): """Tear down test.""" self.datadir.remove_data_dir() def assertListOfFilesEqual(self, expected: list, actual: list): """Assert list of files equal. Args: expected: Expected list of files actual: Actual list of files """ msg = 'Failed asserting list of files where equal expected' self.assertEqual(len(expected), len(actual), msg=msg) for i, file in enumerate(expected): self.assertEqual(file.filename, actual[i].filename, msg=msg) self.assertIsInstance(cls=file.__class__, obj=actual[i], msg=msg) def test_filesystem_can_list_contents_of_root_directory(self): """Test filesystem can list root directory.""" self.index.photos_unique_domains = MagicMock( return_value=['example.com', 'example.net']) files = self.filesystem._list('/') self.assertListOfFilesEqual([ Directory('.'), Directory('..'), Directory('example.com'), Directory('example.net'), ], files) self.index.photos_unique_domains.assert_called_with(self.refresh_rate) def test_filesystem_can_list_contents_of_domain(self): """Test filesystem can list contents of domain.""" self.index.photos_unique_captures_of_domain = MagicMock(return_value=[ '2019-01-13H20:00', '2019-01-13H21:00', '2019-01-13H22:00', ]) expected = [ Directory('.'), Directory('..'), Directory('2019-01-13H20:00'), Directory('2019-01-13H21:00'), Directory('2019-01-13H22:00'), Directory(LastCapture.FILENAME), ] files = self.filesystem._list('/example.com') self.assertListOfFilesEqual(expected, files) files = self.filesystem._list('/example.com/') self.assertListOfFilesEqual(expected, files) self.index.photos_unique_captures_of_domain.assert_called_with( 'example.com', self.refresh_rate) def test_filesystem_can_list_contents_of_capture_at_given_path(self): """Test filesystem can list contents of capture at given path.""" self.index.photos_list_files_in_directory = MagicMock(return_value=[ 'index.png', 'contact.png', 'about.png', ]) self.index.photos_list_directories_in_directory = MagicMock( return_value=[ 'sub_dir_1', 'sub_dir_2', ]) expected = [ Directory('.'), Directory('..'), File('index.png'), File('contact.png'), File('about.png'), Directory('sub_dir_1'), Directory('sub_dir_2'), ] files = self.filesystem._list('/example.com/2019-01-13H20:00/') self.assertListOfFilesEqual(expected, files) files = self.filesystem._list('/example.com/2019-01-13H20:00') self.assertListOfFilesEqual(expected, files) files = self.filesystem._list('/example.com/2019-01-13H20:00/foo/bar/') self.assertListOfFilesEqual(expected, files) calls = [ call('example.com', '2019-01-13H20:00', '/', self.refresh_rate), call('example.com', '2019-01-13H20:00', '/', self.refresh_rate), call('example.com', '2019-01-13H20:00', '/foo/bar/', self.refresh_rate), ] self.index.photos_list_files_in_directory.assert_has_calls(calls) self.index.photos_list_directories_in_directory.assert_has_calls(calls) def test_filesystem_can_get_attributes_of_directory(self): """Test filesystem can get attributes of directory.""" time.time = MagicMock(return_value=time.time()) self.index.photos_directory_exists = MagicMock(return_value=True) self.index.photos_unique_domains = MagicMock( return_value=['example.com']) self.index.photos_unique_captures_of_domain = MagicMock( return_value=['2019-01-13H20:00']) expected = { 'st_atime': time.time(), 'st_ctime': time.time(), 'st_gid': os.getgid(), 'st_mode': Directory('').ST_MODE, 'st_mtime': time.time(), 'st_size': 0, 'st_uid': os.getuid(), } attr = self.filesystem._attributes('/') self.assertEqual(expected, attr) attr = self.filesystem._attributes('/example.com/') self.assertEqual(expected, attr) attr = self.filesystem._attributes('/example.com/2019-01-13H20:00') self.assertEqual(expected, attr) attr = self.filesystem._attributes('/example.com/2019-01-13H20:00/') self.assertEqual(expected, attr) attr = self.filesystem._attributes( '/example.com/2019-01-13H20:00/foo/bar') self.assertEqual(expected, attr) self.index.photos_directory_exists.assert_called_with( domain='example.com', captured_at='2019-01-13H20:00', directory='/foo/bar/', refresh_rate=self.refresh_rate) def test_filesystem_can_get_attributes_of_file(self): """Test filesystem can get attributes of file.""" time.time = MagicMock(return_value=time.time()) self.index.photos_directory_exists = MagicMock(return_value=False) self.index.photos_file_exists = MagicMock( return_value=123000 # returns filesize ) expected = { 'st_atime': time.time(), 'st_ctime': time.time(), 'st_gid': os.getgid(), 'st_mode': File('').ST_MODE, 'st_mtime': time.time(), 'st_size': 123000, 'st_uid': os.getuid(), } attr = self.filesystem._attributes( '/example.com/2019-01-13H20:00/index.png') self.assertEqual(expected, attr) self.index.photos_file_exists.assert_called_with( domain='example.com', captured_at='2019-01-13H20:00', full_filename='/index.png', refresh_rate=self.refresh_rate) def test_filesystem_can_translate_path_to_file_in_datadir(self): """Test filesystem can translate path to file in datadir.""" datadir_path = PhotoPath(self.datadir) url = Url.from_string('https://example.com/foo/bar') photo = Screenshot(url, datadir_path, self.refresh_rate) self.index.es.index = MagicMock() photo.path.filesize = MagicMock(return_value=10000) self.index.save_photo(photo) self.index.photos_file_exists = MagicMock(return_value=123000) self.index.photos_get_photo = MagicMock(return_value=photo) path = self.filesystem._translate_path( '/example.com/2019-01-13H20:00/foo/bar.png') self.assertEqual(datadir_path.full_path(), path)
def setUp(self): """Set up test.""" self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.index = Index(self.datadir, MagicMock())
class TestIndex(unittest.TestCase): """Test index class.""" def setUp(self): """Set up test.""" self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.index = Index(self.datadir, MagicMock()) def tearDown(self): """Tear down test.""" self.datadir.remove_data_dir() def search_returns_doc(self, doc: dict): """Search to elastic search returns doc. Mock search method of self.index.es to return given doc Args: doc: document or partial document to return """ self.index.es.search = MagicMock( return_value={'hits': { 'total': 1, 'hits': [doc] }}) def search_returns_aggregation(self, index: str, buckets: list): """Search to elastic search returns aggregation. Args: index: index being searched buckets: buckets that's returned """ self.index.es.search = MagicMock( return_value={'aggregations': { index: { 'buckets': buckets } }}) def test_recently_crawled_url_can_be_fetched(self): """Test recently crawled url can be fetched.""" self.search_returns_doc({ '_id': 'xxx...', '_source': { 'url': 'http://example.com', 'timestamp': 1547229873.257901 } }) url = self.index.recently_crawled_url(refresh.Hourly) self.assertIsInstance(cls=Url, obj=url) self.assertEqual('http://example.com', url.to_string()) self.index.es.search.assert_called_with( index='crawled', size=5, body={ 'query': { 'bool': { 'must': { 'term': { 'status_code': 200, } }, 'must_not': [{ 'term': { 'lock_value': refresh.Hourly().lock(), } }] } }, 'sort': [{ 'timestamp': { 'order': 'desc' } }] }) def test_lock_can_be_placed_on_crawled_url(self): """Test lock can be placed on crawled url.""" url = Url.from_string('http://example.com') self.index.es.update = MagicMock() self.index.lock_crawled_url(url, refresh.Hourly) self.index.es.update.assert_called_with( index='crawled', doc_type='url', id=url.hash(), retry_on_conflict=3, body={ 'doc': { 'lock_format': refresh.Hourly.lock_format(), 'lock_value': refresh.Hourly().lock(), } }) def test_index_can_store_photo(self): """Test index can store a photo.""" self.index.es.index = MagicMock() time.time = MagicMock(return_value=time.time()) url = Url.from_string('http://example.com') path = PhotoPath(self.datadir) path.filesize = MagicMock(return_value=10000) photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly) self.index.save_photo(photo) self.index.es.index.assert_called_with( index='photos', doc_type='photo', id=path.uuid, body={ 'url_id': url.hash(), 'refresh_rate': refresh.Hourly.lock_format(), 'captured_at': refresh.Hourly().lock(), 'filesize': photo.filesize(), 'filename': photo.filename(), 'directory': photo.directory(), 'domain': photo.domain(), 'timestamp': int(time.time()) }) def test_index_can_list_unique_photo_domains(self): """Test index can list unique photos.""" self.search_returns_aggregation('photos', [{ 'key': 'example.com', }, { 'key': 'example.net', }]) domains = self.index.photos_unique_domains(refresh.Hourly) self.assertEqual(['example.com', 'example.net'], domains) self.index.es.search.assert_called_with( index='photos', size=0, body={ 'query': { 'bool': { 'must': { 'term': { 'refresh_rate': refresh.Hourly.lock_format(), } }, } }, 'aggs': { 'photos': { 'terms': { 'field': 'domain', 'size': 10000 } } } }) def test_index_can_list_unique_captures_of_domains(self): """Test index can list unique captures of domain.""" self.search_returns_aggregation('photos', [{ 'key': '2019-01-13H20:00', }, { 'key': '2019-01-13H21:00', }]) domains = self.index.photos_unique_captures_of_domain( 'example.com', refresh.Hourly) format = refresh.Hourly.lock_format() self.assertEqual(['2019-01-13H20:00', '2019-01-13H21:00'], domains) self.index.es.search.assert_called_with( index='photos', size=0, body={ 'query': { 'bool': { 'must': [{ 'term': { 'domain': 'example.com', } }, { 'term': { 'refresh_rate': format, } }], } }, 'aggs': { 'photos': { 'terms': { 'field': 'captured_at', 'size': 10000 } } } }) def test_photo_can_be_retrieved(self): """Test photo can be retrieved.""" format = refresh.Hourly.lock_format() capture = refresh.Hourly().lock() self.search_returns_doc({ '_id': 'uuid-xxx...', '_source': { 'url_id': 'xxx...', 'refresh_rate': format, 'captured_at': capture, 'filename': 'some-filename.png', 'directory': '/some/path/', 'domain': 'example.com', 'filesize': 12300, 'timestamp': time.time(), } }) photo = self.index.photos_get_photo( domain='example.com', captured_at=capture, full_filename='/some/path/some-filename.png', refresh_rate=refresh.Hourly) self.assertIsInstance(cls=Photo, obj=photo) self.assertEqual('uuid-xxx...', photo.path.uuid) self.index.es.search.assert_called_with( index='photos', size=1, body={ 'query': { 'bool': { 'must': [{ 'term': { 'domain': 'example.com', } }, { 'term': { 'refresh_rate': format, } }, { 'term': { 'captured_at': capture, } }, { 'term': { 'directory': '/some/path/', } }, { 'term': { 'filename': 'some-filename.png', } }], } } }) def test_directories_within_a_directory_can_be_fetched(self): """Test directories within a directory can be fetched.""" format = refresh.Hourly.lock_format() capture = refresh.Hourly().lock() self.search_returns_aggregation( 'photos', [{ 'key': '/path/to/some/dir/', }, { 'key': '/path/to/some/other/dir/', }, { 'key': '/path/to/not/same/other/dir/', }, { 'key': '/path/to/a/dir/', }]) directories = self.index.photos_list_directories_in_directory( domain='example.com', captured_at=capture, directory='/path/to/', refresh_rate=refresh.Hourly) self.assertEqual(['some', 'not', 'a'], directories) self.index.es.search.assert_called_with( index='photos', size=0, body={ 'query': { 'bool': { 'must': [{ 'term': { 'domain': 'example.com', } }, { 'term': { 'refresh_rate': format, } }, { 'term': { 'captured_at': capture, } }, { 'wildcard': { 'directory': '/path/to/*', } }], } }, 'aggs': { 'photos': { 'terms': { 'field': 'directory', 'size': 10000 } } } })