def tick(self): """Tick. Checkout a url from index, take photo of url, save to datadir and update index with photo metadata """ try: timer = time.time() url = self._checkout_url() console.dp(f'taking photo of {url.to_string()}') path = PhotoPath(self.datadir) photo = LoadingPhoto( url=url, path=path, refresh_rate=self.refresh_rate ) photo.save_loading_text() self.index.save_photo(photo) camera = c.Camera( viewport_width=self.viewport_width, viewport_height=self.viewport_height, viewport_max_height=self.viewport_max_height, addons={ 'IDCAC': Addons.IDCAC, 'REFERER_HEADER': Addons.REFERER_HEADER, 'UBLOCK_ORIGIN': Addons.UBLOCK_ORIGIN, } ) photo = camera.take_picture(url, path, self.refresh_rate) self.index.save_photo(photo) timer = int(time.time() - timer) console.p( f'photo was taken of {url.to_string()} took: {timer}s' ) except EmptySearchResultException as e: pass finally: time.sleep(1)
def start_photographers( amount: int, refresh_rate: Type[refresh.RefreshRate], datadir: DataDirectory, viewport_width: int, viewport_height: int, viewport_max_height: Optional[int], elasticsearch_host: str, debug: bool ): """Start photographer threads. Args: amount: amount of crawlers to start refresh_rate: How often photographs should be refreshed, more exactly defines which lock should be placed on crawled urls datadir: Data directory to store pictures in viewport_width: width of camera viewport viewport_height: height of camera viewport viewport_max_height: max height of camera viewport elasticsearch_host: elasticsearch host debug: Display debugging information """ console.p(f'starting {amount} photographer threads') Controller.PHOTOGRAPHER_PROCESSES = amount while amount > 0: thread_id = str(uuid.uuid4()) thread = Thread(target=_photographer_thread, args=( refresh_rate, datadir, viewport_width, viewport_height, viewport_max_height, elasticsearch_host, debug, thread_id )) thread.start() Controller.threads[thread_id] = { 'running': True } amount -= 1
def _stats_thread(elasticsearch_host: str): """Stats thread. Prints system and saas statistics every 5th minute Args: elasticsearch_host: elasticsearch host """ start = time.time() last_print = 1 while Controller.SHOULD_RUN: time.sleep(1) mins = int(int(time.time() - start) / 60) if mins % 5 != 0 or mins <= last_print: continue index = Index(host=elasticsearch_host) last_print = mins t = '[throughput] 5m: {}, 15m: {}, 30min: {}, 1h: {}'.format( stats.throughput(index, 5), stats.throughput(index, 15), stats.throughput(index, 30), stats.throughput(index, 60), ) ta = '{} 5m: {}, 15m: {}, 30min: {}, 1h: {}'.format( '[throughput 1min avg]', round(stats.throughput(index, 5) / 5, 2) if mins > 4 else 'n/a', round(stats.throughput(index, 15) / 15, 2) if mins > 14 else 'n/a', round(stats.throughput(index, 30) / 30, 2) if mins > 29 else 'n/a', round(stats.throughput(index, 60) / 60, 2) if mins > 59 else 'n/a', ) load = '[load avg] 1m: {}, 5m: {}, 15min: {}'.format( stats.load_avg(1), stats.load_avg(5), stats.load_avg(15), ) cpu = f'[current cpu usage] {stats.cpu_usage(10)}%' mem = f'[memory usage] {stats.memory_usage(10)}%' for msg in [t, ta, load, cpu, mem]: console.p(msg)
def _crawler_thread( url_file: str, ignore_found_urls: bool, stay_at_domain: bool, elasticsearch_host: str, debug: bool, thread_id: str ): """Crawler thread. Args: url_file: path to url file ignore_found_urls: if crawler should ignore new urls found on pages it crawls stay_at_domain: if crawler should ignore urls from a different domain than the one it was found at elasticsearch_host: elasticsearch host debug: Display debugging information thread_id: id of thread """ try: crawler = Crawler( url_file=url_file, index=Index(host=elasticsearch_host), ignore_found_urls=ignore_found_urls, stay_at_domain=stay_at_domain, ) while Controller.SHOULD_RUN: crawler.tick() except UrlFileNotFoundError: console.p(f'ERROR: url_file was not found at \'{url_file}\'') time.sleep(2) Controller.threads[thread_id]['running'] = False Controller.stop_all() except Exception as e: console.p(f'error occured in crawler thread {thread_id}: {e}') if debug: raise e finally: Controller.threads[thread_id]['running'] = False
def start_filesystem( mountpoint: str, datadir: DataDirectory, refresh_rate: Type[refresh.RefreshRate], elasticsearch_host: str ): """Start filesystem process. FUSE python library will kill the main process, forking main process and mounts the filesystem from that process instead. Args: mountpoint: where to mount filesystem datadir: Data directory to store pictures in refresh_rate: Which refresh rate filesystem should use for fetching photos elasticsearch_host: elasticsearch host Returns: True if main process, False if the forked process bool """ console.p(f'mounting filesystem at: {real_path(mountpoint)}') pid = os.fork() if pid != 0: Controller.FUSE_PID = pid return True try: Filesystem.mount( mountpoint, Index(datadir, host=elasticsearch_host), refresh_rate ) except RuntimeError as e: console.p(f'failed to mount FUSE filesystem: {e}') return False
def _photographer_thread( refresh_rate: Type[refresh.RefreshRate], datadir: DataDirectory, viewport_width: int, viewport_height: int, viewport_max_height: Optional[int], elasticsearch_host: str, debug: bool, thread_id: str ): """Photographer thread. Args: refresh_rate: How often photographs should be refreshed datadir: Data directory to store pictures in viewport_width: width of camera viewport viewport_height: height of camera viewport viewport_max_height: max height of camera viewport elasticsearch_host: elasticsearch host debug: Display debugging information thread_id: id of thread """ try: photographer = p.Photographer( Index(host=elasticsearch_host), refresh_rate, datadir, viewport_width, viewport_height, viewport_max_height ) while Controller.SHOULD_RUN: photographer.tick() except Exception as e: console.p(f'error occured in photographer thread {thread_id}: {e}') if debug: raise e finally: Controller.threads[thread_id]['running'] = False
def stop_all(): """Stop all threads.""" try: Controller.SHOULD_RUN = False i = 0 while not Controller._any_thread_is_running(): if i % 10 == 0: console.p('waiting for saas to stop') i += 1 time.sleep(0.5) console.p('cleaning up') try: for pid in Controller.webdrivers: os.kill(pid, signal.SIGTERM) if Controller.FUSE_PID: os.kill(Controller.FUSE_PID, signal.SIGTERM) except ProcessLookupError: pass except KeyboardInterrupt: Controller.stop_all()
def start_crawlers( amount: int, url_file: str, ignore_found_urls: bool, stay_at_domain: bool, elasticsearch_host: str, debug: bool ): """Start crawler threads. Args: amount: amount of crawlers to start url_file: path to urls file ignore_found_urls: if crawler should ignore new urls found on pages it crawls stay_at_domain: if crawler should ignore urls from a different domain than the one it was found at elasticsearch_host: elasticsearch host debug: Display debugging information """ console.p(f'starting {amount} crawler threads') while amount > 0: thread_id = str(uuid.uuid4()) thread = Thread(target=_crawler_thread, args=( url_file, ignore_found_urls, stay_at_domain, elasticsearch_host, debug, thread_id )) thread.start() Controller.threads[thread_id] = { 'running': True } amount -= 1
def create_indices(self): """Create indices in elasticsearch.""" console.p('creating indices') try: self.es.indices.create(Index.UNCRAWLED, body={'mappings': Mappings.uncrawled}) self.es.indices.create(Index.CRAWLED, body={'mappings': Mappings.crawled}) self.es.indices.create(Index.PHOTOS, body={ 'mappings': Mappings.photos, 'settings': Settings.photos, }) console.p('done.') except RequestError: console.p('indices already exist, skipping.')
def clear(self): """Clear all documents.""" console.p('clearing all indices') self.es.indices.delete(index='_all', request_timeout=1000000) console.p('indices cleared')
def main(): """Entry point for saas.""" try: parser = arguments.get_argument_parser() args = parser.parse_args(sys.argv[1:]) console.DEBUG = args.debug JavascriptSnippets.load() index = Index(host=args.elasticsearch_host) if not index.ping(): console.p('ERROR: failed to connect to elasticsearch') sys.exit() if not index.verify(): if not args.setup_elasticsearch and not args.clear_elasticsearch: console.p('ERROR: elasticsearch is not configured') console.p(' {} {}'.format( 'start saas with --setup-elasticsearch', 'to configure elasticsearch')) sys.exit() datadir = DataDirectory(args.data_dir, args.optimize_storage) refresh_rate = { 'day': refresh.Daily, 'hour': refresh.Hourly, 'minute': refresh.EveryMinute, }[args.refresh_rate] if args.setup_elasticsearch: index.create_indices() if args.clear_elasticsearch: index.clear() index.create_indices() if args.clear_data_dir: datadir.clear() if not Controller.start_filesystem( mountpoint=args.mountpoint, datadir=datadir, refresh_rate=refresh_rate, elasticsearch_host=args.elasticsearch_host): sys.exit() Controller.start_stats(elasticsearch_host=args.elasticsearch_host) Controller.start_crawlers(amount=args.crawler_threads, url_file=args.url_file, ignore_found_urls=args.ignore_found_urls, stay_at_domain=args.stay_at_domain, elasticsearch_host=args.elasticsearch_host, debug=args.debug) Controller.start_photographers( amount=args.photographer_threads, refresh_rate=refresh_rate, datadir=datadir, viewport_width=args.viewport_width, viewport_height=args.viewport_height, viewport_max_height=args.viewport_max_height, elasticsearch_host=args.elasticsearch_host, debug=args.debug) while True: if args.stop_if_idle == 0: time.sleep(10) continue try: crawled = index.timestamp_of_most_recent_document( index.CRAWLED) photos = index.timestamp_of_most_recent_document(index.PHOTOS) timestamp = photos if crawled > timestamp: timestamp = crawled seconds = int(time.time()) - timestamp mins = int(seconds / 60) if mins >= args.stop_if_idle: console.p(f'was idle for {mins} minutes', end='') raise StopIfIdleTimeoutExpired except EmptySearchResultException: pass finally: time.sleep(2) except (KeyboardInterrupt, StopIfIdleTimeoutExpired): console.p(' terminating.') Controller.stop_all() console.p('')
def clear(self): """Clear data directory.""" console.p(f'clearing data directory at: {self.root}') shutil.rmtree(self.root) create_dir(self.root)