def info_imagecrawler_desc(imagecrawler: str) -> int: from nichtparasoup._internals import _log from nichtparasoup.core.server import type_module_name_str from nichtparasoup.imagecrawler import get_imagecrawlers imagecrawler_class = get_imagecrawlers().get_class(imagecrawler) if not imagecrawler_class: _message_exception(ValueError('unknown ImageCrawler {!r}'.format(imagecrawler))) return 1 info = [] info_linebreak = '\r\n' imagecrawler_info = imagecrawler_class.info() info.append(imagecrawler_info.description) if imagecrawler_info.long_description: info.append(imagecrawler_info.long_description) if imagecrawler_info.config: info_bull = info_linebreak + ' * ' mlen = max(len(k) for k in imagecrawler_info.config.keys()) info.append('Config: ' + info_bull + info_bull.join([ '{key:{mlen}}: {desc}'.format(mlen=mlen, key=key, desc=desc) for key, desc in imagecrawler_info.config.items()])) _message((info_linebreak * 2).join(info)) _log('debug', info_linebreak.join( [ info_linebreak, 'DEBUG INFO', 'Icon : {!r}'.format(imagecrawler_info.icon_url), 'Class: {!r}'.format(type_module_name_str(imagecrawler_class)), ])) return 0
def _log_refill_crawler(crawler: Crawler, refilled: int) -> None: # must be compatible to nichtparasoup.core._OnFill if refilled > 0: _log( 'info', "refilled by {} via {!r}".format(refilled, crawler.imagecrawler))
def __init__(self, entries: Iterable[EntryPoint]) -> None: # pragma: no cover self._list = [(n, c) for n, c in self._builtins().items()] # type: List[_Imagecrawler] for entry in entries: try: self._add(entry) _log('debug', 'Entry point added: {} from {!r}'.format(entry, entry.dist)) except Exception as e: _log('debug', 'Entry point skipped: {} from {!r}\r\n\t{}'.format(entry, entry.dist, e), exc_info=True)
def stop(self) -> None: self._run_lock.acquire() try: if not self.is_alive(): raise RuntimeError('not running') _log('info', " * stopping {}".format(type(self).__name__)) self._stop_event.set() finally: self._run_lock.release()
def stop(self) -> None: with self._locks.run: if not self.__running: raise RuntimeError('not running') _log('info', "\r\n * stopping {}".format(type(self).__name__)) if self._refiller: self._refiller.stop() self._refiller = None self.__running = False
def start(self) -> None: self._run_lock.acquire() try: if self.is_alive(): raise RuntimeError('already running') _log('info', " * starting {}".format(type(self).__name__)) self._stop_event.clear() super().start() finally: self._run_lock.release()
def start(self) -> None: with self._locks.run: if self.__running: raise RuntimeError('already running') _log('info', " * starting {}".format(type(self).__name__)) _log('info', ' * fill all crawlers up to {}'.format(self.keep)) self.refill() # initial fill if not self._refiller: self._refiller = ServerRefiller(self, 1) self._refiller.start() # start threaded periodical refill self._stats.time_started = int(time()) self.__running = True
def run(self) -> None: from time import sleep while not self._stop_event.is_set(): server = self._server_wr() # type: Optional[Server] if server: server.refill() else: _log('info', " * server gone. stopping {}".format(type(self).__name__)) self._stop_event.set() if not self._stop_event.is_set(): sleep(self._sleep)
def get_stream(self, uri: str) -> Tuple[Union[HTTPResponse, addinfourl], str]: if not self._valid_uri(uri): raise ValueError('not remote: ' + uri) _log( 'debug', 'fetch remote {!r} in {}s with {!r}'.format( uri, self._timeout, self._headers)) request = Request(uri, headers=self._headers) try: response = urlopen( request, timeout=self._timeout) # type: Union[HTTPResponse, addinfourl] except Exception as e: _log('debug', 'caught error on fetch remote {!r}'.format(uri), exc_info=True) raise e actual_uri = response.geturl() # after following redirects ... return response, actual_uri
def crawl(self) -> ImageCollection: # pragma: no cover with self._crawl_lock: try: if self._reset_before_next_crawl: _log('debug', 'crawler resetting {!r}'.format(self)) self._reset() self._reset_before_next_crawl = False _log('debug', 'crawling started {!r}'.format(self)) crawled = self._crawl() _log('debug', 'crawling finished {!r}'.format(self)) return crawled except Exception: _log('exception', 'caught an error during crawling {!r}'.format(self)) return ImageCollection()
def run(self) -> None: from werkzeug.serving import run_simple from nichtparasoup._internals import _log self.imageserver.start() try: _log('info', ' * starting {0} bound to {1.hostname} on port {1.port}'.format(type(self).__name__, self)) run_simple( self.hostname, self.port, application=self, static_files={'/': self._STATIC_FILES}, processes=1, threaded=True, use_reloader=False, use_debugger=False) _log('info', ' * stopped {0} bound to {1.hostname} on port {1.port}'.format(type(self).__name__, self)) except Exception as e: _log('exception', ' * Error occurred. stopping everything') raise e finally: self.imageserver.stop()
def reset(self) -> None: self._reset_before_next_crawl = True _log('debug', 'crawler reset planned for {!r}'.format(self))
def __init__(self, **config: Any) -> None: # pragma: no cover self._config = self.check_config( config) # intended to be immutable from now on self._reset_before_next_crawl = True self._crawl_lock = Lock() _log('debug', 'crawler initialized: {!r}'.format(self))