def _retry_connection(url, method="get", max_retries=3, **kwargs): """Repeats the connection with increasing pauses until an answer arrives. This should ease out of the 10054 Error, that windows throws. Args: url (str): the destination url. method (str): a valid HTTP verb, defaults to "get". max_retries (int): the number of maximum retries. kwargs (dict): keyword arguments for requests. Returns: `requests.Response`: the response from the website. """ retry = 0 response = None # create an default dictionary for the request arguments. defaults = utility.DefaultDict({ "headers": {"User-Agent": "Sherlock/0.0.1"} }) while response is None and retry < max_retries: try: with requests.Session() as s: logger.debug(f"Try to {method.upper()} to '{url}'.") response = s.request(method, url, **(defaults.other(kwargs))) except requests.exceptions.ConnectionError as connErr: # sleep increasing (exponential time intervals) logger.error("Detected an Error while connecting... " f"retry ({retry})") time.sleep(2 ** retry) return response
def __init__(self, elastic, crawler_dir="crawlers", crawler_args={}, **cron_defaults): """Initializes the scheduler by binding it to it's elasticsearch db. Args: elastic (elasticsearch.Elasticsearh): The es-client to save the crawling jobs in. crawler_dir (str): the directory, where the crawlers will be found. Defaults to "crawlers". job_defaults (dict): a dictionary of keyword arguments for the schedulers job_defaults. **cron_defaults (dict): a dictionary of keyword arguments for the schedulers job_defaults. Returns: Scheduler: a fresh Scheduler instance. """ jobstores = { "default": { "type": "memory" }, "elastic": InjectorJobStore(kwargs=crawler_args, client=elastic) } executors = { "default": ThreadPoolExecutor(10), "processpool": ProcessPoolExecutor(10) } job_defaults = { "misfire_grace_time": 5 * 60, # 5min "coalesce": True, } self.cron_defaults = utility.DefaultDict( { # standard is every day at 00:00:00 "hour": 0, "minute": 0, "second": 0 }, **cron_defaults) self.scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc) self.crawlers = _detect_crawlers() # set up the validator schema. self.job_validator = cerberus.Validator(SCHEMATA["job"]({ "trigger_ids": list(self.TRIGGERS) }), allow_unknown=True) self.scheduler.start()
def __init__(self, host="localhost", port=9000, auth=None, cert=None, **kwargs): """Initialize the elasticsearch client. Args: host (str): the host of the elasticsearch database. Defaults to `'localhost'`. port (int): the port of the elasticsearch database. Defaults to `9000`. auth (tuple): a tuple of username and password, defaults to `None`. cert (path): a path to a self-signed certificate as used by ibmcloud. **kwargs (dict): keyword arguments that updates the defaults. Returns: Elastic: a new elasticsearch client. """ self.defaults = utility.DefaultDict(dict({ "seeds_index": "seeds", "search_index": "searches", "docs_index": "eurlex", "doc_type": "nutch", "seed_type": "seed", "search_type": "search", "size": 10 }, **kwargs)) context = None if cert: if not os.path.exists(cert): logger.error(f"Certificate file: {cert} does not exist!") else: context = ssl.create_default_context(cafile=cert) self.es = es.Elasticsearch(host=host, port=port, http_auth=auth, use_ssl=True, ssl_context=context, timeout=60) self.fs = filestore.FileStore(self.defaults.fs_dir(None)) for script_id, script_body in self.SCRIPTS.items(): self.es.put_script(id=script_id, body=script_body) # check whether the document index exists, if not create it. self._create_index(self.defaults.docs_index(), self.defaults.doc_type(), self.DOC_MAPPING) self._create_index(self.defaults.seeds_index(), self.defaults.seed_type(), self.SEED_MAPPING) self._create_index(self.defaults.search_index(), self.defaults.search_type(), self.SEARCH_MAPPING) self.es.indices.close(index=self.defaults.docs_index()) self.es.indices.put_settings(index=self.defaults.docs_index(), body=self.SETTINGS) self.es.indices.open(index=self.defaults.docs_index())
def __init__(self, elastic, fetch_limit=None, initial=False, queue_size=100): super(BasePlugin, self).__init__() self.elastic = elastic self.defaults = utility.DefaultDict({ "limit": fetch_limit, "initial": initial, }) self.url_fetcher = _retry_connection self.entry_resource = [] self.docq = queue.Queue(maxsize=queue_size)
def __init__(self, **kwargs): """Initializes the PDFAnalyzer with an `elastic.Elastic` instance. Args: **kwargs (dict): keyword-arguments for further options. Returns: PDFConverter: a new PDFConverter instance. """ super().__init__() self.defaults = utility.DefaultDict( {"bin_path": utility.path_in_project("pdftotext", True)}, **kwargs)
def __init__(self, directory=None): """Initialize a file-store to the given directory. Args: directory (str): an absolute directory, where the files should be saved. """ self.defaults = utility.DefaultDict({"mode": "b"}) self.dir = directory # set a default path if self.dir is None: self.dir = utility.path_in_project("uploads") if not _create_dir(self.dir): raise IOError(f"Couldn't create the upload folder: {self.dir}")