Example #1
0
def _retry_connection(url, method="get", max_retries=3, **kwargs):
    """Repeats the connection with increasing pauses until an answer arrives.

    This should ease out of the 10054 Error, that windows throws.

    Args:
        url (str): the destination url.
        method (str): a valid HTTP verb, defaults to "get".
        max_retries (int): the number of maximum retries.
        kwargs (dict): keyword arguments for requests.

    Returns:
        `requests.Response`: the response from the website.
    """
    retry = 0
    response = None
    # create an default dictionary for the request arguments.
    defaults = utility.DefaultDict({
        "headers": {"User-Agent": "Sherlock/0.0.1"}
    })

    while response is None and retry < max_retries:
        try:
            with requests.Session() as s:
                logger.debug(f"Try to {method.upper()} to '{url}'.")
                response = s.request(method, url, **(defaults.other(kwargs)))
        except requests.exceptions.ConnectionError as connErr:
            # sleep increasing (exponential time intervals)
            logger.error("Detected an Error while connecting... "
                         f"retry ({retry})")
            time.sleep(2 ** retry)
    return response
Example #2
0
    def __init__(self,
                 elastic,
                 crawler_dir="crawlers",
                 crawler_args={},
                 **cron_defaults):
        """Initializes the scheduler by binding it to it's elasticsearch db.

        Args:
            elastic (elasticsearch.Elasticsearh): The es-client to save the
                crawling jobs in.
            crawler_dir (str): the directory, where the crawlers will be found.
                Defaults to "crawlers".
            job_defaults (dict): a dictionary of keyword arguments for
                the schedulers job_defaults.
            **cron_defaults (dict): a dictionary of keyword arguments for
                the schedulers job_defaults.

        Returns:
            Scheduler: a fresh Scheduler instance.
        """
        jobstores = {
            "default": {
                "type": "memory"
            },
            "elastic": InjectorJobStore(kwargs=crawler_args, client=elastic)
        }

        executors = {
            "default": ThreadPoolExecutor(10),
            "processpool": ProcessPoolExecutor(10)
        }

        job_defaults = {
            "misfire_grace_time": 5 * 60,  # 5min
            "coalesce": True,
        }

        self.cron_defaults = utility.DefaultDict(
            {
                # standard is every day at 00:00:00
                "hour": 0,
                "minute": 0,
                "second": 0
            },
            **cron_defaults)

        self.scheduler = BackgroundScheduler(jobstores=jobstores,
                                             executors=executors,
                                             job_defaults=job_defaults,
                                             timezone=utc)

        self.crawlers = _detect_crawlers()
        # set up the validator schema.
        self.job_validator = cerberus.Validator(SCHEMATA["job"]({
            "trigger_ids":
            list(self.TRIGGERS)
        }),
                                                allow_unknown=True)
        self.scheduler.start()
Example #3
0
    def __init__(self, host="localhost", port=9000, auth=None, cert=None,
                 **kwargs):
        """Initialize the elasticsearch client.

        Args:
            host (str): the host of the elasticsearch database.
                Defaults to `'localhost'`.
            port (int): the port of the elasticsearch database.
                Defaults to `9000`.
            auth (tuple): a tuple of username and password, defaults to `None`.
            cert (path): a path to a self-signed certificate as used by
                ibmcloud.
            **kwargs (dict): keyword arguments that updates the defaults.

        Returns:
            Elastic: a new elasticsearch client.
        """
        self.defaults = utility.DefaultDict(dict({
            "seeds_index": "seeds",
            "search_index": "searches",
            "docs_index": "eurlex",
            "doc_type": "nutch",
            "seed_type": "seed",
            "search_type": "search",
            "size": 10
        }, **kwargs))

        context = None
        if cert:
            if not os.path.exists(cert):
                logger.error(f"Certificate file: {cert} does not exist!")
            else:
                context = ssl.create_default_context(cafile=cert)

        self.es = es.Elasticsearch(host=host, port=port, http_auth=auth,
                                   use_ssl=True, ssl_context=context,
                                   timeout=60)
        self.fs = filestore.FileStore(self.defaults.fs_dir(None))

        for script_id, script_body in self.SCRIPTS.items():
            self.es.put_script(id=script_id, body=script_body)

        # check whether the document index exists, if not create it.
        self._create_index(self.defaults.docs_index(),
                           self.defaults.doc_type(),
                           self.DOC_MAPPING)
        self._create_index(self.defaults.seeds_index(),
                           self.defaults.seed_type(),
                           self.SEED_MAPPING)
        self._create_index(self.defaults.search_index(),
                           self.defaults.search_type(),
                           self.SEARCH_MAPPING)
        self.es.indices.close(index=self.defaults.docs_index())
        self.es.indices.put_settings(index=self.defaults.docs_index(),
                                     body=self.SETTINGS)
        self.es.indices.open(index=self.defaults.docs_index())
Example #4
0
 def __init__(self, elastic, fetch_limit=None, initial=False,
              queue_size=100):
     super(BasePlugin, self).__init__()
     self.elastic = elastic
     self.defaults = utility.DefaultDict({
         "limit": fetch_limit,
         "initial": initial,
     })
     self.url_fetcher = _retry_connection
     self.entry_resource = []
     self.docq = queue.Queue(maxsize=queue_size)
Example #5
0
    def __init__(self, **kwargs):
        """Initializes the PDFAnalyzer with an `elastic.Elastic` instance.

        Args:
            **kwargs (dict): keyword-arguments for further options.

        Returns:
            PDFConverter: a new PDFConverter instance.
        """
        super().__init__()
        self.defaults = utility.DefaultDict(
            {"bin_path": utility.path_in_project("pdftotext", True)}, **kwargs)
Example #6
0
    def __init__(self, directory=None):
        """Initialize a file-store to the given directory.

        Args:
            directory (str): an absolute directory, where the files should be
                saved.
        """
        self.defaults = utility.DefaultDict({"mode": "b"})
        self.dir = directory

        # set a default path
        if self.dir is None:
            self.dir = utility.path_in_project("uploads")

        if not _create_dir(self.dir):
            raise IOError(f"Couldn't create the upload folder: {self.dir}")