def startup(self):
        """"""
        super().startup()

        self.DATAPATH = self.config["Path"]
        self.REQUEST_TIMEOUT = float(
            self.config["RequestTimeoutFactor"]) * float(
                self.config["StopWaitSecs"])

        self.ua = UserAgent()

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.request = Request(self.db)
        self.url = URLs(self.db)
        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))

        self.url_id, self.url_str = None, None

        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
            "Dnt": "1",
            "Referer": "https://www.google.com",
        }
Ejemplo n.º 2
0
    def stop_procs(self):
        super(Context, self).stop_procs()
        temp_db = DBInterface(config=self.config["General"])

        docs = Documents(temp_db)
        self.logger.info("Resetting scheduled documents")
        docs.reset_enqueued()
Ejemplo n.º 3
0
    def startup(self):
        """"""
        super().startup()

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))
Ejemplo n.º 4
0
class PostProcessingScheduler(ProcWorker):
    def init_args(self, args):
        (self.document_q, ) = args

    def startup(self):
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.documents = Documents(self.db)
        self.todo_documents = []
        self.current_document = None
        self.logger.info("{} started".format(self.name))

    def shutdown(self):
        super().shutdown()

    def main_func(self):
        """
        This function requests a number of unpostprocessed documents from the database and enqueues them into the postprocessing queue.
        """

        if len(self.todo_documents) == 0:
            self.logger.debug("Requesting new documents")
            self.todo_documents = self.documents.get_unprocessed_documents(
                limit=self.PREFETCH_LIMIT)
            if len(self.todo_documents) == 0:
                time.sleep(self.DEFAULT_POLLING_TIMEOUT * 10)
                self.logger.debug("No new documents recieved")
            else:
                self.logger.debug("Recieved {} new documents".format(
                    len(self.todo_documents)))

            return

        if self.current_document is None:
            self.current_document = self.todo_documents.pop()
            self.documents.mark_as_enqueued(
                self.current_document["document"]["id"])

        try:
            self.logger.debug("Queueing up Document with id: {}".format(
                self.current_document["document"]["id"]))
            self.document_q.put(self.current_document,
                                timeout=self.DEFAULT_POLLING_TIMEOUT)
            self.logger.info("Queued up document with id: {}".format(
                self.current_document["document"]["id"]))
            self.current_document = None
        except Full:
            self.logger.debug("Queue full - retrying")
Ejemplo n.º 5
0
    def startup(self):
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.documents = Documents(self.db)
        self.todo_documents = []
        self.current_document = None
        self.logger.info("{} started".format(self.name))
Ejemplo n.º 6
0
    def startup(self):
        """"""
        super().startup()

        self.es = Elasticsearch(self.config["ESConnection"])
        self.indexname = self.config["ESIndexname"]
        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name
        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))
Ejemplo n.º 7
0
class PostProcessingWorker(QueueProcWorker):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def startup(self):
        """"""
        super().startup()

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))

    def shutdown(self):
        """"""
        super().shutdown()

    def main_func(self, document):
        """
        Applies the data extraction rules onto the passed in document.

        Args:
            document (dict): combination dict containing information about the rule and the document.
        """
        try:
            metadata = self.docs.get_metadata(document["document"]["id"])

            document_data = None
            document_data = rule_registry.all[
                document["rule"]["name"]].extract_data(
                    document["document"]["filepath"])

            data = {**metadata, **document_data}

            self.logger.debug(
                "Extracted the following information {}".format(data))

            self.docs.set_data(document["document"]["id"], data)

            self.logger.info("Processed document {}".format(
                document["document"]["id"]))

        except NotImplementedError:
            self.logger.info(
                "Document {} not processed. No postprocessing rule implemented"
                .format(document["document"]["id"]))
Ejemplo n.º 8
0
def indexing_unindex(ctx):
    """
    Function for ``eurocli indexing unindex``
    Unindexes all documents which are marked for unindexing
    """
    d = Documents(ctx.obj["db"])

    click.echo("Unindexing stale documents")
    documents = d.get_documents_to_unidex()

    current_index = get_current_index(ctx.obj["es"], ctx.obj["index"])

    successfull_ids = index_documents(
        ctx.obj["es"], d, "delete", current_index, documents, silent=True
    )

    click.echo(
        "Unindexed successfully {} documents out of {}".format(
            len(successfull_ids), len(documents)
        )
    )
    d.reset_unindex(successfull_ids)
Ejemplo n.º 9
0
def postprocessing_reset(ctx, rule, force):
    """
    Function for ``eurocli postprocessing reset [...]``

    Args:
        ctx (context): context object
        rule (int): id('s) of the rule which documents should be reset
        force (boolean): unindexing failures are ignored if true
    """
    click.echo("Resetting postprocessing results")

    d = Documents(ctx.obj["db"])
    if rule:
        for ru in rule:
            try:
                d.reset_postprocessing_by_rule(ru)
            except Exception as e:
                print(e)
    else:
        if force:
            click.echo("Resetting all postprocessing results")
            d.reset_all_postprocessing()
        else:
            click.echo("Force (-f) to reset all postprocessing results")

    documents = d.get_documents_to_unidex()

    current_index = get_current_index(ctx.obj["es"], ctx.obj["index"])

    successfull_ids = index_documents(
        ctx.obj["es"], d, "delete", current_index, documents, silent=True
    )
    click.echo(
        "Unindexed successfully {} documents out of {}".format(
            len(successfull_ids), len(documents)
        )
    )
    if force:
        click.echo("Force resetting all unindex flags")
        d.reset_unindex(documents)
    else:
        d.reset_unindex(successfull_ids)
class DocumentDownloader(QueueProcWorker):
    """
    Worker responsible for downloading documents
    """

    DATAPATH = "../data/"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def init_args(self, args):
        (
            self.work_q,
            self.url_q,
        ) = args

    def startup(self):
        """"""
        super().startup()

        self.DATAPATH = self.config["Path"]
        self.REQUEST_TIMEOUT = float(
            self.config["RequestTimeoutFactor"]) * float(
                self.config["StopWaitSecs"])

        self.ua = UserAgent()

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.request = Request(self.db)
        self.url = URLs(self.db)
        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))

        self.url_id, self.url_str = None, None

        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
            "Dnt": "1",
            "Referer": "https://www.google.com",
        }

    def shutdown(self):
        """"""
        super().shutdown()

    def main_func(self, token):
        """
        This method downloads documents.
        It gets called whenever a new request token is provided by the throttling mechanism. It then tries to get a new URL from the url work queue. The token is returned if no work is available.
        Otherwise a new session with a random user agent is generated, a download is triggered, a downloaded file is stored and the request logged.

        Args:
            token (str): Request throttling token that is provided by the token bucket
        """
        # get url
        if not self.url_id:
            self.logger.debug("Getting new URL")
            self.url_id = self.url_q.safe_get()

            if self.url_id is None:
                self.work_q.safe_put(token)
                time.sleep(self.DEFAULT_POLLING_TIMEOUT)
                self.logger.debug("No work - returning")
                return

            url = self.url.get_url(id=self.url_id)
            self.url_str = url["url"]
            self.filetype = url["filetype"]

        try:

            self.logger.debug("Downloading: {}".format(self.url_str))

            with requests.Session() as ses:
                ses.headers = self.headers
                ses.headers["User-Agent"] = self.ua.random
                resp = ses.get(
                    self.url_str,
                    allow_redirects=True,
                    timeout=self.REQUEST_TIMEOUT,
                )
            self.logger.debug("Response for: {} is {}".format(
                self.url_str, resp.status_code))

            doc_id = None
            # if successfull store file
            if resp.status_code == 200:
                self.logger.debug("Storing file for {}".format(self.url_str))
                file_uuid = str(uuid.uuid4())
                filename = file_uuid + self.filetype
                abspath = os.path.abspath(self.DATAPATH)
                filepath = abspath + "/" + filename

                open(filepath, "wb").write(resp.content)

                doc_id = self.docs.register_document(filepath=filepath,
                                                     filename=file_uuid)

            self.request.mark_as_requested(
                self.url_id,
                status_code=resp.status_code,
                redirected_url=resp.url,
                document_id=doc_id,
            )

            self.logger.info("Crawled: {}".format(self.url_str))

            self.url_id, self.url_str, self.filetype = None, None, None

        except requests.ReadTimeout as e:

            self.logger.warn("Timeout for url: {}".format(self.url_str))
            self.logger.warn("Exception Message: {}".format(e))

            self.request.mark_as_requested(url_id=self.url_id,
                                           status_code=408,
                                           redirected_url=self.url_str)
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return

        except requests.RequestException as e:
            self.logger.warn("Request exception for url: {}".format(
                self.url_str))
            self.logger.warn("Exception Message: {}".format(e))
            self.request.mark_as_requested(url_id=self.url_id,
                                           status_code=460,
                                           redirected_url=self.url_str)
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return
Ejemplo n.º 11
0
class Indexer(ProcWorker):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def startup(self):
        """"""
        super().startup()

        self.es = Elasticsearch(self.config["ESConnection"])
        self.indexname = self.config["ESIndexname"]
        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name
        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))

    def shutdown(self):
        """"""
        super().shutdown()

    def main_func(self):
        """
        Get's unindexed documents from the database and indexes them in elasticsearch.

        This function deletes documents from the index if they are already indexed and reindexes them. This avoids creating multiple versions of a document in the index.
        Left over documents can be caused by an unsuccessfull postprocessing reset or timeouts caused by elasticsearch.

        """

        try:
            documents = self.docs.get_unindexed_data(limit=self.PREFETCH_LIMIT)

            if len(documents) > 0:

                deleted_ids = index_documents(self.es,
                                              self.docs,
                                              "delete",
                                              self.indexname,
                                              documents,
                                              silent=True)

                if len(deleted_ids) > 0:
                    self.logger.warn(
                        "Deleted {} documents successfully out of {} documents in the batch"
                        .format(len(deleted_ids), len(documents)))

                successfull_ids = index_documents_data(self.es,
                                                       self.docs,
                                                       "index",
                                                       self.indexname,
                                                       documents,
                                                       silent=True)

                self.docs.set_indexed(successfull_ids)

                self.logger.info(
                    "Indexed {} documents successfully out of {} documents in the batch"
                    .format(len(successfull_ids), len(documents)))

        except Exception as e:
            self.logger.error(e)

        finally:
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)