Esempio n. 1
0
def _query_from_search(search):
    """Creates a query string and a tbs-value for the given search-dict."""
    s = ut.SDA(search)
    query = []
    query.append(s["keywords"])
    query.append(" OR ".join(
        [Q_FT_TEMPLATE.format(ft=ft) for ft in s["file_types"]]))
    query.append(" OR ".join(
        [Q_SRC_TEMPLATE.format(src=src) for src in s["sources"]]))

    tp = ""
    max_level = 0

    for cur_tp in s["time_periods"]:
        cur_level = Q_TYPE_HIERARCHY.get(cur_tp["type"], 0)
        if cur_level <= max_level:
            continue

        if cur_tp["type"] == "c":
            tp = Q_TF_TEMPLATE.format(**cur_tp)
        else:
            tp = Q_TP_TEMPLATE.format(**cur_tp)
        max_level = cur_level

    return {"query": quote_plus(" ".join(query)), "tp": quote_plus(tp)}
Esempio n. 2
0
 def analyze(self, doc, **options):
     doc = ut.SDA(doc)
     return {
         "date":
         ut.date_from_string(
             ut.try_keys(doc, [
                 "metadata.date", "metadata.ModDate",
                 "metadata.Last-Modified", "metadata.modified",
                 "metadata.CreationDate", "metadata.crawl_date"
             ], None)),
         "source": {
             "url": doc["metadata.url"],
             "name": doc["metadata.source"] or "inhouse"
         },
         "content_type":
         ut.try_keys(doc, [
             "content_type", "metadata.contentType", "metadata.mimetype",
             "metadata.Content-Type", "metadata.dc:format"
         ], "application/pdf").strip(),
         "document":
         ut.try_keys(doc, [
             "metadata.title", "metadata.Title", "metadata.dc:title",
             "metadata.filename"
         ], "No Title").strip()
     }
Esempio n. 3
0
 def process_document(self, document, **kwargs):
     doc = ut.SDA(document)
     resp = self.url_fetcher(doc["metadata.detail_url"])
     tree = html.fromstring(resp.content)
     content = self.content_path(tree)
     doc["metadata.mentionned"] = [
         _make_resource_path(e, self.CWD)
         for e in self.connected_path(content)
     ]
     return doc.a_dict
Esempio n. 4
0
    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            doc["metadata.url"] = self.doc_path(entry)
            doc["metadata.date"] = self.date_path(entry)
            doc["metadata.title"] = self.title_path(entry)
            doc["metadata.crawl_date"] = ut.from_date()
            docs.append(doc.a_dict)

        return docs
Esempio n. 5
0
 def process_document(self, document, **kwargs):
     doc = ut.SDA(document)
     resp = self.url_fetcher(doc["metadata.detail_url"])
     tree = html.fromstring(resp.content)
     for entry in self.meta_path(tree):
         key = self.key_path(entry)
         value = self.value_path(entry)
         doc[f"metadata.{key}"] = value
     # if there is more than one document included, use the HTML version.
     if self.num_of_docs_path(tree) > 1:
         doc["metadata.url"] = _get_html_version(doc["metadata.url"])
     return doc.a_dict
Esempio n. 6
0
    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            title = self.title_path(entry)
            if not title:
                continue
            doc["metadata.title"] = title
            logging.info(f"Found document: {title}.")
            date = self.date_path(entry)
            doc["metadata.date"] = date
            doc["metadata.url"] = self.doc_path(entry)
            docs.append(doc.a_dict)

        return docs
Esempio n. 7
0
    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            doc["metadata.date"] = self.date_path(entry)
            doc["metadata.title"] = self.title_path(entry)
            doc["metadata.detail_url"] = self.detail_path(entry)
            doc["metadata.url"] = self.doc_path(entry)
            if doc["metadata.url"] is None:
                doc["metadata.url"] = doc["metadata.detail_url"]
            doc["metadata.topic"] = self.topic_path(entry)
            doc["metadata.type"] = self.type_path(entry)
            docs.append(doc.a_dict)

        return docs
Esempio n. 8
0
    def upsert_job(self, job_dict, **runtime_args):
        """Adds or updates a job using the provided user_input.

        If an id field is present in the dict, the job is updated, otherwise
        a new one is created.

        Args:
            job_dict (dict): user input for a job, as defined in `SCHEMATA`.
            **runtime_args (dict): additional runtime arguments for the
                crawler.

        Returns:
            apscheduler.job.Job: a new job Object.
        """
        if not self.job_validator.validate(job_dict):
            raise (AssertionError(str(self.job_validator.errors)))

        doc = utility.SDA(job_dict)

        job = self.crawlers.get(doc["crawler.id"], None)
        # default to the SearchPlugin, and give the search name as argument.
        if job is None:
            inst = {
                "args": ("SearchPlugin", runtime_args),
                "kwargs": dict(search_id=doc["crawler.id"])
            }
        else:
            inst = {"args": (doc["crawler.id"], runtime_args), "kwargs": {}}
        trigger = self._make_trigger(doc["schedule"])

        if doc["id"]:
            self.scheduler.modify_job(doc["id"],
                                      jobstore="elastic",
                                      func=_run_plugin,
                                      name=doc["name.name"],
                                      **inst)
            new_job = self.scheduler.reschedule_job(doc["id"],
                                                    jobstore="elastic",
                                                    trigger=trigger)
        else:
            # use the crawler id as name, when the job is created.
            new_job = self.scheduler.add_job(_run_plugin,
                                             jobstore="elastic",
                                             trigger=trigger,
                                             name=doc["crawler.id"],
                                             **inst)

        return new_job
Esempio n. 9
0
    def find_entries(self, page):
        docs = []
        for entry in self.entry_path(page):
            doc = ut.SDA({}, "N/A")
            title = self.title_path(entry)
            if not title:
                continue
            doc["metadata.title"] = title
            logging.info(f"Found document: {title}.")
            dates = self.date_path(entry)
            doc["metadata.date"] = dates.get("Last update", dt.datetime.now())
            doc["metadata.date_original"] = dates.get("Publication date",
                                                      dt.datetime.now())
            doc["metadata.url"] = self.doc_path(entry)
            docs.append(doc.a_dict)

        return docs
Esempio n. 10
0
def transform_output(results):
    """Transforms the results dictionary into an easy readable document list.

    Args:
        results (dict): the results of a elastic_search operation.

    Returns:
        list: a list of cleaned documents.
    """
    res = ut.SDA(results)

    if res["_source"]:
        return _transform_document(results)
    if res["hits.total"] > 0:
        return [_transform_document(doc) for doc in res["hits.hits"]]
    # otherwise
    return []
Esempio n. 11
0
    def download_document(self, document, **kwargs):
        """Fetches the url of a document and sets the content of the document.

        Args:
            document (dict): the document that should be prepared, expects
                at least an "url" key.
            **kwargs (dict): additional keyword args, which are only consumed.

        Returns:
            dict: a document with added "content" field.
        """
        # fetch body
        doc_url = utility.SDA(document)["metadata.url"]
        if not doc_url:
            document["raw_content"] = None
            return document
        resp = self.url_fetcher(doc_url)
        content_type = resp.headers.get("content-type", None)
        document["content_type"] = content_type
        document["raw_content"] = resp.content
        return document
Esempio n. 12
0
    def get_documents(self, limit=None, initial=None, **kwargs):
        """Fetches new entries for the given resource and places them in a
        queue (`self.process_docq`).

        Args:
            limit (int): maximum number of entries to pull.
            initial (bool): whether an initial run should be done?
                An initial run does not halt, when a document appeared prev.
            **kwargs (dict): additional keyword args, which are only consumed.

        Returns:
            BasePlugin: self.
        """
        limit = self.defaults.limit.also(limit)
        initial = self.defaults.initial.also(initial)
        has_unseen_docs = True
        doc_count = 0
        for page in self.entry_resource:
            # insert the entries into documents, if they aren't already tracked
            cur_docs = self.find_entries(page, **kwargs)

            # if there are no documents on the page, break
            if len(cur_docs) == 0:
                logger.info(f"No documents found on page {page}!")

            for doc in cur_docs:
                doc = utility.SDA(doc)
                doc_url = doc["metadata.url"]
                # skip entries where no url is given
                if not doc_url:
                    logger.debug("Document contains no url. SKIP.")
                    continue
                exists = self.elastic.exist_document(source_url=doc_url)

                # handle existing files, if they have a date field, which lies
                # in the past, break the loop.
                if exists:
                    logger.debug(f"Document for url '{doc_url}' does already "
                                 "exist. SKIP.")
                    doc_date = doc["metadata.date"]
                    today = utility.from_date()
                    if (not initial) and doc_date and doc_date < today:
                        logger.debug("Document's date lies in the past."
                                     "Stop search.")
                        has_unseen_docs = False
                        break
                    continue

                logger.info(f"Found document {doc_url}.")
                doc["metadata.source"] = self.source_name
                # enter documents to processing queue.
                self.docq.put((doc_count, doc.a_dict))
                doc_count += 1

                # break when the number of retrieved documents reaches the
                # limit
                if limit and doc_count >= limit:
                    has_unseen_docs = False
                    break

            # check whether there are still unseen documents, else do not
            # continue searching
            if has_unseen_docs is False:
                break
        return self