def _query_from_search(search): """Creates a query string and a tbs-value for the given search-dict.""" s = ut.SDA(search) query = [] query.append(s["keywords"]) query.append(" OR ".join( [Q_FT_TEMPLATE.format(ft=ft) for ft in s["file_types"]])) query.append(" OR ".join( [Q_SRC_TEMPLATE.format(src=src) for src in s["sources"]])) tp = "" max_level = 0 for cur_tp in s["time_periods"]: cur_level = Q_TYPE_HIERARCHY.get(cur_tp["type"], 0) if cur_level <= max_level: continue if cur_tp["type"] == "c": tp = Q_TF_TEMPLATE.format(**cur_tp) else: tp = Q_TP_TEMPLATE.format(**cur_tp) max_level = cur_level return {"query": quote_plus(" ".join(query)), "tp": quote_plus(tp)}
def analyze(self, doc, **options): doc = ut.SDA(doc) return { "date": ut.date_from_string( ut.try_keys(doc, [ "metadata.date", "metadata.ModDate", "metadata.Last-Modified", "metadata.modified", "metadata.CreationDate", "metadata.crawl_date" ], None)), "source": { "url": doc["metadata.url"], "name": doc["metadata.source"] or "inhouse" }, "content_type": ut.try_keys(doc, [ "content_type", "metadata.contentType", "metadata.mimetype", "metadata.Content-Type", "metadata.dc:format" ], "application/pdf").strip(), "document": ut.try_keys(doc, [ "metadata.title", "metadata.Title", "metadata.dc:title", "metadata.filename" ], "No Title").strip() }
def process_document(self, document, **kwargs): doc = ut.SDA(document) resp = self.url_fetcher(doc["metadata.detail_url"]) tree = html.fromstring(resp.content) content = self.content_path(tree) doc["metadata.mentionned"] = [ _make_resource_path(e, self.CWD) for e in self.connected_path(content) ] return doc.a_dict
def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") doc["metadata.url"] = self.doc_path(entry) doc["metadata.date"] = self.date_path(entry) doc["metadata.title"] = self.title_path(entry) doc["metadata.crawl_date"] = ut.from_date() docs.append(doc.a_dict) return docs
def process_document(self, document, **kwargs): doc = ut.SDA(document) resp = self.url_fetcher(doc["metadata.detail_url"]) tree = html.fromstring(resp.content) for entry in self.meta_path(tree): key = self.key_path(entry) value = self.value_path(entry) doc[f"metadata.{key}"] = value # if there is more than one document included, use the HTML version. if self.num_of_docs_path(tree) > 1: doc["metadata.url"] = _get_html_version(doc["metadata.url"]) return doc.a_dict
def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") title = self.title_path(entry) if not title: continue doc["metadata.title"] = title logging.info(f"Found document: {title}.") date = self.date_path(entry) doc["metadata.date"] = date doc["metadata.url"] = self.doc_path(entry) docs.append(doc.a_dict) return docs
def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") doc["metadata.date"] = self.date_path(entry) doc["metadata.title"] = self.title_path(entry) doc["metadata.detail_url"] = self.detail_path(entry) doc["metadata.url"] = self.doc_path(entry) if doc["metadata.url"] is None: doc["metadata.url"] = doc["metadata.detail_url"] doc["metadata.topic"] = self.topic_path(entry) doc["metadata.type"] = self.type_path(entry) docs.append(doc.a_dict) return docs
def upsert_job(self, job_dict, **runtime_args): """Adds or updates a job using the provided user_input. If an id field is present in the dict, the job is updated, otherwise a new one is created. Args: job_dict (dict): user input for a job, as defined in `SCHEMATA`. **runtime_args (dict): additional runtime arguments for the crawler. Returns: apscheduler.job.Job: a new job Object. """ if not self.job_validator.validate(job_dict): raise (AssertionError(str(self.job_validator.errors))) doc = utility.SDA(job_dict) job = self.crawlers.get(doc["crawler.id"], None) # default to the SearchPlugin, and give the search name as argument. if job is None: inst = { "args": ("SearchPlugin", runtime_args), "kwargs": dict(search_id=doc["crawler.id"]) } else: inst = {"args": (doc["crawler.id"], runtime_args), "kwargs": {}} trigger = self._make_trigger(doc["schedule"]) if doc["id"]: self.scheduler.modify_job(doc["id"], jobstore="elastic", func=_run_plugin, name=doc["name.name"], **inst) new_job = self.scheduler.reschedule_job(doc["id"], jobstore="elastic", trigger=trigger) else: # use the crawler id as name, when the job is created. new_job = self.scheduler.add_job(_run_plugin, jobstore="elastic", trigger=trigger, name=doc["crawler.id"], **inst) return new_job
def find_entries(self, page): docs = [] for entry in self.entry_path(page): doc = ut.SDA({}, "N/A") title = self.title_path(entry) if not title: continue doc["metadata.title"] = title logging.info(f"Found document: {title}.") dates = self.date_path(entry) doc["metadata.date"] = dates.get("Last update", dt.datetime.now()) doc["metadata.date_original"] = dates.get("Publication date", dt.datetime.now()) doc["metadata.url"] = self.doc_path(entry) docs.append(doc.a_dict) return docs
def transform_output(results): """Transforms the results dictionary into an easy readable document list. Args: results (dict): the results of a elastic_search operation. Returns: list: a list of cleaned documents. """ res = ut.SDA(results) if res["_source"]: return _transform_document(results) if res["hits.total"] > 0: return [_transform_document(doc) for doc in res["hits.hits"]] # otherwise return []
def download_document(self, document, **kwargs): """Fetches the url of a document and sets the content of the document. Args: document (dict): the document that should be prepared, expects at least an "url" key. **kwargs (dict): additional keyword args, which are only consumed. Returns: dict: a document with added "content" field. """ # fetch body doc_url = utility.SDA(document)["metadata.url"] if not doc_url: document["raw_content"] = None return document resp = self.url_fetcher(doc_url) content_type = resp.headers.get("content-type", None) document["content_type"] = content_type document["raw_content"] = resp.content return document
def get_documents(self, limit=None, initial=None, **kwargs): """Fetches new entries for the given resource and places them in a queue (`self.process_docq`). Args: limit (int): maximum number of entries to pull. initial (bool): whether an initial run should be done? An initial run does not halt, when a document appeared prev. **kwargs (dict): additional keyword args, which are only consumed. Returns: BasePlugin: self. """ limit = self.defaults.limit.also(limit) initial = self.defaults.initial.also(initial) has_unseen_docs = True doc_count = 0 for page in self.entry_resource: # insert the entries into documents, if they aren't already tracked cur_docs = self.find_entries(page, **kwargs) # if there are no documents on the page, break if len(cur_docs) == 0: logger.info(f"No documents found on page {page}!") for doc in cur_docs: doc = utility.SDA(doc) doc_url = doc["metadata.url"] # skip entries where no url is given if not doc_url: logger.debug("Document contains no url. SKIP.") continue exists = self.elastic.exist_document(source_url=doc_url) # handle existing files, if they have a date field, which lies # in the past, break the loop. if exists: logger.debug(f"Document for url '{doc_url}' does already " "exist. SKIP.") doc_date = doc["metadata.date"] today = utility.from_date() if (not initial) and doc_date and doc_date < today: logger.debug("Document's date lies in the past." "Stop search.") has_unseen_docs = False break continue logger.info(f"Found document {doc_url}.") doc["metadata.source"] = self.source_name # enter documents to processing queue. self.docq.put((doc_count, doc.a_dict)) doc_count += 1 # break when the number of retrieved documents reaches the # limit if limit and doc_count >= limit: has_unseen_docs = False break # check whether there are still unseen documents, else do not # continue searching if has_unseen_docs is False: break return self