Esempio n. 1
0
 def _filter_row(self, row):
     if not self._pre_validate_row(row):
         self._warn_once("rows are skipped because they failed pre validation")
         return None
     else:
         num_total_docs = self._stats["updated docs in ES"] + self._stats["added docs to ES"]
         if num_total_docs%100 == 0:
             logging.info("updated {} docs, inserted {} docs. Last doc updated:".format(self._stats["updated docs in ES"],
                                                                                        self._stats["added docs to ES"]))
             logging.info("{source}:{collection},{id}@{version}".format(
                 source=row.get("source"), collection=row.get("collection"),
                 version=row.get("version"), id=row.get("id")))
         original_row = deepcopy(row)
         try:
             row = deepcopy(original_row)
             source_doc = row.pop("source_doc")
             new_doc = self._initialize_new_doc(row, source_doc)
             self._populate_language_fields(new_doc, row)
             self._populate_related_documents(new_doc, row)
             self._add_title_related_fields(new_doc)
             self._validate_collection(new_doc)
             self._validate_slugs(new_doc)
             with temp_loglevel(logging.ERROR):
                 try:
                     old_doc = self._es.get(index=self._idx, id="{}_{}".format(
                         new_doc["source"], new_doc["source_id"]))["_source"]
                 except NotFoundError:
                     old_doc = None
             if old_doc:
                 return self._update_doc(new_doc, old_doc)
             else:
                 return self._add_doc(new_doc)
         except Exception:
             logging.exception("unexpected exception, row={}".format(original_row))
             raise
Esempio n. 2
0
 def _ensure_slug_uniqueness(self, slug, doc):
     body = {"query": {"constant_score": {"filter": {"term": {"slugs": slug}}}}}
     with temp_loglevel(logging.ERROR):
         results = self._es.search(index=self._idx, doc_type=constants.PIPELINES_ES_DOC_TYPE, body=body, ignore_unavailable=True)
     for hit in results["hits"]["hits"]:
         if hit["_id"] != "{}_{}".format(doc["source"], doc["source_id"]):
             return self._ensure_slug_uniqueness("{}-{}".format(slug, doc["source_id"]), doc)
     return slug
Esempio n. 3
0
 def _delete(self, id):
     if id in self._all_es_ids:
         with temp_loglevel(logging.ERROR):
             self._es.delete(index=self._idx,
                             doc_type=constants.PIPELINES_ES_DOC_TYPE,
                             id=id)
         self._stats[self.STATS_DELETED] += 1
     else:
         self._stats[self.STATS_DELETED] += 1
Esempio n. 4
0
 def _update_doc(self, new_doc, old_doc):
     if old_doc["version"] == new_doc["version"]:
         self._warn_once("rows are updated even though version is the same")
     self._update_doc_slugs(new_doc, old_doc)
     with temp_loglevel(logging.ERROR):
         self._es.index(index=self._idx, doc_type=constants.PIPELINES_ES_DOC_TYPE,
                         id="{}_{}".format(new_doc["source"], new_doc["source_id"]),
                         body=new_doc)
     self._stats["updated docs in ES"] += 1
     return self._get_sync_response(new_doc, "updated doc in ES")
Esempio n. 5
0
 def _get_all_es_ids(self):
     # TODO: optimize, this is really inefficient (but, done only once per pipeline run)
     with temp_loglevel(logging.ERROR):
         res = [
             doc["_id"] for doc in elasticsearch.helpers.scan(
                 self._es,
                 index=self._idx,
                 doc_type=constants.PIPELINES_ES_DOC_TYPE,
                 scroll=u"3h")
         ]
     return res
Esempio n. 6
0
 def _add_doc(self, new_doc):
     with temp_loglevel(logging.ERROR):
         self._es.index(index=self._idx, doc_type=constants.PIPELINES_ES_DOC_TYPE,
                        body=new_doc, id="{}_{}".format(new_doc["source"], new_doc["source_id"]))
     self._stats["added docs to ES"] += 1
     return self._get_sync_response(new_doc, "added to ES")