Exemple #1
0
    def import_json(self, in_lines, website_id: int):

        import_every = 10000
        cooldown_time = 0

        docs = []

        for line in in_lines:
            try:
                doc = ujson.loads(line)
                name, ext = os.path.splitext(doc["name"])
                doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else ""
                doc["name"] = name
                doc["website_id"] = website_id
                docs.append(doc)
            except Exception as e:
                logger.error("Error in import_json: " + str(e) +
                             " for line : + \n" + line)

            if len(docs) >= import_every:
                self._index(docs)
                docs.clear()
                time.sleep(cooldown_time)

        if docs:
            self._index(docs)
Exemple #2
0
    def delete_docs(self, website_id):

        while True:
            try:
                logger.debug("Deleting docs of " + str(website_id))

                to_delete = helpers.scan(query={
                    "query": {
                        "term": {
                            "website_id": website_id
                        }
                    }
                }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id)

                buf = []
                counter = 0
                for doc in to_delete:
                    buf.append(doc)
                    counter += 1

                    if counter >= 10000:
                        self._delete(buf, website_id)
                        buf.clear()
                        counter = 0
                if counter > 0:
                    self._delete(buf, website_id)
                break

            except Exception as e:
                logger.error("During delete: " + str(e))
                time.sleep(10)

        logger.debug("Done deleting for " + str(website_id))
Exemple #3
0
    def _delete(self, docs, website_id):
        bulk_string = self.create_bulk_delete_string(docs)
        result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
                              routing=website_id)

        if result["errors"]:
            logger.error("Error in ES bulk delete: \n" + result["errors"])
            raise IndexingError
Exemple #4
0
 def _index(self, docs):
     while True:
         try:
             logger.debug("Indexing " + str(len(docs)) + " docs")
             bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
             self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
                          routing=docs[0]["website_id"])
             break
         except Exception as e:
             logger.error("Error in _index: " + str(e) + ", retrying")
             time.sleep(10)