def bulk_builder(self, changes): """ http://www.elasticsearch.org/guide/reference/api/bulk.html bulk loader follows the following: { "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }\n { "field1" : "value1" }\n """ for change in changes: try: with lock_manager(self.change_trigger(change)) as t: if t is not None: tr = self.change_transform(t) if tr is not None: self.change_transport(tr) yield { "index": { "_index": self.es_index, "_type": self.es_type, "_id": tr['_id'] } } yield tr except Exception, ex: pillow_logging.error( "Error on change: %s, %s" % (change['id'], ex) )
def process_bulk_docs(self, docs): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len((docs))) changes = [self._doc_to_change(doc) for doc in docs] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s", change.id, exception) max_payload_size = pow(10, 8) # ~ 100Mb payloads = prepare_bulk_payloads(bulk_changes, max_payload_size) if len(payloads) > 1: pillow_logging.info("Payload split into %s parts" % len(payloads)) for payload in payloads: success = self._send_payload_with_retries(payload) if not success: # stop the reindexer if we're unable to send a payload to ES return False return True
def process_bulk_docs(self, docs, progress_logger): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len(docs)) changes = [ self._doc_to_change(doc) for doc in docs if self.process_deletes or not is_deletion(doc.get('doc_type')) ] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s (%s)", change.id, type(exception), exception) es_interface = ElasticsearchInterface(self.es) try: es_interface.bulk_ops(bulk_changes) except (ESBulkIndexError, ES2BulkIndexError, ES7BulkIndexError) as e: pillow_logging.error("Bulk index errors\n%s", e.errors) except Exception: pillow_logging.exception("\tException sending payload to ES") return False return True
def send_to_elasticsearch(path, es_getter, name, data=None, retries=MAX_RETRIES, except_on_failure=False, update=False, delete=False): """ More fault tolerant es.put method """ data = data if data is not None else {} current_tries = 0 while current_tries < retries: try: if delete: res = es_getter().delete(path=path) elif update: params = {'retry_on_conflict': 2} res = es_getter().post("%s/_update" % path, data={"doc": data}, params=params) else: res = es_getter().put(path, data=data) break except ConnectionError, ex: current_tries += 1 pillow_logging.error("[%s] put_robust error %s attempt %d/%d" % ( name, ex, current_tries, retries)) time.sleep(math.pow(RETRY_INTERVAL, current_tries)) if current_tries == retries: message = "[%s] Max retry error on %s" % (name, path) if except_on_failure: raise PillowtopIndexingError(message) else: pillow_logging.error(message) res = {}
def change_transport(self, doc_dict): """ Save the document to ElasticSearch """ try: if not self.bulk: doc_exists_val = doc_exists(self, doc_dict) if self.allow_updates: can_put = True else: can_put = not doc_exists_val if can_put and not self.bulk: self.send_robust(doc_dict, update=doc_exists_val) except Exception, ex: tb = traceback.format_exc() pillow_logging.error( "PillowTop [%(pillow_name)s]: Aliased Elastic Pillow transport change data doc_id: %(doc_id)s to elasticsearch error: %(error)s\ntraceback: %(tb)s\n" % { "pillow_name": self.get_name(), "doc_id": doc_dict['_id'], "error": ex, "tb": tb } ) return None
def change_transport(self, doc_dict): """ Override the elastic transport to go to the index + the type being a string between the domain and case type """ try: if not self.bulk: doc_path = self.get_doc_path_typed(doc_dict) doc_exists = self.doc_exists(doc_dict) if self.allow_updates: can_put = True else: can_put = not doc_exists if can_put and not self.bulk: res = self.send_robust(doc_path, data=doc_dict, update=doc_exists) return res except Exception, ex: tb = traceback.format_exc() pillow_logging.error( "PillowTop [%(pillow_name)s]: Aliased Elastic Pillow transport change data doc_id: %(doc_id)s to elasticsearch error: %(error)s\ntraceback: %(tb)s\n" % { "pillow_name": self.get_name(), "doc_id": doc_dict['_id'], "error": ex, "tb": tb } ) return None
def process_bulk_docs(self, docs): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len((docs))) changes = [self._doc_to_change(doc) for doc in docs] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s", change.id, exception) payloads = prepare_bulk_payloads(bulk_changes, MAX_PAYLOAD_SIZE) if len(payloads) > 1: pillow_logging.info("Payload split into %s parts" % len(payloads)) for payload in payloads: success = self._send_payload_with_retries(payload) if not success: # stop the reindexer if we're unable to send a payload to ES return False return True
def process_bulk_docs(self, docs, progress_logger): if not docs: return True pillow_logging.info("Processing batch of %s docs", len(docs)) changes = [] for doc in docs: change = self._doc_to_change(doc) # de-dupe the is_deletion check if self.process_deletes or not change.deleted: changes.append(change) error_collector = ErrorCollector() bulk_changes = build_bulk_payload(changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error processing doc %s: %s (%s)", change.id, type(exception), exception) es_interface = ElasticsearchInterface(self.es) try: es_interface.bulk_ops(self.index_info.alias, self.index_info.type, bulk_changes) except BulkIndexError as e: pillow_logging.error("Bulk index errors\n%s", e.errors) except Exception as exc: pillow_logging.exception( "Error sending bulk payload to Elasticsearch: %s", exc) return False return True
def change_trigger(self, changes_dict): id = changes_dict['id'] if changes_dict.get('deleted', False): try: if doc_exists(self, id): self.get_es_new().delete(self.es_index, self.es_type, id) except Exception, ex: pillow_logging.error( "ElasticPillow: error deleting route %s - ignoring: %s" % ( self.get_doc_path(changes_dict['id']), ex, ) ) return None
def send_to_elasticsearch(index, doc_type, doc_id, es_getter, name, data=None, retries=MAX_RETRIES, except_on_failure=False, update=False, delete=False): """ More fault tolerant es.put method """ data = data if data is not None else {} current_tries = 0 while current_tries < retries: try: if delete: es_getter().delete(index, doc_type, doc_id) elif update: params = {'retry_on_conflict': 2} es_getter().index(index, doc_type, body=data, id=doc_id, params=params) else: es_getter().create(index, doc_type, body=data, id=doc_id) break except ConnectionError as ex: current_tries += 1 pillow_logging.error("[%s] put_robust error %s attempt %d/%d" % (name, ex, current_tries, retries)) if current_tries == retries: message = "[%s] Max retry error on %s/%s/%s" % ( name, index, doc_type, doc_id) if except_on_failure: raise PillowtopIndexingError(message) else: pillow_logging.error(message) time.sleep(math.pow(RETRY_INTERVAL, current_tries)) except RequestError as ex: error_message = "Pillowtop put_robust error [%s]:\n%s\n\tpath: %s/%s/%s\n\t%s" % ( name, ex.error or "No error message", index, doc_type, doc_id, data.keys()) if except_on_failure: raise PillowtopIndexingError(error_message) else: pillow_logging.error(error_message) break except ConflictError: break # ignore the error if a doc already exists when trying to create it in the index except NotFoundError: break
def process_bulk_docs(self, docs): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len((docs))) changes = [self._doc_to_change(doc) for doc in docs] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s (%s)", change.id, type(exception), exception) es_interface = ElasticsearchInterface(self.es) try: es_interface.bulk_ops(bulk_changes) except Exception: pillow_logging.exception("\tException sending payload to ES") return False return True
def send_to_elasticsearch(index, doc_type, doc_id, es_getter, name, data=None, retries=MAX_RETRIES, propagate_failure=settings.UNIT_TESTING, update=False, delete=False, es_merge_update=False): """ More fault tolerant es.put method kwargs: es_merge_update: Set this to True to use Elasticsearch.update instead of Elasticsearch.index which merges existing ES doc and current update. If this is set to False, the doc will be replaced """ data = data if data is not None else {} current_tries = 0 es_interface = ElasticsearchInterface(es_getter()) retries = 1 if settings.UNIT_TESTING else retries while current_tries < retries: try: if delete: es_interface.delete_doc(index, doc_type, doc_id) elif update: params = {'retry_on_conflict': 2} if es_merge_update: es_interface.update_doc_fields(index, doc_type, doc_id, fields=data, params=params) else: es_interface.update_doc(index, doc_type, doc_id, doc=data, params=params) else: es_interface.create_doc(index, doc_type, doc_id, doc=data) break except ConnectionError as ex: current_tries += 1 pillow_logging.error("[{}] put_robust error {} attempt {}/{}".format( name, ex, current_tries, retries)) if current_tries == retries: message = "[{}] Max retry error on {}/{}/{}:\n\n{}".format( name, index, doc_type, doc_id, traceback.format_exc()) if propagate_failure: raise PillowtopIndexingError(message) else: pillow_logging.error(message) time.sleep(math.pow(RETRY_INTERVAL, current_tries)) except RequestError: error_message = ( "Pillowtop put_robust error [{}]:\n\n{}\n\tpath: {}/{}/{}\n\t{}".format( name, traceback.format_exc(), index, doc_type, doc_id, list(data)) ) if propagate_failure: raise PillowtopIndexingError(error_message) else: pillow_logging.error(error_message) break except ConflictError: break # ignore the error if a doc already exists when trying to create it in the index except NotFoundError: break
def send_to_elasticsearch(index, doc_type, doc_id, es_getter, name, data=None, retries=MAX_RETRIES, except_on_failure=False, update=False, delete=False): """ More fault tolerant es.put method """ data = data if data is not None else {} current_tries = 0 while current_tries < retries: try: if delete: es_getter().delete(index, doc_type, doc_id) elif update: params = {'retry_on_conflict': 2} es_getter().update(index, doc_type, doc_id, body={"doc": data}, params=params) else: es_getter().create(index, doc_type, body=data, id=doc_id) break except ConnectionError, ex: current_tries += 1 pillow_logging.error("[%s] put_robust error %s attempt %d/%d" % ( name, ex, current_tries, retries)) if current_tries == retries: message = "[%s] Max retry error on %s/%s/%s" % (name, index, doc_type, doc_id) if except_on_failure: raise PillowtopIndexingError(message) else: pillow_logging.error(message) time.sleep(math.pow(RETRY_INTERVAL, current_tries)) except RequestError as ex: error_message = "Pillowtop put_robust error [%s]:\n%s\n\tpath: %s/%s/%s\n\t%s" % ( name, ex.error or "No error message", index, doc_type, doc_id, data.keys()) if except_on_failure: raise PillowtopIndexingError(error_message) else: pillow_logging.error(error_message) break
raise PillowtopIndexingError(message) else: pillow_logging.error(message) res = {} if res.get('status', 0) == 400: error_message = "Pillowtop put_robust error [%s]:\n%s\n\tpath: %s\n\t%s" % ( name, res.get('error', "No error message"), path, data.keys()) if except_on_failure: raise PillowtopIndexingError(error_message) else: pillow_logging.error(error_message) return res class AliasedElasticPillow(BasicPillow): """ This pillow class defines it as being alias-able. That is, when you query it, you use an Alias to access it. This could be for varying reasons - to make an index by certain metadata into its own index for performance/separation reasons Or, for our initial use case, needing to version/update the index mappings on the fly with minimal disruption.