def ensure_matched_revisions(change, fetched_document): """ This function ensures that the document fetched from a change matches the revision at which it was pushed to kafka at. See http://manage.dimagi.com/default.asp?237983 for more details :raises: DocumentMismatchError - Raised when the revisions of the fetched document and the change metadata do not match """ change_has_rev = change.metadata and change.metadata.document_rev is not None doc_has_rev = fetched_document and '_rev' in fetched_document if doc_has_rev and change_has_rev: doc_rev = fetched_document['_rev'] change_rev = change.metadata.document_rev if doc_rev != change_rev: fetched_rev = _convert_rev_to_int(doc_rev) stored_rev = _convert_rev_to_int(change_rev) if fetched_rev < stored_rev or stored_rev == -1: message = "Mismatched revs for {}: Cloudant rev {} vs. Changes feed rev {}".format( change.id, doc_rev, change_rev) pillow_logging.warning(message) raise DocumentMismatchError(message)
def process_change(self, pillow_instance, change): if change.deleted and change.id: self._delete_doc_if_exists(change.id) return doc = change.get_document() if doc is None: pillow_logging.warning( "Unable to get document from change: {}".format(change)) return if self.doc_filter_fn and self.doc_filter_fn(doc): return # prepare doc for es doc_ready_to_save = self.doc_transform_fn(doc) # send it across send_to_elasticsearch( index=self.index_info.index, doc_type=self.index_info.type, doc_id=change.id, es_getter=self.es_getter, name=pillow_instance.get_name(), data=doc_ready_to_save, update=self._doc_exists(change.id), )
def ensure_matched_revisions(change, fetched_document): """ This function ensures that the document fetched from a change matches the revision at which it was pushed to kafka at. See http://manage.dimagi.com/default.asp?237983 for more details :raises: DocumentMismatchError - Raised when the revisions of the fetched document and the change metadata do not match """ change_has_rev = change.metadata and change.metadata.document_rev is not None doc_has_rev = fetched_document and '_rev' in fetched_document if doc_has_rev and change_has_rev: doc_rev = fetched_document['_rev'] change_rev = change.metadata.document_rev if doc_rev != change_rev: fetched_rev = _convert_rev_to_int(doc_rev) stored_rev = _convert_rev_to_int(change_rev) if fetched_rev < stored_rev or stored_rev == -1: message = "Mismatched revs for {}: Cloudant rev {} vs. Changes feed rev {}".format( change.id, doc_rev, change_rev ) pillow_logging.warning(message) raise DocumentMismatchError(message)
def _send_payload_with_retries(self, payload): pillow_logging.info("Sending payload to ES") retries = 0 bulk_start = datetime.utcnow() success = False while retries < MAX_TRIES: if retries: retry_time = (datetime.utcnow() - bulk_start ).seconds + retries * RETRY_TIME_DELAY_FACTOR pillow_logging.warning("\tRetrying in %s seconds" % retry_time) time.sleep(retry_time) pillow_logging.warning("\tRetrying now ...") # reset timestamp when looping again bulk_start = datetime.utcnow() try: self.es.bulk(payload) success = True break except Exception: retries += 1 pillow_logging.exception("\tException sending payload to ES") return success
def _filter_invalid_config(configs): """Return a list of configs that have been validated""" valid_configs = [] for config in configs: try: config.validate() valid_configs.append(config) except Exception: pillow_logging.warning("Invalid config found during bootstrap: %s", config._id) return valid_configs
def ensure_document_exists(change): """ Ensures that the document recorded in Kafka exists and is properly returned :raises: DocumentNotFoundError - Raised when the document is not found """ doc = change.get_document() if doc is None: pillow_logging.warning("Unable to get document from change: {}".format(change)) raise DocumentNotFoundError() # force a retry
def _do_bootstrap(self, configs=None): configs = self.get_filtered_configs(configs) if not configs: pillow_logging.warning("UCR pillow has no configs to process") self.table_adapters_by_domain = defaultdict(list) for config in configs: self.table_adapters_by_domain[config.domain].append( _get_indicator_adapter_for_pillow(config) )
def timed(*args, **kw): ts = datetime.now() result = method(*args, **kw) te = datetime.now() seconds = (te - ts).total_seconds() if seconds > LONG_UCR_LOGGING_THRESHOLD: table = args[2] doc = args[3] log_message = "UCR data source {} on doc_id {} took {} seconds to process".format( table.config._id, doc['_id'], seconds) pillow_logging.warning(log_message) return result
def timed(*args, **kw): ts = datetime.now() result = method(*args, **kw) te = datetime.now() seconds = (te - ts).total_seconds() if seconds > LONG_UCR_LOGGING_THRESHOLD: table = args[2] doc = args[3] log_message = "UCR data source {} on doc_id {} took {} seconds to process".format( table.config._id, doc['_id'], seconds ) pillow_logging.warning(log_message) return result
def timed(*args, **kw): ts = datetime.now() result = method(*args, **kw) te = datetime.now() seconds = (te - ts).total_seconds() if seconds > 0.1: table = args[1] doc = args[2] message = u"UCR data source {} on doc_id {} took {} seconds to process".format( table.config._id, doc['_id'], seconds ) pillow_logging.warning(message) return result
def bootstrap(self, configs=None): configs = self.get_filtered_configs(configs) if not configs: pillow_logging.warning("UCR pillow has no configs to process") self.table_adapters_by_domain = defaultdict(list) for config in configs: self.table_adapters_by_domain[config.domain].append( get_indicator_adapter(config, raise_errors=True)) self.rebuild_tables_if_necessary() self.bootstrapped = True self.last_bootstrapped = datetime.utcnow()
def build_bulk_payload(changes, doc_transform=None, error_collector=None): """Process a set of changes, returning a list of BulkActionItem objects. :param changes: iterable of changes to process in Elasticsearch :param doc_transform: optional function used to serialize documents into the correct format for indexing into Elasticsearch (documents are indexed as-is if not specified) :param error_collector: optional ``ErrorCollector`` instance used to store any any document fetch or transform exceptions (exceptions raised if not provided) :returns: ``list`` of BulkActionItem instances """ # TODO: do not transform the docs to be indexed (DocumentAdapter will # perform this task in the future) from corehq.apps.change_feed.document_types import get_doc_meta_object_from_document if doc_transform is None: def doc_transform(doc): return doc payload = [] def _is_deleted(change): doc = change.get_document() if doc and doc.get('doc_type'): return get_doc_meta_object_from_document(doc).is_deletion elif change.deleted: return bool(change.id) for change in changes: if _is_deleted(change): payload.append(BulkActionItem.delete_id(change.id)) elif not change.deleted: try: doc = change.get_document() doc = doc_transform(doc) payload.append(BulkActionItem.index(doc)) except Exception as e: if not error_collector: raise error_collector.add_error(ChangeError(change, e)) else: # See PR discussion: https://github.com/dimagi/commcare-hq/pull/31243 # # Discarding changes when the deletion status is ambiguous feels # like guessing to me, which goes against the zen. Log a warning # for trackability. pillow_logging.warning("discarding ambiguous bulk change: %s", change) return payload
def bootstrap(self, configs=None): configs = self.get_filtered_configs(configs) if not configs: pillow_logging.warning("UCR pillow has no configs to process") self.table_adapters_by_domain = defaultdict(list) for config in configs: self.table_adapters_by_domain[config.domain].append( get_indicator_adapter(config, raise_errors=True) ) self.rebuild_tables_if_necessary() self.bootstrapped = True self.last_bootstrapped = datetime.utcnow()
def timed(*args, **kw): ts = datetime.now() result = method(*args, **kw) te = datetime.now() seconds = (te - ts).total_seconds() if seconds > LONG_UCR_LOGGING_THRESHOLD: table = args[1] doc = args[2] log_message = u"UCR data source {} on doc_id {} took {} seconds to process".format( table.config._id, doc['_id'], seconds) pillow_logging.warning(log_message) if seconds > LONG_UCR_SOFT_ASSERT_THRESHOLD: email_message = u"UCR data source {} is taking too long to process".format( table.config._id) _slow_ucr_assert(False, email_message) return result
def process_change(self, pillow_instance, change, do_set_checkpoint): # todo: if deletion - delete # prepare doc for es doc = change.get_document() if doc is None: pillow_logging.warning("Unable to get document from change: {}".format(change)) return doc_ready_to_save = self.doc_transform_fn(doc) # send it across send_to_elasticsearch( index=self.index_meta.index, doc_type=self.index_meta.type, doc_id=change.id, es_getter=self.es_getter, name=pillow_instance.get_name(), data=doc_ready_to_save, update=self.elasticsearch.exists(self.index_meta.index, self.index_meta.type, change.id), )
def process_change(self, pillow_instance, change): if change.deleted and change.id: self._delete_doc_if_exists(change.id) return # prepare doc for es doc = change.get_document() if doc is None: pillow_logging.warning("Unable to get document from change: {}".format(change)) return doc_ready_to_save = self.doc_transform_fn(doc) # send it across send_to_elasticsearch( index=self.index_info.index, doc_type=self.index_info.type, doc_id=change.id, es_getter=self.es_getter, name=pillow_instance.get_name(), data=doc_ready_to_save, update=self._doc_exists(change.id), )
def _send_payload_with_retries(self, payload): pillow_logging.info("Sending payload to ES") retries = 0 bulk_start = datetime.utcnow() success = False while retries < MAX_TRIES: if retries: retry_time = (datetime.utcnow() - bulk_start).seconds + retries * RETRY_TIME_DELAY_FACTOR pillow_logging.warning("\tRetrying in %s seconds" % retry_time) time.sleep(retry_time) pillow_logging.warning("\tRetrying now ...") # reset timestamp when looping again bulk_start = datetime.utcnow() try: self.es.bulk(payload) success = True break except Exception: retries += 1 pillow_logging.exception("\tException sending payload to ES") return success