Ejemplo n.º 1
0
def ensure_matched_revisions(change, fetched_document):
    """
    This function ensures that the document fetched from a change matches the
    revision at which it was pushed to kafka at.

    See http://manage.dimagi.com/default.asp?237983 for more details

    :raises: DocumentMismatchError - Raised when the revisions of the fetched document
        and the change metadata do not match
    """

    change_has_rev = change.metadata and change.metadata.document_rev is not None
    doc_has_rev = fetched_document and '_rev' in fetched_document
    if doc_has_rev and change_has_rev:

        doc_rev = fetched_document['_rev']
        change_rev = change.metadata.document_rev
        if doc_rev != change_rev:
            fetched_rev = _convert_rev_to_int(doc_rev)
            stored_rev = _convert_rev_to_int(change_rev)
            if fetched_rev < stored_rev or stored_rev == -1:
                message = "Mismatched revs for {}: Cloudant rev {} vs. Changes feed rev {}".format(
                    change.id, doc_rev, change_rev)
                pillow_logging.warning(message)
                raise DocumentMismatchError(message)
Ejemplo n.º 2
0
    def process_change(self, pillow_instance, change):
        if change.deleted and change.id:
            self._delete_doc_if_exists(change.id)
            return

        doc = change.get_document()
        if doc is None:
            pillow_logging.warning(
                "Unable to get document from change: {}".format(change))
            return

        if self.doc_filter_fn and self.doc_filter_fn(doc):
            return

        # prepare doc for es
        doc_ready_to_save = self.doc_transform_fn(doc)
        # send it across
        send_to_elasticsearch(
            index=self.index_info.index,
            doc_type=self.index_info.type,
            doc_id=change.id,
            es_getter=self.es_getter,
            name=pillow_instance.get_name(),
            data=doc_ready_to_save,
            update=self._doc_exists(change.id),
        )
Ejemplo n.º 3
0
def ensure_matched_revisions(change, fetched_document):
    """
    This function ensures that the document fetched from a change matches the
    revision at which it was pushed to kafka at.

    See http://manage.dimagi.com/default.asp?237983 for more details

    :raises: DocumentMismatchError - Raised when the revisions of the fetched document
        and the change metadata do not match
    """

    change_has_rev = change.metadata and change.metadata.document_rev is not None
    doc_has_rev = fetched_document and '_rev' in fetched_document
    if doc_has_rev and change_has_rev:

        doc_rev = fetched_document['_rev']
        change_rev = change.metadata.document_rev
        if doc_rev != change_rev:
            fetched_rev = _convert_rev_to_int(doc_rev)
            stored_rev = _convert_rev_to_int(change_rev)
            if fetched_rev < stored_rev or stored_rev == -1:
                message = "Mismatched revs for {}: Cloudant rev {} vs. Changes feed rev {}".format(
                    change.id,
                    doc_rev,
                    change_rev
                )
                pillow_logging.warning(message)
                raise DocumentMismatchError(message)
Ejemplo n.º 4
0
    def _send_payload_with_retries(self, payload):
        pillow_logging.info("Sending payload to ES")

        retries = 0
        bulk_start = datetime.utcnow()
        success = False
        while retries < MAX_TRIES:
            if retries:
                retry_time = (datetime.utcnow() - bulk_start
                              ).seconds + retries * RETRY_TIME_DELAY_FACTOR
                pillow_logging.warning("\tRetrying in %s seconds" % retry_time)
                time.sleep(retry_time)
                pillow_logging.warning("\tRetrying now ...")
                # reset timestamp when looping again
                bulk_start = datetime.utcnow()

            try:
                self.es.bulk(payload)
                success = True
                break
            except Exception:
                retries += 1
                pillow_logging.exception("\tException sending payload to ES")

        return success
Ejemplo n.º 5
0
def _filter_invalid_config(configs):
    """Return a list of configs that have been validated"""
    valid_configs = []
    for config in configs:
        try:
            config.validate()
            valid_configs.append(config)
        except Exception:
            pillow_logging.warning("Invalid config found during bootstrap: %s", config._id)
    return valid_configs
Ejemplo n.º 6
0
def ensure_document_exists(change):
    """
    Ensures that the document recorded in Kafka exists and is properly returned

    :raises: DocumentNotFoundError - Raised when the document is not found
    """
    doc = change.get_document()
    if doc is None:
        pillow_logging.warning("Unable to get document from change: {}".format(change))
        raise DocumentNotFoundError()  # force a retry
Ejemplo n.º 7
0
    def _do_bootstrap(self, configs=None):
        configs = self.get_filtered_configs(configs)
        if not configs:
            pillow_logging.warning("UCR pillow has no configs to process")

        self.table_adapters_by_domain = defaultdict(list)

        for config in configs:
            self.table_adapters_by_domain[config.domain].append(
                _get_indicator_adapter_for_pillow(config)
            )
Ejemplo n.º 8
0
 def timed(*args, **kw):
     ts = datetime.now()
     result = method(*args, **kw)
     te = datetime.now()
     seconds = (te - ts).total_seconds()
     if seconds > LONG_UCR_LOGGING_THRESHOLD:
         table = args[2]
         doc = args[3]
         log_message = "UCR data source {} on doc_id {} took {} seconds to process".format(
             table.config._id, doc['_id'], seconds)
         pillow_logging.warning(log_message)
     return result
Ejemplo n.º 9
0
 def timed(*args, **kw):
     ts = datetime.now()
     result = method(*args, **kw)
     te = datetime.now()
     seconds = (te - ts).total_seconds()
     if seconds > LONG_UCR_LOGGING_THRESHOLD:
         table = args[2]
         doc = args[3]
         log_message = "UCR data source {} on doc_id {} took {} seconds to process".format(
             table.config._id, doc['_id'], seconds
         )
         pillow_logging.warning(log_message)
     return result
Ejemplo n.º 10
0
 def timed(*args, **kw):
     ts = datetime.now()
     result = method(*args, **kw)
     te = datetime.now()
     seconds = (te - ts).total_seconds()
     if seconds > 0.1:
         table = args[1]
         doc = args[2]
         message = u"UCR data source {} on doc_id {} took {} seconds to process".format(
             table.config._id, doc['_id'], seconds
         )
         pillow_logging.warning(message)
     return result
Ejemplo n.º 11
0
    def bootstrap(self, configs=None):
        configs = self.get_filtered_configs(configs)
        if not configs:
            pillow_logging.warning("UCR pillow has no configs to process")

        self.table_adapters_by_domain = defaultdict(list)

        for config in configs:
            self.table_adapters_by_domain[config.domain].append(
                get_indicator_adapter(config, raise_errors=True))

        self.rebuild_tables_if_necessary()
        self.bootstrapped = True
        self.last_bootstrapped = datetime.utcnow()
Ejemplo n.º 12
0
def build_bulk_payload(changes, doc_transform=None, error_collector=None):
    """Process a set of changes, returning a list of BulkActionItem objects.

    :param changes: iterable of changes to process in Elasticsearch
    :param doc_transform: optional function used to serialize documents into the
                          correct format for indexing into Elasticsearch
                          (documents are indexed as-is if not specified)
    :param error_collector: optional ``ErrorCollector`` instance used to store
                            any any document fetch or transform exceptions
                            (exceptions raised if not provided)
    :returns: ``list`` of BulkActionItem instances
    """
    # TODO: do not transform the docs to be indexed (DocumentAdapter will
    #       perform this task in the future)
    from corehq.apps.change_feed.document_types import get_doc_meta_object_from_document

    if doc_transform is None:
        def doc_transform(doc):
            return doc
    payload = []

    def _is_deleted(change):
        doc = change.get_document()
        if doc and doc.get('doc_type'):
            return get_doc_meta_object_from_document(doc).is_deletion
        elif change.deleted:
            return bool(change.id)

    for change in changes:
        if _is_deleted(change):
            payload.append(BulkActionItem.delete_id(change.id))
        elif not change.deleted:
            try:
                doc = change.get_document()
                doc = doc_transform(doc)
                payload.append(BulkActionItem.index(doc))
            except Exception as e:
                if not error_collector:
                    raise
                error_collector.add_error(ChangeError(change, e))
        else:
            # See PR discussion: https://github.com/dimagi/commcare-hq/pull/31243
            #
            # Discarding changes when the deletion status is ambiguous feels
            # like guessing to me, which goes against the zen. Log a warning
            # for trackability.
            pillow_logging.warning("discarding ambiguous bulk change: %s", change)

    return payload
Ejemplo n.º 13
0
    def bootstrap(self, configs=None):
        configs = self.get_filtered_configs(configs)
        if not configs:
            pillow_logging.warning("UCR pillow has no configs to process")

        self.table_adapters_by_domain = defaultdict(list)

        for config in configs:
            self.table_adapters_by_domain[config.domain].append(
                get_indicator_adapter(config, raise_errors=True)
            )

        self.rebuild_tables_if_necessary()
        self.bootstrapped = True
        self.last_bootstrapped = datetime.utcnow()
Ejemplo n.º 14
0
 def timed(*args, **kw):
     ts = datetime.now()
     result = method(*args, **kw)
     te = datetime.now()
     seconds = (te - ts).total_seconds()
     if seconds > LONG_UCR_LOGGING_THRESHOLD:
         table = args[1]
         doc = args[2]
         log_message = u"UCR data source {} on doc_id {} took {} seconds to process".format(
             table.config._id, doc['_id'], seconds)
         pillow_logging.warning(log_message)
         if seconds > LONG_UCR_SOFT_ASSERT_THRESHOLD:
             email_message = u"UCR data source {} is taking too long to process".format(
                 table.config._id)
             _slow_ucr_assert(False, email_message)
     return result
Ejemplo n.º 15
0
    def process_change(self, pillow_instance, change, do_set_checkpoint):
        # todo: if deletion - delete
        # prepare doc for es
        doc = change.get_document()
        if doc is None:
            pillow_logging.warning("Unable to get document from change: {}".format(change))
            return

        doc_ready_to_save = self.doc_transform_fn(doc)
        # send it across
        send_to_elasticsearch(
            index=self.index_meta.index,
            doc_type=self.index_meta.type,
            doc_id=change.id,
            es_getter=self.es_getter,
            name=pillow_instance.get_name(),
            data=doc_ready_to_save,
            update=self.elasticsearch.exists(self.index_meta.index, self.index_meta.type, change.id),
        )
Ejemplo n.º 16
0
    def process_change(self, pillow_instance, change):
        if change.deleted and change.id:
            self._delete_doc_if_exists(change.id)
            return

        # prepare doc for es
        doc = change.get_document()
        if doc is None:
            pillow_logging.warning("Unable to get document from change: {}".format(change))
            return

        doc_ready_to_save = self.doc_transform_fn(doc)
        # send it across
        send_to_elasticsearch(
            index=self.index_info.index,
            doc_type=self.index_info.type,
            doc_id=change.id,
            es_getter=self.es_getter,
            name=pillow_instance.get_name(),
            data=doc_ready_to_save,
            update=self._doc_exists(change.id),
        )
Ejemplo n.º 17
0
    def _send_payload_with_retries(self, payload):
        pillow_logging.info("Sending payload to ES")

        retries = 0
        bulk_start = datetime.utcnow()
        success = False
        while retries < MAX_TRIES:
            if retries:
                retry_time = (datetime.utcnow() - bulk_start).seconds + retries * RETRY_TIME_DELAY_FACTOR
                pillow_logging.warning("\tRetrying in %s seconds" % retry_time)
                time.sleep(retry_time)
                pillow_logging.warning("\tRetrying now ...")
                # reset timestamp when looping again
                bulk_start = datetime.utcnow()

            try:
                self.es.bulk(payload)
                success = True
                break
            except Exception:
                retries += 1
                pillow_logging.exception("\tException sending payload to ES")

        return success