Esempio n. 1
0
def _save_document_helper(indicator, doc):
    eval_context = EvaluationContext(doc)
    something_failed = False
    for config_id in indicator.indicator_config_ids:
        adapter = None
        try:
            config = _get_config(config_id)
        except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError):
            celery_task_logger.info(
                "{} no longer exists, skipping".format(config_id))
            continue
        except ESError:
            celery_task_logger.info(
                "ES errored when trying to retrieve config")
            something_failed = True
            return
        try:
            adapter = get_indicator_adapter(config, can_handle_laboratory=True)
            adapter.save(doc, eval_context)
            eval_context.reset_iteration()
        except (DatabaseError, ESError, InternalError, RequestError,
                ConnectionTimeout, ProtocolError, ReadTimeout):
            # a database had an issue so log it and go on to the next document
            celery_task_logger.info(
                "DB error when saving config: {}".format(config_id))
            something_failed = True
            return
        except Exception as e:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, e)
            something_failed = True
            return

    return not something_failed
Esempio n. 2
0
    def process_change(self, pillow_instance, change):
        self.bootstrap_if_needed()
        if change.deleted:
            # we don't currently support hard-deletions at all.
            # we may want to change this at some later date but seem ok for now.
            # see https://github.com/dimagi/commcare-hq/pull/6944 for rationale
            return

        domain = change.metadata.domain
        if not domain:
            # if no domain we won't save to any UCR table
            return

        async_tables = []
        doc = change.get_document()
        ensure_document_exists(change)
        ensure_matched_revisions(change)

        if doc is None:
            return

        eval_context = EvaluationContext(doc)
        for table in self.table_adapters_by_domain[domain]:
            if table.config.filter(doc):
                if table.run_asynchronous:
                    async_tables.append(table.config._id)
                else:
                    self._save_doc_to_table(table, doc, eval_context)
                    eval_context.reset_iteration()
            elif table.config.deleted_filter(doc):
                table.delete(doc)

        if async_tables:
            AsyncIndicator.update_indicators(change, async_tables)
Esempio n. 3
0
    def _process_chunk_for_domain(self, domain, changes_chunk):
        adapters = list(self.table_adapters_by_domain[domain])
        changes_by_id = {change.id: change for change in changes_chunk}
        to_delete_by_adapter = defaultdict(list)
        rows_to_save_by_adapter = defaultdict(list)
        async_configs_by_doc_id = defaultdict(list)
        to_update = {change for change in changes_chunk if not change.deleted}
        retry_changes, docs = self.get_docs_for_changes(to_update, domain)
        change_exceptions = []

        for doc in docs:
            eval_context = EvaluationContext(doc)
            for adapter in adapters:
                if adapter.config.filter(doc):
                    if adapter.run_asynchronous:
                        async_configs_by_doc_id[doc['_id']].append(
                            adapter.config._id)
                    else:
                        try:
                            rows_to_save_by_adapter[adapter].extend(
                                adapter.get_all_values(doc, eval_context))
                        except Exception as e:
                            change_exceptions.append(
                                (changes_by_id[doc["_id"]], e))
                        eval_context.reset_iteration()
                elif adapter.config.deleted_filter(doc) or adapter.doc_exists(
                        doc):
                    to_delete_by_adapter[adapter].append(doc['_id'])

        # bulk delete by adapter
        to_delete = [c.id for c in changes_chunk if c.deleted]
        for adapter in adapters:
            delete_ids = to_delete_by_adapter[adapter] + to_delete
            try:
                adapter.bulk_delete(delete_ids)
            except Exception as ex:
                notify_exception(
                    None, "Error in deleting changes chunk {ids}: {ex}".format(
                        ids=delete_ids, ex=ex))
                retry_changes.update(
                    [c for c in changes_chunk if c.id in delete_ids])
        # bulk update by adapter
        for adapter, rows in six.iteritems(rows_to_save_by_adapter):
            try:
                adapter.save_rows(rows)
            except Exception as ex:
                notify_exception(
                    None, "Error in saving changes chunk {ids}: {ex}".format(
                        ids=[c.id for c in to_update], ex=repr(ex)))
                retry_changes.update(to_update)
        if async_configs_by_doc_id:
            doc_type_by_id = {
                _id: changes_by_id[_id].metadata.document_type
                for _id in async_configs_by_doc_id.keys()
            }
            AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain,
                                               doc_type_by_id)

        return retry_changes, change_exceptions
Esempio n. 4
0
    def _process_chunk_for_domain(self, domain, changes_chunk):
        adapters = list(self.table_adapters_by_domain[domain])
        changes_by_id = {change.id: change for change in changes_chunk}
        to_delete_by_adapter = defaultdict(list)
        rows_to_save_by_adapter = defaultdict(list)
        async_configs_by_doc_id = defaultdict(list)
        to_update = {change for change in changes_chunk if not change.deleted}
        retry_changes, docs = self.get_docs_for_changes(to_update, domain)
        change_exceptions = []

        for doc in docs:
            eval_context = EvaluationContext(doc)
            for adapter in adapters:
                if adapter.config.filter(doc):
                    if adapter.run_asynchronous:
                        async_configs_by_doc_id[doc['_id']].append(adapter.config._id)
                    else:
                        try:
                            rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context))
                        except Exception as e:
                            change_exceptions.append((changes_by_id[doc["_id"]], e))
                        eval_context.reset_iteration()
                elif adapter.config.deleted_filter(doc) or adapter.doc_exists(doc):
                    to_delete_by_adapter[adapter].append(doc['_id'])

        # bulk delete by adapter
        to_delete = [c.id for c in changes_chunk if c.deleted]
        for adapter in adapters:
            delete_ids = to_delete_by_adapter[adapter] + to_delete
            try:
                adapter.bulk_delete(delete_ids)
            except Exception as ex:
                notify_exception(
                    None,
                    "Error in deleting changes chunk {ids}: {ex}".format(
                        ids=delete_ids, ex=ex))
                retry_changes.update([c for c in changes_chunk if c.id in delete_ids])
        # bulk update by adapter
        for adapter, rows in six.iteritems(rows_to_save_by_adapter):
            try:
                adapter.save_rows(rows)
            except Exception as ex:
                notify_exception(
                    None,
                    "Error in saving changes chunk {ids}: {ex}".format(
                        ids=[c.id for c in to_update], ex=repr(ex)))
                retry_changes.update(to_update)
        if async_configs_by_doc_id:
            doc_type_by_id = {
                _id: changes_by_id[_id].metadata.document_type
                for _id in async_configs_by_doc_id.keys()
            }
            AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain, doc_type_by_id)

        return retry_changes, change_exceptions
Esempio n. 5
0
def _save_document_helper(indicator, doc):
    eval_context = EvaluationContext(doc)
    something_failed = False
    configs_to_remove = []
    configs = dict()
    for config_id in indicator.indicator_config_ids:
        try:
            configs[config_id] = _get_config(config_id)
        except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError):
            celery_task_logger.info(
                "{} no longer exists, skipping".format(config_id))
            configs_to_remove.append(config_id)
            continue
        except ESError:
            celery_task_logger.info(
                "ES errored when trying to retrieve config")
            something_failed = True
            continue

    for config_id, config in six.iteritems(configs):
        adapter = None
        try:
            adapter = get_indicator_adapter(config, can_handle_laboratory=True)
            adapter.save(doc, eval_context)
            eval_context.reset_iteration()
        except (ProtocolError, ReadTimeout):
            celery_task_logger.info(
                "Riak error when saving config: {}".format(config_id))
            something_failed = True
        except RequestError:
            celery_task_logger.info(
                "Couch error when saving config: {}".format(config_id))
            something_failed = True
        except (ESError, ConnectionTimeout):
            # a database had an issue so log it and go on to the next document
            celery_task_logger.info(
                "ES error when saving config: {}".format(config_id))
            something_failed = True
        except (DatabaseError, InternalError):
            # a database had an issue so log it and go on to the next document
            celery_task_logger.info(
                "psql error when saving config: {}".format(config_id))
            something_failed = True
        except Exception as e:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, e)
            something_failed = True
        else:
            configs_to_remove.append(config_id)

    rebuild_related_docs = any(config.icds_rebuild_related_docs
                               for config in six.itervalues(configs) if config)
    return (not something_failed, configs_to_remove, rebuild_related_docs)
Esempio n. 6
0
def save_document(doc_ids):
    lock_keys = []
    for doc_id in doc_ids:
        lock_keys.append(get_async_indicator_modify_lock_key(doc_id))

    with CriticalSection(lock_keys):
        indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids)
        if not indicators:
            return

        first_indicator = indicators[0]
        processed_indicators = []
        failed_indicators = []

        for i in indicators:
            assert i.domain == first_indicator.domain
            assert i.doc_type == first_indicator.doc_type

        indicator_by_doc_id = {i.doc_id: i for i in indicators}
        doc_store = get_document_store(first_indicator.domain,
                                       first_indicator.doc_type)
        for doc in doc_store.iter_documents(doc_ids):
            indicator = indicator_by_doc_id[doc['_id']]

            eval_context = EvaluationContext(doc)
            for config_id in indicator.indicator_config_ids:
                adapter = None
                try:
                    config = _get_config(config_id)
                    adapter = get_indicator_adapter(config,
                                                    can_handle_laboratory=True)
                    adapter.save(doc, eval_context)
                    eval_context.reset_iteration()
                except (ESError, RequestError, ConnectionTimeout):
                    # couch or es had an issue so don't log it and go on to the next doc
                    failed_indicators.append(indicator.pk)
                    break
                except Exception as e:
                    # getting the config could fail before the adapter is set
                    if adapter:
                        adapter.handle_exception(doc, e)
                    failed_indicators.append(indicator.pk)
                    break
                else:
                    processed_indicators.append(indicator.pk)

        AsyncIndicator.objects.filter(pk__in=processed_indicators).delete()
        AsyncIndicator.objects.filter(pk__in=failed_indicators).update(
            date_queued=None,
            unsuccessful_attempts=F('unsuccessful_attempts') + 1)
Esempio n. 7
0
    def process_change(self, change):
        self.bootstrap_if_needed()

        domain = change.metadata.domain
        if not domain or domain not in self.table_adapters_by_domain:
            # if no domain we won't save to any UCR table
            return

        if change.deleted:
            adapters = list(self.table_adapters_by_domain[domain])
            for table in adapters:
                table.delete({'_id': change.metadata.document_id})

        async_tables = []
        doc = change.get_document()
        ensure_document_exists(change)
        ensure_matched_revisions(change, doc)

        if doc is None:
            return

        with TimingContext() as timer:
            eval_context = EvaluationContext(doc)
            # make copy to avoid modifying list during iteration
            adapters = list(self.table_adapters_by_domain[domain])
            doc_subtype = change.metadata.document_subtype
            for table in adapters:
                if table.config.filter(doc, eval_context):
                    if table.run_asynchronous:
                        async_tables.append(table.config._id)
                    else:
                        self._save_doc_to_table(domain, table, doc, eval_context)
                        eval_context.reset_iteration()
                elif (doc_subtype is None
                        or doc_subtype in table.config.get_case_type_or_xmlns_filter()):
                    table.delete(doc)

            if async_tables:
                AsyncIndicator.update_from_kafka_change(change, async_tables)

        self.domain_timing_context.update(**{
            domain: timer.duration
        })
Esempio n. 8
0
    def process_change(self, change):
        self.bootstrap_if_needed()

        domain = change.metadata.domain
        if not domain or domain not in self.table_adapters_by_domain:
            # if no domain we won't save to any UCR table
            return

        if change.deleted:
            adapters = list(self.table_adapters_by_domain[domain])
            for table in adapters:
                table.delete({'_id': change.metadata.document_id})

        async_tables = []
        doc = change.get_document()
        ensure_document_exists(change)
        ensure_matched_revisions(change, doc)

        if doc is None:
            return

        with TimingContext() as timer:
            eval_context = EvaluationContext(doc)
            # make copy to avoid modifying list during iteration
            adapters = list(self.table_adapters_by_domain[domain])
            for table in adapters:
                if table.config.filter(doc):
                    if table.run_asynchronous:
                        async_tables.append(table.config._id)
                    else:
                        self._save_doc_to_table(domain, table, doc, eval_context)
                        eval_context.reset_iteration()
                elif table.config.deleted_filter(doc) or table.doc_exists(doc):
                    table.delete(doc)

            if async_tables:
                AsyncIndicator.update_from_kafka_change(change, async_tables)

        self.domain_timing_context.update(**{
            domain: timer.duration
        })
Esempio n. 9
0
def build_async_indicators(indicator_doc_ids):
    # written to be used with _queue_indicators, indicator_doc_ids must
    #   be a chunk of 100
    memoizers = {'configs': {}, 'adapters': {}}
    assert(len(indicator_doc_ids)) <= ASYNC_INDICATOR_CHUNK_SIZE

    def handle_exception(exception, config_id, doc, adapter):
        metric = None
        if isinstance(exception, (ProtocolError, ReadTimeout)):
            metric = 'commcare.async_indicator.riak_error'
        elif isinstance(exception, (ESError, ConnectionTimeout)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.es_error'
        elif isinstance(exception, (DatabaseError, InternalError)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.psql_error'
        else:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, exception)
        if metric:
            metrics_counter(metric, tags={'config_id': config_id})

    def doc_ids_from_rows(rows):
        formatted_rows = [
            {column.column.database_column_name.decode('utf-8'): column.value for column in row}
            for row in rows
        ]
        return set(row['doc_id'] for row in formatted_rows)

    def _get_config(config_id):
        config_by_id = memoizers['configs']
        if config_id in config_by_id:
            return config_by_id[config_id]
        else:
            config = _get_config_by_id(config_id)
            config_by_id[config_id] = config
            return config

    def _get_adapter(config):
        adapter_by_config = memoizers['adapters']
        if config._id in adapter_by_config:
            return adapter_by_config[config._id]
        else:
            adapter = get_indicator_adapter(config, load_source='build_async_indicators')
            adapter_by_config[config._id] = adapter
            return adapter

    def _metrics_timer(step, config_id=None):
        tags = {
            'action': step,
        }
        if config_id and settings.ENTERPRISE_MODE:
            tags['config_id'] = config_id
        else:
            # Prometheus requires consistent tags even if not available
            tags['config_id'] = None
        return metrics_histogram_timer(
            'commcare.async_indicator.timing',
            timing_buckets=(.03, .1, .3, 1, 3, 10), tags=tags
        )

    # tracks processed/deleted configs to be removed from each indicator
    configs_to_remove_by_indicator_id = defaultdict(list)

    def _mark_config_to_remove(config_id, indicator_ids):
        for _id in indicator_ids:
            configs_to_remove_by_indicator_id[_id].append(config_id)

    timer = TimingContext()
    lock_keys = [
        get_async_indicator_modify_lock_key(indicator_doc_id)
        for indicator_doc_id in indicator_doc_ids
    ]
    with CriticalSection(lock_keys):
        all_indicators = AsyncIndicator.objects.filter(
            doc_id__in=indicator_doc_ids
        )
        if not all_indicators:
            return

        doc_store = get_document_store_for_doc_type(
            all_indicators[0].domain, all_indicators[0].doc_type,
            load_source="build_async_indicators",
        )
        failed_indicators = set()

        rows_to_save_by_adapter = defaultdict(list)
        docs_to_delete_by_adapter = defaultdict(list)
        # there will always be one AsyncIndicator per doc id
        indicator_by_doc_id = {i.doc_id: i for i in all_indicators}
        config_ids = set()
        with timer:
            for doc in doc_store.iter_documents(list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                eval_context = EvaluationContext(doc)
                for config_id in indicator.indicator_config_ids:
                    with _metrics_timer('transform', config_id):
                        config_ids.add(config_id)
                        try:
                            config = _get_config(config_id)
                        except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError):
                            celery_task_logger.info("{} no longer exists, skipping".format(config_id))
                            # remove because the config no longer exists
                            _mark_config_to_remove(config_id, [indicator.pk])
                            continue
                        except ESError:
                            celery_task_logger.info("ES errored when trying to retrieve config")
                            failed_indicators.add(indicator)
                            continue
                        adapter = None
                        try:
                            adapter = _get_adapter(config)
                            rows_to_save = adapter.get_all_values(doc, eval_context)
                            if rows_to_save:
                                rows_to_save_by_adapter[adapter].extend(rows_to_save)
                            else:
                                docs_to_delete_by_adapter[adapter].append(doc)
                            eval_context.reset_iteration()
                        except Exception as e:
                            failed_indicators.add(indicator)
                            handle_exception(e, config_id, doc, adapter)

            with _metrics_timer('single_batch_update'):
                for adapter, rows in rows_to_save_by_adapter.items():
                    doc_ids = doc_ids_from_rows(rows)
                    indicators = [indicator_by_doc_id[doc_id] for doc_id in doc_ids]
                    try:
                        with _metrics_timer('update', adapter.config._id):
                            adapter.save_rows(rows, use_shard_col=True)
                    except Exception as e:
                        failed_indicators.union(indicators)
                        message = str(e)
                        notify_exception(None, "Exception bulk saving async indicators:{}".format(message))
                    else:
                        # remove because it's successfully processed
                        _mark_config_to_remove(
                            config_id,
                            [i.pk for i in indicators]
                        )

            with _metrics_timer('single_batch_delete'):
                for adapter, docs in docs_to_delete_by_adapter.items():
                    with _metrics_timer('delete', adapter.config._id):
                        adapter.bulk_delete(docs)

        # delete fully processed indicators
        processed_indicators = set(all_indicators) - failed_indicators
        AsyncIndicator.objects.filter(pk__in=[i.pk for i in processed_indicators]).delete()

        # update failure for failed indicators
        with transaction.atomic():
            for indicator in failed_indicators:
                indicator.update_failure(
                    configs_to_remove_by_indicator_id.get(indicator.pk, [])
                )
                indicator.save()

        metrics_counter('commcare.async_indicator.processed_success', len(processed_indicators))
        metrics_counter('commcare.async_indicator.processed_fail', len(failed_indicators))
        metrics_counter(
            'commcare.async_indicator.processing_time', timer.duration,
            tags={'config_ids': config_ids}
        )
        metrics_counter(
            'commcare.async_indicator.processed_total', len(indicator_doc_ids),
            tags={'config_ids': config_ids}
        )
Esempio n. 10
0
def _build_async_indicators(indicator_doc_ids):
    def handle_exception(exception, config_id, doc, adapter):
        metric = None
        if isinstance(exception, (ProtocolError, ReadTimeout)):
            metric = 'commcare.async_indicator.riak_error'
        elif isinstance(exception, (ESError, ConnectionTimeout)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.es_error'
        elif isinstance(exception, (DatabaseError, InternalError)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.psql_error'
        else:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, exception)
        if metric:
            datadog_counter(metric,
                            1,
                            tags={
                                'config_id': config_id,
                                'doc_id': doc['_id']
                            })

    def doc_ids_from_rows(rows):
        formatted_rows = [{
            column.column.database_column_name.decode('utf-8'): column.value
            for column in row
        } for row in rows]
        return set(row['doc_id'] for row in formatted_rows)

    # tracks processed/deleted configs to be removed from each indicator
    configs_to_remove_by_indicator_id = defaultdict(list)

    def _mark_config_to_remove(config_id, indicator_ids):
        for _id in indicator_ids:
            configs_to_remove_by_indicator_id[_id].append(config_id)

    timer = TimingContext()
    lock_keys = [
        get_async_indicator_modify_lock_key(indicator_id)
        for indicator_id in indicator_doc_ids
    ]
    with CriticalSection(lock_keys):
        all_indicators = AsyncIndicator.objects.filter(
            doc_id__in=indicator_doc_ids)
        if not all_indicators:
            return

        doc_store = get_document_store_for_doc_type(all_indicators[0].domain,
                                                    all_indicators[0].doc_type)
        failed_indicators = set()

        rows_to_save_by_adapter = defaultdict(list)
        indicator_by_doc_id = {i.doc_id: i for i in all_indicators}
        config_ids = set()
        with timer:
            for doc in doc_store.iter_documents(
                    list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                eval_context = EvaluationContext(doc)
                for config_id in indicator.indicator_config_ids:
                    config_ids.add(config_id)
                    try:
                        config = _get_config_by_id(config_id)
                    except (ResourceNotFound,
                            StaticDataSourceConfigurationNotFoundError):
                        celery_task_logger.info(
                            "{} no longer exists, skipping".format(config_id))
                        # remove because the config no longer exists
                        _mark_config_to_remove(config_id, [indicator.pk])
                        continue
                    except ESError:
                        celery_task_logger.info(
                            "ES errored when trying to retrieve config")
                        failed_indicators.add(indicator)
                        continue
                    adapter = None
                    try:
                        adapter = get_indicator_adapter(config)
                        rows_to_save_by_adapter[adapter].extend(
                            adapter.get_all_values(doc, eval_context))
                        eval_context.reset_iteration()
                    except Exception as e:
                        failed_indicators.add(indicator)
                        handle_exception(e, config_id, doc, adapter)

            for adapter, rows in six.iteritems(rows_to_save_by_adapter):
                doc_ids = doc_ids_from_rows(rows)
                indicators = [
                    indicator_by_doc_id[doc_id] for doc_id in doc_ids
                ]
                try:
                    adapter.save_rows(rows)
                except Exception as e:
                    failed_indicators.union(indicators)
                    message = six.text_type(e)
                    notify_exception(
                        None,
                        "Exception bulk saving async indicators:{}".format(
                            message))
                else:
                    # remove because it's sucessfully processed
                    _mark_config_to_remove(config_id,
                                           [i.pk for i in indicators])

        # delete fully processed indicators
        processed_indicators = set(all_indicators) - failed_indicators
        AsyncIndicator.objects.filter(
            pk__in=[i.pk for i in processed_indicators]).delete()

        # update failure for failed indicators
        with transaction.atomic():
            for indicator in failed_indicators:
                indicator.update_failure(
                    configs_to_remove_by_indicator_id.get(indicator.pk, []))
                indicator.save()

        datadog_counter('commcare.async_indicator.processed_success',
                        len(processed_indicators))
        datadog_counter('commcare.async_indicator.processed_fail',
                        len(failed_indicators))
        datadog_histogram('commcare.async_indicator.processing_time',
                          timer.duration / len(indicator_doc_ids),
                          tags=[
                              'config_ids:{}'.format(config_ids),
                          ])
Esempio n. 11
0
    def _process_chunk_for_domain(self, domain, changes_chunk):
        adapters = list(self.table_adapters_by_domain[domain])
        changes_by_id = {change.id: change for change in changes_chunk}
        to_delete_by_adapter = defaultdict(list)
        rows_to_save_by_adapter = defaultdict(list)
        async_configs_by_doc_id = defaultdict(list)
        to_update = {change for change in changes_chunk if not change.deleted}
        with self._datadog_timing('extract'):
            retry_changes, docs = self.get_docs_for_changes(to_update, domain)
        change_exceptions = []

        with self._datadog_timing('single_batch_transform'):
            for doc in docs:
                change = changes_by_id[doc['_id']]
                doc_subtype = change.metadata.document_subtype
                eval_context = EvaluationContext(doc)
                with self._datadog_timing('single_doc_transform'):
                    for adapter in adapters:
                        with self._datadog_timing('transform',
                                                  adapter.config._id):
                            if adapter.config.filter(doc, eval_context):
                                if adapter.run_asynchronous:
                                    async_configs_by_doc_id[doc['_id']].append(
                                        adapter.config._id)
                                else:
                                    try:
                                        rows_to_save_by_adapter[
                                            adapter].extend(
                                                adapter.get_all_values(
                                                    doc, eval_context))
                                    except Exception as e:
                                        change_exceptions.append((change, e))
                                    eval_context.reset_iteration()
                            elif (doc_subtype is None
                                  or doc_subtype in adapter.config.
                                  get_case_type_or_xmlns_filter()):
                                # Delete if the subtype is unknown or
                                # if the subtype matches our filters, but the full filter no longer applies
                                to_delete_by_adapter[adapter].append(doc)

        with self._datadog_timing('single_batch_delete'):
            # bulk delete by adapter
            to_delete = [{'_id': c.id} for c in changes_chunk if c.deleted]
            for adapter in adapters:
                delete_docs = to_delete_by_adapter[adapter] + to_delete
                if not delete_docs:
                    continue
                with self._datadog_timing('delete', adapter.config._id):
                    try:
                        adapter.bulk_delete(delete_docs)
                    except Exception:
                        delete_ids = [doc['_id'] for doc in delete_docs]
                        retry_changes.update(
                            [c for c in changes_chunk if c.id in delete_ids])

        with self._datadog_timing('single_batch_load'):
            # bulk update by adapter
            for adapter, rows in rows_to_save_by_adapter.items():
                with self._datadog_timing('load', adapter.config._id):
                    try:
                        adapter.save_rows(rows)
                    except Exception:
                        retry_changes.update(to_update)

        if async_configs_by_doc_id:
            with self._datadog_timing('async_config_load'):
                doc_type_by_id = {
                    _id: changes_by_id[_id].metadata.document_type
                    for _id in async_configs_by_doc_id.keys()
                }
                AsyncIndicator.bulk_update_records(async_configs_by_doc_id,
                                                   domain, doc_type_by_id)

        return retry_changes, change_exceptions
Esempio n. 12
0
def _build_async_indicators(indicator_doc_ids):
    def handle_exception(exception, config_id, doc, adapter):
        metric = None
        if isinstance(exception, (ProtocolError, ReadTimeout)):
            metric = 'commcare.async_indicator.riak_error'
        elif isinstance(exception, (ESError, ConnectionTimeout)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.es_error'
        elif isinstance(exception, (DatabaseError, InternalError)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.psql_error'
        else:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, exception)
        if metric:
            datadog_counter(metric, 1,
                tags={'config_id': config_id, 'doc_id': doc['_id']})

    def doc_ids_from_rows(rows):
        formatted_rows = [
            {column.column.database_column_name.decode('utf-8'): column.value for column in row}
            for row in rows
        ]
        return set(row['doc_id'] for row in formatted_rows)

    # tracks processed/deleted configs to be removed from each indicator
    configs_to_remove_by_indicator_id = defaultdict(list)

    def _mark_config_to_remove(config_id, indicator_ids):
        for _id in indicator_ids:
            configs_to_remove_by_indicator_id[_id].append(config_id)

    timer = TimingContext()
    lock_keys = [
        get_async_indicator_modify_lock_key(indicator_id)
        for indicator_id in indicator_doc_ids
    ]
    with CriticalSection(lock_keys):
        all_indicators = AsyncIndicator.objects.filter(
            doc_id__in=indicator_doc_ids
        )
        if not all_indicators:
            return

        doc_store = get_document_store_for_doc_type(
            all_indicators[0].domain, all_indicators[0].doc_type,
            load_source="build_async_indicators",
        )
        failed_indicators = set()

        rows_to_save_by_adapter = defaultdict(list)
        indicator_by_doc_id = {i.doc_id: i for i in all_indicators}
        config_ids = set()
        with timer:
            for doc in doc_store.iter_documents(list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                eval_context = EvaluationContext(doc)
                for config_id in indicator.indicator_config_ids:
                    config_ids.add(config_id)
                    try:
                        config = _get_config_by_id(config_id)
                    except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError):
                        celery_task_logger.info("{} no longer exists, skipping".format(config_id))
                        # remove because the config no longer exists
                        _mark_config_to_remove(config_id, [indicator.pk])
                        continue
                    except ESError:
                        celery_task_logger.info("ES errored when trying to retrieve config")
                        failed_indicators.add(indicator)
                        continue
                    adapter = None
                    try:
                        adapter = get_indicator_adapter(config, load_source='build_async_indicators')
                        rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context))
                        eval_context.reset_iteration()
                    except Exception as e:
                        failed_indicators.add(indicator)
                        handle_exception(e, config_id, doc, adapter)

            for adapter, rows in six.iteritems(rows_to_save_by_adapter):
                doc_ids = doc_ids_from_rows(rows)
                indicators = [indicator_by_doc_id[doc_id] for doc_id in doc_ids]
                try:
                    adapter.save_rows(rows)
                except Exception as e:
                    failed_indicators.union(indicators)
                    message = six.text_type(e)
                    notify_exception(None,
                        "Exception bulk saving async indicators:{}".format(message))
                else:
                    # remove because it's sucessfully processed
                    _mark_config_to_remove(
                        config_id,
                        [i.pk for i in indicators]
                    )

        # delete fully processed indicators
        processed_indicators = set(all_indicators) - failed_indicators
        AsyncIndicator.objects.filter(pk__in=[i.pk for i in processed_indicators]).delete()

        # update failure for failed indicators
        with transaction.atomic():
            for indicator in failed_indicators:
                indicator.update_failure(
                    configs_to_remove_by_indicator_id.get(indicator.pk, [])
                )
                indicator.save()

        datadog_counter('commcare.async_indicator.processed_success', len(processed_indicators))
        datadog_counter('commcare.async_indicator.processed_fail', len(failed_indicators))
        datadog_histogram(
            'commcare.async_indicator.processing_time', timer.duration / len(indicator_doc_ids),
            tags=[
                'config_ids:{}'.format(config_ids),
            ]
        )