def handle(self):
        indicator_config_id = 'static-ccs_record_cases'
        case_type_of_xmlns = 'ccs_record'
        config = _get_config_by_id(indicator_config_id)

        document_store = get_document_store_for_doc_type(
            config.domain, config.referenced_doc_type, case_type_or_xmlns=case_type_of_xmlns
        )

        current_month_start = datetime.date.today().replace(day=1)
        last_month_start = (current_month_start - datetime.timedelta(days=1)).replace(day=1)

        current_month_start = current_month_start.strftime('%Y-%m-%d')
        last_month_start = last_month_start.strftime('%Y-%m-%d')
        current_month_doc_ids = CcsRecordMonthly.objects.filter(pnc_complete=1,
                                                                month=current_month_start).values('case_id')
        docs_last_month = CcsRecordMonthly.objects.filter(pnc_complete=1,
                                                          month=last_month_start).values('case_id')

        doc_ids = current_month_doc_ids + docs_last_month
        doc_ids = {doc_id['case_id'] for doc_id in doc_ids}
        relevant_ids = list()
        next_event = time.time() + 10
        for doc_id in doc_ids:
            relevant_ids.append(doc_id)
            if len(relevant_ids) >= ID_CHUNK_SIZE:
                _build_indicators(config, document_store, relevant_ids)
                relevant_ids = []

            if time.time() > next_event:
                print("processed till case %s" % (doc_id['case_id']))
                next_event = time.time() + 10

        if relevant_ids:
            _build_indicators(config, document_store, relevant_ids)
    def handle(self, domain, data_source_id, *args, **kwargs):
        config, _ = get_datasource_config(data_source_id, domain)
        adapter = get_indicator_adapter(config)
        q = adapter.get_query_object()
        document_store = get_document_store_for_doc_type(domain, config.referenced_doc_type)
        bad_rows = []
        for row in with_progress_bar(q, length=q.count()):
            doc_id = row.doc_id
            doc = document_store.get_document(doc_id)

            current_rows = config.get_all_values(doc)
            if len(current_rows) > 1:
                raise ValueError("this command doesn't work for datasources returning multiple rows per doc")

            try:
                current_row = current_rows[0]
            except KeyError:
                continue

            # don't compare the 'inserted_at' columns
            current_row = [val for val in current_row if val.column.database_column_name != 'inserted_at']

            for val in current_row:
                try:
                    inserted_value = getattr(row, val.column.database_column_name)
                    if (inserted_value != val.value
                       or row.inserted_at.replace(tzinfo=pytz.utc) < parse_datetime(doc['server_modified_on'])):
                        bad_rows.append({
                            'doc_id': row.doc_id,
                            'column_name': val.column.database_column_name,
                            'inserted_at': row.inserted_at.isoformat(),
                            'server_modified_on': doc['server_modified_on'],
                            'stored_value': getattr(row, val.column.database_column_name),
                            'desired_value': val.value,
                            'message': ('column mismatch'
                                        if inserted_value != val.value else "modified date early"),
                        })
                except AttributeError:
                    bad_rows.append({
                        'doc_id': row.doc_id,
                        'column_name': val.column.database_column_name,
                        'inserted_at': 'missing',
                        'server_modified_on': doc['server_modified_on'],
                        'stored_value': 'missing',
                        'desired_value': val.value,
                        'message': 'doc missing',
                    })

        filename = 'datasource_mismatches_{}_{}.csv'.format(
            data_source_id[-8:],
            datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
        )
        with open(filename, 'w', encoding='utf-8') as f:
            headers = ['doc_id', 'column_name', 'inserted_at', 'server_modified_on',
                       'stored_value', 'desired_value', 'message']
            writer = csv.DictWriter(f, headers)
            writer.writeheader()
            writer.writerows(bad_rows)

        print("Found {} mismatches. Check {} for more details".format(len(bad_rows), filename))
Exemple #3
0
    def handle(self, domain, count, **options):
        sort_by = options['sort']
        indicators = AsyncIndicator.objects.filter(
            domain=domain).order_by('-date_created')[:count]
        print('processing {} indicators'.format(len(indicators)))

        # build up data source configs and docs
        configs = {}
        docs = {}
        for indicator in indicators:
            doc_store = get_document_store_for_doc_type(
                domain,
                indicator.doc_type,
                load_source="profile_async_indicators")
            docs[indicator.doc_id] = doc_store.get_document(indicator.doc_id)
            for config_id in indicator.indicator_config_ids:
                configs[config_id] = _get_config(config_id)

        local_variables = {
            '_simulate_indicator_saves': _simulate_indicator_saves,
            'indicators': indicators,
            'docs': docs,
            'configs': configs,
        }
        cProfile.runctx('_simulate_indicator_saves(indicators, docs, configs)',
                        {}, local_variables, 'async_ucr_stats.log')
        print_profile_stats('async_ucr_stats.log', sort_by)
Exemple #4
0
 def _get_document(related_doc_type, doc_id, context):
     document_store = get_document_store_for_doc_type(context.root_doc['domain'], related_doc_type)
     try:
         doc = document_store.get_document(doc_id)
     except DocumentNotFoundError:
         return None
     if context.root_doc['domain'] != doc.get('domain'):
         return None
     return doc
Exemple #5
0
def _iteratively_build_table(config,
                             resume_helper=None,
                             in_place=False,
                             limit=-1):
    resume_helper = resume_helper or DataSourceResumeHelper(config)
    indicator_config_id = config._id
    case_type_or_xmlns_list = config.get_case_type_or_xmlns_filter()
    completed_ct_xmlns = resume_helper.get_completed_case_type_or_xmlns()
    if completed_ct_xmlns:
        case_type_or_xmlns_list = [
            case_type_or_xmlns
            for case_type_or_xmlns in case_type_or_xmlns_list
            if case_type_or_xmlns not in completed_ct_xmlns
        ]

    for case_type_or_xmlns in case_type_or_xmlns_list:
        relevant_ids = []
        document_store = get_document_store_for_doc_type(
            config.domain,
            config.referenced_doc_type,
            case_type_or_xmlns=case_type_or_xmlns)

        for i, relevant_id in enumerate(document_store.iter_document_ids()):
            if i >= limit > -1:
                break
            relevant_ids.append(relevant_id)
            if len(relevant_ids) >= ID_CHUNK_SIZE:
                _build_indicators(config, document_store, relevant_ids)
                relevant_ids = []

        if relevant_ids:
            _build_indicators(config, document_store, relevant_ids)

        resume_helper.add_completed_case_type_or_xmlns(case_type_or_xmlns)

    resume_helper.clear_resume_info()
    if not id_is_static(indicator_config_id):
        if in_place:
            config.meta.build.finished_in_place = True
        else:
            config.meta.build.finished = True
        try:
            config.save()
        except ResourceConflict:
            current_config = DataSourceConfiguration.get(config._id)
            # check that a new build has not yet started
            if in_place:
                if config.meta.build.initiated_in_place == current_config.meta.build.initiated_in_place:
                    current_config.meta.build.finished_in_place = True
            else:
                if config.meta.build.initiated == current_config.meta.build.initiated:
                    current_config.meta.build.finished = True
            current_config.save()
        adapter = get_indicator_adapter(config,
                                        raise_errors=True,
                                        can_handle_laboratory=True)
        adapter.after_table_build()
Exemple #6
0
 def _get_document(related_doc_type, doc_id, context):
     document_store = get_document_store_for_doc_type(context.root_doc['domain'], related_doc_type)
     try:
         doc = document_store.get_document(doc_id)
     except DocumentNotFoundError:
         return None
     if context.root_doc['domain'] != doc.get('domain'):
         return None
     return doc
 def get_ucr_config_and_document_store(self, indicator_config_id,
                                       case_type_of_xmlns):
     config = _get_config_by_id(indicator_config_id)
     document_store = get_document_store_for_doc_type(
         config.domain,
         config.referenced_doc_type,
         case_type_or_xmlns=case_type_of_xmlns,
         load_source="build_ccs_record_ucr",
     )
     return config, document_store
    def handle(self, domain, data_source_id, doc_id, **options):
        config, _ = get_datasource_config(data_source_id, domain)
        doc_type = config.referenced_doc_type
        doc_store = get_document_store_for_doc_type(domain, doc_type)
        doc = doc_store.get_document(doc_id)
        sort_by = options['sort']
        local_variables = {'config': config, 'doc': doc}

        cProfile.runctx('config.get_all_values(doc)', {}, local_variables, 'ucr_stats.log')
        print_profile_stats('ucr_stats.log', sort_by)
Exemple #9
0
def _iteratively_build_table(config,
                             resume_helper=None,
                             in_place=False,
                             limit=-1):
    resume_helper = resume_helper or DataSourceResumeHelper(config)
    indicator_config_id = config._id
    case_type_or_xmlns_list = config.get_case_type_or_xmlns_filter()
    domains = config.data_domains

    loop_iterations = list(itertools.product(domains, case_type_or_xmlns_list))
    completed_iterations = resume_helper.get_completed_iterations()
    if completed_iterations:
        loop_iterations = list(
            set(loop_iterations) - set(completed_iterations))

    for domain, case_type_or_xmlns in loop_iterations:
        relevant_ids = []
        document_store = get_document_store_for_doc_type(
            domain,
            config.referenced_doc_type,
            case_type_or_xmlns=case_type_or_xmlns,
            load_source="build_indicators",
        )

        for i, relevant_id in enumerate(document_store.iter_document_ids()):
            if i >= limit > -1:
                break
            relevant_ids.append(relevant_id)
            if len(relevant_ids) >= ID_CHUNK_SIZE:
                _build_indicators(config, document_store, relevant_ids)
                relevant_ids = []

        if relevant_ids:
            _build_indicators(config, document_store, relevant_ids)

        resume_helper.add_completed_iteration(domain, case_type_or_xmlns)

    resume_helper.clear_resume_info()
    if not id_is_static(indicator_config_id):
        if in_place:
            config.meta.build.finished_in_place = True
        else:
            config.meta.build.finished = True
        try:
            config.save()
        except ResourceConflict:
            current_config = get_ucr_datasource_config_by_id(config._id)
            # check that a new build has not yet started
            if in_place:
                if config.meta.build.initiated_in_place == current_config.meta.build.initiated_in_place:
                    current_config.meta.build.finished_in_place = True
            else:
                if config.meta.build.initiated == current_config.meta.build.initiated:
                    current_config.meta.build.finished = True
            current_config.save()
Exemple #10
0
    def handle(self, domain, data_source_id, doc_id, **options):
        config, _ = get_datasource_config(data_source_id, domain)
        doc_type = config.referenced_doc_type
        doc_store = get_document_store_for_doc_type(domain, doc_type)
        doc = doc_store.get_document(doc_id)
        sort_by = options['sort']
        local_variables = {'config': config, 'doc': doc}

        cProfile.runctx('config.get_all_values(doc)', {}, local_variables,
                        'ucr_stats.log')
        print_profile_stats('ucr_stats.log', sort_by)
Exemple #11
0
def _iteratively_build_table(config, resume_helper=None, in_place=False, limit=-1):
    resume_helper = resume_helper or DataSourceResumeHelper(config)
    indicator_config_id = config._id
    case_type_or_xmlns_list = config.get_case_type_or_xmlns_filter()
    completed_ct_xmlns = resume_helper.get_completed_case_type_or_xmlns()
    if completed_ct_xmlns:
        case_type_or_xmlns_list = [
            case_type_or_xmlns
            for case_type_or_xmlns in case_type_or_xmlns_list
            if case_type_or_xmlns not in completed_ct_xmlns
        ]

    for case_type_or_xmlns in case_type_or_xmlns_list:
        relevant_ids = []
        document_store = get_document_store_for_doc_type(
            config.domain, config.referenced_doc_type,
            case_type_or_xmlns=case_type_or_xmlns,
            load_source="build_indicators",
        )

        for i, relevant_id in enumerate(document_store.iter_document_ids()):
            if i >= limit > -1:
                break
            relevant_ids.append(relevant_id)
            if len(relevant_ids) >= ID_CHUNK_SIZE:
                _build_indicators(config, document_store, relevant_ids)
                relevant_ids = []

        if relevant_ids:
            _build_indicators(config, document_store, relevant_ids)

        resume_helper.add_completed_case_type_or_xmlns(case_type_or_xmlns)

    resume_helper.clear_resume_info()
    if not id_is_static(indicator_config_id):
        if in_place:
            config.meta.build.finished_in_place = True
        else:
            config.meta.build.finished = True
        try:
            config.save()
        except ResourceConflict:
            current_config = DataSourceConfiguration.get(config._id)
            # check that a new build has not yet started
            if in_place:
                if config.meta.build.initiated_in_place == current_config.meta.build.initiated_in_place:
                    current_config.meta.build.finished_in_place = True
            else:
                if config.meta.build.initiated == current_config.meta.build.initiated:
                    current_config.meta.build.finished = True
            current_config.save()
Exemple #12
0
    def handle(self):
        indicator_config_id = 'static-ccs_record_cases'
        case_type_of_xmlns = 'ccs_record'
        config = _get_config_by_id(indicator_config_id)

        document_store = get_document_store_for_doc_type(
            config.domain,
            config.referenced_doc_type,
            case_type_or_xmlns=case_type_of_xmlns,
            load_source="build_ccs_record_ucr",
        )

        current_month_start = datetime.date.today().replace(day=1)
        last_month_start = (current_month_start -
                            datetime.timedelta(days=1)).replace(day=1)

        current_month_start = current_month_start.strftime('%Y-%m-%d')
        last_month_start = last_month_start.strftime('%Y-%m-%d')
        current_month_doc_ids = CcsRecordMonthly.objects.filter(
            pnc_complete=1, month=current_month_start).values('case_id')
        docs_last_month = CcsRecordMonthly.objects.filter(
            pnc_complete=1, month=last_month_start).values('case_id')

        doc_ids = current_month_doc_ids + docs_last_month
        doc_ids = {doc_id['case_id'] for doc_id in doc_ids}
        relevant_ids = list()
        next_event = time.time() + 10
        for doc_id in doc_ids:
            relevant_ids.append(doc_id)
            if len(relevant_ids) >= ID_CHUNK_SIZE:
                _build_indicators(config, document_store, relevant_ids)
                relevant_ids = []

            if time.time() > next_event:
                print("processed till case %s" % (doc_id['case_id']))
                next_event = time.time() + 10

        if relevant_ids:
            _build_indicators(config, document_store, relevant_ids)
    def handle(self, domain, count, **options):
        sort_by = options['sort']
        indicators = AsyncIndicator.objects.filter(domain=domain).order_by('-date_created')[:count]
        print('processing {} indicators'.format(len(indicators)))

        # build up data source configs and docs
        configs = {}
        docs = {}
        for indicator in indicators:
            doc_store = get_document_store_for_doc_type(domain, indicator.doc_type)
            docs[indicator.doc_id] = doc_store.get_document(indicator.doc_id)
            for config_id in indicator.indicator_config_ids:
                configs[config_id] = _get_config(config_id)

        local_variables = {
            '_simulate_indicator_saves': _simulate_indicator_saves,
            'indicators': indicators,
            'docs': docs,
            'configs': configs,
        }
        cProfile.runctx('_simulate_indicator_saves(indicators, docs, configs)', {},
                        local_variables, 'async_ucr_stats.log')
        print_profile_stats('async_ucr_stats.log', sort_by)
Exemple #14
0
    def handle(self, domain, data_source_id, *args, **kwargs):
        config, _ = get_datasource_config(data_source_id, domain)
        adapter = get_indicator_adapter(config, load_source='find_datasource_mismatches')
        q = adapter.get_query_object()
        document_store = get_document_store_for_doc_type(
            domain, config.referenced_doc_type, load_source="find_datasource_mismatches")
        bad_rows = []
        for row in with_progress_bar(q, length=q.count()):
            adapter.track_load()
            doc_id = row.doc_id
            doc = document_store.get_document(doc_id)

            current_rows = config.get_all_values(doc)
            if len(current_rows) > 1:
                raise ValueError("this command doesn't work for datasources returning multiple rows per doc")

            try:
                current_row = current_rows[0]
            except KeyError:
                continue

            # don't compare the 'inserted_at' columns
            current_row = [val for val in current_row if val.column.database_column_name != 'inserted_at']

            for val in current_row:
                try:
                    inserted_value = getattr(row, val.column.database_column_name)
                    if (inserted_value != val.value
                       or row.inserted_at.replace(tzinfo=pytz.utc) < parse_datetime(doc['server_modified_on'])):
                        bad_rows.append({
                            'doc_id': row.doc_id,
                            'column_name': val.column.database_column_name,
                            'inserted_at': row.inserted_at.isoformat(),
                            'server_modified_on': doc['server_modified_on'],
                            'stored_value': getattr(row, val.column.database_column_name),
                            'desired_value': val.value,
                            'message': ('column mismatch'
                                        if inserted_value != val.value else "modified date early"),
                        })
                except AttributeError:
                    bad_rows.append({
                        'doc_id': row.doc_id,
                        'column_name': val.column.database_column_name,
                        'inserted_at': 'missing',
                        'server_modified_on': doc['server_modified_on'],
                        'stored_value': 'missing',
                        'desired_value': val.value,
                        'message': 'doc missing',
                    })

        filename = 'datasource_mismatches_{}_{}.csv'.format(
            data_source_id[-8:],
            datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
        )
        with open(filename, 'w', encoding='utf-8') as f:
            headers = ['doc_id', 'column_name', 'inserted_at', 'server_modified_on',
                       'stored_value', 'desired_value', 'message']
            writer = csv.DictWriter(f, headers)
            writer.writeheader()
            writer.writerows(bad_rows)

        print("Found {} mismatches. Check {} for more details".format(len(bad_rows), filename))
 def test_doc_store(self):
     doc_store = get_document_store_for_doc_type(self.domain, self.doc_type)
     self.assertSetEqual(set(self.all_doc_ids_domain),
                         set(doc_store.iter_document_ids()))
Exemple #16
0
def save_document(doc_ids):
    lock_keys = []
    for doc_id in doc_ids:
        lock_keys.append(get_async_indicator_modify_lock_key(doc_id))

    indicator_config_ids = None
    timer = TimingContext()
    with CriticalSection(lock_keys):
        indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids)
        if not indicators:
            return

        first_indicator = indicators[0]
        processed_indicators = []
        failed_indicators = []

        for i in indicators:
            assert i.domain == first_indicator.domain
            assert i.doc_type == first_indicator.doc_type

        indicator_by_doc_id = {i.doc_id: i for i in indicators}
        doc_store = get_document_store_for_doc_type(first_indicator.domain,
                                                    first_indicator.doc_type)
        indicator_config_ids = first_indicator.indicator_config_ids
        related_docs_to_rebuild = set()

        with timer:
            for doc in doc_store.iter_documents(
                    list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                successfully_processed, to_remove, rebuild_related_docs = _save_document_helper(
                    indicator, doc)
                if rebuild_related_docs:
                    related_docs_to_rebuild = related_docs_to_rebuild.union(
                        icds_get_related_docs_ids(doc['_id']))
                if successfully_processed:
                    processed_indicators.append(indicator.pk)
                else:
                    failed_indicators.append((indicator, to_remove))

        num_processed = len(processed_indicators)
        num_failed = len(failed_indicators)
        AsyncIndicator.objects.filter(pk__in=processed_indicators).delete()
        with transaction.atomic():
            for indicator, to_remove in failed_indicators:
                indicator.update_failure(to_remove)
                indicator.save()

    # remove any related docs that were just rebuilt
    related_docs_to_rebuild = related_docs_to_rebuild - set(doc_ids)
    # queue the docs that aren't already queued
    _queue_indicators(
        AsyncIndicator.objects.filter(doc_id__in=related_docs_to_rebuild,
                                      date_queued=None))

    datadog_counter('commcare.async_indicator.processed_success',
                    num_processed)
    datadog_counter('commcare.async_indicator.processed_fail', num_failed)
    datadog_histogram('commcare.async_indicator.processing_time',
                      timer.duration,
                      tags=['config_ids:{}'.format(indicator_config_ids)])
Exemple #17
0
def _build_async_indicators(indicator_doc_ids):
    def handle_exception(exception, config_id, doc, adapter):
        metric = None
        if isinstance(exception, (ProtocolError, ReadTimeout)):
            metric = 'commcare.async_indicator.riak_error'
        elif isinstance(exception, (ESError, ConnectionTimeout)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.es_error'
        elif isinstance(exception, (DatabaseError, InternalError)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.psql_error'
        else:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, exception)
        if metric:
            datadog_counter(metric,
                            1,
                            tags={
                                'config_id': config_id,
                                'doc_id': doc['_id']
                            })

    def doc_ids_from_rows(rows):
        formatted_rows = [{
            column.column.database_column_name.decode('utf-8'): column.value
            for column in row
        } for row in rows]
        return set(row['doc_id'] for row in formatted_rows)

    # tracks processed/deleted configs to be removed from each indicator
    configs_to_remove_by_indicator_id = defaultdict(list)

    def _mark_config_to_remove(config_id, indicator_ids):
        for _id in indicator_ids:
            configs_to_remove_by_indicator_id[_id].append(config_id)

    timer = TimingContext()
    lock_keys = [
        get_async_indicator_modify_lock_key(indicator_id)
        for indicator_id in indicator_doc_ids
    ]
    with CriticalSection(lock_keys):
        all_indicators = AsyncIndicator.objects.filter(
            doc_id__in=indicator_doc_ids)
        if not all_indicators:
            return

        doc_store = get_document_store_for_doc_type(all_indicators[0].domain,
                                                    all_indicators[0].doc_type)
        failed_indicators = set()

        rows_to_save_by_adapter = defaultdict(list)
        indicator_by_doc_id = {i.doc_id: i for i in all_indicators}
        config_ids = set()
        with timer:
            for doc in doc_store.iter_documents(
                    list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                eval_context = EvaluationContext(doc)
                for config_id in indicator.indicator_config_ids:
                    config_ids.add(config_id)
                    try:
                        config = _get_config_by_id(config_id)
                    except (ResourceNotFound,
                            StaticDataSourceConfigurationNotFoundError):
                        celery_task_logger.info(
                            "{} no longer exists, skipping".format(config_id))
                        # remove because the config no longer exists
                        _mark_config_to_remove(config_id, [indicator.pk])
                        continue
                    except ESError:
                        celery_task_logger.info(
                            "ES errored when trying to retrieve config")
                        failed_indicators.add(indicator)
                        continue
                    adapter = None
                    try:
                        adapter = get_indicator_adapter(config)
                        rows_to_save_by_adapter[adapter].extend(
                            adapter.get_all_values(doc, eval_context))
                        eval_context.reset_iteration()
                    except Exception as e:
                        failed_indicators.add(indicator)
                        handle_exception(e, config_id, doc, adapter)

            for adapter, rows in six.iteritems(rows_to_save_by_adapter):
                doc_ids = doc_ids_from_rows(rows)
                indicators = [
                    indicator_by_doc_id[doc_id] for doc_id in doc_ids
                ]
                try:
                    adapter.save_rows(rows)
                except Exception as e:
                    failed_indicators.union(indicators)
                    message = six.text_type(e)
                    notify_exception(
                        None,
                        "Exception bulk saving async indicators:{}".format(
                            message))
                else:
                    # remove because it's sucessfully processed
                    _mark_config_to_remove(config_id,
                                           [i.pk for i in indicators])

        # delete fully processed indicators
        processed_indicators = set(all_indicators) - failed_indicators
        AsyncIndicator.objects.filter(
            pk__in=[i.pk for i in processed_indicators]).delete()

        # update failure for failed indicators
        with transaction.atomic():
            for indicator in failed_indicators:
                indicator.update_failure(
                    configs_to_remove_by_indicator_id.get(indicator.pk, []))
                indicator.save()

        datadog_counter('commcare.async_indicator.processed_success',
                        len(processed_indicators))
        datadog_counter('commcare.async_indicator.processed_fail',
                        len(failed_indicators))
        datadog_histogram('commcare.async_indicator.processing_time',
                          timer.duration / len(indicator_doc_ids),
                          tags=[
                              'config_ids:{}'.format(config_ids),
                          ])
Exemple #18
0
def build_async_indicators(indicator_doc_ids):
    # written to be used with _queue_indicators, indicator_doc_ids must
    #   be a chunk of 100
    memoizers = {'configs': {}, 'adapters': {}}
    assert(len(indicator_doc_ids)) <= ASYNC_INDICATOR_CHUNK_SIZE

    def handle_exception(exception, config_id, doc, adapter):
        metric = None
        if isinstance(exception, (ProtocolError, ReadTimeout)):
            metric = 'commcare.async_indicator.riak_error'
        elif isinstance(exception, (ESError, ConnectionTimeout)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.es_error'
        elif isinstance(exception, (DatabaseError, InternalError)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.psql_error'
        else:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, exception)
        if metric:
            metrics_counter(metric, tags={'config_id': config_id})

    def doc_ids_from_rows(rows):
        formatted_rows = [
            {column.column.database_column_name.decode('utf-8'): column.value for column in row}
            for row in rows
        ]
        return set(row['doc_id'] for row in formatted_rows)

    def _get_config(config_id):
        config_by_id = memoizers['configs']
        if config_id in config_by_id:
            return config_by_id[config_id]
        else:
            config = _get_config_by_id(config_id)
            config_by_id[config_id] = config
            return config

    def _get_adapter(config):
        adapter_by_config = memoizers['adapters']
        if config._id in adapter_by_config:
            return adapter_by_config[config._id]
        else:
            adapter = get_indicator_adapter(config, load_source='build_async_indicators')
            adapter_by_config[config._id] = adapter
            return adapter

    def _metrics_timer(step, config_id=None):
        tags = {
            'action': step,
        }
        if config_id and settings.ENTERPRISE_MODE:
            tags['config_id'] = config_id
        else:
            # Prometheus requires consistent tags even if not available
            tags['config_id'] = None
        return metrics_histogram_timer(
            'commcare.async_indicator.timing',
            timing_buckets=(.03, .1, .3, 1, 3, 10), tags=tags
        )

    # tracks processed/deleted configs to be removed from each indicator
    configs_to_remove_by_indicator_id = defaultdict(list)

    def _mark_config_to_remove(config_id, indicator_ids):
        for _id in indicator_ids:
            configs_to_remove_by_indicator_id[_id].append(config_id)

    timer = TimingContext()
    lock_keys = [
        get_async_indicator_modify_lock_key(indicator_doc_id)
        for indicator_doc_id in indicator_doc_ids
    ]
    with CriticalSection(lock_keys):
        all_indicators = AsyncIndicator.objects.filter(
            doc_id__in=indicator_doc_ids
        )
        if not all_indicators:
            return

        doc_store = get_document_store_for_doc_type(
            all_indicators[0].domain, all_indicators[0].doc_type,
            load_source="build_async_indicators",
        )
        failed_indicators = set()

        rows_to_save_by_adapter = defaultdict(list)
        docs_to_delete_by_adapter = defaultdict(list)
        # there will always be one AsyncIndicator per doc id
        indicator_by_doc_id = {i.doc_id: i for i in all_indicators}
        config_ids = set()
        with timer:
            for doc in doc_store.iter_documents(list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                eval_context = EvaluationContext(doc)
                for config_id in indicator.indicator_config_ids:
                    with _metrics_timer('transform', config_id):
                        config_ids.add(config_id)
                        try:
                            config = _get_config(config_id)
                        except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError):
                            celery_task_logger.info("{} no longer exists, skipping".format(config_id))
                            # remove because the config no longer exists
                            _mark_config_to_remove(config_id, [indicator.pk])
                            continue
                        except ESError:
                            celery_task_logger.info("ES errored when trying to retrieve config")
                            failed_indicators.add(indicator)
                            continue
                        adapter = None
                        try:
                            adapter = _get_adapter(config)
                            rows_to_save = adapter.get_all_values(doc, eval_context)
                            if rows_to_save:
                                rows_to_save_by_adapter[adapter].extend(rows_to_save)
                            else:
                                docs_to_delete_by_adapter[adapter].append(doc)
                            eval_context.reset_iteration()
                        except Exception as e:
                            failed_indicators.add(indicator)
                            handle_exception(e, config_id, doc, adapter)

            with _metrics_timer('single_batch_update'):
                for adapter, rows in rows_to_save_by_adapter.items():
                    doc_ids = doc_ids_from_rows(rows)
                    indicators = [indicator_by_doc_id[doc_id] for doc_id in doc_ids]
                    try:
                        with _metrics_timer('update', adapter.config._id):
                            adapter.save_rows(rows, use_shard_col=True)
                    except Exception as e:
                        failed_indicators.union(indicators)
                        message = str(e)
                        notify_exception(None, "Exception bulk saving async indicators:{}".format(message))
                    else:
                        # remove because it's successfully processed
                        _mark_config_to_remove(
                            config_id,
                            [i.pk for i in indicators]
                        )

            with _metrics_timer('single_batch_delete'):
                for adapter, docs in docs_to_delete_by_adapter.items():
                    with _metrics_timer('delete', adapter.config._id):
                        adapter.bulk_delete(docs)

        # delete fully processed indicators
        processed_indicators = set(all_indicators) - failed_indicators
        AsyncIndicator.objects.filter(pk__in=[i.pk for i in processed_indicators]).delete()

        # update failure for failed indicators
        with transaction.atomic():
            for indicator in failed_indicators:
                indicator.update_failure(
                    configs_to_remove_by_indicator_id.get(indicator.pk, [])
                )
                indicator.save()

        metrics_counter('commcare.async_indicator.processed_success', len(processed_indicators))
        metrics_counter('commcare.async_indicator.processed_fail', len(failed_indicators))
        metrics_counter(
            'commcare.async_indicator.processing_time', timer.duration,
            tags={'config_ids': config_ids}
        )
        metrics_counter(
            'commcare.async_indicator.processed_total', len(indicator_doc_ids),
            tags={'config_ids': config_ids}
        )
Exemple #19
0
def _build_async_indicators(indicator_doc_ids):
    def handle_exception(exception, config_id, doc, adapter):
        metric = None
        if isinstance(exception, (ProtocolError, ReadTimeout)):
            metric = 'commcare.async_indicator.riak_error'
        elif isinstance(exception, (ESError, ConnectionTimeout)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.es_error'
        elif isinstance(exception, (DatabaseError, InternalError)):
            # a database had an issue so log it and go on to the next document
            metric = 'commcare.async_indicator.psql_error'
        else:
            # getting the config could fail before the adapter is set
            if adapter:
                adapter.handle_exception(doc, exception)
        if metric:
            datadog_counter(metric, 1,
                tags={'config_id': config_id, 'doc_id': doc['_id']})

    def doc_ids_from_rows(rows):
        formatted_rows = [
            {column.column.database_column_name.decode('utf-8'): column.value for column in row}
            for row in rows
        ]
        return set(row['doc_id'] for row in formatted_rows)

    # tracks processed/deleted configs to be removed from each indicator
    configs_to_remove_by_indicator_id = defaultdict(list)

    def _mark_config_to_remove(config_id, indicator_ids):
        for _id in indicator_ids:
            configs_to_remove_by_indicator_id[_id].append(config_id)

    timer = TimingContext()
    lock_keys = [
        get_async_indicator_modify_lock_key(indicator_id)
        for indicator_id in indicator_doc_ids
    ]
    with CriticalSection(lock_keys):
        all_indicators = AsyncIndicator.objects.filter(
            doc_id__in=indicator_doc_ids
        )
        if not all_indicators:
            return

        doc_store = get_document_store_for_doc_type(
            all_indicators[0].domain, all_indicators[0].doc_type,
            load_source="build_async_indicators",
        )
        failed_indicators = set()

        rows_to_save_by_adapter = defaultdict(list)
        indicator_by_doc_id = {i.doc_id: i for i in all_indicators}
        config_ids = set()
        with timer:
            for doc in doc_store.iter_documents(list(indicator_by_doc_id.keys())):
                indicator = indicator_by_doc_id[doc['_id']]
                eval_context = EvaluationContext(doc)
                for config_id in indicator.indicator_config_ids:
                    config_ids.add(config_id)
                    try:
                        config = _get_config_by_id(config_id)
                    except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError):
                        celery_task_logger.info("{} no longer exists, skipping".format(config_id))
                        # remove because the config no longer exists
                        _mark_config_to_remove(config_id, [indicator.pk])
                        continue
                    except ESError:
                        celery_task_logger.info("ES errored when trying to retrieve config")
                        failed_indicators.add(indicator)
                        continue
                    adapter = None
                    try:
                        adapter = get_indicator_adapter(config, load_source='build_async_indicators')
                        rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context))
                        eval_context.reset_iteration()
                    except Exception as e:
                        failed_indicators.add(indicator)
                        handle_exception(e, config_id, doc, adapter)

            for adapter, rows in six.iteritems(rows_to_save_by_adapter):
                doc_ids = doc_ids_from_rows(rows)
                indicators = [indicator_by_doc_id[doc_id] for doc_id in doc_ids]
                try:
                    adapter.save_rows(rows)
                except Exception as e:
                    failed_indicators.union(indicators)
                    message = six.text_type(e)
                    notify_exception(None,
                        "Exception bulk saving async indicators:{}".format(message))
                else:
                    # remove because it's sucessfully processed
                    _mark_config_to_remove(
                        config_id,
                        [i.pk for i in indicators]
                    )

        # delete fully processed indicators
        processed_indicators = set(all_indicators) - failed_indicators
        AsyncIndicator.objects.filter(pk__in=[i.pk for i in processed_indicators]).delete()

        # update failure for failed indicators
        with transaction.atomic():
            for indicator in failed_indicators:
                indicator.update_failure(
                    configs_to_remove_by_indicator_id.get(indicator.pk, [])
                )
                indicator.save()

        datadog_counter('commcare.async_indicator.processed_success', len(processed_indicators))
        datadog_counter('commcare.async_indicator.processed_fail', len(failed_indicators))
        datadog_histogram(
            'commcare.async_indicator.processing_time', timer.duration / len(indicator_doc_ids),
            tags=[
                'config_ids:{}'.format(config_ids),
            ]
        )