Exemple #1
0
def reprocess_archive_stubs():
    # Check for archive stubs
    from corehq.form_processor.interfaces.dbaccessors import FormAccessors
    from couchforms.models import UnfinishedArchiveStub
    stubs = UnfinishedArchiveStub.objects.filter(attempts__lt=3)
    datadog_gauge('commcare.unfinished_archive_stubs', len(stubs))
    start = time.time()
    cutoff = start + timedelta(minutes=4).total_seconds()
    for stub in stubs:
        # Exit this task after 4 minutes so that tasks remain short
        if time.time() > cutoff:
            return
        try:
            xform = FormAccessors(stub.domain).get_form(form_id=stub.xform_id)
            # If the history wasn't updated the first time around, run the whole thing again.
            if not stub.history_updated:
                FormAccessors.do_archive(xform,
                                         stub.archive,
                                         stub.user_id,
                                         trigger_signals=True)

            # If the history was updated the first time around, just send the update to kafka
            else:
                FormAccessors.publish_archive_action_to_kafka(
                    xform, stub.user_id, stub.archive)
        except Exception:
            # Errors should not prevent processing other stubs
            notify_exception(None, "Error processing UnfinishedArchiveStub")
Exemple #2
0
    def __record_change_metric_in_datadog(self,
                                          metric,
                                          change,
                                          processor=None,
                                          processing_time=None,
                                          add_case_type_tag=False):
        if change.metadata is not None:
            common_tags = [
                'datasource:{}'.format(change.metadata.data_source_name),
                'is_deletion:{}'.format(change.metadata.is_deletion),
                'pillow_name:{}'.format(self.get_name()),
                'processor:{}'.format(processor.__class__.__name__
                                      if processor else "all_processors"),
            ]

            metric_tags = list(common_tags)
            if add_case_type_tag and settings.ENTERPRISE_MODE and change.metadata.document_type == 'CommCareCase':
                metric_tags.append('case_type:{}'.format(
                    change.metadata.document_subtype))

            datadog_counter(metric, tags=metric_tags)

            change_lag = (datetime.utcnow() -
                          change.metadata.publish_timestamp).total_seconds()
            datadog_gauge('commcare.change_feed.change_lag',
                          change_lag,
                          tags=[
                              'pillow_name:{}'.format(self.get_name()),
                              _topic_for_ddog(change.topic),
                          ])

            if processing_time:
                datadog_histogram('commcare.change_feed.processing_time',
                                  processing_time,
                                  tags=common_tags)
Exemple #3
0
    def __record_change_metric_in_datadog(self,
                                          metric,
                                          change,
                                          processor=None,
                                          processing_time=None):
        if change.metadata is not None:
            tags = [
                'datasource:{}'.format(change.metadata.data_source_name),
                'is_deletion:{}'.format(change.metadata.is_deletion),
                'pillow_name:{}'.format(self.get_name()),
                'processor:{}'.format(processor.__class__.__name__
                                      if processor else "all_processors"),
            ]
            count = 1 if processor else len(self.processors)
            datadog_counter(metric, value=count, tags=tags)

            change_lag = (datetime.utcnow() -
                          change.metadata.publish_timestamp).total_seconds()
            datadog_gauge('commcare.change_feed.change_lag',
                          change_lag,
                          tags=[
                              'pillow_name:{}'.format(self.get_name()),
                              _topic_for_ddog(change.topic),
                          ])

            if processing_time:
                datadog_histogram('commcare.change_feed.processing_time',
                                  processing_time,
                                  tags=tags)
Exemple #4
0
    def _record_datadog_metrics(self, changes_chunk, processing_time):
        tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"]
        # Since success/fail count is tracked per processor, to get sense of
        #   actual operations count, multiply by number of processors
        count = len(changes_chunk) * len(self.processors)
        datadog_counter('commcare.change_feed.changes.count', count, tags=tags)

        max_change_lag = (
            datetime.utcnow() -
            changes_chunk[0].metadata.publish_timestamp).total_seconds()
        min_change_lag = (
            datetime.utcnow() -
            changes_chunk[-1].metadata.publish_timestamp).total_seconds()
        datadog_gauge('commcare.change_feed.chunked.min_change_lag',
                      min_change_lag,
                      tags=tags)
        datadog_gauge('commcare.change_feed.chunked.max_change_lag',
                      max_change_lag,
                      tags=tags)

        datadog_histogram('commcare.change_feed.chunked.processing_time_total',
                          processing_time,
                          tags=tags +
                          ["chunk_size:{}".format(str(len(changes_chunk)))])

        if len(changes_chunk) == self.processor_chunk_size:
            # don't report offset chunks to ease up datadog calculations
            datadog_histogram('commcare.change_feed.processing_time',
                              processing_time / len(changes_chunk),
                              tags=tags +
                              ["chunk_size:".format(str(len(changes_chunk)))])
Exemple #5
0
def queue_async_indicators():
    start = datetime.utcnow()
    cutoff = start + ASYNC_INDICATOR_QUEUE_TIME
    time_for_crit_section = ASYNC_INDICATOR_QUEUE_TIME.seconds - 10

    oldest_indicator = AsyncIndicator.objects.order_by('date_queued').first()
    if oldest_indicator and oldest_indicator.date_queued:
        lag = (datetime.utcnow() -
               oldest_indicator.date_queued).total_seconds()
        datadog_gauge('commcare.async_indicator.oldest_queued_indicator', lag)

    with CriticalSection(['queue-async-indicators'],
                         timeout=time_for_crit_section):
        day_ago = datetime.utcnow() - timedelta(days=1)
        indicators = AsyncIndicator.objects.all()[:settings.
                                                  ASYNC_INDICATORS_TO_QUEUE]
        if indicators:
            lag = (datetime.utcnow() -
                   indicators[0].date_created).total_seconds()
            datadog_gauge('commcare.async_indicator.oldest_created_indicator',
                          lag)
        indicators_by_domain_doc_type = defaultdict(list)
        for indicator in indicators:
            # don't requeue anything htat's be queued in the past day
            if not indicator.date_queued or indicator.date_queued < day_ago:
                indicators_by_domain_doc_type[(
                    indicator.domain, indicator.doc_type)].append(indicator)

        for k, indicators in indicators_by_domain_doc_type.items():
            now = datetime.utcnow()
            if now > cutoff:
                break
            _queue_indicators(indicators)
Exemple #6
0
def async_indicators_metrics():
    oldest_indicator = AsyncIndicator.objects.order_by('date_queued').first()
    if oldest_indicator and oldest_indicator.date_queued:
        lag = (datetime.utcnow() -
               oldest_indicator.date_queued).total_seconds()
        datadog_gauge('commcare.async_indicator.oldest_queued_indicator', lag)

    indicator = AsyncIndicator.objects.first()
    if indicator:
        lag = (datetime.utcnow() - indicator.date_created).total_seconds()
        datadog_gauge('commcare.async_indicator.oldest_created_indicator', lag)

    for config_id, metrics in _indicator_metrics().iteritems():
        tags = ["config_id:{}".format(config_id)]
        datadog_gauge('commcare.async_indicator.indicator_count',
                      metrics['count'],
                      tags=tags)
        datadog_gauge('commcare.async_indicator.lag',
                      metrics['lag'],
                      tags=tags)

    # Don't use ORM summing because it would attempt to get every value in DB
    unsuccessful_attempts = sum(
        AsyncIndicator.objects.values_list('unsuccessful_attempts',
                                           flat=True).all()[:100])
    datadog_gauge('commcare.async_indicator.unsuccessful_attempts',
                  unsuccessful_attempts)
Exemple #7
0
    def fetch_all(initial_response):

        resp = initial_response
        scroll_id = resp.get('_scroll_id')
        if scroll_id is None:
            return
        iteration = 0

        while True:

            start = int(time.time() * 1000)
            resp = client.scroll(scroll_id, scroll=scroll)
            datadog_gauge('commcare.es_scroll', (time.time() * 1000) - start,
                          tags=[
                              u'iteration:{}'.format(iteration),
                          ])

            for hit in resp['hits']['hits']:
                yield hit

            # check if we have any errrors
            if resp["_shards"]["failed"]:
                logging.getLogger('elasticsearch.helpers').warning(
                    'Scroll request has failed on %d shards out of %d.',
                    resp['_shards']['failed'], resp['_shards']['total'])

            scroll_id = resp.get('_scroll_id')
            # end of scroll
            if scroll_id is None or not resp['hits']['hits']:
                break

            iteration += 1
Exemple #8
0
def record_pillow_error_queue_size():
    data = PillowError.objects.values('pillow').annotate(
        num_errors=Count('id'))
    for row in data:
        datadog_gauge('commcare.pillowtop.errors',
                      row['num_errors'],
                      tags=['pillow_name:%s' % row['pillow']])
Exemple #9
0
    def _record_datadog_metrics(self, changes_chunk, processing_time):
        tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"]
        change_count = len(changes_chunk)
        if settings.ENTERPRISE_MODE:
            type_counter = Counter([
                change.metadata.document_subtype
                for change in changes_chunk if change.metadata.document_type == 'CommCareCase'
            ])
            for case_type, type_count in type_counter.items():
                tags_with_type = tags + ['case_type:{}'.format(case_type)]
                datadog_counter('commcare.change_feed.changes.count', type_count, tags=tags_with_type)

            remainder = change_count - sum(type_counter.values())
            if remainder:
                datadog_counter('commcare.change_feed.changes.count', remainder, tags=tags)
        else:
            datadog_counter('commcare.change_feed.changes.count', change_count, tags=tags)

        max_change_lag = (datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds()
        min_change_lag = (datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds()
        datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags)
        datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags)

        # processing_time per change
        datadog_histogram('commcare.change_feed.processing_time', processing_time / change_count, tags=tags)

        if change_count == self.processor_chunk_size:
            # don't report offset chunks to ease up datadog calculations
            datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time,
                              tags=tags + ["chunk_size:{}".format(str(change_count))])
Exemple #10
0
def datadog_report_user_stats(metric_name, commcare_users_by_domain):
    commcare_users_by_domain = summarize_user_counts(commcare_users_by_domain,
                                                     n=50)
    for domain, user_count in commcare_users_by_domain.items():
        datadog_gauge(
            metric_name,
            user_count,
            tags=['domain:{}'.format('_other' if domain is () else domain)])
Exemple #11
0
 def heartbeat():
     try:
         datadog_gauge('commcare.celery.heartbeat.blockage_duration',
                       self.get_blockage_duration(),
                       tags=['celery_queue:{}'.format(self.queue)])
     except HeartbeatNeverRecorded:
         pass
     self.mark_seen()
Exemple #12
0
def _report_current_global_submission_thresholds():
    for window, value, threshold in global_submission_rate_limiter.iter_rates():
        datadog_gauge('commcare.xform_submissions.global_threshold', threshold, tags=[
            f'window:{window}'
        ])
        datadog_gauge('commcare.xform_submissions.global_usage', value, tags=[
            f'window:{window}'
        ])
Exemple #13
0
def _report_current_global_two_factor_setup_rate_limiter():
    for window, value, threshold in global_two_factor_setup_rate_limiter.iter_rates(
    ):
        datadog_gauge('commcare.two_factor.global_two_factor_setup_threshold',
                      threshold,
                      tags=['window:{}'.format(window)])
        datadog_gauge('commcare.two_factor.global_two_factor_setup_usage',
                      value,
                      tags=['window:{}'.format(window)])
 def _record_checkpoint_in_datadog(self):
     datadog_counter('commcare.change_feed.change_feed.checkpoint', tags=[
         'pillow_name:{}'.format(self.get_name()),
     ])
     checkpoint_sequence = self._normalize_checkpoint_sequence()
     for topic, value in six.iteritems(checkpoint_sequence):
         datadog_gauge('commcare.change_feed.checkpoint_offsets', value, tags=[
             'pillow_name:{}'.format(self.get_name()),
             _topic_for_ddog(topic),
         ])
Exemple #15
0
 def _record_checkpoint_in_datadog(self):
     datadog_counter('commcare.change_feed.change_feed.checkpoint', tags=[
         'pillow_name:{}'.format(self.get_name()),
     ])
     checkpoint_sequence = self._normalize_checkpoint_sequence()
     for topic, value in six.iteritems(checkpoint_sequence):
         datadog_gauge('commcare.change_feed.checkpoint_offsets', value, tags=[
             'pillow_name:{}'.format(self.get_name()),
             _topic_for_ddog(topic),
         ])
Exemple #16
0
def celery_record_time_to_start(task_id=None, task=None, **kwargs):
    from corehq.util.datadog.gauges import datadog_gauge, datadog_counter
    time_to_start = TimeToStartTimer(task_id).stop_and_pop_timing()
    tags = [
        'celery_task_name:{}'.format(task.name),
        'celery_queue:{}'.format(task.queue),
    ]
    if time_to_start:
        datadog_gauge('commcare.celery.task.time_to_start', time_to_start.total_seconds(), tags=tags)
    else:
        datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags)
Exemple #17
0
 def get_and_report_blockage_duration(self):
     blockage_duration = self.get_blockage_duration()
     datadog_gauge('commcare.celery.heartbeat.blockage_duration',
                   blockage_duration.total_seconds(),
                   tags=['celery_queue:{}'.format(self.queue)])
     if self.threshold:
         datadog_gauge(
             'commcare.celery.heartbeat.blockage_ok',
             1
             if blockage_duration.total_seconds() <= self.threshold else 0,
             tags=['celery_queue:{}'.format(self.queue)])
     return blockage_duration
Exemple #18
0
 def get_and_report_blockage_duration(self):
     blockage_duration = self.get_blockage_duration()
     datadog_gauge(
         'commcare.celery.heartbeat.blockage_duration',
         blockage_duration.total_seconds(),
         tags=['celery_queue:{}'.format(self.queue)]
     )
     if self.threshold:
         datadog_gauge(
             'commcare.celery.heartbeat.blockage_ok',
             1 if blockage_duration.total_seconds() <= self.threshold else 0,
             tags=['celery_queue:{}'.format(self.queue)]
         )
     return blockage_duration
Exemple #19
0
def pillow_datadog_metrics():
    def _is_couch(pillow):
        # text is couch, json is kafka
        return pillow['seq_format'] == 'text'

    pillow_meta = get_all_pillows_json()

    active_pillows = getattr(settings, 'ACTIVE_PILLOW_NAMES', None)
    if active_pillows:
        pillow_meta = [pillow for pillow in pillow_meta if pillow['name'] in active_pillows]

    for pillow in pillow_meta:
        tags = [
            'pillow_name:{}'.format(pillow['name']),
            'feed_type:{}'.format('couch' if _is_couch(pillow) else 'kafka')
        ]

        datadog_gauge(
            'commcare.change_feed.seconds_since_last_update',
            pillow['seconds_since_last'], tags=tags
        )

        for topic_name, offset in pillow['offsets'].items():
            if _is_couch(pillow):
                if not isinstance(pillow['seq'], int) or len(pillow['offsets']) != 1:
                    _assert(False, "Unexpected couch pillow format {}".format(pillow['name']))
                    continue
                tags_with_topic = tags + ['topic:{}'.format(topic_name)]
                processed_offset = pillow['seq']
            else:
                if not pillow['seq']:
                    # this pillow has never been initialized.
                    # (custom pillows on most environments)
                    continue
                if not isinstance(pillow['seq'], dict) or len(pillow['offsets']) != len(pillow['seq']):
                    _assert(False, "Unexpected kafka pillow format {}".format(pillow['name']))
                    continue
                topic, partition = topic_name.split(',')
                tags_with_topic = tags + ['topic:{}-{}'.format(topic, partition)]
                processed_offset = pillow['seq'][topic_name]

            if processed_offset == 0:
                # assume if nothing has been processed that this pillow is not
                # supposed to be running
                continue

            datadog_gauge(
                'commcare.change_feed.current_offsets',
                offset, tags=tags_with_topic
            )
            datadog_gauge(
                'commcare.change_feed.processed_offsets',
                processed_offset, tags=tags_with_topic
            )
            needs_processing = offset - processed_offset
            datadog_gauge(
                'commcare.change_feed.need_processing',
                needs_processing, tags=tags_with_topic
            )
Exemple #20
0
def pillow_datadog_metrics():
    def _is_couch(pillow):
        # text is couch, json is kafka
        return pillow['seq_format'] == 'text'

    pillow_meta = get_all_pillows_json()

    for pillow in pillow_meta:
        tags = [
            'pillow_name:{}'.format(pillow['name']),
            'feed_type:{}'.format('couch' if _is_couch(pillow) else 'kafka')
        ]

        datadog_gauge('commcare.change_feed.seconds_since_last_update',
                      pillow['seconds_since_last'],
                      tags=tags)

        for topic_name, offset in pillow['offsets'].items():
            if _is_couch(pillow):
                if not isinstance(pillow['seq'],
                                  int) or len(pillow['offsets']) != 1:
                    _assert(
                        False, "Unexpected couch pillow format {}".format(
                            pillow['name']))
                    continue
                tags_with_topic = tags + ['topic:{}'.format(topic_name)]
                processed_offset = pillow['seq']
            else:
                if not isinstance(pillow['seq'], dict) or len(
                        pillow['offsets']) != len(pillow['seq']):
                    _assert(
                        False, "Unexpected kafka pillow format {}".format(
                            pillow['name']))
                    continue
                if not pillow['seq']:
                    # this pillow has never been initialized.
                    # (custom pillows on most environments)
                    continue
                topic, partition = topic_name.split(',')
                tags_with_topic = tags + [
                    'topic:{}-{}'.format(topic, partition)
                ]
                processed_offset = pillow['seq'][topic_name]

            if processed_offset == 0:
                # assume if nothing has been processed that this pillow is not
                # supposed to be running
                continue

            datadog_gauge('commcare.change_feed.current_offsets',
                          offset,
                          tags=tags_with_topic)
            datadog_gauge('commcare.change_feed.processed_offsets',
                          processed_offset,
                          tags=tags_with_topic)
            needs_processing = offset - processed_offset
            datadog_gauge('commcare.change_feed.need_processing',
                          needs_processing,
                          tags=tags_with_topic)
    def __record_change_metric_in_datadog(self, metric, change, timer=None):
        if change.metadata is not None:
            tags = [
                u'datasource:{}'.format(change.metadata.data_source_name),
                u'document_type:{}'.format(change.metadata.document_type),
                u'domain:{}'.format(change.metadata.domain),
                u'is_deletion:{}'.format(change.metadata.is_deletion),
                u'pillow_name:{}'.format(self.get_name())
            ]
            datadog_counter(metric, tags=tags)

            if timer:
                datadog_gauge('commcare.change_feed.processing_time',
                              timer.duration,
                              tags=tags)
Exemple #22
0
def pillow_datadog_metrics():
    def _is_couch(pillow):
        # text is couch, json is kafka
        return pillow['seq_format'] == 'text'

    pillow_meta = get_all_pillows_json()

    active_pillows = getattr(settings, 'ACTIVE_PILLOW_NAMES', None)
    if active_pillows:
        pillow_meta = [
            pillow for pillow in pillow_meta
            if pillow['name'] in active_pillows
        ]

    for pillow in pillow_meta:
        # The host and group tags are added here to ensure they remain constant
        # regardless of which celery worker the task get's executed on.
        # Without this the sum of the metrics get's inflated.
        tags = [
            'pillow_name:{}'.format(pillow['name']),
            'feed_type:{}'.format('couch' if _is_couch(pillow) else 'kafka'),
            'host:celery', 'group:celery'
        ]

        datadog_gauge('commcare.change_feed.seconds_since_last_update',
                      pillow['seconds_since_last'],
                      tags=tags)

        for topic_name, offset in pillow['offsets'].items():
            if _is_couch(pillow):
                tags_with_topic = tags + ['topic:{}'.format(topic_name)]
                processed_offset = pillow['seq']
            else:
                if not pillow['seq']:
                    # this pillow has never been initialized.
                    # (custom pillows on most environments)
                    continue
                topic, partition = topic_name.split(',')
                tags_with_topic = tags + [
                    'topic:{}-{}'.format(topic, partition)
                ]
                processed_offset = pillow['seq'][topic_name]

            if processed_offset == 0:
                # assume if nothing has been processed that this pillow is not
                # supposed to be running
                continue

            datadog_gauge('commcare.change_feed.current_offsets',
                          offset,
                          tags=tags_with_topic)
            datadog_gauge('commcare.change_feed.processed_offsets',
                          processed_offset,
                          tags=tags_with_topic)
            needs_processing = offset - processed_offset
            datadog_gauge('commcare.change_feed.need_processing',
                          needs_processing,
                          tags=tags_with_topic)
Exemple #23
0
def server_up(req):
    """
    Health check view which can be hooked into server monitoring tools like 'pingdom'

    Returns:
        HttpResponse("success", status_code=200)
        HttpResponse(error_message, status_code=500)

    Hit serverup.txt to check all the default enabled services (always_check=True)
    Hit serverup.txt?only={check_name} to only check a specific service
    Hit serverup.txt?{check_name} to include a non-default check (currently only ``heartbeat``)
    """
    only = req.GET.get('only', None)
    if only and only in CHECKS:
        checks_to_do = [only]
    else:
        checks_to_do = [
            check for check, check_info in CHECKS.items() if
            check_info['always_check'] or req.GET.get(check, None) is not None
        ]

    statuses = run_checks(checks_to_do)
    failed_checks = [(check, status) for check, status in statuses
                     if not status.success]

    for check_name, status in statuses:
        tags = [
            'status:{}'.format('failed' if not status.success else 'ok'),
            'check:{}'.format(check_name)
        ]
        datadog_gauge('commcare.serverup.check', status.duration, tags=tags)

    if failed_checks and not is_deploy_in_progress():
        status_messages = [
            html.linebreaks('<strong>{}</strong>: {}'.format(
                check, html.escape(status.msg)).strip())
            for check, status in failed_checks
        ]
        create_datadog_event(
            'Serverup check failed',
            '\n'.join(status_messages),
            alert_type='error',
            aggregation_key='serverup',
        )
        status_messages.insert(0, 'Failed Checks (%s):' % os.uname()[1])
        return HttpResponse(''.join(status_messages), status=500)
    else:
        return HttpResponse("success")
Exemple #24
0
def celery_record_time_to_start(task_id=None, task=None, **kwargs):
    from corehq.util.datadog.gauges import datadog_gauge, datadog_counter
    time_sent = cache.get('task.{}.time_sent'.format(task_id))
    tags = [
        'celery_task_name:{}'.format(task.name),
        'celery_queue:{}'.format(task.queue),
    ]
    if time_sent:
        time_to_start = (datetime.datetime.utcnow() -
                         time_sent).total_seconds()
        datadog_gauge('commcare.celery.task.time_to_start',
                      time_to_start,
                      tags=tags)
    else:
        datadog_counter('commcare.celery.task.time_to_start_unavailable',
                        tags=tags)
    def __record_change_metric_in_datadog(self, metric, change, timer=None):
        if change.metadata is not None:
            tags = [
                'datasource:{}'.format(change.metadata.data_source_name),
                'is_deletion:{}'.format(change.metadata.is_deletion),
                'pillow_name:{}'.format(self.get_name()),
            ]
            datadog_counter(metric, tags=tags)

            change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).seconds
            datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[
                'pillow_name:{}'.format(self.get_name()),
                _topic_for_ddog(change.topic),
            ])

            if timer:
                datadog_histogram('commcare.change_feed.processing_time', timer.duration, tags=tags)
Exemple #26
0
def server_up(req):
    """
    Health check view which can be hooked into server monitoring tools like 'pingdom'

    Returns:
        HttpResponse("success", status_code=200)
        HttpResponse(error_message, status_code=500)

    Hit serverup.txt to check all the default enabled services (always_check=True)
    Hit serverup.txt?only={check_name} to only check a specific service
    Hit serverup.txt?{check_name} to include a non-default check (currently only ``heartbeat``)
    """
    only = req.GET.get('only', None)
    if only and only in CHECKS:
        checks_to_do = [only]
    else:
        checks_to_do = [
            check
            for check, check_info in CHECKS.items()
            if check_info['always_check'] or req.GET.get(check, None) is not None
        ]

    statuses = run_checks(checks_to_do)
    failed_checks = [(check, status) for check, status in statuses if not status.success]

    for check_name, status in statuses:
        tags = [
            'status:{}'.format('failed' if not status.success else 'ok'),
            'check:{}'.format(check_name)
        ]
        datadog_gauge('commcare.serverup.check', status.duration, tags=tags)

    if failed_checks and not is_deploy_in_progress():
        status_messages = [
            html.linebreaks('<strong>{}</strong>: {}'.format(check, html.escape(status.msg)).strip())
            for check, status in failed_checks
        ]
        create_datadog_event(
            'Serverup check failed', '\n'.join(status_messages),
            alert_type='error', aggregation_key='serverup',
        )
        status_messages.insert(0, 'Failed Checks (%s):' % os.uname()[1])
        return HttpResponse(''.join(status_messages), status=500)
    else:
        return HttpResponse("success")
Exemple #27
0
    def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None):
        if change.metadata is not None:
            tags = [
                'datasource:{}'.format(change.metadata.data_source_name),
                'is_deletion:{}'.format(change.metadata.is_deletion),
                'pillow_name:{}'.format(self.get_name()),
                'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"),
            ]
            count = 1 if processor else len(self.processors)
            datadog_counter(metric, value=count, tags=tags)

            change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds()
            datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[
                'pillow_name:{}'.format(self.get_name()),
                _topic_for_ddog(change.topic),
            ])

            if processing_time:
                datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=tags)
Exemple #28
0
def _record_metrics(tags, submission_type, response, timer=None, xform=None):
    if xform and xform.metadata:
        lag = xform.received_on - xform.metadata.timeEnd
        datadog_gauge('commcare.xform_submissions.lag',
                      int(lag.total_seconds()),
                      tags=tags)

    tags += [
        'submission_type:{}'.format(submission_type),
        'status_code:{}'.format(response.status_code)
    ]

    if response.status_code == 201 and timer:
        tags += [
            'duration:%s' % bucket_value(timer.duration,
                                         (1, 5, 20, 60, 120, 300, 600), 's'),
        ]

    datadog_counter('commcare.xform_submissions.count', tags=tags)
Exemple #29
0
def celery_record_time_to_start(task_id=None, task=None, **kwargs):
    from corehq.util.datadog.gauges import datadog_gauge, datadog_counter

    tags = [
        'celery_task_name:{}'.format(task.name),
        'celery_queue:{}'.format(task.queue),
    ]

    timer = TimeToStartTimer(task_id)
    try:
        time_to_start = timer.stop_and_pop_timing()
    except TimingNotAvailable:
        datadog_counter('commcare.celery.task.time_to_start_unavailable',
                        tags=tags)
    else:
        datadog_gauge('commcare.celery.task.time_to_start',
                      time_to_start.total_seconds(),
                      tags=tags)
        get_task_time_to_start.set_cached_value(task_id).to(time_to_start)

    TimeToRunTimer(task_id).start_timing()
Exemple #30
0
    def _record_change_in_datadog(self, change, timer):
        from corehq.apps.change_feed.consumer.feed import KafkaChangeFeed
        change_feed = self.get_change_feed()
        current_seq = self._normalize_sequence(
            change_feed.get_processed_offsets())
        current_offsets = change_feed.get_latest_offsets()

        tags = [
            'pillow_name:{}'.format(self.get_name()),
            'feed_type:{}'.format('kafka' if isinstance(
                change_feed, KafkaChangeFeed) else 'couch')
        ]
        for topic, value in current_seq.iteritems():
            tags_with_topic = tags + [
                _topic_for_ddog(topic),
            ]
            datadog_gauge('commcare.change_feed.processed_offsets',
                          value,
                          tags=tags_with_topic)
            if topic in current_offsets:
                needs_processing = current_offsets[topic] - value
                datadog_gauge('commcare.change_feed.need_processing',
                              needs_processing,
                              tags=tags_with_topic)

        for topic, offset in current_offsets.iteritems():
            tags_with_topic = tags + [
                _topic_for_ddog(topic),
            ]
            datadog_gauge('commcare.change_feed.current_offsets',
                          offset,
                          tags=tags_with_topic)

        self.__record_change_metric_in_datadog(
            'commcare.change_feed.changes.count', change, timer)
    def _record_change_in_datadog(self, change, timer):
        change_feed = self.get_change_feed()
        sequence = self._normalize_checkpoint_sequence()
        current_offsets = change_feed.get_current_offsets()

        for topic, value in sequence.iteritems():
            datadog_gauge('commcare.change_feed.processed_offsets',
                          value,
                          tags=[
                              'pillow_name:{}'.format(self.get_name()),
                              'topic:{}'.format(topic),
                          ])
            if topic in current_offsets:
                datadog_gauge('commcare.change_feed.need_processing',
                              current_offsets[topic] - value,
                              tags=[
                                  'pillow_name:{}'.format(self.get_name()),
                                  'topic:{}'.format(topic),
                              ])

        for topic, offset in current_offsets.iteritems():
            datadog_gauge('commcare.change_feed.current_offsets',
                          offset,
                          tags=[
                              'pillow_name:{}'.format(self.get_name()),
                              'topic:{}'.format(topic),
                          ])

        self.__record_change_metric_in_datadog(
            'commcare.change_feed.changes.count', change, timer)
Exemple #32
0
def reprocess_archive_stubs():
    # Check for archive stubs
    from corehq.form_processor.interfaces.dbaccessors import FormAccessors
    from couchforms.models import UnfinishedArchiveStub
    stubs = UnfinishedArchiveStub.objects.filter()
    datadog_gauge('commcare.unfinished_archive_stubs', len(stubs))
    start = time.time()
    cutoff = start + timedelta(minutes=4).total_seconds()
    for stub in stubs:
        # Exit this task after 4 minutes so that the same stub isn't ever processed in multiple queues.
        if time.time() - start > cutoff:
            return
        xform = FormAccessors(stub.domain).get_form(form_id=stub.xform_id)
        # If the history wasn't updated the first time around, run the whole thing again.
        if not stub.history_updated:
            if stub.archive:
                xform.archive(user_id=stub.user_id)
            else:
                xform.unarchive(user_id=stub.user_id)
        # If the history was updated the first time around, just send the update to kafka
        else:
            xform.publish_archive_action_to_kafka(user_id=stub.user_id, archive=stub.archive)
Exemple #33
0
def reprocess_archive_stubs():
    # Check for archive stubs
    from corehq.form_processor.interfaces.dbaccessors import FormAccessors
    from couchforms.models import UnfinishedArchiveStub
    stubs = UnfinishedArchiveStub.objects.filter()
    datadog_gauge('commcare.unfinished_archive_stubs', len(stubs))
    start = time.time()
    cutoff = start + timedelta(minutes=4).total_seconds()
    for stub in stubs:
        # Exit this task after 4 minutes so that the same stub isn't ever processed in multiple queues.
        if time.time() - start > cutoff:
            return
        xform = FormAccessors(stub.domain).get_form(form_id=stub.xform_id)
        # If the history wasn't updated the first time around, run the whole thing again.
        if not stub.history_updated:
            if stub.archive:
                xform.archive(user_id=stub.user_id)
            else:
                xform.unarchive(user_id=stub.user_id)
        # If the history was updated the first time around, just send the update to kafka
        else:
            xform.publish_archive_action_to_kafka(user_id=stub.user_id, archive=stub.archive)
Exemple #34
0
    def _record_datadog_metrics(self, changes_chunk, processing_time):
        tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"]
        # Since success/fail count is tracked per processor, to get sense of
        #   actual operations count, multiply by number of processors
        count = len(changes_chunk) * len(self.processors)
        datadog_counter('commcare.change_feed.changes.count', count, tags=tags)

        max_change_lag = (datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds()
        min_change_lag = (datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds()
        datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags)
        datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags)

        # processing_time per change
        datadog_histogram(
            'commcare.change_feed.processing_time',
            processing_time / len(changes_chunk),
            tags=tags + ["chunk_size:".format(str(len(changes_chunk)))]
        )

        if len(changes_chunk) == self.processor_chunk_size:
            # don't report offset chunks to ease up datadog calculations
            datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time,
                tags=tags + ["chunk_size:{}".format(str(len(changes_chunk)))])
Exemple #35
0
def restore(request, domain, app_id=None):
    """
    We override restore because we have to supply our own
    user model (and have the domain in the url)
    """
    if toggles.ENIKSHAY.enabled(domain):
        update_device_id(request.couch_user, request.GET.get('device_id'))
    response, timing_context = get_restore_response(
        domain, request.couch_user, app_id, **get_restore_params(request))
    tags = [
        u'status_code:{}'.format(response.status_code),
    ]
    datadog_counter('commcare.restores.count', tags=tags)
    if timing_context is not None:
        for timer in timing_context.to_list(exclude_root=True):
            # Only record leaf nodes so we can sum to get the total
            if timer.is_leaf_node:
                datadog_gauge(
                    'commcare.restores.timings',
                    timer.duration,
                    tags=tags + [u'segment:{}'.format(timer.name)],
                )

    return response
Exemple #36
0
def couch_sql_migration_stats():
    result = (DomainES().filter(filters.term(
        'use_sql_backend', False)).remove_default_filters().aggregations([
            aggregations.SumAggregation('cases', 'cp_n_cases'),
            aggregations.SumAggregation('forms', 'cp_n_forms'),
        ]).size(0).run())

    datadog_gauge('commcare.couch_sql_migration.domains_remaining',
                  int(result.total))
    datadog_gauge('commcare.couch_sql_migration.forms_remaining',
                  int(result.aggregations.forms.value))
    datadog_gauge('commcare.couch_sql_migration.cases_remaining',
                  int(result.aggregations.cases.value))
Exemple #37
0
def couch_sql_migration_stats():
    result = (
        DomainES()
        .filter(filters.term('use_sql_backend', False))
        .remove_default_filters()
        .aggregations([
            aggregations.SumAggregation('cases', 'cp_n_cases'),
            aggregations.SumAggregation('forms', 'cp_n_forms'),
        ])
        .size(0).run()
    )

    datadog_gauge('commcare.couch_sql_migration.domains_remaining', int(result.total))
    datadog_gauge('commcare.couch_sql_migration.forms_remaining', int(result.aggregations.forms.value))
    datadog_gauge('commcare.couch_sql_migration.cases_remaining', int(result.aggregations.cases.value))
Exemple #38
0
def record_pillow_error_queue_size():
    data = PillowError.objects.values('pillow').annotate(num_errors=Count('id'))
    for row in data:
        datadog_gauge('commcare.pillowtop.error_queue', row['num_errors'], tags=[
            'pillow_name:%s' % row['pillow']
        ])
def _record_datadog_metrics():
    count = UnfinishedSubmissionStub.objects.count()
    datadog_gauge('commcare.submission_reprocessing.queue_size', count)
Exemple #40
0
def async_indicators_metrics():
    now = datetime.utcnow()
    oldest_indicator = AsyncIndicator.objects.order_by('date_queued').first()
    if oldest_indicator and oldest_indicator.date_queued:
        lag = (now - oldest_indicator.date_queued).total_seconds()
        datadog_gauge('commcare.async_indicator.oldest_queued_indicator', lag)

    oldest_100_indicators = AsyncIndicator.objects.all()[:100]
    if oldest_100_indicators.exists():
        oldest_indicator = oldest_100_indicators[0]
        lag = (now - oldest_indicator.date_created).total_seconds()
        datadog_gauge('commcare.async_indicator.oldest_created_indicator', lag)

        lags = [
            (now - indicator.date_created).total_seconds()
            for indicator in oldest_100_indicators
        ]
        avg_lag = sum(lags) / len(lags)
        datadog_gauge('commcare.async_indicator.oldest_created_indicator_avg', avg_lag)

    for config_id, metrics in six.iteritems(_indicator_metrics()):
        tags = ["config_id:{}".format(config_id)]
        datadog_gauge('commcare.async_indicator.indicator_count', metrics['count'], tags=tags)
        datadog_gauge('commcare.async_indicator.lag', metrics['lag'], tags=tags)

    # Don't use ORM summing because it would attempt to get every value in DB
    unsuccessful_attempts = sum(AsyncIndicator.objects.values_list('unsuccessful_attempts', flat=True).all()[:100])
    datadog_gauge('commcare.async_indicator.unsuccessful_attempts', unsuccessful_attempts)
Exemple #41
0
def _process_form(request,
                  domain,
                  app_id,
                  user_id,
                  authenticated,
                  auth_cls=AuthContext):
    if should_ignore_submission(request):
        # silently ignore submission if it meets ignore-criteria
        return SubmissionPost.submission_ignored_response()

    if toggles.FORM_SUBMISSION_BLACKLIST.enabled(domain):
        return SubmissionPost.get_blacklisted_response()

    try:
        instance, attachments = couchforms.get_instance_and_attachment(request)
    except MultimediaBug as e:
        try:
            instance = request.FILES[MAGIC_PROPERTY].read()
            xform = convert_xform_to_json(instance)
            meta = xform.get("meta", {})
        except:
            meta = {}

        details = {
            "domain": domain,
            "app_id": app_id,
            "user_id": user_id,
            "authenticated": authenticated,
            "form_meta": meta,
        }
        log_counter(MULTIMEDIA_SUBMISSION_ERROR_COUNT, details)
        notify_exception(request, "Received a submission with POST.keys()",
                         details)
        return HttpResponseBadRequest(e.message)

    app_id, build_id = get_app_and_build_ids(domain, app_id)
    submission_post = SubmissionPost(
        instance=instance,
        attachments=attachments,
        domain=domain,
        app_id=app_id,
        build_id=build_id,
        auth_context=auth_cls(
            domain=domain,
            user_id=user_id,
            authenticated=authenticated,
        ),
        location=couchforms.get_location(request),
        received_on=couchforms.get_received_on(request),
        date_header=couchforms.get_date_header(request),
        path=couchforms.get_path(request),
        submit_ip=couchforms.get_submit_ip(request),
        last_sync_token=couchforms.get_last_sync_token(request),
        openrosa_headers=couchforms.get_openrosa_headers(request),
    )
    with TimingContext() as timer:
        result = submission_post.run()

    response = result.response

    tags = [
        'backend:sql' if should_use_sql_backend(domain) else 'backend:couch',
        u'domain:{}'.format(domain)
    ]
    datadog_counter('commcare.xform_submissions.count',
                    tags=tags +
                    ['status_code:{}'.format(response.status_code)])

    if response.status_code == 400:
        logging.error('Status code 400 for a form submission. '
                      'Response is: \n{0}\n')
    elif response.status_code == 201:

        datadog_gauge('commcare.xform_submissions.timings',
                      timer.duration,
                      tags=tags)
        # normalize over number of items (form or case) saved
        normalized_time = timer.duration / (1 + len(result.cases))
        datadog_gauge('commcare.xform_submissions.normalized_timings',
                      normalized_time,
                      tags=tags)
        datadog_counter('commcare.xform_submissions.case_count',
                        len(result.cases),
                        tags=tags)
        datadog_counter('commcare.xform_submissions.ledger_count',
                        len(result.ledgers),
                        tags=tags)

    return response