Example #1
0
def check_repeaters():
    start = datetime.utcnow()
    six_hours_sec = 6 * 60 * 60
    six_hours_later = start + timedelta(seconds=six_hours_sec)

    # Long timeout to allow all waiting repeat records to be iterated
    check_repeater_lock = get_redis_lock(
        CHECK_REPEATERS_KEY,
        timeout=six_hours_sec,
        name=CHECK_REPEATERS_KEY,
    )
    if not check_repeater_lock.acquire(blocking=False):
        datadog_counter("commcare.repeaters.check.locked_out")
        return

    try:
        with datadog_bucket_timer(
            "commcare.repeaters.check.processing",
            tags=[],
            timing_buckets=_check_repeaters_buckets,
        ):
            for record in iterate_repeat_records(start):
                if datetime.utcnow() > six_hours_later:
                    _soft_assert(False, "I've been iterating repeat records for six hours. I quit!")
                    break
                datadog_counter("commcare.repeaters.check.attempt_forward")
                record.attempt_forward_now()
    finally:
        check_repeater_lock.release()
Example #2
0
    def set_cached_payload_if_necessary(self, fileobj, duration, is_async):
        # only cache if the duration was longer than the threshold
        is_long_restore = duration > timedelta(seconds=INITIAL_SYNC_CACHE_THRESHOLD)
        if is_async or self.force_cache or is_long_restore:
            type_ = 'unknown'
            if is_async:
                type_ = 'async'
            elif self.force_cache:
                type_ = 'force'
            elif is_long_restore:
                type_ = 'long'

            tags = {
                'type:{}'.format(type_),
            }
            datadog_counter('commcare.restores.cache_writes', tags=tags)
            response = CachedResponse.save_for_later(
                fileobj,
                self.cache_timeout,
                self.domain,
                self.restore_user.user_id,
            )
            self.restore_payload_path_cache.set_value(response.name, self.cache_timeout)
            return response
        return None
Example #3
0
 def __del__(self):
     if self.track_unreleased and self.lock_timer.is_started():
         datadog_counter("commcare.lock.not_released", tags=self.tags)
     if self.lock_trace is not None:
         self.lock_trace.set_tag("deleted", "not_released")
         self.lock_trace.finish()
         self.lock_trace = None
Example #4
0
    def delete_old_images():
        start = datetime.utcnow()
        max_age = start - timedelta(days=90)
        db = get_blob_db()

        def _get_query(db_name, max_age=max_age):
            return BlobMeta.objects.using(db_name).filter(
                content_type='image/jpeg',
                type_code=CODES.form_attachment,
                domain='icds-cas',
                created_on__lt=max_age
            )

        run_again = False
        for db_name in get_db_aliases_for_partitioned_query():
            bytes_deleted = 0
            metas = list(_get_query(db_name)[:1000])
            if metas:
                for meta in metas:
                    bytes_deleted += meta.content_length or 0
                db.bulk_delete(metas=metas)
                datadog_counter('commcare.icds_images.bytes_deleted', value=bytes_deleted)
                datadog_counter('commcare.icds_images.count_deleted', value=len(metas))
                run_again = True

        if run_again:
            delete_old_images.delay()
Example #5
0
 def reconcile_actions_if_necessary(self, xform):
     if not self.check_action_order():
         datadog_counter("commcare.form_processor.couch.reconcile_actions")
         try:
             self.reconcile_actions(rebuild=True, xforms={xform.form_id: xform})
         except ReconciliationError:
             pass
Example #6
0
 def _record_batch_exception_in_datadog(self, processor):
     datadog_counter(
         "commcare.change_feed.batch_processor_exceptions",
         tags=[
             'pillow_name:{}'.format(self.get_name()),
             'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"),
         ])
Example #7
0
    def degraded(self):
        """Indicate that the lock has "degraded gracefully"

        The lock was not acquired, but processing continued as if it had
        been acquired.
        """
        datadog_counter("commcare.lock.degraded", tags=self.tags)
Example #8
0
def report_shard_failures(search_result):
    """Report es shard failures to datadog
    """
    if not isinstance(search_result, dict):
        return

    if search_result.get('_shards', {}).get('failed'):
        datadog_counter('commcare.es.partial_results', value=1)
Example #9
0
def reprocess_submission(submssion_stub_id):
    with CriticalSection(['reprocess_submission_%s' % submssion_stub_id]):
        try:
            stub = UnfinishedSubmissionStub.objects.get(id=submssion_stub_id)
        except UnfinishedSubmissionStub.DoesNotExist:
            return

        reprocess_unfinished_stub(stub)
        datadog_counter('commcare.submission_reprocessing.count')
Example #10
0
def maybe_not_found(throw=None):
    try:
        yield
    except ClientError as err:
        if not is_not_found(err):
            raise
        datadog_counter('commcare.blobdb.notfound')
        if throw is not None:
            raise throw
Example #11
0
 def release(self):
     self.lock.release()
     if self.lock_timer.is_started():
         self.lock_timer.stop()
     if self.end_time and time.time() > self.end_time:
         datadog_counter("commcare.lock.released_after_timeout", tags=self.tags)
     if self.lock_trace is not None:
         self.lock_trace.finish()
         self.lock_trace = None
Example #12
0
 def _record_checkpoint_in_datadog(self):
     datadog_counter('commcare.change_feed.change_feed.checkpoint', tags=[
         'pillow_name:{}'.format(self.get_name()),
     ])
     checkpoint_sequence = self._normalize_checkpoint_sequence()
     for topic, value in six.iteritems(checkpoint_sequence):
         datadog_gauge('commcare.change_feed.checkpoint_offsets', value, tags=[
             'pillow_name:{}'.format(self.get_name()),
             _topic_for_ddog(topic),
         ])
Example #13
0
    def reconcile_transactions_if_necessary(self):
        if self.case.check_transaction_order():
            return False
        datadog_counter("commcare.form_processor.sql.reconciling_transactions")
        try:
            self.reconcile_transactions()
        except ReconciliationError as e:
            reconciliation_soft_assert(False, "ReconciliationError: %s" % e.message)

        return True
Example #14
0
def commit_migration(domain_name):
    domain_obj = Domain.get_by_name(domain_name, strict=True)
    domain_obj.use_sql_backend = True
    domain_obj.save()
    clear_local_domain_sql_backend_override(domain_name)
    if not should_use_sql_backend(domain_name):
        Domain.get_by_name.clear(Domain, domain_name)
        assert should_use_sql_backend(domain_name)
    datadog_counter("commcare.couch_sql_migration.total_committed")
    _logger.info("committed migration for {}".format(domain_name))
Example #15
0
    def _log_processed_docs_count(self, tags, throttled=False):
        if throttled and self.processed_docs < 100:
            return

        processed_docs = self.processed_docs
        self.processed_docs = 0

        datadog_counter("commcare.couchsqlmigration.processed_docs",
                        value=processed_docs,
                        tags=tags)
Example #16
0
    def get_context(self, record):
        from corehq.util.datadog.gauges import datadog_counter
        try:
            request = record.request
        except Exception:
            request = None

        request_repr = get_sanitized_request_repr(request)

        tb_list = []
        code = None
        if record.exc_info:
            etype, _value, tb = record.exc_info
            value = clean_exception(_value)
            tb_list = ['Traceback (most recent call first):\n']
            formatted_exception = traceback.format_exception_only(etype, value)
            tb_list.extend(formatted_exception)
            extracted_tb = list(reversed(traceback.extract_tb(tb)))
            code = self.get_code(extracted_tb)
            tb_list.extend(traceback.format_list(extracted_tb))
            stack_trace = '\n'.join(tb_list)
            subject = '%s: %s' % (record.levelname,
                                  formatted_exception[0].strip() if formatted_exception else record.getMessage())
        else:
            stack_trace = 'No stack trace available'
            subject = '%s: %s' % (
                record.levelname,
                record.getMessage()
            )
        context = defaultdict(lambda: '')
        context.update({
            'subject': self.format_subject(subject),
            'message': record.getMessage(),
            'details': getattr(record, 'details', None),
            'tb_list': tb_list,
            'request_repr': request_repr,
            'stack_trace': stack_trace,
            'code': code,
        })
        if request:
            sanitized_url = sanitize_url(request.build_absolute_uri())
            datadog_counter(ERROR_COUNT, tags=[
                'url:{}'.format(sanitized_url),
                'group:{}'.format(get_url_group(sanitized_url)),
                'domain:{}'.format(getattr(request, 'domain', DATADOG_UNKNOWN)),
            ])

            context.update({
                'get': list(request.GET.items()),
                'post': SafeExceptionReporterFilter().get_post_parameters(request),
                'method': request.method,
                'username': request.user.username if getattr(request, 'user', None) else "",
                'url': request.build_absolute_uri(),
            })
        return context
Example #17
0
        def _inner(*args, **kwargs):
            response = fn(*args, **kwargs)

            try:
                datadog_counter(
                    metric_name,
                    tags=['status_code:{}'.format(response.status_code)])
            except Exception:
                datadog_logger.exception('Unable to record Datadog stats')

            return response
Example #18
0
        def _inner(*args, **kwargs):
            response = fn(*args, **kwargs)

            try:
                datadog_counter(metric_name, tags=[
                    'status_code:{}'.format(response.status_code)
                ])
            except Exception:
                datadog_logger.exception('Unable to record Datadog stats')

            return response
Example #19
0
    def reconcile_transactions_if_necessary(self):
        if self.case.check_transaction_order():
            return False
        datadog_counter("commcare.form_processor.sql.reconcile_transactions")
        try:
            self.reconcile_transactions()
        except ReconciliationError as e:
            reconciliation_soft_assert(
                False, "ReconciliationError: %s" % six.text_type(e))

        return True
Example #20
0
 def _s3_bucket(self, create=False):
     if create and not self._s3_bucket_exists:
         try:
             self.db.meta.client.head_bucket(Bucket=self.s3_bucket_name)
         except ClientError as err:
             if not is_not_found(err):
                 datadog_counter('commcare.blobdb.notfound')
                 raise
             self.db.create_bucket(Bucket=self.s3_bucket_name)
         self._s3_bucket_exists = True
     return self.db.Bucket(self.s3_bucket_name)
Example #21
0
    def bulk_delete(self, metas):
        """Delete blob metadata in bulk

        :param metas: A list of `BlobMeta` objects.
        """
        if any(meta.id is None for meta in metas):
            raise ValueError("cannot delete unsaved BlobMeta")
        delete_blobs_sql = """
        WITH deleted AS (
            DELETE FROM blobs_blobmeta
            WHERE id IN %s
            RETURNING *
        ), ins AS (
            INSERT INTO blobs_deletedblobmeta (
                "id",
                "domain",
                "parent_id",
                "name",
                "key",
                "type_code",
                "created_on",
                "deleted_on"
            ) (
                SELECT
                    "id",
                    "domain",
                    "parent_id",
                    "name",
                    "key",
                    "type_code",
                    "created_on",
                    %s AS "deleted_on"
                FROM deleted
                WHERE expires_on IS NULL
            ) ON CONFLICT (id) DO UPDATE SET
                name = EXCLUDED.name,
                key = EXCLUDED.key,
                type_code = EXCLUDED.type_code,
                created_on = EXCLUDED.created_on,
                deleted_on = CLOCK_TIMESTAMP()
            WHERE blobs_deletedblobmeta.parent_id = EXCLUDED.parent_id and blobs_deletedblobmeta.key = EXCLUDED.key
        ) SELECT COUNT(*) FROM deleted;
        """
        now = _utcnow()
        parents = defaultdict(list)
        for meta in metas:
            parents[meta.parent_id].append(meta.id)
        for dbname, split_parent_ids in split_list_by_db_partition(parents):
            ids = tuple(m for p in split_parent_ids for m in parents[p])
            with BlobMeta.get_cursor_for_partition_db(dbname) as cursor:
                cursor.execute(delete_blobs_sql, [ids, now])
        deleted_bytes = sum(m.content_length for m in metas)
        datadog_counter('commcare.blobs.deleted.count', value=len(metas))
        datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
Example #22
0
def commit_migration(domain_name):
    domain_obj = Domain.get_by_name(domain_name, strict=True)
    domain_obj.use_sql_backend = True
    domain_obj.save()
    clear_local_domain_sql_backend_override(domain_name)
    if not should_use_sql_backend(domain_name):
        Domain.get_by_name.clear(Domain, domain_name)
        assert should_use_sql_backend(domain_name), \
            "could not set use_sql_backend for domain %s (try again)" % domain_name
    datadog_counter("commcare.couch_sql_migration.total_committed")
    log.info("committed migration for {}".format(domain_name))
Example #23
0
def _rate_limit_exc(exc_info):
    exc_type, exc_value, tb = exc_info
    rate_limit_key = _get_rate_limit_key(exc_info)
    if not rate_limit_key:
        return False

    datadog_counter('commcare.sentry.errors.rate_limited',
                    tags=['service:{}'.format(rate_limit_key)])
    if is_pg_cancelled_query_exception(exc_value):
        datadog_counter('hq_custom.postgres.standby_query_canellations')
    exponential_backoff_key = '{}_down'.format(rate_limit_key)
    return is_rate_limited(exponential_backoff_key)
Example #24
0
def _record_metrics(tags, submission_type, response, timer=None):
    tags += [
        'submission_type:{}'.format(submission_type),
        'status_code:{}'.format(response.status_code)
    ]

    if response.status_code == 201 and timer:
        tags += [
            'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'),
        ]

    datadog_counter('commcare.xform_submissions.count', tags=tags)
Example #25
0
 def get(self, identifier=None, bucket=DEFAULT_BUCKET, key=None):
     if identifier is None and bucket == DEFAULT_BUCKET:
         path = self.get_path(key=key)
     else:
         # legacy: can be removed with old API
         assert key is None, key
         key = join(bucket, identifier)
         path = self.get_path(identifier, bucket)
     if not exists(path):
         datadog_counter('commcare.blobdb.notfound')
         raise NotFound(key)
     return open(path, "rb")
Example #26
0
 def _record_checkpoint_in_datadog(self):
     datadog_counter('commcare.change_feed.change_feed.checkpoint',
                     tags=[
                         'pillow_name:{}'.format(self.get_name()),
                     ])
     checkpoint_sequence = self._normalize_checkpoint_sequence()
     for topic, value in six.iteritems(checkpoint_sequence):
         datadog_gauge('commcare.change_feed.checkpoint_offsets',
                       value,
                       tags=[
                           'pillow_name:{}'.format(self.get_name()),
                           _topic_for_ddog(topic),
                       ])
Example #27
0
 def _s3_bucket(self, create=False):
     if create and not self._s3_bucket_exists:
         try:
             with self.report_timing('head_bucket', self.s3_bucket_name):
                 self.db.meta.client.head_bucket(Bucket=self.s3_bucket_name)
         except ClientError as err:
             if not is_not_found(err):
                 datadog_counter('commcare.blobdb.notfound')
                 raise
             with self.report_timing('create_bucket', self.s3_bucket_name):
                 self.db.create_bucket(Bucket=self.s3_bucket_name)
         self._s3_bucket_exists = True
     return self.db.Bucket(self.s3_bucket_name)
Example #28
0
def _rate_limit_submission(domain):

    allow_usage = submission_rate_limiter.allow_usage(domain)

    if allow_usage:
        submission_rate_limiter.report_usage(domain)
    else:
        datadog_counter('commcare.xform_submissions.rate_limited',
                        tags=[
                            'domain:{}'.format(domain),
                        ])

    return not allow_usage
Example #29
0
    def delete(self, key, content_length):
        """Delete blob metadata

        Metadata for temporary blobs is deleted. Non-temporary metadata
        is retained to make it easier to track down missing blobs.

        :param key: Blob key string.
        :returns: The number of metadata rows deleted.
        """
        with get_cursor(BlobMeta) as cursor:
            cursor.execute('SELECT 1 FROM delete_blob_meta(%s)', [key])
        datadog_counter('commcare.blobs.deleted.count')
        datadog_counter('commcare.blobs.deleted.bytes', value=content_length)
 def _send_timings(self, timing_context):
     metric_name_template = "commcare.%s.count"
     metric_name_template_normalized = "commcare.%s.count.normalized"
     for timing in timing_context.to_list():
         datadog_counter(
             metric_name_template % timing.full_name,
             tags=['duration:%s' % bucket_value(timing.duration, TIMING_BUCKETS)])
         normalize_denominator = getattr(timing, 'normalize_denominator', None)
         if normalize_denominator:
             datadog_counter(
                 metric_name_template_normalized % timing.full_name,
                 tags=['duration:%s' % bucket_value(timing.duration / normalize_denominator,
                                                    NORMALIZED_TIMING_BUCKETS)])
Example #31
0
def rate_limit_submission_by_delaying(domain, max_wait):
    if not submission_rate_limiter.allow_usage(domain):
        with TimingContext() as timer:
            acquired = submission_rate_limiter.wait(domain, timeout=max_wait)
        if acquired:
            duration_tag = bucket_value(timer.duration, [1, 5, 10, 15, 20], unit='s')
        else:
            duration_tag = 'timeout'
        datadog_counter('commcare.xform_submissions.rate_limited.test', tags=[
            'domain:{}'.format(domain),
            'duration:{}'.format(duration_tag)
        ])
    submission_rate_limiter.report_usage(domain)
Example #32
0
def _rate_limit_restore(domain):

    allow_usage = restore_rate_limiter.allow_usage(domain)

    if allow_usage:
        restore_rate_limiter.report_usage(domain)
    else:
        datadog_counter('commcare.restore.rate_limited',
                        tags=[
                            'domain:{}'.format(domain),
                        ])

    return not allow_usage
Example #33
0
 def _send_timings(self, timing_context):
     metric_name_template = "commcare.%s.count"
     metric_name_template_normalized = "commcare.%s.count.normalized"
     for timing in timing_context.to_list():
         datadog_counter(
             metric_name_template % timing.full_name,
             tags=['duration:%s' % bucket_value(timing.duration, TIMING_BUCKETS)])
         normalize_denominator = getattr(timing, 'normalize_denominator', None)
         if normalize_denominator:
             datadog_counter(
                 metric_name_template_normalized % timing.full_name,
                 tags=['duration:%s' % bucket_value(timing.duration / normalize_denominator,
                                                    NORMALIZED_TIMING_BUCKETS)])
Example #34
0
def rate_limit_two_factor_setup(device):
    """
    This holds attempts per user AND attempts per IP below limits

    given by two_factor_setup_rate_limiter.
    And keeps total requests below limits given by global_two_factor_setup_rate_limiter.

    Requests without an IP are rejected (unusual).
    If a device has no username attached or if it is not a PhoneDevice,
    then those requests are also rejected.

    """
    _status_rate_limited = 'rate_limited'
    _status_bad_request = 'bad_request'
    _status_accepted = 'accepted'

    def get_ip_address():
        request = get_request()
        if request:
            return get_ip(request)
        else:
            return None

    _report_current_global_two_factor_setup_rate_limiter()

    ip_address = get_ip_address()
    username = device.user.username
    method = device.method if isinstance(device, PhoneDevice) else None

    if ip_address and username and method:
        if two_factor_setup_rate_limiter.allow_usage('ip:{}'.format(ip_address)) \
                and two_factor_setup_rate_limiter.allow_usage('user:{}'.format(username)) \
                and global_two_factor_setup_rate_limiter.allow_usage():
            two_factor_setup_rate_limiter.report_usage(
                'ip:{}'.format(ip_address))
            two_factor_setup_rate_limiter.report_usage(
                'user:{}'.format(username))
            global_two_factor_setup_rate_limiter.report_usage()
            status = _status_accepted
        else:
            status = _status_rate_limited
    else:
        status = _status_bad_request

    datadog_counter('commcare.two_factor.setup_requests',
                    1,
                    tags=[
                        'status:{}'.format(status),
                        'method:{}'.format(method),
                    ])
    return status != _status_accepted
Example #35
0
def silence_and_report_error(message, datadog_metric):
    """
    Prevent a piece of code from ever causing 500s if it errors

    Instead, report the issue to sentry and track the overall count on datadog
    """

    try:
        yield
    except Exception:
        notify_exception(None, message)
        datadog_counter(datadog_metric)
        if settings.UNIT_TESTING:
            raise
Example #36
0
def _record_metrics(tags, submission_type, response, result=None, timer=None):
    tags += [
        'submission_type:{}'.format(submission_type),
        'status_code:{}'.format(response.status_code)
    ]

    if response.status_code == 201 and timer and result:
        tags += [
            'duration:%s' % bucket_value(timer.duration, (5, 10, 20), 's'),
            'case_count:%s' % bucket_value(len(result.cases), (2, 5, 10)),
            'ledger_count:%s' % bucket_value(len(result.ledgers), (2, 5, 10)),
        ]

    datadog_counter('commcare.xform_submissions.count', tags=tags)
Example #37
0
def _commit_timing(queryset):
    # only send to datadog on initial query evaluation
    commit = queryset._mptt_set._result_cache is None
    try:
        yield
    finally:
        if commit and queryset._mptt_set._result_cache is not None:
            timing = queryset._timing
            for key in timing.timers:
                bucket = bucket_value(timing.duration(key), TIME_BUCKETS, "s")
                datadog_counter(
                    'commcare.locations.%s.%s.count' % (timing.name, key),
                    tags=['duration:%s' % bucket],
                )
Example #38
0
def _record_metrics(tags, submission_type, response, result=None, timer=None):
    tags += [
        'submission_type:{}'.format(submission_type),
        'status_code:{}'.format(response.status_code)
    ]

    if response.status_code == 201 and timer and result:
        tags += [
            'duration:%s' % bucket_value(timer.duration, (5, 10, 20), 's'),
            'case_count:%s' % bucket_value(len(result.cases), (2, 5, 10)),
            'ledger_count:%s' % bucket_value(len(result.ledgers), (2, 5, 10)),
        ]

    datadog_counter('commcare.xform_submissions.count', tags=tags)
Example #39
0
def save_document(doc_ids):
    lock_keys = []
    for doc_id in doc_ids:
        lock_keys.append(get_async_indicator_modify_lock_key(doc_id))

    indicator_config_ids = None
    timer = TimingContext()
    with CriticalSection(lock_keys):
        indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids)
        if not indicators:
            return

        first_indicator = indicators[0]
        processed_indicators = []
        failed_indicators = []

        for i in indicators:
            assert i.domain == first_indicator.domain
            assert i.doc_type == first_indicator.doc_type

        indicator_by_doc_id = {i.doc_id: i for i in indicators}
        doc_store = get_document_store(first_indicator.domain,
                                       first_indicator.doc_type)
        indicator_config_ids = first_indicator.indicator_config_ids

        with timer:
            for doc in doc_store.iter_documents(indicator_by_doc_id.keys()):
                indicator = indicator_by_doc_id[doc['_id']]
                successfully_processed, to_remove = _save_document_helper(
                    indicator, doc)
                if successfully_processed:
                    processed_indicators.append(indicator.pk)
                else:
                    failed_indicators.append((indicator, to_remove))

        num_processed = len(processed_indicators)
        num_failed = len(failed_indicators)
        AsyncIndicator.objects.filter(pk__in=processed_indicators).delete()
        with transaction.atomic():
            for indicator, to_remove in failed_indicators:
                indicator.update_failure(to_remove)
                indicator.save()

    datadog_counter('commcare.async_indicator.processed_success',
                    num_processed)
    datadog_counter('commcare.async_indicator.processed_fail', num_failed)
    datadog_histogram('commcare.async_indicator.processing_time',
                      timer.duration,
                      tags=[u'config_ids:{}'.format(indicator_config_ids)])
Example #40
0
def report_and_fail_on_shard_failures(search_result):
    """
    Raise an ESShardFailure if there are shard failures in an ES search result (JSON)

    and report to datadog.
    The commcare.es.partial_results metric counts 1 per ES request with any shard failure.
    """
    if not isinstance(search_result, dict):
        return

    if search_result.get('_shards', {}).get('failed'):
        datadog_counter('commcare.es.partial_results', value=1)
        # Example message:
        #   "_shards: {'successful': 4, 'failed': 1, 'total': 5}"
        raise ESShardFailure('_shards: {!r}'.format(search_result.get('_shards')))
Example #41
0
def celery_record_time_to_start(task_id=None, task=None, **kwargs):
    from corehq.util.datadog.gauges import datadog_gauge, datadog_counter

    tags = [
        'celery_task_name:{}'.format(task.name),
        'celery_queue:{}'.format(task.queue),
    ]

    timer = TimeToStartTimer(task_id)
    try:
        time_to_start = timer.stop_and_pop_timing()
    except TimingNotAvailable:
        datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags)
    else:
        datadog_gauge('commcare.celery.task.time_to_start', time_to_start.total_seconds(), tags=tags)
Example #42
0
 def bulk_delete(self, paths):
     success = True
     for chunk in chunked(paths, 1000):
         objects = [{"Key": path} for path in chunk]
         s3_bucket = self._s3_bucket()
         deleted_bytes = 0
         for path in chunk:
             with maybe_not_found():
                 deleted_bytes += s3_bucket.Object(path).content_length
         resp = s3_bucket.delete_objects(Delete={"Objects": objects})
         deleted = set(d["Key"] for d in resp.get("Deleted", []))
         success = success and all(o["Key"] in deleted for o in objects)
         datadog_counter('commcare.blobs.deleted.count', value=len(deleted))
         datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
     return success
Example #43
0
 def bulk_delete(self, paths):
     success = True
     deleted_count = 0
     deleted_bytes = 0
     for path in paths:
         if not exists(path):
             success = False
         else:
             cs = _count_size(path)
             deleted_count += cs.count
             deleted_bytes += cs.size
             os.remove(path)
     datadog_counter('commcare.blobs.deleted.count', value=deleted_count)
     datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
     return success
Example #44
0
    def should_capture(self, exc_info):
        ex_value = exc_info[1]
        capture = getattr(ex_value, 'sentry_capture', True)
        if not capture:
            return False

        if not super(HQSentryClient, self).should_capture(exc_info):
            return False

        rate_limit_key = _get_rate_limit_key(exc_info)
        if rate_limit_key:
            datadog_counter('commcare.sentry.errors.rate_limited',
                            tags=['service:{}'.format(rate_limit_key)])
            return not _is_rate_limited(rate_limit_key)
        return True
Example #45
0
 def _record_transient_bounce(self, aws_meta, uid):
     exists = TransientBounceEmail.objects.filter(
         email=aws_meta.email,
         timestamp=aws_meta.timestamp,
     ).exists()
     if not exists:
         TransientBounceEmail.objects.create(
             email=aws_meta.email,
             timestamp=aws_meta.timestamp,
             headers=aws_meta.headers,
         )
     if self.delete_processed_messages:
         self._delete_message_with_uid(uid)
     datadog_counter(
         'commcare.bounced_email_manager.transient_bounce_recorded')
Example #46
0
def report_and_fail_on_shard_failures(search_result):
    """
    Raise an ESShardFailure if there are shard failures in an ES search result (JSON)

    and report to datadog.
    The commcare.es.partial_results metric counts 1 per ES request with any shard failure.
    """
    if not isinstance(search_result, dict):
        return

    if search_result.get('_shards', {}).get('failed'):
        datadog_counter('commcare.es.partial_results', value=1)
        # Example message:
        #   "_shards: {'successful': 4, 'failed': 1, 'total': 5}"
        raise ESShardFailure('_shards: {!r}'.format(search_result.get('_shards')))
Example #47
0
def _delay_and_report_rate_limit_submission(domain, max_wait, datadog_metric):
    with TimingContext() as timer:
        acquired = submission_rate_limiter.wait(domain, timeout=max_wait)
    if acquired:
        duration_tag = bucket_value(timer.duration, [.5, 1, 5, 10, 15], unit='s')
    elif timer.duration < max_wait:
        duration_tag = 'quick_reject'
    else:
        duration_tag = 'delayed_reject'
    datadog_counter(datadog_metric, tags=[
        f'domain:{domain}',
        f'duration:{duration_tag}',
        f'throttle_method:{"delay" if acquired else "reject"}'
    ])
    return acquired
Example #48
0
 def delete(self, *args, **kw):
     identifier, bucket = self.get_args_for_delete(*args, **kw)
     if identifier is None:
         path = safejoin(self.rootdir, bucket)
         remove = shutil.rmtree
     else:
         path = self.get_path(identifier, bucket)
         remove = os.remove
     if not exists(path):
         return False
     cs = _count_size(path)
     datadog_counter('commcare.blobs.deleted.count', value=cs.count)
     datadog_counter('commcare.blobs.deleted.bytes', value=cs.size)
     remove(path)
     return True
Example #49
0
    def __record_change_metric_in_datadog(self, metric, change, timer=None):
        if change.metadata is not None:
            tags = [
                u'datasource:{}'.format(change.metadata.data_source_name),
                u'document_type:{}'.format(change.metadata.document_type),
                u'domain:{}'.format(change.metadata.domain),
                u'is_deletion:{}'.format(change.metadata.is_deletion),
                u'pillow_name:{}'.format(self.get_name())
            ]
            datadog_counter(metric, tags=tags)

            if timer:
                datadog_gauge('commcare.change_feed.processing_time',
                              timer.duration,
                              tags=tags)
Example #50
0
    def bulk_delete(self, metas):
        """Delete blob metadata in bulk

        :param metas: A list of `BlobMeta` objects.
        """
        if any(meta.id is None for meta in metas):
            raise ValueError("cannot delete unsaved BlobMeta")
        parents = defaultdict(list)
        for meta in metas:
            parents[meta.parent_id].append(meta.id)
        for db_name, split_parent_ids in split_list_by_db_partition(parents):
            ids = chain.from_iterable(parents[x] for x in split_parent_ids)
            BlobMeta.objects.using(db_name).filter(id__in=list(ids)).delete()
        deleted_bytes = sum(meta.content_length for m in metas)
        datadog_counter('commcare.blobs.deleted.count', value=len(metas))
        datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
Example #51
0
def celery_record_time_to_start(task_id=None, task=None, **kwargs):
    from corehq.util.datadog.gauges import datadog_gauge, datadog_counter
    time_sent = cache.get('task.{}.time_sent'.format(task_id))
    tags = [
        'celery_task_name:{}'.format(task.name),
        'celery_queue:{}'.format(task.queue),
    ]
    if time_sent:
        time_to_start = (datetime.datetime.utcnow() -
                         time_sent).total_seconds()
        datadog_gauge('commcare.celery.task.time_to_start',
                      time_to_start,
                      tags=tags)
    else:
        datadog_counter('commcare.celery.task.time_to_start_unavailable',
                        tags=tags)
Example #52
0
    def bulk_delete(self, metas):
        """Delete blob metadata in bulk

        :param metas: A list of `BlobMeta` objects.
        """
        if any(meta.id is None for meta in metas):
            raise ValueError("cannot delete unsaved BlobMeta")
        delete_blobs_sql = """
        WITH deleted AS (
            DELETE FROM blobs_blobmeta
            WHERE id IN %s
            RETURNING *
        ), ins AS (
            INSERT INTO blobs_deletedblobmeta (
                "id",
                "domain",
                "parent_id",
                "name",
                "key",
                "type_code",
                "created_on",
                "deleted_on"
            )
            SELECT
                "id",
                "domain",
                "parent_id",
                "name",
                "key",
                "type_code",
                "created_on",
                %s AS "deleted_on"
            FROM deleted
            WHERE expires_on IS NULL
        ) SELECT COUNT(*) FROM deleted;
        """
        now = _utcnow()
        parents = defaultdict(list)
        for meta in metas:
            parents[meta.parent_id].append(meta.id)
        for dbname, split_parent_ids in split_list_by_db_partition(parents):
            ids = tuple(m for p in split_parent_ids for m in parents[p])
            with connections[dbname].cursor() as cursor:
                cursor.execute(delete_blobs_sql, [ids, now])
        deleted_bytes = sum(meta.content_length for m in metas)
        datadog_counter('commcare.blobs.deleted.count', value=len(metas))
        datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
Example #53
0
 def put(self, meta):
     """Save `BlobMeta` in the metadata database"""
     meta.save()
     length = meta.content_length
     datadog_counter('commcare.blobs.added.count')
     datadog_counter('commcare.blobs.added.bytes', value=length)
     if meta.expires_on is not None:
         datadog_counter('commcare.temp_blobs.count')
         datadog_counter('commcare.temp_blobs.bytes_added', value=length)
Example #54
0
 def handle_exception(exception, config_id, doc, adapter):
     metric = None
     if isinstance(exception, (ProtocolError, ReadTimeout)):
         metric = 'commcare.async_indicator.riak_error'
     elif isinstance(exception, (ESError, ConnectionTimeout)):
         # a database had an issue so log it and go on to the next document
         metric = 'commcare.async_indicator.es_error'
     elif isinstance(exception, (DatabaseError, InternalError)):
         # a database had an issue so log it and go on to the next document
         metric = 'commcare.async_indicator.psql_error'
     else:
         # getting the config could fail before the adapter is set
         if adapter:
             adapter.handle_exception(doc, exception)
     if metric:
         datadog_counter(metric, 1,
             tags={'config_id': config_id, 'doc_id': doc['_id']})
Example #55
0
    def should_capture(self, exc_info):
        ex_value = exc_info[1]
        capture = getattr(ex_value, 'sentry_capture', True)
        if not capture:
            return False

        if not super(HQSentryClient, self).should_capture(exc_info):
            return False

        rate_limit_key = _get_rate_limit_key(exc_info)
        if rate_limit_key:
            datadog_counter('commcare.sentry.errors.rate_limited', tags=[
                'service:{}'.format(rate_limit_key)
            ])
            exponential_backoff_key = '{}_down'.format(rate_limit_key)
            return not is_rate_limited(exponential_backoff_key)
        return True
Example #56
0
def remove_from_queue(queued_sms):
    with transaction.atomic():
        sms = SMS()
        for field in sms._meta.fields:
            if field.name != 'id':
                setattr(sms, field.name, getattr(queued_sms, field.name))
        queued_sms.delete()
        sms.save()

    sms.publish_change()

    if sms.direction == OUTGOING and sms.processed and not sms.error:
        create_billable_for_sms(sms)
        datadog_counter('commcare.sms.outbound_succeeded')
    elif sms.direction == OUTGOING:
        datadog_counter('commcare.sms.outbound_failed')
    elif sms.direction == INCOMING and sms.domain and domain_has_privilege(sms.domain, privileges.INBOUND_SMS):
        create_billable_for_sms(sms)
Example #57
0
    def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None):
        if change.metadata is not None:
            tags = [
                'datasource:{}'.format(change.metadata.data_source_name),
                'is_deletion:{}'.format(change.metadata.is_deletion),
                'pillow_name:{}'.format(self.get_name()),
                'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"),
            ]
            count = 1 if processor else len(self.processors)
            datadog_counter(metric, value=count, tags=tags)

            change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds()
            datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[
                'pillow_name:{}'.format(self.get_name()),
                _topic_for_ddog(change.topic),
            ])

            if processing_time:
                datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=tags)
Example #58
0
def handle_pillow_error(pillow, change, exception):
    from pillow_retry.models import PillowError

    pillow_logging.exception("[%s] Error on change: %s, %s" % (
        pillow.get_name(),
        change['id'],
        exception,
    ))

    datadog_counter('commcare.change_feed.changes.exceptions', tags=[
        'pillow_name:{}'.format(pillow.get_name()),
    ])

    # keep track of error attempt count
    change.increment_attempt_count()

    # always retry document missing errors, because the error is likely with couch
    if pillow.retry_errors or isinstance(exception, DocumentMissingError):
        error = PillowError.get_or_create(change, pillow)
        error.add_attempt(exception, sys.exc_info()[2], change.metadata)
        error.save()
Example #59
0
def _submission_error(request, message, count_metric, metric_tags,
        domain, app_id, user_id, authenticated, meta=None, status=400,
        notify=True):
    """Notify exception, datadog count, record metrics, construct response

    :param status: HTTP status code (default: 400).
    :returns: HTTP response object
    """
    details = [
        "domain:{}".format(domain),
        "app_id:{}".format(app_id),
        "user_id:{}".format(user_id),
        "authenticated:{}".format(authenticated),
        "form_meta:{}".format(meta or {}),
    ]
    datadog_counter(count_metric, tags=details)
    if notify:
        notify_exception(request, message, details)
    response = HttpResponseBadRequest(
        message, status=status, content_type="text/plain")
    _record_metrics(metric_tags, 'unknown', response)
    return response