def test_configurable_retry_count_resets(monkeypatch):
    logger = MockLogger()

    default_failing_function = FailingFunction(
        ValueError, max_executions=3, delay=.1)

    exc_callback = lambda: log_uncaught_errors(logger)

    retry(default_failing_function, exc_callback=exc_callback,
          max_count=3, backoff_delay=0)()

    assert logger.call_count == 2
    assert default_failing_function.call_count == 3

    logger = MockLogger()

    failing_function = FailingFunction(ValueError, max_executions=5, delay=.1)

    exc_callback = lambda: log_uncaught_errors(logger)

    retry(failing_function, exc_callback=exc_callback, max_count=5,
          backoff_delay=0)()

    assert logger.call_count == 4
    assert failing_function.call_count == 5
    def _run(self):
        with self.semaphore:
            log = logger.new(
                record_id=self.record_id, action_log_id=self.action_log_id,
                action=self.action_name, account_id=self.account_id,
                extra_args=self.extra_args)

            for _ in range(ACTION_MAX_NR_OF_RETRIES):
                try:
                    before_func = datetime.utcnow()
                    if self.extra_args:
                        self.func(self.account_id, self.record_id,
                                  self.extra_args)
                    else:
                        self.func(self.account_id, self.record_id)
                    after_func = datetime.utcnow()

                    with session_scope(self.account_id) as db_session:
                        action_log_entry = db_session.query(ActionLog).get(
                            self.action_log_id)
                        action_log_entry.status = 'successful'
                        db_session.commit()
                        latency = round((datetime.utcnow() -
                                         action_log_entry.created_at).
                                        total_seconds(), 2)
                        func_latency = round((after_func - before_func).
                                             total_seconds(), 2)
                        log.info('syncback action completed',
                                 action_id=self.action_log_id,
                                 latency=latency,
                                 process=self.parent_service.process_number,
                                 func_latency=func_latency)
                        self._log_to_statsd(action_log_entry.status, latency)
                        return
                except Exception:
                    log_uncaught_errors(log, account_id=self.account_id,
                                        provider=self.provider)
                    with session_scope(self.account_id) as db_session:
                        action_log_entry = db_session.query(ActionLog).get(
                            self.action_log_id)
                        action_log_entry.retries += 1
                        if (action_log_entry.retries ==
                                ACTION_MAX_NR_OF_RETRIES):
                            log.critical('Max retries reached, giving up.',
                                         exc_info=True)
                            action_log_entry.status = 'failed'
                            self._log_to_statsd(action_log_entry.status)
                            db_session.commit()
                            return
                        db_session.commit()

                # Wait before retrying
                log.info("Syncback worker retrying action after sleeping",
                         duration=self.retry_interval)

                # TODO(T6974): We might want to do some kind of exponential
                # backoff with jitter to avoid the thundering herd problem if a
                # provider suddenly starts having issues for a short period of
                # time.
                gevent.sleep(self.retry_interval)
Exemple #3
0
    def poll(self, event):
        # Determine which accounts to sync
        start_accounts = self.account_ids_to_sync()
        statsd_client.gauge(
            'mailsync.account_counts.{}.mailsync-{}.count'.format(
                self.host, self.process_number), len(start_accounts))

        # Perform the appropriate action on each account
        for account_id in start_accounts:
            if account_id not in self.syncing_accounts:
                try:
                    self.start_sync(account_id)
                except OperationalError:
                    self.log.error('Database error starting account sync',
                                   exc_info=True)
                    log_uncaught_errors()

        stop_accounts = self.account_ids_owned() - set(start_accounts)
        for account_id in stop_accounts:
            self.log.info('sync service stopping sync',
                          account_id=account_id)
            try:
                self.stop_sync(account_id)
            except OperationalError:
                self.log.error('Database error stopping account sync',
                               exc_info=True)
                log_uncaught_errors()
Exemple #4
0
    def poll(self):
        if self.stealing_enabled:
            r = self.queue_client.claim_next(self.process_identifier)
            if r:
                self.log.info('Claimed new account sync', account_id=r)

        # Determine which accounts to sync
        start_accounts = self.accounts_to_sync()
        statsd_client.gauge(
            'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id),
            len(start_accounts))

        # Perform the appropriate action on each account
        for account_id in start_accounts:
            if account_id not in self.syncing_accounts:
                try:
                    self.start_sync(account_id)
                except OperationalError:
                    self.log.error('Database error starting account sync',
                                   exc_info=True)
                    log_uncaught_errors()

        stop_accounts = self.syncing_accounts - set(start_accounts)
        for account_id in stop_accounts:
            self.log.info('sync service stopping sync',
                          account_id=account_id)
            try:
                self.stop_sync(account_id)
            except OperationalError:
                self.log.error('Database error stopping account sync',
                               exc_info=True)
                log_uncaught_errors()
Exemple #5
0
    def _run(self):
        """
        Index into CloudSearch the contacts of all namespaces.

        """
        try:
            for key in engine_manager.engines:
                with session_scope_by_shard_id(key) as db_session:
                    pointer = db_session.query(ContactSearchIndexCursor).first()
                    if pointer:
                        self.transaction_pointers[key] = pointer.transaction_id
                    else:
                        # Never start from 0; if the service hasn't
                        # run before start from the latest
                        # transaction, with the expectation that a
                        # backfill will be run separately.
                        latest_transaction = (
                            db_session.query(Transaction).order_by(desc(Transaction.created_at)).first()
                        )
                        if latest_transaction:
                            self.transaction_pointers[key] = latest_transaction.id
                        else:
                            self.transaction_pointers[key] = 0

            self.log.info("Starting contact-search-index service", transaction_pointers=self.transaction_pointers)

            while True:
                for key in engine_manager.engines:
                    with session_scope_by_shard_id(key) as db_session:
                        transactions = (
                            db_session.query(Transaction)
                            .filter(
                                Transaction.id > self.transaction_pointers[key], Transaction.object_type == "contact"
                            )
                            .with_hint(Transaction, "USE INDEX (ix_transaction_table_name)")
                            .order_by(asc(Transaction.id))
                            .limit(self.chunk_size)
                            .options(joinedload(Transaction.namespace))
                            .all()
                        )

                        # index up to chunk_size transactions
                        should_sleep = False
                        if transactions:
                            self.index(transactions, db_session)
                            oldest_transaction = min(transactions, key=lambda t: t.created_at)
                            current_timestamp = datetime.utcnow()
                            latency = (current_timestamp - oldest_transaction.created_at).seconds
                            self._report_transactions_latency(latency)
                            new_pointer = transactions[-1].id
                            self.update_pointer(new_pointer, key, db_session)
                            db_session.commit()
                        else:
                            should_sleep = True
                    if should_sleep:
                        log.info("sleeping")
                        sleep(self.poll_interval)
        except Exception:
            log_uncaught_errors(log)
Exemple #6
0
    def poll(self):
        # We really don't want to take on more load than we can bear, so we
        # need to check the CPU usage before accepting new accounts.
        # Note that we can't check this for the current core because the kernel
        # transparently moves programs across cores.
        usage_per_cpu = psutil.cpu_percent(percpu=True)
        self.rolling_cpu_counts.append(usage_per_cpu)

        cpu_averages = self._compute_cpu_average()

        cpus_over_nominal = all([cpu_usage > NOMINAL_THRESHOLD for cpu_usage in cpu_averages])

        # Conservatively, stop accepting accounts if the CPU usage is over
        # NOMINAL_THRESHOLD for every core, or if the total # of accounts
        # being synced by a single process exceeds the threshold. Excessive
        # concurrency per process can result in lowered database throughput
        # or availability problems, since many transactions may be held open
        # at the same time.
        if self.stealing_enabled and not cpus_over_nominal and \
                len(self.syncing_accounts) < MAX_ACCOUNTS_PER_PROCESS:
            r = self.queue_client.claim_next(self.process_identifier)
            if r:
                self.log.info('Claimed new account sync', account_id=r)
        else:
            if not self.stealing_enabled:
                reason = 'stealing disabled'
            elif cpus_over_nominal:
                reason = 'CPU too high'
            else:
                reason = 'reached max accounts for process'
            self.log.info('Not claiming new account sync', reason=reason)

        # Determine which accounts to sync
        start_accounts = self.accounts_to_sync()
        statsd_client.gauge(
            'accounts.{}.mailsync-{}.count'.format(
                self.host, self.process_number), len(start_accounts))

        # Perform the appropriate action on each account
        for account_id in start_accounts:
            if account_id not in self.syncing_accounts:
                try:
                    self.start_sync(account_id)
                except OperationalError:
                    self.log.error('Database error starting account sync',
                                   exc_info=True)
                    log_uncaught_errors()

        stop_accounts = self.syncing_accounts - set(start_accounts)
        for account_id in stop_accounts:
            self.log.info('sync service stopping sync',
                          account_id=account_id)
            try:
                self.stop_sync(account_id)
            except OperationalError:
                self.log.error('Database error stopping account sync',
                               exc_info=True)
                log_uncaught_errors()
Exemple #7
0
    def poll(self):
        # We really don't want to take on more load than we can bear, so we
        # need to check the CPU usage before accepting new accounts.
        # Note that we can't check this for the current core because the kernel
        # transparently moves programs across cores.
        usage_per_cpu = psutil.cpu_percent(percpu=True)
        self.rolling_cpu_counts.append(usage_per_cpu)

        cpu_averages = self._compute_cpu_average()

        has_overloaded_cpus = all([cpu_usage > 90.0 for cpu_usage in cpu_averages])
        cpus_over_nominal = all([cpu_usage > 85.0 for cpu_usage in cpu_averages])

        # Conservatively, stop accepting accounts if the CPU usage is over 85%
        # for every core.
        if self.stealing_enabled and not cpus_over_nominal:
            r = self.queue_client.claim_next(self.process_identifier)
            if r:
                self.log.info('Claimed new account sync', account_id=r)

        if has_overloaded_cpus:
            # Unload a single account.
            acc = self._pick_account()

            if acc is not None:
                self.log.info('Overloaded CPU, unloading account',
                              account_id=acc)
                self.stop_sync(acc)
            else:
                self.log.error("Couldn't find an account to unload!")

        # Determine which accounts to sync
        start_accounts = self.accounts_to_sync()
        statsd_client.gauge(
            'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id),
            len(start_accounts))

        # Perform the appropriate action on each account
        for account_id in start_accounts:
            if account_id not in self.syncing_accounts:
                try:
                    self.start_sync(account_id)
                except OperationalError:
                    self.log.error('Database error starting account sync',
                                   exc_info=True)
                    log_uncaught_errors()

        stop_accounts = self.syncing_accounts - set(start_accounts)
        for account_id in stop_accounts:
            self.log.info('sync service stopping sync',
                          account_id=account_id)
            try:
                self.stop_sync(account_id)
            except OperationalError:
                self.log.error('Database error stopping account sync',
                               exc_info=True)
                log_uncaught_errors()
Exemple #8
0
 def callback(e):
     if isinstance(e, TRANSIENT_NETWORK_ERRS):
         occurrences[0] += 1
         if occurrences[0] < 20:
             return
     else:
         occurrences[0] = 1
     log_uncaught_errors(logger, account_id=account_id, provider=provider,
                         error_type=type(e).__name__, occurrences=occurrences[0])
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id) for acc
                         in db_session.query(Account) if acc.is_deleted]

    queue_size = len(ids_to_delete)
    for account_id, namespace_id in ids_to_delete:
        # queue_size = length of queue
        # deleted_count = number of accounts deleted during loop iteration
        # this is necessary because the length of ids_to_delete doesn't
        # change during loop iteration
        statsd_client.gauge('mailsync.{}.account_deletion.queue.length'
                            .format(shard_id),
                            queue_size - deleted_count)
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn('Account NOT marked for deletion. '
                             'Will not delete', account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id, namespace_id, throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed', error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.incr('mailsync.account_deletion.queue.deleted', 1)
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully', shard_id=shard_id,
             time=end - start, count=deleted_count)
Exemple #10
0
    def execute_with_lock(self):
        self.log = logger.new(
            record_ids=self.record_ids, action_log_ids=self.action_log_ids,
            action=self.action_name, account_id=self.account_id,
            extra_args=self.extra_args)

        # Double-check that the action is still pending.
        # Although the task queue is populated based on pending actions, it's
        # possible that the processing of one action involved marking other
        # actions as failed.
        records_to_process, action_ids_to_process = self._get_records_and_actions_to_process()
        if len(action_ids_to_process) == 0:
            return

        for attempt in range(ACTION_MAX_NR_OF_RETRIES):
            self.log.debug("executing action", attempt=attempt)
            try:
                before, after = self._execute_timed_action(records_to_process)

                with session_scope(self.account_id) as db_session:
                    action_log_entries = db_session.query(ActionLog). \
                        filter(ActionLog.id.in_(action_ids_to_process))

                    for action_log_entry in action_log_entries:
                        self._mark_action_as_successful(action_log_entry, before, after, db_session)
                    return
            except:
                log_uncaught_errors(self.log, account_id=self.account_id,
                                    provider=self.provider)
                with session_scope(self.account_id) as db_session:
                    action_log_entries = db_session.query(ActionLog). \
                        filter(ActionLog.id.in_(action_ids_to_process))

                    marked_as_failed = False
                    for action_log_entry in action_log_entries:
                        action_log_entry.retries += 1
                        if action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES:
                            marked_as_failed = True
                            self._mark_action_as_failed(action_log_entry, db_session)
                        # If we've merged SyncbackTasks then their corresponding
                        # actions should all fail at the same time.
                        assert (not marked_as_failed or
                                action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES)
                        db_session.commit()
                    if marked_as_failed:
                        return

            # Wait before retrying
            self.log.info("Syncback task retrying action after sleeping",
                          duration=self.retry_interval)

            # TODO(T6974): We might want to do some kind of exponential
            # backoff with jitter to avoid the thundering herd problem if a
            # provider suddenly starts having issues for a short period of
            # time.
            gevent.sleep(self.retry_interval)
Exemple #11
0
    def mark_failure(self, actionlog_id):
        log_uncaught_errors(log, component='syncback', account_id=self.account_id,
                            action_id=actionlog_id, provider=self.provider)

        with session_scope(self.account_id) as db_session:
            actionlog = db_session.query(ActionLog).get(actionlog_id)
            actionlog.retries += 1

            if actionlog.retries == ACTION_MAX_NR_OF_RETRIES:
                log.critical('Max retries reached, giving up.', exc_info=True)
                actionlog.status = 'failed'
                db_session.commit()

                self._log_to_statsd(actionlog.status)
Exemple #12
0
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id) for acc
                         in db_session.query(Account) if acc.is_deleted]

    for account_id, namespace_id in ids_to_delete:
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn('Account NOT marked for deletion. '
                             'Will not delete', account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id, namespace_id, throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed', error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
            gevent.sleep(60)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully', shard_id=shard_id,
             time=end - start, count=deleted_count)
Exemple #13
0
    def _run(self):
        """
        Index into CloudSearch the contacts of all namespaces.

        """
        try:
            self._set_transaction_pointers()

            self.log.info("Starting contact-search-index service", transaction_pointers=self.transaction_pointers)

            while True:
                self._publish_heartbeat()
                self._index_transactions()

        except Exception:
            log_uncaught_errors(log)
Exemple #14
0
    def _run(self):
        with self.semaphore:
            log = logger.new(
                record_id=self.record_id, action_log_id=self.action_log_id,
                action=self.action_name, account_id=self.account_id,
                extra_args=self.extra_args)

            for _ in range(ACTION_MAX_NR_OF_RETRIES):
                try:
                    if self.extra_args:
                        self.func(self.account_id, self.record_id,
                                  self.extra_args)
                    else:
                        self.func(self.account_id, self.record_id)

                    with session_scope(self.account_id) as db_session:
                        action_log_entry = db_session.query(ActionLog).get(
                            self.action_log_id)
                        action_log_entry.status = 'successful'
                        db_session.commit()
                        latency = round((datetime.utcnow() -
                                         action_log_entry.created_at).
                                        total_seconds(), 2)
                        log.info('syncback action completed',
                                 action_id=self.action_log_id,
                                 latency=latency)
                        self._log_to_statsd(action_log_entry.status, latency)
                        return

                except Exception:
                    log_uncaught_errors(log, account_id=self.account_id,
                                        provider=self.provider)
                    with session_scope(self.account_id) as db_session:
                        action_log_entry = db_session.query(ActionLog).get(
                            self.action_log_id)
                        action_log_entry.retries += 1
                        if (action_log_entry.retries ==
                                ACTION_MAX_NR_OF_RETRIES):
                            log.critical('Max retries reached, giving up.',
                                         exc_info=True)
                            action_log_entry.status = 'failed'
                            self._log_to_statsd(action_log_entry.status)
                        db_session.commit()

                # Wait before retrying
                gevent.sleep(self.retry_interval)
Exemple #15
0
        def f(session):
            if obj_state['sent_event']:
                return

            id = obj_state['id']
            sync_should_run = obj_state['sync_should_run']
            sync_host = obj_state['sync_host']
            desired_sync_host = obj_state['desired_sync_host']

            try:
                if sync_host is not None:
                    # Somebody is actively syncing this Account, so notify them if
                    # they should give up the Account.
                    if not sync_should_run or (sync_host != desired_sync_host and desired_sync_host is not None):
                        queue_name = SYNC_EVENT_QUEUE_NAME.format(sync_host)
                        log.info("Sending 'migrate_from' event for Account",
                                 account_id=id, queue_name=queue_name)
                        EventQueue(queue_name).send_event({'event': 'migrate_from', 'id': id})
                    return

                if not sync_should_run:
                    # We don't need to notify anybody because the Account is not
                    # actively being synced (sync_host is None) and sync_should_run is False,
                    # so just return early.
                    return

                if desired_sync_host is not None:
                    # Nobody is actively syncing the Account, and we have somebody
                    # who wants to sync this Account, so notify them.
                    queue_name = SYNC_EVENT_QUEUE_NAME.format(desired_sync_host)
                    log.info("Sending 'migrate_to' event for Account",
                             account_id=id, queue_name=queue_name)
                    EventQueue(queue_name).send_event({'event': 'migrate_to', 'id': id})
                    return

                # Nobody is actively syncing the Account, and nobody in particular
                # wants to sync the Account so notify the shared queue.
                shared_queue = shared_sync_event_queue_for_zone(config.get('ZONE'))
                log.info("Sending 'migrate' event for Account",
                         account_id=id, queue_name=shared_queue.queue_name)
                shared_queue.send_event({'event': 'migrate', 'id': id})
                obj_state['sent_event'] = True
            except:
                log_uncaught_errors(log, account_id=id, sync_host=sync_host,
                                    desired_sync_host=desired_sync_host)
Exemple #16
0
    def _run(self):
        with self.semaphore:
            log = logger.new(record_id=self.record_id,
                             action_log_id=self.action_log_id,
                             action=self.action_name,
                             account_id=self.account_id,
                             extra_args=self.extra_args)

            for _ in range(ACTION_MAX_NR_OF_RETRIES):
                with session_scope() as db_session:
                    try:
                        action_log_entry = db_session.query(ActionLog).get(
                            self.action_log_id)
                        if self.extra_args:
                            self.func(self.account_id, self.record_id,
                                      db_session, self.extra_args)
                        else:
                            self.func(self.account_id, self.record_id,
                                      db_session)
                        action_log_entry.status = 'successful'
                        db_session.commit()
                        latency = round(
                            (datetime.utcnow() -
                             action_log_entry.created_at).total_seconds(), 2)
                        log.info('syncback action completed',
                                 action_id=self.action_log_id,
                                 latency=latency)
                        self._log_to_statsd(action_log_entry.status, latency)
                        return

                    except Exception:
                        log_uncaught_errors(log, account_id=self.account_id)
                        with session_scope() as db_session:
                            action_log_entry.retries += 1
                            if (action_log_entry.retries ==
                                    ACTION_MAX_NR_OF_RETRIES):
                                log.critical('Max retries reached, giving up.',
                                             exc_info=True)
                                action_log_entry.status = 'failed'
                                self._log_to_statsd(action_log_entry.status)
                            db_session.commit()

                # Wait before retrying
                gevent.sleep(self.retry_interval)
    def callback(e):
        is_transient = isinstance(e, TRANSIENT_NETWORK_ERRS)
        mysql_error = None

        log = logger or get_logger()

        if isinstance(e, _mysql_exceptions.OperationalError):
            mysql_error = e
        elif isinstance(e, StatementError) and isinstance(
            e.orig, _mysql_exceptions.OperationalError
        ):
            mysql_error = e.orig

        if mysql_error:
            for msg in TRANSIENT_MYSQL_MESSAGES:
                if msg in mysql_error.message:
                    is_transient = True

        if is_transient:
            occurrences[0] += 1
            if occurrences[0] < 20:
                return
        else:
            occurrences[0] = 1

        if account_id:
            try:
                with session_scope(account_id) as db_session:
                    account = db_session.query(Account).get(account_id)
                    sync_error = account.sync_error
                    if not sync_error or isinstance(sync_error, basestring):
                        account.update_sync_error(e)
                        db_session.commit()
            except:
                log.error(
                    "Error saving sync_error to account object",
                    account_id=account_id,
                    **create_error_log_context(sys.exc_info())
                )

        log_uncaught_errors(
            logger, account_id=account_id, provider=provider, occurrences=occurrences[0]
        )
Exemple #18
0
def delete_marked_accounts(shard_id, ids_to_delete, throttle=False, dry_run=False):
    start = time.time()

    deleted_count = 0
    for account_id, namespace_id in ids_to_delete:
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn('Account NOT marked for deletion. '
                             'Will not delete', account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id, namespace_id, throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed', error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)

            deleted_count += 1
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully', shard_id=shard_id,
             time=end - start, count=deleted_count)
Exemple #19
0
    def _run(self):
        """
        Index into CloudSearch the contacts of all namespaces.

        """
        try:
            self._set_transaction_pointers()

            self.log.info(
                "Starting contact-search-index service",
                transaction_pointers=self.transaction_pointers,
            )

            while True:
                self._publish_heartbeat()
                self._index_transactions()

        except Exception:
            log_uncaught_errors(log)
def test_log_uncaught_errors(logfile):
    try:
        error_throwing_function()
    except:
        log_uncaught_errors()

    last_log_entry = json.loads(logfile.readlines()[-1])

    assert 'exception' in last_log_entry
    exc_info = last_log_entry['exception']

    assert 'ValueError' in exc_info
    assert 'GreenletExit' not in exc_info
    # Check that the traceback is logged. The traceback stored in
    # sys.exc_info() contains an extra entry for the test_log_uncaught_errors
    # frame, so just look for the rest of the traceback.
    tb = sys.exc_info()[2]
    for call in traceback.format_tb(tb)[1:]:
        assert call in exc_info
Exemple #21
0
    def poll(self):
        # We really don't want to take on more load than we can bear, so we need
        # to check the CPU usage before accepting new accounts.
        # Note that we can't check this for the current core because the kernel
        # transparently moves programs across cores.
        usage_per_cpu = psutil.cpu_percent(percpu=True)

        # Conservatively, stop accepting accounts if the CPU usage is over 90%
        # for every core.
        overloaded_cpus = all(
            [cpu_usage > 90.0 for cpu_usage in usage_per_cpu])

        if self.stealing_enabled and not overloaded_cpus:
            r = self.queue_client.claim_next(self.process_identifier)
            if r:
                self.log.info('Claimed new account sync', account_id=r)

        # Determine which accounts to sync
        start_accounts = self.accounts_to_sync()
        statsd_client.gauge(
            'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id),
            len(start_accounts))

        # Perform the appropriate action on each account
        for account_id in start_accounts:
            if account_id not in self.syncing_accounts:
                try:
                    self.start_sync(account_id)
                except OperationalError:
                    self.log.error('Database error starting account sync',
                                   exc_info=True)
                    log_uncaught_errors()

        stop_accounts = self.syncing_accounts - set(start_accounts)
        for account_id in stop_accounts:
            self.log.info('sync service stopping sync', account_id=account_id)
            try:
                self.stop_sync(account_id)
            except OperationalError:
                self.log.error('Database error stopping account sync',
                               exc_info=True)
                log_uncaught_errors()
Exemple #22
0
    def poll(self):
        # We really don't want to take on more load than we can bear, so we need
        # to check the CPU usage before accepting new accounts.
        # Note that we can't check this for the current core because the kernel
        # transparently moves programs across cores.
        usage_per_cpu = psutil.cpu_percent(percpu=True)

        # Conservatively, stop accepting accounts if the CPU usage is over 90%
        # for every core.
        overloaded_cpus = all([cpu_usage > 90.0 for cpu_usage in usage_per_cpu])

        if self.stealing_enabled and not overloaded_cpus:
            r = self.queue_client.claim_next(self.process_identifier)
            if r:
                self.log.info('Claimed new account sync', account_id=r)

        # Determine which accounts to sync
        start_accounts = self.accounts_to_sync()
        statsd_client.gauge(
            'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id),
            len(start_accounts))

        # Perform the appropriate action on each account
        for account_id in start_accounts:
            if account_id not in self.syncing_accounts:
                try:
                    self.start_sync(account_id)
                except OperationalError:
                    self.log.error('Database error starting account sync',
                                   exc_info=True)
                    log_uncaught_errors()

        stop_accounts = self.syncing_accounts - set(start_accounts)
        for account_id in stop_accounts:
            self.log.info('sync service stopping sync',
                          account_id=account_id)
            try:
                self.stop_sync(account_id)
            except OperationalError:
                self.log.error('Database error stopping account sync',
                               exc_info=True)
                log_uncaught_errors()
Exemple #23
0
def batch_delete_namespaces(ids_to_delete, throttle=False, dry_run=False):

    start = time.time()

    deleted_count = 0
    for account_id, namespace_id in ids_to_delete:
        # try:
        try:
            delete_namespace(namespace_id, throttle=throttle, dry_run=dry_run)
        except AccountDeletionErrror as e:
            log.critical('AccountDeletionErrror', error_message=e.message)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

        deleted_count += 1

    end = time.time()
    log.info('All data deleted successfully for ids',
             ids_to_delete=ids_to_delete,
             time=end - start,
             count=deleted_count)
Exemple #24
0
def retry_with_logging(func,
                       logger=None,
                       retry_classes=None,
                       fail_classes=None,
                       account_id=None,
                       backoff_delay=BACKOFF_DELAY):
    callback = lambda: log_uncaught_errors(logger, account_id=account_id)
    return retry(func,
                 exc_callback=callback,
                 retry_classes=retry_classes,
                 fail_classes=fail_classes,
                 backoff_delay=backoff_delay)()
Exemple #25
0
    def callback(e):
        is_transient = isinstance(e, TRANSIENT_NETWORK_ERRS)
        mysql_error = None

        log = logger or get_logger()

        if isinstance(e, _mysql_exceptions.OperationalError):
            mysql_error = e
        elif isinstance(e, StatementError) and isinstance(e.orig, _mysql_exceptions.OperationalError):
            mysql_error = e.orig

        if mysql_error:
            for msg in TRANSIENT_MYSQL_MESSAGES:
                if msg in mysql_error.message:
                    is_transient = True

        if is_transient:
            occurrences[0] += 1
            if occurrences[0] < 20:
                return
        else:
            occurrences[0] = 1

        if account_id:
            try:
                with session_scope(account_id) as db_session:
                    account = db_session.query(Account).get(account_id)
                    sync_error = account.sync_error
                    if not sync_error or isinstance(sync_error, basestring):
                        account.update_sync_error(e)
                        db_session.commit()
            except:
                log.error(
                    "Error saving sync_error to account object",
                    account_id=account_id,
                    **create_error_log_context(sys.exc_info())
                )

        log_uncaught_errors(logger, account_id=account_id, provider=provider, occurrences=occurrences[0])
Exemple #26
0
    def callback(e):
        if isinstance(e, TRANSIENT_NETWORK_ERRS):
            occurrences[0] += 1
            if occurrences[0] < 20:
                return
        else:
            occurrences[0] = 1

        if account_id:
            try:
                with session_scope(account_id) as db_session:
                    account = db_session.query(Account).get(account_id)
                    sync_error = account.sync_error
                    if not sync_error or isinstance(sync_error, basestring):
                        account.update_sync_error(e)
                        db_session.commit()
            except:
                log.error('Error saving sync_error to account object',
                          account_id=account_id, exc_info=True)

        log_uncaught_errors(logger, account_id=account_id, provider=provider,
                            error_type=type(e).__name__, occurrences=occurrences[0])
Exemple #27
0
def retry_and_report_killed(func, account_id, folder_name=None, logger=None,
                            retry_classes=None, fail_classes=None,
                            exc_callback=None, fail_callback=None,
                            **reset_params):
    if not exc_callback:
        exc_callback = lambda: log_uncaught_errors(logger=logger,
                                                   account_id=account_id)
    if not fail_callback:
        fail_callback = lambda: report_killed(account_id, folder_name)

    return retry(func, exc_callback=exc_callback, fail_callback=fail_callback,
                 retry_classes=retry_classes, fail_classes=fail_classes,
                 **reset_params)()
Exemple #28
0
def batch_delete_namespaces(ids_to_delete, throttle=False,
                                           dry_run=False):

    start = time.time()

    deleted_count = 0
    for account_id, namespace_id in ids_to_delete:
        # try:
        try:
            delete_namespace(namespace_id,
                             throttle=throttle,
                             dry_run=dry_run)
        except AccountDeletionErrror as e:
            log.critical('AccountDeletionErrror', error_message=e.message)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

        deleted_count += 1

    end = time.time()
    log.info('All data deleted successfully for ids',
             ids_to_delete=ids_to_delete,
             time=end - start, count=deleted_count)
Exemple #29
0
def delete_marked_accounts(shard_id, ids_to_delete, throttle=False, dry_run=False):
    start = time.time()

    deleted_count = 0
    for account_id, namespace_id in ids_to_delete:
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical("Account with does not exist", account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn("Account NOT marked for deletion. " "Will not delete", account_id=account_id)
                    continue

            log.info("Deleting account", account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info("Deleting database data", account_id=account_id)
                delete_namespace(account_id, namespace_id, throttle=throttle, dry_run=dry_run)
            except Exception as e:
                log.critical("Database data deletion failed", error=e, account_id=account_id)
                continue

            # Delete liveness data
            log.debug("Deleting liveness data", account_id=account_id)
            clear_heartbeat_status(account_id)

            deleted_count += 1
            statsd_client.timing("mailsync.account_deletion.queue.deleted", time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info("All data deleted successfully", shard_id=shard_id, time=end - start, count=deleted_count)
def test_retry_count_resets(monkeypatch):
    monkeypatch.setattr('inbox.util.concurrency._resettable_counter',
                        lambda x, y: _resettable_counter(max_count=3,
                                                         reset_interval=0))
    logger = MockLogger()

    failing_function = FailingFunction(ValueError, max_executions=6,
                                       delay=.001)

    exc_callback = lambda: log_uncaught_errors(logger)

    retry(failing_function, exc_callback=exc_callback, max_count=3,
          backoff_delay=0)()

    assert logger.call_count == 5
    assert failing_function.call_count == 6
Exemple #31
0
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id)
                         for acc in db_session.query(Account)
                         if acc.is_deleted]

    queue_size = len(ids_to_delete)
    for account_id, namespace_id in ids_to_delete:
        # queue_size = length of queue
        # deleted_count = number of accounts deleted during loop iteration
        # this is necessary because the length of ids_to_delete doesn't
        # change during loop iteration
        statsd_client.gauge(
            'mailsync.{}.account_deletion.queue.length'.format(shard_id),
            queue_size - deleted_count)
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn(
                        'Account NOT marked for deletion. '
                        'Will not delete',
                        account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id,
                                 namespace_id,
                                 throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed',
                             error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.incr('mailsync.account_deletion.queue.deleted', 1)
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully',
             shard_id=shard_id,
             time=end - start,
             count=deleted_count)
Exemple #32
0
def retry_with_logging(func, logger=None, retry_classes=None,
                       fail_classes=None, account_id=None,
                       backoff_delay=BACKOFF_DELAY):
    callback = lambda: log_uncaught_errors(logger, account_id=account_id)
    return retry(func, exc_callback=callback, retry_classes=retry_classes,
                 fail_classes=fail_classes, backoff_delay=backoff_delay)()
Exemple #33
0
    def execute_with_lock(self):
        """
        Process a task and return whether it executed successfully.
        """
        self.log = logger.new(
            record_ids=list(set(self.record_ids)),
            action_log_ids=self.action_log_ids[:100],
            n_action_log_ids=len(self.action_log_ids),
            action=self.action_name,
            account_id=self.account_id,
            extra_args=self.extra_args,
        )

        # Double-check that the action is still pending.
        # Although the task queue is populated based on pending actions, it's
        # possible that the processing of one action involved marking other
        # actions as failed.
        (
            records_to_process,
            action_ids_to_process,
        ) = self._get_records_and_actions_to_process()
        if len(action_ids_to_process) == 0:
            return True

        try:
            before, after = self._execute_timed_action(records_to_process)
            self.log.debug("executing action",
                           action_log_ids=action_ids_to_process)

            with session_scope(self.account_id) as db_session:
                action_log_entries = db_session.query(ActionLog).filter(
                    ActionLog.id.in_(action_ids_to_process))

                max_latency = max_func_latency = 0
                for action_log_entry in action_log_entries:
                    latency, func_latency = self._mark_action_as_successful(
                        action_log_entry, before, after, db_session)
                    if latency > max_latency:
                        max_latency = latency
                    if func_latency > max_func_latency:
                        max_func_latency = func_latency
                self.log.info(
                    "syncback action completed",
                    latency=max_latency,
                    process=self.parent_service().process_number,
                    func_latency=max_func_latency,
                )
                return True
        except:
            log_uncaught_errors(self.log,
                                account_id=self.account_id,
                                provider=self.provider)
            with session_scope(self.account_id) as db_session:
                action_log_entries = db_session.query(ActionLog).filter(
                    ActionLog.id.in_(action_ids_to_process))

                marked_as_failed = False
                for action_log_entry in action_log_entries:
                    action_log_entry.retries += 1
                    if action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES:
                        marked_as_failed = True

                if marked_as_failed:
                    self.log.debug(
                        "marking actions as failed",
                        action_log_ids=action_ids_to_process,
                    )
                    # If we merged actions, fail them all at the same time.
                    for action_log_entry in action_log_entries:
                        self._mark_action_as_failed(action_log_entry,
                                                    db_session)
                db_session.commit()

                return False
 def callback():
     log_uncaught_errors(logger, account_id=account_id, provider=provider)
Exemple #35
0
    def _run(self):
        """
        Index into CloudSearch the contacts of all namespaces.

        """
        try:
            for key in engine_manager.engines:
                with session_scope_by_shard_id(key) as db_session:
                    pointer = db_session.query(
                        ContactSearchIndexCursor).first()
                    if pointer:
                        self.transaction_pointers[key] = pointer.transaction_id
                    else:
                        # Never start from 0; if the service hasn't
                        # run before start from the latest
                        # transaction, with the expectation that a
                        # backfill will be run separately.
                        latest_transaction = db_session.query(Transaction). \
                            order_by(desc(Transaction.created_at)).first()
                        if latest_transaction:
                            self.transaction_pointers[
                                key] = latest_transaction.id
                        else:
                            self.transaction_pointers[key] = 0

            self.log.info('Starting contact-search-index service',
                          transaction_pointers=self.transaction_pointers)

            while True:
                for key in engine_manager.engines:
                    with session_scope_by_shard_id(key) as db_session:
                        transactions = db_session.query(Transaction). filter(
                            Transaction.id > self.transaction_pointers[key],
                            Transaction.object_type == 'contact') \
                            .with_hint(
                                Transaction,
                                "USE INDEX (ix_transaction_table_name)") \
                            .order_by(asc(Transaction.id)) \
                            .limit(self.chunk_size) \
                            .options(joinedload(Transaction.namespace)).all()

                        # index up to chunk_size transactions
                        should_sleep = False
                        if transactions:
                            self.index(transactions, db_session)
                            oldest_transaction = min(
                                transactions, key=lambda t: t.created_at)
                            current_timestamp = datetime.utcnow()
                            latency = (current_timestamp -
                                       oldest_transaction.created_at).seconds
                            self._report_transactions_latency(latency)
                            new_pointer = transactions[-1].id
                            self.update_pointer(new_pointer, key, db_session)
                            db_session.commit()
                        else:
                            should_sleep = True
                    if should_sleep:
                        log.info('sleeping')
                        sleep(self.poll_interval)
        except Exception:
            log_uncaught_errors(log)
Exemple #36
0
    def execute_with_lock(self):
        self.log = logger.new(record_ids=self.record_ids,
                              action_log_ids=self.action_log_ids,
                              action=self.action_name,
                              account_id=self.account_id,
                              extra_args=self.extra_args)

        # Double-check that the action is still pending.
        # Although the task queue is populated based on pending actions, it's
        # possible that the processing of one action involved marking other
        # actions as failed.
        records_to_process, action_ids_to_process = self._get_records_and_actions_to_process(
        )
        if len(action_ids_to_process) == 0:
            return

        for attempt in range(ACTION_MAX_NR_OF_RETRIES):
            self.log.debug("executing action", attempt=attempt)
            try:
                before, after = self._execute_timed_action(records_to_process)

                with session_scope(self.account_id) as db_session:
                    action_log_entries = db_session.query(ActionLog). \
                        filter(ActionLog.id.in_(action_ids_to_process))

                    for action_log_entry in action_log_entries:
                        self._mark_action_as_successful(
                            action_log_entry, before, after, db_session)
                    return
            except:
                log_uncaught_errors(self.log,
                                    account_id=self.account_id,
                                    provider=self.provider)
                with session_scope(self.account_id) as db_session:
                    action_log_entries = db_session.query(ActionLog). \
                        filter(ActionLog.id.in_(action_ids_to_process))

                    marked_as_failed = False
                    for action_log_entry in action_log_entries:
                        action_log_entry.retries += 1
                        if action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES:
                            marked_as_failed = True
                            self._mark_action_as_failed(
                                action_log_entry, db_session)
                        # If we've merged SyncbackTasks then their corresponding
                        # actions should all fail at the same time.
                        assert (not marked_as_failed
                                or action_log_entry.retries
                                == ACTION_MAX_NR_OF_RETRIES)
                        db_session.commit()
                    if marked_as_failed:
                        return

            # Wait before retrying
            self.log.info("Syncback task retrying action after sleeping",
                          duration=self.retry_interval)

            # TODO(T6974): We might want to do some kind of exponential
            # backoff with jitter to avoid the thundering herd problem if a
            # provider suddenly starts having issues for a short period of
            # time.
            gevent.sleep(self.retry_interval)
Exemple #37
0
def retry_with_logging(func, logger=None, retry_classes=None,
                       fail_classes=None, account_id=None, **reset_params):
    callback = lambda: log_uncaught_errors(logger, account_id=account_id)
    return retry(func, exc_callback=callback, fail_callback=callback,
                 retry_classes=retry_classes, fail_classes=fail_classes,
                 **reset_params)()
Exemple #38
0
        def f(session):
            if obj_state["sent_event"]:
                return

            id = obj_state["id"]
            sync_should_run = obj_state["sync_should_run"]
            sync_host = obj_state["sync_host"]
            desired_sync_host = obj_state["desired_sync_host"]

            try:
                if sync_host is not None:
                    # Somebody is actively syncing this Account, so notify them if
                    # they should give up the Account.
                    if not sync_should_run or (sync_host != desired_sync_host
                                               and desired_sync_host
                                               is not None):
                        queue_name = SYNC_EVENT_QUEUE_NAME.format(sync_host)
                        log.info(
                            "Sending 'migrate_from' event for Account",
                            account_id=id,
                            queue_name=queue_name,
                        )
                        EventQueue(queue_name).send_event({
                            "event": "migrate_from",
                            "id": id
                        })
                    return

                if not sync_should_run:
                    # We don't need to notify anybody because the Account is not
                    # actively being synced (sync_host is None) and sync_should_run is False,
                    # so just return early.
                    return

                if desired_sync_host is not None:
                    # Nobody is actively syncing the Account, and we have somebody
                    # who wants to sync this Account, so notify them.
                    queue_name = SYNC_EVENT_QUEUE_NAME.format(
                        desired_sync_host)
                    log.info(
                        "Sending 'migrate_to' event for Account",
                        account_id=id,
                        queue_name=queue_name,
                    )
                    EventQueue(queue_name).send_event({
                        "event": "migrate_to",
                        "id": id
                    })
                    return

                # Nobody is actively syncing the Account, and nobody in particular
                # wants to sync the Account so notify the shared queue.
                shared_queue = shared_sync_event_queue_for_zone(
                    config.get("ZONE"))
                log.info(
                    "Sending 'migrate' event for Account",
                    account_id=id,
                    queue_name=shared_queue.queue_name,
                )
                shared_queue.send_event({"event": "migrate", "id": id})
                obj_state["sent_event"] = True
            except:
                log_uncaught_errors(
                    log,
                    account_id=id,
                    sync_host=sync_host,
                    desired_sync_host=desired_sync_host,
                )
Exemple #39
0
 def wrapped(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except exception_type:
         log_uncaught_errors()
         pdb.post_mortem()
Exemple #40
0
    def execute_with_lock(self):
        log = logger.new(record_id=self.record_id,
                         action_log_id=self.action_log_id,
                         action=self.action_name,
                         account_id=self.account_id,
                         extra_args=self.extra_args)

        for _ in range(ACTION_MAX_NR_OF_RETRIES):
            try:
                before_func = datetime.utcnow()
                func_args = [self.account_id, self.record_id]
                if self.extra_args:
                    func_args.append(self.extra_args)
                if self.uses_crispin_client():
                    assert self.crispin_client is not None
                    func_args.insert(0, self.crispin_client)
                self.func(*func_args)
                after_func = datetime.utcnow()

                with session_scope(self.account_id) as db_session:
                    action_log_entry = db_session.query(ActionLog).get(
                        self.action_log_id)
                    action_log_entry.status = 'successful'
                    db_session.commit()
                    latency = round(
                        (datetime.utcnow() -
                         action_log_entry.created_at).total_seconds(), 2)
                    func_latency = round(
                        (after_func - before_func).total_seconds(), 2)
                    log.info('syncback action completed',
                             action_id=self.action_log_id,
                             latency=latency,
                             process=self.parent_service().process_number,
                             func_latency=func_latency)
                    self._log_to_statsd(action_log_entry.status, latency)
                    return
            except Exception:
                log_uncaught_errors(log,
                                    account_id=self.account_id,
                                    provider=self.provider)
                with session_scope(self.account_id) as db_session:
                    action_log_entry = db_session.query(ActionLog).get(
                        self.action_log_id)
                    action_log_entry.retries += 1
                    if (action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES):
                        log.critical('Max retries reached, giving up.',
                                     exc_info=True)
                        action_log_entry.status = 'failed'
                        self._log_to_statsd(action_log_entry.status)
                        db_session.commit()
                        return
                    db_session.commit()

            # Wait before retrying
            log.info("Syncback task retrying action after sleeping",
                     duration=self.retry_interval)

            # TODO(T6974): We might want to do some kind of exponential
            # backoff with jitter to avoid the thundering herd problem if a
            # provider suddenly starts having issues for a short period of
            # time.
            gevent.sleep(self.retry_interval)
Exemple #41
0
 def callback():
     log_uncaught_errors(logger, account_id=account_id, provider=provider)