def test_configurable_retry_count_resets(monkeypatch): logger = MockLogger() default_failing_function = FailingFunction( ValueError, max_executions=3, delay=.1) exc_callback = lambda: log_uncaught_errors(logger) retry(default_failing_function, exc_callback=exc_callback, max_count=3, backoff_delay=0)() assert logger.call_count == 2 assert default_failing_function.call_count == 3 logger = MockLogger() failing_function = FailingFunction(ValueError, max_executions=5, delay=.1) exc_callback = lambda: log_uncaught_errors(logger) retry(failing_function, exc_callback=exc_callback, max_count=5, backoff_delay=0)() assert logger.call_count == 4 assert failing_function.call_count == 5
def _run(self): with self.semaphore: log = logger.new( record_id=self.record_id, action_log_id=self.action_log_id, action=self.action_name, account_id=self.account_id, extra_args=self.extra_args) for _ in range(ACTION_MAX_NR_OF_RETRIES): try: before_func = datetime.utcnow() if self.extra_args: self.func(self.account_id, self.record_id, self.extra_args) else: self.func(self.account_id, self.record_id) after_func = datetime.utcnow() with session_scope(self.account_id) as db_session: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) action_log_entry.status = 'successful' db_session.commit() latency = round((datetime.utcnow() - action_log_entry.created_at). total_seconds(), 2) func_latency = round((after_func - before_func). total_seconds(), 2) log.info('syncback action completed', action_id=self.action_log_id, latency=latency, process=self.parent_service.process_number, func_latency=func_latency) self._log_to_statsd(action_log_entry.status, latency) return except Exception: log_uncaught_errors(log, account_id=self.account_id, provider=self.provider) with session_scope(self.account_id) as db_session: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) action_log_entry.retries += 1 if (action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES): log.critical('Max retries reached, giving up.', exc_info=True) action_log_entry.status = 'failed' self._log_to_statsd(action_log_entry.status) db_session.commit() return db_session.commit() # Wait before retrying log.info("Syncback worker retrying action after sleeping", duration=self.retry_interval) # TODO(T6974): We might want to do some kind of exponential # backoff with jitter to avoid the thundering herd problem if a # provider suddenly starts having issues for a short period of # time. gevent.sleep(self.retry_interval)
def poll(self, event): # Determine which accounts to sync start_accounts = self.account_ids_to_sync() statsd_client.gauge( 'mailsync.account_counts.{}.mailsync-{}.count'.format( self.host, self.process_number), len(start_accounts)) # Perform the appropriate action on each account for account_id in start_accounts: if account_id not in self.syncing_accounts: try: self.start_sync(account_id) except OperationalError: self.log.error('Database error starting account sync', exc_info=True) log_uncaught_errors() stop_accounts = self.account_ids_owned() - set(start_accounts) for account_id in stop_accounts: self.log.info('sync service stopping sync', account_id=account_id) try: self.stop_sync(account_id) except OperationalError: self.log.error('Database error stopping account sync', exc_info=True) log_uncaught_errors()
def poll(self): if self.stealing_enabled: r = self.queue_client.claim_next(self.process_identifier) if r: self.log.info('Claimed new account sync', account_id=r) # Determine which accounts to sync start_accounts = self.accounts_to_sync() statsd_client.gauge( 'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id), len(start_accounts)) # Perform the appropriate action on each account for account_id in start_accounts: if account_id not in self.syncing_accounts: try: self.start_sync(account_id) except OperationalError: self.log.error('Database error starting account sync', exc_info=True) log_uncaught_errors() stop_accounts = self.syncing_accounts - set(start_accounts) for account_id in stop_accounts: self.log.info('sync service stopping sync', account_id=account_id) try: self.stop_sync(account_id) except OperationalError: self.log.error('Database error stopping account sync', exc_info=True) log_uncaught_errors()
def _run(self): """ Index into CloudSearch the contacts of all namespaces. """ try: for key in engine_manager.engines: with session_scope_by_shard_id(key) as db_session: pointer = db_session.query(ContactSearchIndexCursor).first() if pointer: self.transaction_pointers[key] = pointer.transaction_id else: # Never start from 0; if the service hasn't # run before start from the latest # transaction, with the expectation that a # backfill will be run separately. latest_transaction = ( db_session.query(Transaction).order_by(desc(Transaction.created_at)).first() ) if latest_transaction: self.transaction_pointers[key] = latest_transaction.id else: self.transaction_pointers[key] = 0 self.log.info("Starting contact-search-index service", transaction_pointers=self.transaction_pointers) while True: for key in engine_manager.engines: with session_scope_by_shard_id(key) as db_session: transactions = ( db_session.query(Transaction) .filter( Transaction.id > self.transaction_pointers[key], Transaction.object_type == "contact" ) .with_hint(Transaction, "USE INDEX (ix_transaction_table_name)") .order_by(asc(Transaction.id)) .limit(self.chunk_size) .options(joinedload(Transaction.namespace)) .all() ) # index up to chunk_size transactions should_sleep = False if transactions: self.index(transactions, db_session) oldest_transaction = min(transactions, key=lambda t: t.created_at) current_timestamp = datetime.utcnow() latency = (current_timestamp - oldest_transaction.created_at).seconds self._report_transactions_latency(latency) new_pointer = transactions[-1].id self.update_pointer(new_pointer, key, db_session) db_session.commit() else: should_sleep = True if should_sleep: log.info("sleeping") sleep(self.poll_interval) except Exception: log_uncaught_errors(log)
def poll(self): # We really don't want to take on more load than we can bear, so we # need to check the CPU usage before accepting new accounts. # Note that we can't check this for the current core because the kernel # transparently moves programs across cores. usage_per_cpu = psutil.cpu_percent(percpu=True) self.rolling_cpu_counts.append(usage_per_cpu) cpu_averages = self._compute_cpu_average() cpus_over_nominal = all([cpu_usage > NOMINAL_THRESHOLD for cpu_usage in cpu_averages]) # Conservatively, stop accepting accounts if the CPU usage is over # NOMINAL_THRESHOLD for every core, or if the total # of accounts # being synced by a single process exceeds the threshold. Excessive # concurrency per process can result in lowered database throughput # or availability problems, since many transactions may be held open # at the same time. if self.stealing_enabled and not cpus_over_nominal and \ len(self.syncing_accounts) < MAX_ACCOUNTS_PER_PROCESS: r = self.queue_client.claim_next(self.process_identifier) if r: self.log.info('Claimed new account sync', account_id=r) else: if not self.stealing_enabled: reason = 'stealing disabled' elif cpus_over_nominal: reason = 'CPU too high' else: reason = 'reached max accounts for process' self.log.info('Not claiming new account sync', reason=reason) # Determine which accounts to sync start_accounts = self.accounts_to_sync() statsd_client.gauge( 'accounts.{}.mailsync-{}.count'.format( self.host, self.process_number), len(start_accounts)) # Perform the appropriate action on each account for account_id in start_accounts: if account_id not in self.syncing_accounts: try: self.start_sync(account_id) except OperationalError: self.log.error('Database error starting account sync', exc_info=True) log_uncaught_errors() stop_accounts = self.syncing_accounts - set(start_accounts) for account_id in stop_accounts: self.log.info('sync service stopping sync', account_id=account_id) try: self.stop_sync(account_id) except OperationalError: self.log.error('Database error stopping account sync', exc_info=True) log_uncaught_errors()
def poll(self): # We really don't want to take on more load than we can bear, so we # need to check the CPU usage before accepting new accounts. # Note that we can't check this for the current core because the kernel # transparently moves programs across cores. usage_per_cpu = psutil.cpu_percent(percpu=True) self.rolling_cpu_counts.append(usage_per_cpu) cpu_averages = self._compute_cpu_average() has_overloaded_cpus = all([cpu_usage > 90.0 for cpu_usage in cpu_averages]) cpus_over_nominal = all([cpu_usage > 85.0 for cpu_usage in cpu_averages]) # Conservatively, stop accepting accounts if the CPU usage is over 85% # for every core. if self.stealing_enabled and not cpus_over_nominal: r = self.queue_client.claim_next(self.process_identifier) if r: self.log.info('Claimed new account sync', account_id=r) if has_overloaded_cpus: # Unload a single account. acc = self._pick_account() if acc is not None: self.log.info('Overloaded CPU, unloading account', account_id=acc) self.stop_sync(acc) else: self.log.error("Couldn't find an account to unload!") # Determine which accounts to sync start_accounts = self.accounts_to_sync() statsd_client.gauge( 'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id), len(start_accounts)) # Perform the appropriate action on each account for account_id in start_accounts: if account_id not in self.syncing_accounts: try: self.start_sync(account_id) except OperationalError: self.log.error('Database error starting account sync', exc_info=True) log_uncaught_errors() stop_accounts = self.syncing_accounts - set(start_accounts) for account_id in stop_accounts: self.log.info('sync service stopping sync', account_id=account_id) try: self.stop_sync(account_id) except OperationalError: self.log.error('Database error stopping account sync', exc_info=True) log_uncaught_errors()
def callback(e): if isinstance(e, TRANSIENT_NETWORK_ERRS): occurrences[0] += 1 if occurrences[0] < 20: return else: occurrences[0] = 1 log_uncaught_errors(logger, account_id=account_id, provider=provider, error_type=type(e).__name__, occurrences=occurrences[0])
def delete_marked_accounts(shard_id, throttle=False, dry_run=False): start = time.time() deleted_count = 0 ids_to_delete = [] with session_scope_by_shard_id(shard_id) as db_session: ids_to_delete = [(acc.id, acc.namespace.id) for acc in db_session.query(Account) if acc.is_deleted] queue_size = len(ids_to_delete) for account_id, namespace_id in ids_to_delete: # queue_size = length of queue # deleted_count = number of accounts deleted during loop iteration # this is necessary because the length of ids_to_delete doesn't # change during loop iteration statsd_client.gauge('mailsync.{}.account_deletion.queue.length' .format(shard_id), queue_size - deleted_count) try: with session_scope(namespace_id) as db_session: account = db_session.query(Account).get(account_id) if not account: log.critical('Account with does not exist', account_id=account_id) continue if account.sync_should_run or not account.is_deleted: log.warn('Account NOT marked for deletion. ' 'Will not delete', account_id=account_id) continue log.info('Deleting account', account_id=account_id) start_time = time.time() # Delete data in database try: log.info('Deleting database data', account_id=account_id) delete_namespace(account_id, namespace_id, throttle=throttle, dry_run=dry_run) except Exception as e: log.critical('Database data deletion failed', error=e, account_id=account_id) continue # Delete liveness data log.debug('Deleting liveness data', account_id=account_id) clear_heartbeat_status(account_id) deleted_count += 1 statsd_client.incr('mailsync.account_deletion.queue.deleted', 1) statsd_client.timing('mailsync.account_deletion.queue.deleted', time.time() - start_time) except Exception: log_uncaught_errors(log, account_id=account_id) end = time.time() log.info('All data deleted successfully', shard_id=shard_id, time=end - start, count=deleted_count)
def execute_with_lock(self): self.log = logger.new( record_ids=self.record_ids, action_log_ids=self.action_log_ids, action=self.action_name, account_id=self.account_id, extra_args=self.extra_args) # Double-check that the action is still pending. # Although the task queue is populated based on pending actions, it's # possible that the processing of one action involved marking other # actions as failed. records_to_process, action_ids_to_process = self._get_records_and_actions_to_process() if len(action_ids_to_process) == 0: return for attempt in range(ACTION_MAX_NR_OF_RETRIES): self.log.debug("executing action", attempt=attempt) try: before, after = self._execute_timed_action(records_to_process) with session_scope(self.account_id) as db_session: action_log_entries = db_session.query(ActionLog). \ filter(ActionLog.id.in_(action_ids_to_process)) for action_log_entry in action_log_entries: self._mark_action_as_successful(action_log_entry, before, after, db_session) return except: log_uncaught_errors(self.log, account_id=self.account_id, provider=self.provider) with session_scope(self.account_id) as db_session: action_log_entries = db_session.query(ActionLog). \ filter(ActionLog.id.in_(action_ids_to_process)) marked_as_failed = False for action_log_entry in action_log_entries: action_log_entry.retries += 1 if action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES: marked_as_failed = True self._mark_action_as_failed(action_log_entry, db_session) # If we've merged SyncbackTasks then their corresponding # actions should all fail at the same time. assert (not marked_as_failed or action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES) db_session.commit() if marked_as_failed: return # Wait before retrying self.log.info("Syncback task retrying action after sleeping", duration=self.retry_interval) # TODO(T6974): We might want to do some kind of exponential # backoff with jitter to avoid the thundering herd problem if a # provider suddenly starts having issues for a short period of # time. gevent.sleep(self.retry_interval)
def mark_failure(self, actionlog_id): log_uncaught_errors(log, component='syncback', account_id=self.account_id, action_id=actionlog_id, provider=self.provider) with session_scope(self.account_id) as db_session: actionlog = db_session.query(ActionLog).get(actionlog_id) actionlog.retries += 1 if actionlog.retries == ACTION_MAX_NR_OF_RETRIES: log.critical('Max retries reached, giving up.', exc_info=True) actionlog.status = 'failed' db_session.commit() self._log_to_statsd(actionlog.status)
def delete_marked_accounts(shard_id, throttle=False, dry_run=False): start = time.time() deleted_count = 0 ids_to_delete = [] with session_scope_by_shard_id(shard_id) as db_session: ids_to_delete = [(acc.id, acc.namespace.id) for acc in db_session.query(Account) if acc.is_deleted] for account_id, namespace_id in ids_to_delete: try: with session_scope(namespace_id) as db_session: account = db_session.query(Account).get(account_id) if not account: log.critical('Account with does not exist', account_id=account_id) continue if account.sync_should_run or not account.is_deleted: log.warn('Account NOT marked for deletion. ' 'Will not delete', account_id=account_id) continue log.info('Deleting account', account_id=account_id) start_time = time.time() # Delete data in database try: log.info('Deleting database data', account_id=account_id) delete_namespace(account_id, namespace_id, throttle=throttle, dry_run=dry_run) except Exception as e: log.critical('Database data deletion failed', error=e, account_id=account_id) continue # Delete liveness data log.debug('Deleting liveness data', account_id=account_id) clear_heartbeat_status(account_id) deleted_count += 1 statsd_client.timing('mailsync.account_deletion.queue.deleted', time.time() - start_time) gevent.sleep(60) except Exception: log_uncaught_errors(log, account_id=account_id) end = time.time() log.info('All data deleted successfully', shard_id=shard_id, time=end - start, count=deleted_count)
def _run(self): """ Index into CloudSearch the contacts of all namespaces. """ try: self._set_transaction_pointers() self.log.info("Starting contact-search-index service", transaction_pointers=self.transaction_pointers) while True: self._publish_heartbeat() self._index_transactions() except Exception: log_uncaught_errors(log)
def _run(self): with self.semaphore: log = logger.new( record_id=self.record_id, action_log_id=self.action_log_id, action=self.action_name, account_id=self.account_id, extra_args=self.extra_args) for _ in range(ACTION_MAX_NR_OF_RETRIES): try: if self.extra_args: self.func(self.account_id, self.record_id, self.extra_args) else: self.func(self.account_id, self.record_id) with session_scope(self.account_id) as db_session: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) action_log_entry.status = 'successful' db_session.commit() latency = round((datetime.utcnow() - action_log_entry.created_at). total_seconds(), 2) log.info('syncback action completed', action_id=self.action_log_id, latency=latency) self._log_to_statsd(action_log_entry.status, latency) return except Exception: log_uncaught_errors(log, account_id=self.account_id, provider=self.provider) with session_scope(self.account_id) as db_session: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) action_log_entry.retries += 1 if (action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES): log.critical('Max retries reached, giving up.', exc_info=True) action_log_entry.status = 'failed' self._log_to_statsd(action_log_entry.status) db_session.commit() # Wait before retrying gevent.sleep(self.retry_interval)
def f(session): if obj_state['sent_event']: return id = obj_state['id'] sync_should_run = obj_state['sync_should_run'] sync_host = obj_state['sync_host'] desired_sync_host = obj_state['desired_sync_host'] try: if sync_host is not None: # Somebody is actively syncing this Account, so notify them if # they should give up the Account. if not sync_should_run or (sync_host != desired_sync_host and desired_sync_host is not None): queue_name = SYNC_EVENT_QUEUE_NAME.format(sync_host) log.info("Sending 'migrate_from' event for Account", account_id=id, queue_name=queue_name) EventQueue(queue_name).send_event({'event': 'migrate_from', 'id': id}) return if not sync_should_run: # We don't need to notify anybody because the Account is not # actively being synced (sync_host is None) and sync_should_run is False, # so just return early. return if desired_sync_host is not None: # Nobody is actively syncing the Account, and we have somebody # who wants to sync this Account, so notify them. queue_name = SYNC_EVENT_QUEUE_NAME.format(desired_sync_host) log.info("Sending 'migrate_to' event for Account", account_id=id, queue_name=queue_name) EventQueue(queue_name).send_event({'event': 'migrate_to', 'id': id}) return # Nobody is actively syncing the Account, and nobody in particular # wants to sync the Account so notify the shared queue. shared_queue = shared_sync_event_queue_for_zone(config.get('ZONE')) log.info("Sending 'migrate' event for Account", account_id=id, queue_name=shared_queue.queue_name) shared_queue.send_event({'event': 'migrate', 'id': id}) obj_state['sent_event'] = True except: log_uncaught_errors(log, account_id=id, sync_host=sync_host, desired_sync_host=desired_sync_host)
def _run(self): with self.semaphore: log = logger.new(record_id=self.record_id, action_log_id=self.action_log_id, action=self.action_name, account_id=self.account_id, extra_args=self.extra_args) for _ in range(ACTION_MAX_NR_OF_RETRIES): with session_scope() as db_session: try: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) if self.extra_args: self.func(self.account_id, self.record_id, db_session, self.extra_args) else: self.func(self.account_id, self.record_id, db_session) action_log_entry.status = 'successful' db_session.commit() latency = round( (datetime.utcnow() - action_log_entry.created_at).total_seconds(), 2) log.info('syncback action completed', action_id=self.action_log_id, latency=latency) self._log_to_statsd(action_log_entry.status, latency) return except Exception: log_uncaught_errors(log, account_id=self.account_id) with session_scope() as db_session: action_log_entry.retries += 1 if (action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES): log.critical('Max retries reached, giving up.', exc_info=True) action_log_entry.status = 'failed' self._log_to_statsd(action_log_entry.status) db_session.commit() # Wait before retrying gevent.sleep(self.retry_interval)
def callback(e): is_transient = isinstance(e, TRANSIENT_NETWORK_ERRS) mysql_error = None log = logger or get_logger() if isinstance(e, _mysql_exceptions.OperationalError): mysql_error = e elif isinstance(e, StatementError) and isinstance( e.orig, _mysql_exceptions.OperationalError ): mysql_error = e.orig if mysql_error: for msg in TRANSIENT_MYSQL_MESSAGES: if msg in mysql_error.message: is_transient = True if is_transient: occurrences[0] += 1 if occurrences[0] < 20: return else: occurrences[0] = 1 if account_id: try: with session_scope(account_id) as db_session: account = db_session.query(Account).get(account_id) sync_error = account.sync_error if not sync_error or isinstance(sync_error, basestring): account.update_sync_error(e) db_session.commit() except: log.error( "Error saving sync_error to account object", account_id=account_id, **create_error_log_context(sys.exc_info()) ) log_uncaught_errors( logger, account_id=account_id, provider=provider, occurrences=occurrences[0] )
def delete_marked_accounts(shard_id, ids_to_delete, throttle=False, dry_run=False): start = time.time() deleted_count = 0 for account_id, namespace_id in ids_to_delete: try: with session_scope(namespace_id) as db_session: account = db_session.query(Account).get(account_id) if not account: log.critical('Account with does not exist', account_id=account_id) continue if account.sync_should_run or not account.is_deleted: log.warn('Account NOT marked for deletion. ' 'Will not delete', account_id=account_id) continue log.info('Deleting account', account_id=account_id) start_time = time.time() # Delete data in database try: log.info('Deleting database data', account_id=account_id) delete_namespace(account_id, namespace_id, throttle=throttle, dry_run=dry_run) except Exception as e: log.critical('Database data deletion failed', error=e, account_id=account_id) continue # Delete liveness data log.debug('Deleting liveness data', account_id=account_id) clear_heartbeat_status(account_id) deleted_count += 1 statsd_client.timing('mailsync.account_deletion.queue.deleted', time.time() - start_time) except Exception: log_uncaught_errors(log, account_id=account_id) end = time.time() log.info('All data deleted successfully', shard_id=shard_id, time=end - start, count=deleted_count)
def _run(self): """ Index into CloudSearch the contacts of all namespaces. """ try: self._set_transaction_pointers() self.log.info( "Starting contact-search-index service", transaction_pointers=self.transaction_pointers, ) while True: self._publish_heartbeat() self._index_transactions() except Exception: log_uncaught_errors(log)
def test_log_uncaught_errors(logfile): try: error_throwing_function() except: log_uncaught_errors() last_log_entry = json.loads(logfile.readlines()[-1]) assert 'exception' in last_log_entry exc_info = last_log_entry['exception'] assert 'ValueError' in exc_info assert 'GreenletExit' not in exc_info # Check that the traceback is logged. The traceback stored in # sys.exc_info() contains an extra entry for the test_log_uncaught_errors # frame, so just look for the rest of the traceback. tb = sys.exc_info()[2] for call in traceback.format_tb(tb)[1:]: assert call in exc_info
def poll(self): # We really don't want to take on more load than we can bear, so we need # to check the CPU usage before accepting new accounts. # Note that we can't check this for the current core because the kernel # transparently moves programs across cores. usage_per_cpu = psutil.cpu_percent(percpu=True) # Conservatively, stop accepting accounts if the CPU usage is over 90% # for every core. overloaded_cpus = all( [cpu_usage > 90.0 for cpu_usage in usage_per_cpu]) if self.stealing_enabled and not overloaded_cpus: r = self.queue_client.claim_next(self.process_identifier) if r: self.log.info('Claimed new account sync', account_id=r) # Determine which accounts to sync start_accounts = self.accounts_to_sync() statsd_client.gauge( 'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id), len(start_accounts)) # Perform the appropriate action on each account for account_id in start_accounts: if account_id not in self.syncing_accounts: try: self.start_sync(account_id) except OperationalError: self.log.error('Database error starting account sync', exc_info=True) log_uncaught_errors() stop_accounts = self.syncing_accounts - set(start_accounts) for account_id in stop_accounts: self.log.info('sync service stopping sync', account_id=account_id) try: self.stop_sync(account_id) except OperationalError: self.log.error('Database error stopping account sync', exc_info=True) log_uncaught_errors()
def poll(self): # We really don't want to take on more load than we can bear, so we need # to check the CPU usage before accepting new accounts. # Note that we can't check this for the current core because the kernel # transparently moves programs across cores. usage_per_cpu = psutil.cpu_percent(percpu=True) # Conservatively, stop accepting accounts if the CPU usage is over 90% # for every core. overloaded_cpus = all([cpu_usage > 90.0 for cpu_usage in usage_per_cpu]) if self.stealing_enabled and not overloaded_cpus: r = self.queue_client.claim_next(self.process_identifier) if r: self.log.info('Claimed new account sync', account_id=r) # Determine which accounts to sync start_accounts = self.accounts_to_sync() statsd_client.gauge( 'accounts.{}.mailsync-{}.count'.format(self.host, self.cpu_id), len(start_accounts)) # Perform the appropriate action on each account for account_id in start_accounts: if account_id not in self.syncing_accounts: try: self.start_sync(account_id) except OperationalError: self.log.error('Database error starting account sync', exc_info=True) log_uncaught_errors() stop_accounts = self.syncing_accounts - set(start_accounts) for account_id in stop_accounts: self.log.info('sync service stopping sync', account_id=account_id) try: self.stop_sync(account_id) except OperationalError: self.log.error('Database error stopping account sync', exc_info=True) log_uncaught_errors()
def batch_delete_namespaces(ids_to_delete, throttle=False, dry_run=False): start = time.time() deleted_count = 0 for account_id, namespace_id in ids_to_delete: # try: try: delete_namespace(namespace_id, throttle=throttle, dry_run=dry_run) except AccountDeletionErrror as e: log.critical('AccountDeletionErrror', error_message=e.message) except Exception: log_uncaught_errors(log, account_id=account_id) deleted_count += 1 end = time.time() log.info('All data deleted successfully for ids', ids_to_delete=ids_to_delete, time=end - start, count=deleted_count)
def retry_with_logging(func, logger=None, retry_classes=None, fail_classes=None, account_id=None, backoff_delay=BACKOFF_DELAY): callback = lambda: log_uncaught_errors(logger, account_id=account_id) return retry(func, exc_callback=callback, retry_classes=retry_classes, fail_classes=fail_classes, backoff_delay=backoff_delay)()
def callback(e): is_transient = isinstance(e, TRANSIENT_NETWORK_ERRS) mysql_error = None log = logger or get_logger() if isinstance(e, _mysql_exceptions.OperationalError): mysql_error = e elif isinstance(e, StatementError) and isinstance(e.orig, _mysql_exceptions.OperationalError): mysql_error = e.orig if mysql_error: for msg in TRANSIENT_MYSQL_MESSAGES: if msg in mysql_error.message: is_transient = True if is_transient: occurrences[0] += 1 if occurrences[0] < 20: return else: occurrences[0] = 1 if account_id: try: with session_scope(account_id) as db_session: account = db_session.query(Account).get(account_id) sync_error = account.sync_error if not sync_error or isinstance(sync_error, basestring): account.update_sync_error(e) db_session.commit() except: log.error( "Error saving sync_error to account object", account_id=account_id, **create_error_log_context(sys.exc_info()) ) log_uncaught_errors(logger, account_id=account_id, provider=provider, occurrences=occurrences[0])
def callback(e): if isinstance(e, TRANSIENT_NETWORK_ERRS): occurrences[0] += 1 if occurrences[0] < 20: return else: occurrences[0] = 1 if account_id: try: with session_scope(account_id) as db_session: account = db_session.query(Account).get(account_id) sync_error = account.sync_error if not sync_error or isinstance(sync_error, basestring): account.update_sync_error(e) db_session.commit() except: log.error('Error saving sync_error to account object', account_id=account_id, exc_info=True) log_uncaught_errors(logger, account_id=account_id, provider=provider, error_type=type(e).__name__, occurrences=occurrences[0])
def retry_and_report_killed(func, account_id, folder_name=None, logger=None, retry_classes=None, fail_classes=None, exc_callback=None, fail_callback=None, **reset_params): if not exc_callback: exc_callback = lambda: log_uncaught_errors(logger=logger, account_id=account_id) if not fail_callback: fail_callback = lambda: report_killed(account_id, folder_name) return retry(func, exc_callback=exc_callback, fail_callback=fail_callback, retry_classes=retry_classes, fail_classes=fail_classes, **reset_params)()
def delete_marked_accounts(shard_id, ids_to_delete, throttle=False, dry_run=False): start = time.time() deleted_count = 0 for account_id, namespace_id in ids_to_delete: try: with session_scope(namespace_id) as db_session: account = db_session.query(Account).get(account_id) if not account: log.critical("Account with does not exist", account_id=account_id) continue if account.sync_should_run or not account.is_deleted: log.warn("Account NOT marked for deletion. " "Will not delete", account_id=account_id) continue log.info("Deleting account", account_id=account_id) start_time = time.time() # Delete data in database try: log.info("Deleting database data", account_id=account_id) delete_namespace(account_id, namespace_id, throttle=throttle, dry_run=dry_run) except Exception as e: log.critical("Database data deletion failed", error=e, account_id=account_id) continue # Delete liveness data log.debug("Deleting liveness data", account_id=account_id) clear_heartbeat_status(account_id) deleted_count += 1 statsd_client.timing("mailsync.account_deletion.queue.deleted", time.time() - start_time) except Exception: log_uncaught_errors(log, account_id=account_id) end = time.time() log.info("All data deleted successfully", shard_id=shard_id, time=end - start, count=deleted_count)
def test_retry_count_resets(monkeypatch): monkeypatch.setattr('inbox.util.concurrency._resettable_counter', lambda x, y: _resettable_counter(max_count=3, reset_interval=0)) logger = MockLogger() failing_function = FailingFunction(ValueError, max_executions=6, delay=.001) exc_callback = lambda: log_uncaught_errors(logger) retry(failing_function, exc_callback=exc_callback, max_count=3, backoff_delay=0)() assert logger.call_count == 5 assert failing_function.call_count == 6
def delete_marked_accounts(shard_id, throttle=False, dry_run=False): start = time.time() deleted_count = 0 ids_to_delete = [] with session_scope_by_shard_id(shard_id) as db_session: ids_to_delete = [(acc.id, acc.namespace.id) for acc in db_session.query(Account) if acc.is_deleted] queue_size = len(ids_to_delete) for account_id, namespace_id in ids_to_delete: # queue_size = length of queue # deleted_count = number of accounts deleted during loop iteration # this is necessary because the length of ids_to_delete doesn't # change during loop iteration statsd_client.gauge( 'mailsync.{}.account_deletion.queue.length'.format(shard_id), queue_size - deleted_count) try: with session_scope(namespace_id) as db_session: account = db_session.query(Account).get(account_id) if not account: log.critical('Account with does not exist', account_id=account_id) continue if account.sync_should_run or not account.is_deleted: log.warn( 'Account NOT marked for deletion. ' 'Will not delete', account_id=account_id) continue log.info('Deleting account', account_id=account_id) start_time = time.time() # Delete data in database try: log.info('Deleting database data', account_id=account_id) delete_namespace(account_id, namespace_id, throttle=throttle, dry_run=dry_run) except Exception as e: log.critical('Database data deletion failed', error=e, account_id=account_id) continue # Delete liveness data log.debug('Deleting liveness data', account_id=account_id) clear_heartbeat_status(account_id) deleted_count += 1 statsd_client.incr('mailsync.account_deletion.queue.deleted', 1) statsd_client.timing('mailsync.account_deletion.queue.deleted', time.time() - start_time) except Exception: log_uncaught_errors(log, account_id=account_id) end = time.time() log.info('All data deleted successfully', shard_id=shard_id, time=end - start, count=deleted_count)
def execute_with_lock(self): """ Process a task and return whether it executed successfully. """ self.log = logger.new( record_ids=list(set(self.record_ids)), action_log_ids=self.action_log_ids[:100], n_action_log_ids=len(self.action_log_ids), action=self.action_name, account_id=self.account_id, extra_args=self.extra_args, ) # Double-check that the action is still pending. # Although the task queue is populated based on pending actions, it's # possible that the processing of one action involved marking other # actions as failed. ( records_to_process, action_ids_to_process, ) = self._get_records_and_actions_to_process() if len(action_ids_to_process) == 0: return True try: before, after = self._execute_timed_action(records_to_process) self.log.debug("executing action", action_log_ids=action_ids_to_process) with session_scope(self.account_id) as db_session: action_log_entries = db_session.query(ActionLog).filter( ActionLog.id.in_(action_ids_to_process)) max_latency = max_func_latency = 0 for action_log_entry in action_log_entries: latency, func_latency = self._mark_action_as_successful( action_log_entry, before, after, db_session) if latency > max_latency: max_latency = latency if func_latency > max_func_latency: max_func_latency = func_latency self.log.info( "syncback action completed", latency=max_latency, process=self.parent_service().process_number, func_latency=max_func_latency, ) return True except: log_uncaught_errors(self.log, account_id=self.account_id, provider=self.provider) with session_scope(self.account_id) as db_session: action_log_entries = db_session.query(ActionLog).filter( ActionLog.id.in_(action_ids_to_process)) marked_as_failed = False for action_log_entry in action_log_entries: action_log_entry.retries += 1 if action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES: marked_as_failed = True if marked_as_failed: self.log.debug( "marking actions as failed", action_log_ids=action_ids_to_process, ) # If we merged actions, fail them all at the same time. for action_log_entry in action_log_entries: self._mark_action_as_failed(action_log_entry, db_session) db_session.commit() return False
def callback(): log_uncaught_errors(logger, account_id=account_id, provider=provider)
def _run(self): """ Index into CloudSearch the contacts of all namespaces. """ try: for key in engine_manager.engines: with session_scope_by_shard_id(key) as db_session: pointer = db_session.query( ContactSearchIndexCursor).first() if pointer: self.transaction_pointers[key] = pointer.transaction_id else: # Never start from 0; if the service hasn't # run before start from the latest # transaction, with the expectation that a # backfill will be run separately. latest_transaction = db_session.query(Transaction). \ order_by(desc(Transaction.created_at)).first() if latest_transaction: self.transaction_pointers[ key] = latest_transaction.id else: self.transaction_pointers[key] = 0 self.log.info('Starting contact-search-index service', transaction_pointers=self.transaction_pointers) while True: for key in engine_manager.engines: with session_scope_by_shard_id(key) as db_session: transactions = db_session.query(Transaction). filter( Transaction.id > self.transaction_pointers[key], Transaction.object_type == 'contact') \ .with_hint( Transaction, "USE INDEX (ix_transaction_table_name)") \ .order_by(asc(Transaction.id)) \ .limit(self.chunk_size) \ .options(joinedload(Transaction.namespace)).all() # index up to chunk_size transactions should_sleep = False if transactions: self.index(transactions, db_session) oldest_transaction = min( transactions, key=lambda t: t.created_at) current_timestamp = datetime.utcnow() latency = (current_timestamp - oldest_transaction.created_at).seconds self._report_transactions_latency(latency) new_pointer = transactions[-1].id self.update_pointer(new_pointer, key, db_session) db_session.commit() else: should_sleep = True if should_sleep: log.info('sleeping') sleep(self.poll_interval) except Exception: log_uncaught_errors(log)
def execute_with_lock(self): self.log = logger.new(record_ids=self.record_ids, action_log_ids=self.action_log_ids, action=self.action_name, account_id=self.account_id, extra_args=self.extra_args) # Double-check that the action is still pending. # Although the task queue is populated based on pending actions, it's # possible that the processing of one action involved marking other # actions as failed. records_to_process, action_ids_to_process = self._get_records_and_actions_to_process( ) if len(action_ids_to_process) == 0: return for attempt in range(ACTION_MAX_NR_OF_RETRIES): self.log.debug("executing action", attempt=attempt) try: before, after = self._execute_timed_action(records_to_process) with session_scope(self.account_id) as db_session: action_log_entries = db_session.query(ActionLog). \ filter(ActionLog.id.in_(action_ids_to_process)) for action_log_entry in action_log_entries: self._mark_action_as_successful( action_log_entry, before, after, db_session) return except: log_uncaught_errors(self.log, account_id=self.account_id, provider=self.provider) with session_scope(self.account_id) as db_session: action_log_entries = db_session.query(ActionLog). \ filter(ActionLog.id.in_(action_ids_to_process)) marked_as_failed = False for action_log_entry in action_log_entries: action_log_entry.retries += 1 if action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES: marked_as_failed = True self._mark_action_as_failed( action_log_entry, db_session) # If we've merged SyncbackTasks then their corresponding # actions should all fail at the same time. assert (not marked_as_failed or action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES) db_session.commit() if marked_as_failed: return # Wait before retrying self.log.info("Syncback task retrying action after sleeping", duration=self.retry_interval) # TODO(T6974): We might want to do some kind of exponential # backoff with jitter to avoid the thundering herd problem if a # provider suddenly starts having issues for a short period of # time. gevent.sleep(self.retry_interval)
def retry_with_logging(func, logger=None, retry_classes=None, fail_classes=None, account_id=None, **reset_params): callback = lambda: log_uncaught_errors(logger, account_id=account_id) return retry(func, exc_callback=callback, fail_callback=callback, retry_classes=retry_classes, fail_classes=fail_classes, **reset_params)()
def f(session): if obj_state["sent_event"]: return id = obj_state["id"] sync_should_run = obj_state["sync_should_run"] sync_host = obj_state["sync_host"] desired_sync_host = obj_state["desired_sync_host"] try: if sync_host is not None: # Somebody is actively syncing this Account, so notify them if # they should give up the Account. if not sync_should_run or (sync_host != desired_sync_host and desired_sync_host is not None): queue_name = SYNC_EVENT_QUEUE_NAME.format(sync_host) log.info( "Sending 'migrate_from' event for Account", account_id=id, queue_name=queue_name, ) EventQueue(queue_name).send_event({ "event": "migrate_from", "id": id }) return if not sync_should_run: # We don't need to notify anybody because the Account is not # actively being synced (sync_host is None) and sync_should_run is False, # so just return early. return if desired_sync_host is not None: # Nobody is actively syncing the Account, and we have somebody # who wants to sync this Account, so notify them. queue_name = SYNC_EVENT_QUEUE_NAME.format( desired_sync_host) log.info( "Sending 'migrate_to' event for Account", account_id=id, queue_name=queue_name, ) EventQueue(queue_name).send_event({ "event": "migrate_to", "id": id }) return # Nobody is actively syncing the Account, and nobody in particular # wants to sync the Account so notify the shared queue. shared_queue = shared_sync_event_queue_for_zone( config.get("ZONE")) log.info( "Sending 'migrate' event for Account", account_id=id, queue_name=shared_queue.queue_name, ) shared_queue.send_event({"event": "migrate", "id": id}) obj_state["sent_event"] = True except: log_uncaught_errors( log, account_id=id, sync_host=sync_host, desired_sync_host=desired_sync_host, )
def wrapped(*args, **kwargs): try: return func(*args, **kwargs) except exception_type: log_uncaught_errors() pdb.post_mortem()
def execute_with_lock(self): log = logger.new(record_id=self.record_id, action_log_id=self.action_log_id, action=self.action_name, account_id=self.account_id, extra_args=self.extra_args) for _ in range(ACTION_MAX_NR_OF_RETRIES): try: before_func = datetime.utcnow() func_args = [self.account_id, self.record_id] if self.extra_args: func_args.append(self.extra_args) if self.uses_crispin_client(): assert self.crispin_client is not None func_args.insert(0, self.crispin_client) self.func(*func_args) after_func = datetime.utcnow() with session_scope(self.account_id) as db_session: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) action_log_entry.status = 'successful' db_session.commit() latency = round( (datetime.utcnow() - action_log_entry.created_at).total_seconds(), 2) func_latency = round( (after_func - before_func).total_seconds(), 2) log.info('syncback action completed', action_id=self.action_log_id, latency=latency, process=self.parent_service().process_number, func_latency=func_latency) self._log_to_statsd(action_log_entry.status, latency) return except Exception: log_uncaught_errors(log, account_id=self.account_id, provider=self.provider) with session_scope(self.account_id) as db_session: action_log_entry = db_session.query(ActionLog).get( self.action_log_id) action_log_entry.retries += 1 if (action_log_entry.retries == ACTION_MAX_NR_OF_RETRIES): log.critical('Max retries reached, giving up.', exc_info=True) action_log_entry.status = 'failed' self._log_to_statsd(action_log_entry.status) db_session.commit() return db_session.commit() # Wait before retrying log.info("Syncback task retrying action after sleeping", duration=self.retry_interval) # TODO(T6974): We might want to do some kind of exponential # backoff with jitter to avoid the thundering herd problem if a # provider suddenly starts having issues for a short period of # time. gevent.sleep(self.retry_interval)