def distributed_send_message(message): # If I am the master, attempt sending my messages through my slaves. if coordinator.am_i_master(): try: if coordinator.slave_count and coordinator.slaves: for i, address in enumerate(coordinator.slaves): if i >= coordinator.slave_count: logger.error('Failed using all configured slaves; resorting to local send_message') break if rpc.send_message_to_slave(message, address): return True except StopIteration: logger.warning('No more slaves. Sending locally.') logger.info('Sending message (ID %s) locally', message.get('message_id', '?')) runtime = send_message(message) add_mode_stat(message['mode'], runtime) metrics_key = 'app_%(application)s_mode_%(mode)s_cnt' % message metrics.add_new_metrics({metrics_key: 0}) metrics.incr(metrics_key) if runtime is not None: return True raise Exception('Failed sending message')
def handle_api_notification_request(socket, address, req): notification = req['data'] notification['subject'] = '[%(application)s] %(subject)s' % notification role = notification.get('role') if not role: reject_api_request(socket, address, 'INVALID role') logger.warn('Dropping OOB message with invalid role "%s" from app %s', role, notification.get('application')) return target = notification.get('target') if not target: reject_api_request(socket, address, 'INVALID target') logger.warn('Dropping OOB message with invalid target "%s" from app %s', target, notification.get('application')) return expanded_targets = cache.targets_for_role(role, target) if not expanded_targets: reject_api_request(socket, address, 'INVALID role:target') logger.warn('Dropping OOB message with invalid role:target "%s:%s" from app %s', role, target, notification.get('application')) return # If we're rendering this using templates+context instead of body, fill in the # needed iris key. if notification.get('template') and notification.get('context'): notification['context']['iris'] = notification['context'].get('iris', {}) logger.info('-> %s OK, to %s:%s (%s:%s)', address, role, target, notification.get('application', '?'), notification.get('priority', notification.get('mode', '?'))) for _target in expanded_targets: temp_notification = notification.copy() temp_notification['target'] = _target send_queue.put(temp_notification) metrics.incr('notification_cnt') socket.sendall(msgpack.packb('OK'))
def distributed_send_message(message): if rpc.num_slaves and rpc.sender_slaves: for i, address in enumerate(rpc.sender_slaves): if i >= rpc.num_slaves: logger.error( 'Failed using all configured slaves; resorting to local send_message' ) break if rpc.send_message_to_slave(message, address): return True logger.info('Sending message (ID %s) locally', message.get('message_id', '?')) runtime = send_message(message) add_mode_stat(message['mode'], runtime) metrics_key = 'app_%(application)s_mode_%(mode)s_cnt' % message metrics.add_new_metrics({metrics_key: 0}) metrics.incr(metrics_key) if runtime is not None: return True raise Exception('Failed sending message')
def allow_send(self, message): application = message.get('application') if not application: return True # Purpose of quotas is to protect downstreams. If we're already going to drop this message, # don't let it account against quota. if message.get('mode') == 'drop': return True rate = self.rates.get(application) if not rate: return True hard_buckets, soft_buckets, hard_limit, soft_limit, wait_time, plan_name, target = rate # Increment both buckets for this minute hard_buckets[-1] += 1 soft_buckets[-1] += 1 # If hard limit breached, disallow sending this message and create incident hard_quota_usage = sum(hard_buckets) hard_usage_pct = 0 if hard_limit > 0: hard_usage_pct = (hard_quota_usage // hard_limit) * 100 metrics.set('app_%s_quota_hard_usage_pct' % application, hard_usage_pct) if hard_quota_usage > hard_limit: metrics.incr('quota_hard_exceed_cnt') if plan_name: with self.last_incidents_mutex: self.notify_incident(application, hard_limit, len(hard_buckets), plan_name, wait_time) return False # If soft limit breached, just notify owner and still send soft_quota_usage = sum(soft_buckets) soft_usage_pct = 0 if soft_limit > 0: soft_usage_pct = (soft_quota_usage // soft_limit) * 100 metrics.set('app_%s_quota_soft_usage_pct' % application, soft_usage_pct) if soft_quota_usage > soft_limit: metrics.incr('quota_soft_exceed_cnt') if target: with self.last_soft_quota_notification_time_mutex: self.notify_target(application, soft_limit, len(soft_buckets), *target) return True return True
def main(): boot_time = time.time() config = load_config() metrics.init(config, 'iris-owa-sync', default_metrics) owaconfig = config.get('owa') if not owaconfig: logger.critical('Missing OWA configs') sys.exit(1) api_host = owaconfig.get('api_host', 'http://localhost:16649') iris_client = IrisClient(api_host, 0, owaconfig['iris_app'], owaconfig['iris_app_key']) proxies = owaconfig.get('proxies') # only way to configure a proxy is to monkey-patch (http adapter) a monkey-patch (baseprotocol) :/ if proxies: UseProxyHttpAdapter._my_proxies = proxies exchangelib.protocol.BaseProtocol.HTTP_ADAPTER_CLS = UseProxyHttpAdapter creds = exchangelib.Credentials(**owaconfig['credentials']) try: nap_time = int(owaconfig.get('sleep_interval', 60)) except ValueError: nap_time = 60 while True: start_time = time.time() message_count = 0 try: config = exchangelib.Configuration(credentials=creds, **owaconfig['config']) account = exchangelib.Account(config=config, access_type=exchangelib.DELEGATE, **owaconfig['account']) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception('Failed authenticating to OWA365') metrics.incr('owa_api_failure_count') else: logger.info('Receiving mail on behalf of %s', owaconfig['account'].get('primary_smtp_address')) message_count = poll(account, iris_client) now = time.time() run_time = now - start_time logger.info( 'Last run took %2.f seconds and processed %s messages. Waiting %s seconds until next poll..', run_time, message_count, nap_time) metrics.set('uptime', now - boot_time) metrics.emit() sleep(nap_time)
def ldap_sync_loop(ldap_lists, engine, interval): sleep(60) while True: logger.info('Starting ldap sync...') try: run_start = time.time() sync_ldap_lists(ldap_lists, engine) logger.info('Ldap mailing list sync took %.2f seconds', time.time() - run_start) except Exception: metrics.incr('failed_tasks') logger.exception('Error syncing from ldap!') sleep(interval)
def oncall_sync_loop(config, engine, interval): while True: logger.info('Starting oncall sync...') try: run_start = time.time() sync_from_oncall(config, engine) logger.info('oncall sync took %.2f seconds', time.time() - run_start) except Exception: metrics.incr('failed_tasks') logger.exception('Error syncing from oncall!') sleep(interval)
def handle_slave_send(socket, address, req): message = req['data'] message_id = message.get('message_id', '?') try: runtime = send_funcs['send_message'](message) add_mode_stat(message['mode'], runtime) metrics_key = 'app_%(application)s_mode_%(mode)s_cnt' % message metrics.add_new_metrics({metrics_key: 0}) metrics.incr(metrics_key) if runtime is not None: response = 'OK' access_logger.info( 'Message (ID %s) from master %s sent successfully', message_id, address) metrics.incr('slave_message_send_success_cnt') else: response = 'FAIL' access_logger.error( 'Got falsy value from send_message for message (ID %s) from master %s: %s', message_id, address, runtime) metrics.incr('slave_message_send_fail_cnt') except Exception: response = 'FAIL' logger.exception('Sending message (ID %s) from master %s failed.') metrics.incr('slave_message_send_fail_cnt') socket.sendall(msgpack.packb(response))
def send_message_to_slave(message, address): try: payload = generate_msgpack_message_payload(message) except TypeError: logger.exception('Failed encoding message %s as msgpack', message) metrics.incr('rpc_message_pass_fail_cnt') return False pretty_address = '%s:%s' % address message_id = message.get('message_id', '?') try: s = socket.create_connection(address) s.send(payload) sender_resp = msgpack_unpack_msg_from_socket(s) s.close() except socket.error: logging.exception('Failed connecting to %s to send message (ID %s)', pretty_address, message_id) metrics.incr('rpc_message_pass_fail_cnt') return False if sender_resp == 'OK': access_logger.info( 'Successfully passed message (ID %s) to %s for sending', message_id, pretty_address) metrics.incr('rpc_message_pass_success_cnt') return True else: logger.error('Failed sending message (ID %s) through %s: %s', message_id, pretty_address, sender_resp) metrics.incr('rpc_message_pass_fail_cnt') return False
def prune_target(engine, target_name, target_type): if target_type == 'user': metrics.incr('users_purged') else: metrics.incr('others_purged') if target_type == 'team': try: # rename team to prevent namespace conflict but preserve the ability to reactivate it in the future new_name = str(uuid.uuid4()) engine.execute('''UPDATE `target` SET `active` = FALSE, `name` = %s WHERE `name` = %s AND `type_id` = (SELECT `id` FROM `target_type` WHERE `name` = %s)''', (new_name, target_name, target_type)) logger.info('Deleted inactive oncall team %s', target_name) except SQLAlchemyError as e: logger.error('Deleting oncall team %s failed: %s', target_name, e) metrics.incr('sql_errors') return try: engine.execute('''DELETE FROM `target` WHERE `name` = %s AND `type_id` = (SELECT `id` FROM `target_type` WHERE `name` = %s)''', (target_name, target_type)) logger.info('Deleted inactive target %s', target_name) # The user has messages or some other user data which should be preserved. # Just mark as inactive. except IntegrityError: logger.info('Marking target %s inactive', target_name) engine.execute('''UPDATE `target` SET `active` = FALSE WHERE `name` = %s AND `type_id` = (SELECT `id` FROM `target_type` WHERE `name` = %s)''', (target_name, target_type)) except SQLAlchemyError as e: logger.error('Deleting target %s failed: %s', target_name, e) metrics.incr('sql_errors') return
def stats_task(): connection = db.engine.raw_connection() cursor = connection.cursor() cursor.execute('SELECT `id`, `name` FROM `application`') applications = [{'id': row[0], 'name': row[1]} for row in cursor] for app in applications: try: stats = app_stats.calculate_app_stats(app, connection, cursor) set_app_stats(app, stats, connection, cursor) except Exception: logger.exception('App stats calculation failed for app %s', app['name']) metrics.incr('task_failure') cursor.close() connection.close()
def allow_send(self, message): application = message.get('application') if not application: return True rate = self.rates.get(application) if not rate: return True hard_buckets, soft_buckets, hard_limit, soft_limit, wait_time, plan_name, target = rate # Increment both buckets for this minute hard_buckets[-1] += 1 soft_buckets[-1] += 1 # If hard limit breached, disallow sending this message and create incident hard_quota_usage = sum(hard_buckets) hard_usage_pct = 0 if hard_limit > 0: hard_usage_pct = (hard_quota_usage / hard_limit) * 100 metrics.set('app_%s_quota_hard_usage_pct' % application, hard_usage_pct) if hard_quota_usage > hard_limit: metrics.incr('quota_hard_exceed_cnt') self.notify_incident(application, hard_limit, len(hard_buckets), plan_name, wait_time) return False # If soft limit breached, just notify owner and still send soft_quota_usage = sum(soft_buckets) soft_usage_pct = 0 if soft_limit > 0: soft_usage_pct = (soft_quota_usage / soft_limit) * 100 metrics.set('app_%s_quota_soft_usage_pct' % application, soft_usage_pct) if soft_quota_usage > soft_limit: metrics.incr('quota_soft_exceed_cnt') self.notify_target(application, soft_limit, len(soft_buckets), *target) return True return True
def prune_user(engine, username): metrics.incr('users_purged') try: engine.execute('''DELETE FROM `target` WHERE `name` = %s AND `type_id` = (SELECT `id` FROM `target_type` WHERE `name` = 'user')''', username) logger.info('Deleted inactive user %s', username) # The user has messages or some other user data which should be preserved. # Just mark as inactive. except IntegrityError: logger.info('Marking user %s inactive', username) engine.execute('''UPDATE `target` SET `active` = FALSE WHERE `name` = %s AND `type_id` = (SELECT `id` FROM `target_type` WHERE `name` = 'user')''', username) except SQLAlchemyError as e: logger.error('Deleting user %s failed: %s', username, e) metrics.incr('sql_errors')
def handle_slave_send(socket, address, req): message = req['data'] message_id = message.get('message_id', '?') message['to_slave'] = True try: runtime = send_funcs['message_send_enqueue'](message) response = 'OK' access_logger.info('Message (ID %s) from master %s queued successfully', message_id, address) except Exception: response = 'FAIL' logger.exception('Queueing message (ID %s) from master %s failed.') access_logger.error('Failed queueing message (ID %s) from master %s: %s', message_id, address, runtime) metrics.incr('slave_message_send_fail_cnt') socket.sendall(msgpack.packb(response))
def stats_task(connection, cursor): cursor.execute('SELECT `id`, `name` FROM `application`') applications = [{'id': row[0], 'name': row[1]} for row in cursor] # clean up old app stats cursor.execute('''DELETE FROM `application_stats`''') for app in applications: try: stats = app_stats.calculate_app_stats(app, connection, cursor) set_app_stats(app, stats, connection, cursor) except Exception: logger.exception('App stats calculation failed for app %s', app['name']) metrics.incr('task_failure') try: stats = app_stats.calculate_global_stats(connection, cursor) set_global_stats(stats, connection, cursor) except Exception: logger.exception('Global stats calculation failed') metrics.incr('task_failure')
def relay(message, iris_client): # Get headers into the format the iris expects from gmail headers = [{ 'name': header.name, 'value': header.value } for header in message.headers] # If this is a bulk message or an auto reply or something else, don't bother sending it to iris-api if is_pointless_message(headers): logger.info('Not relaying pointless message %s (from %s) to iris-api', message.message_id, message.sender.email_address) metrics.incr('message_ignore_count') return # To and From headers are strangely missing if message.to_recipients: headers.append({ 'name': 'To', 'value': message.to_recipients[0].email_address }) headers.append({'name': 'From', 'value': message.sender.email_address}) data = {'headers': headers, 'body': message.text_body.strip()} try: iris_client.post('response/email', json=data).raise_for_status() metrics.incr('message_relay_success_count') except requests.exceptions.RequestException: metrics.incr('message_relay_failure_count') logger.exception('Failed posting message %s (from %s) to iris-api', message.message_id, message.sender.email_address)
def handle_api_request(socket, address): metrics.incr('api_request_cnt') timeout = Timeout.start_new(rpc_timeout) try: req = msgpack_unpack_msg_from_socket(socket) if not req: logger.warning('Couldn\'t get msgpack data from %s', address) socket.close() return access_logger.info('%s %s', address, req['endpoint']) handler = api_request_handlers.get(req['endpoint']) if handler is not None: handler(socket, address, req) else: logger.info('-> %s unknown request', address) socket.sendall(msgpack.packb('UNKNOWN')) except Timeout: metrics.incr('api_request_timeout_cnt') logger.warning('-> %s timeout', address) socket.sendall(msgpack.packb('TIMEOUT')) finally: timeout.cancel() socket.close()
def poll(account, iris_client): try: metrics.set('total_inbox_count', account.inbox.total_count) metrics.set('unread_inbox_count', account.inbox.unread_count) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception('Failed to gather inbox counts from OWA API') metrics.incr('owa_api_failure_count') processed_messages = 0 messages_to_mark_read = [] try: for message in account.inbox.filter( is_read=False).order_by('-datetime_received'): processed_messages += 1 try: relay(message, iris_client) except Exception: logger.exception('Uncaught exception during message relaying') metrics.incr('message_relay_failure_count') # Mark it as read in bulk later. (This syntax isn't documented) message.is_read = True messages_to_mark_read.append((message, ('is_read', ))) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception('Failed to iterate through inbox') metrics.incr('owa_api_failure_count') if messages_to_mark_read: bulk_update_count = len(messages_to_mark_read) logger.info('will mark %s messages as read', bulk_update_count) try: account.bulk_update(items=messages_to_mark_read) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception( 'Failed to update read status on %s messages in bulk', bulk_update_count) metrics.incr('owa_api_failure_count') metrics.set('message_process_count', processed_messages) return processed_messages
def main(): global config config = load_config() is_master = config['sender'].get('is_master', False) logger.info('[-] bootstraping sender (master: %s)...', is_master) init_sender(config) init_plugins(config.get('plugins', {})) init_vendors(config.get('vendors', []), config.get('applications', [])) send_task = spawn(send) worker_tasks = [spawn(worker) for x in xrange(100)] if is_master: if should_mock_gwatch_renewer: spawn(mock_gwatch_renewer) else: spawn(gwatch_renewer) spawn(prune_old_audit_logs_worker) rpc.init(config['sender'], dict(send_message=send_message)) rpc.run(config['sender']) interval = 60 logger.info('[*] sender bootstrapped') while True: runtime = int(time.time()) logger.info('--> sender looop started.') cache.refresh() cache.purge() if is_master: try: escalate() deactivate() poll() aggregate(runtime) except Exception: metrics.incr('task_failure') logger.exception("Exception occured in main loop.") # check status for all background greenlets and respawn if necessary if not bool(send_task): logger.error("send task failed, %s", send_task.exception) metrics.incr('task_failure') send_task = spawn(send) bad_workers = [] for i, task in enumerate(worker_tasks): if not bool(task): logger.error("worker task failed, %s", task.exception) metrics.incr('task_failure') bad_workers.append(i) for i in bad_workers: worker_tasks[i] = spawn(worker) spawn(metrics.emit) now = time.time() elapsed_time = now - runtime nap_time = max(0, interval - elapsed_time) logger.info( '--> sender loop finished in %s seconds - sleeping %s seconds', elapsed_time, nap_time) sleep(nap_time)
def sync_ldap_lists(ldap_settings, engine): try: l = ldap.ldapobject.ReconnectLDAPObject( ldap_settings['connection']['url']) except Exception: logger.exception('Connecting to ldap to get our mailing lists failed.') return try: l.simple_bind_s(*ldap_settings['connection']['bind_args']) except Exception: logger.exception('binding to ldap to get our mailing lists failed.') return session = sessionmaker(bind=engine)() mailing_list_type_name = 'mailing-list' list_type_id = session.execute( 'SELECT `id` FROM `target_type` WHERE `name` = :name', { 'name': mailing_list_type_name }).scalar() if not list_type_id: try: list_type_id = session.execute( 'INSERT INTO `target_type` (`name`) VALUES (:name)', { 'name': mailing_list_type_name }).lastrowid session.commit() logger.info('Created target_type "%s" with id %s', mailing_list_type_name, list_type_id) except (IntegrityError, DataError): logger.exeption('Failed creating mailing-list type ID') return ldap_add_pause_interval = ldap_settings.get('user_add_pause_interval', None) ldap_add_pause_duration = ldap_settings.get('user_add_pause_duration', 1) ldap_lists = get_ldap_lists(l, ldap_settings['search_strings']) ldap_lists_count = len(ldap_lists) metrics.set('ldap_lists_found', ldap_lists_count) metrics.set('ldap_memberships_found', 0) logger.info('Found %s ldap lists', ldap_lists_count) existing_ldap_lists = { row[0] for row in session.execute( '''SELECT `name` FROM `target` WHERE `target`.`type_id` = :type_id''', {'type_id': list_type_id}) } kill_lists = existing_ldap_lists - {item[1] for item in ldap_lists} if kill_lists: metrics.incr('ldap_lists_removed', len(kill_lists)) for ldap_list in kill_lists: prune_target(engine, ldap_list, mailing_list_type_name) user_add_count = 0 for list_cn, list_name in ldap_lists: try: members = get_ldap_flat_membership(l, ldap_settings['search_strings'], list_cn, ldap_settings['max_depth'], 0, set()) except ldap.SERVER_DOWN: # reconnect and retry once metrics.incr('ldap_reconnects') logger.warning('LDAP server went away for list %s. Reconnecting', list_name) l.reconnect(ldap_settings['connection']['url']) members = get_ldap_flat_membership(l, ldap_settings['search_strings'], list_cn, ldap_settings['max_depth'], 0, set()) if not members: logger.info('Ignoring/pruning empty ldap list %s', list_name) continue num_members = len(members) metrics.incr('ldap_memberships_found', num_members) created = False list_id = session.execute( '''SELECT `mailing_list`.`target_id` FROM `mailing_list` JOIN `target` on `target`.`id` = `mailing_list`.`target_id` WHERE `target`.`name` = :name''', { 'name': list_name }).scalar() if not list_id: try: list_id = session.execute( '''INSERT INTO `target` (`type_id`, `name`) VALUES (:type_id, :name)''', { 'type_id': list_type_id, 'name': list_name }).lastrowid session.commit() except (IntegrityError, DataError): logger.exception( 'Failed adding row to target table for mailing list %s. Skipping this list.', list_name) metrics.incr('ldap_lists_failed_to_add') continue try: session.execute( '''INSERT INTO `mailing_list` (`target_id`, `count`) VALUES (:list_id, :count)''', { 'list_id': list_id, 'count': num_members }) session.commit() except (IntegrityError, DataError): logger.exception( 'Failed adding row to mailing_list table for mailing list %s (ID: %s). Skipping this list.', list_name, list_id) metrics.incr('ldap_lists_failed_to_add') continue logger.info('Created list %s with id %s', list_name, list_id) metrics.incr('ldap_lists_added') created = True if not created: session.execute( 'UPDATE `mailing_list` SET `count` = :count WHERE `target_id` = :list_id', { 'count': num_members, 'list_id': list_id }) session.commit() existing_members = { row[0] for row in session.execute( ''' SELECT `target_contact`.`destination` FROM `mailing_list_membership` JOIN `target_contact` ON `target_contact`.`target_id` = `mailing_list_membership`.`user_id` WHERE `mailing_list_membership`.`list_id` = :list_id AND `target_contact`.`mode_id` = (SELECT `id` FROM `mode` WHERE `name` = 'email') ''', {'list_id': list_id}) } add_members = members - existing_members kill_members = existing_members - members if add_members: metrics.incr('ldap_memberships_added', len(add_members)) for member in add_members: try: session.execute( '''INSERT IGNORE INTO `mailing_list_membership` (`list_id`, `user_id`) VALUES (:list_id, (SELECT `target_id` FROM `target_contact` JOIN `target` ON `target`.`id` = `target_id` WHERE `destination` = :name AND `mode_id` = (SELECT `id` FROM `mode` WHERE `name` = 'email') AND `target`.`type_id` = (SELECT `id` FROM `target_type` WHERE `name` = 'user'))) ''', { 'list_id': list_id, 'name': member }) logger.info('Added %s to list %s', member, list_name) except (IntegrityError, DataError): metrics.incr('ldap_memberships_failed_to_add') logger.warn('Failed adding %s to %s', member, list_name) user_add_count += 1 if (ldap_add_pause_interval is not None) and ( user_add_count % ldap_add_pause_interval) == 0: logger.info('Pausing for %s seconds every %s users.', ldap_add_pause_duration, ldap_add_pause_interval) time.sleep(ldap_add_pause_duration) if kill_members: metrics.incr('ldap_memberships_removed', len(kill_members)) batch_remove_ldap_memberships(session, list_id, kill_members) session.commit() session.close()
def sync_from_oncall(config, engine, purge_old_users=True): # users and teams present in our oncall database oncall_base_url = config.get('oncall-api') if not oncall_base_url: logger.error( 'Missing URL to oncall-api, which we use for user/team lookups. Bailing.' ) return oncall = oncallclient.OncallClient(config.get('oncall-app', ''), config.get('oncall-key', ''), oncall_base_url) oncall_users = fetch_users_from_oncall(oncall) if not oncall_users: logger.warning('No users found. Bailing.') return oncall_team_names = fetch_teams_from_oncall(oncall) if not oncall_team_names: logger.warning('We do not have a list of team names') oncall_team_names = set(oncall_team_names) session = sessionmaker(bind=engine)() # users present in iris' database iris_users = {} for row in engine.execute( '''SELECT `target`.`name` as `name`, `mode`.`name` as `mode`, `target_contact`.`destination` FROM `target` JOIN `user` on `target`.`id` = `user`.`target_id` LEFT OUTER JOIN `target_contact` ON `target`.`id` = `target_contact`.`target_id` LEFT OUTER JOIN `mode` ON `target_contact`.`mode_id` = `mode`.`id` WHERE `target`.`active` = TRUE ORDER BY `target`.`name`'''): contacts = iris_users.setdefault(row.name, {}) if row.mode is None or row.destination is None: continue contacts[row.mode] = row.destination iris_usernames = iris_users.viewkeys() # users from the oncall endpoints and config files metrics.set('users_found', len(oncall_users)) metrics.set('teams_found', len(oncall_team_names)) oncall_users.update(get_predefined_users(config)) oncall_usernames = oncall_users.viewkeys() # set of users not presently in iris users_to_insert = oncall_usernames - iris_usernames # set of existing iris users that are in the user oncall database users_to_update = iris_usernames & oncall_usernames users_to_mark_inactive = iris_usernames - oncall_usernames # get objects needed for insertion target_types = { name: id for name, id in session.execute( 'SELECT `name`, `id` FROM `target_type`') } # 'team' and 'user' modes = { name: id for name, id in session.execute('SELECT `name`, `id` FROM `mode`') } iris_team_names = { name for (name, ) in engine.execute( '''SELECT `name` FROM `target` WHERE `type_id` = %s''', target_types['team']) } target_add_sql = 'INSERT INTO `target` (`name`, `type_id`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE `active` = TRUE' user_add_sql = 'INSERT IGNORE INTO `user` (`target_id`) VALUES (%s)' target_contact_add_sql = '''INSERT INTO `target_contact` (`target_id`, `mode_id`, `destination`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `destination` = %s''' # insert users that need to be logger.info('Users to insert (%d)' % len(users_to_insert)) for username in users_to_insert: logger.info('Inserting %s' % username) try: target_id = engine.execute( target_add_sql, (username, target_types['user'])).lastrowid engine.execute(user_add_sql, (target_id, )) except SQLAlchemyError as e: metrics.incr('users_failed_to_add') metrics.incr('sql_errors') logger.exception('Failed to add user %s' % username) continue metrics.incr('users_added') for key, value in oncall_users[username].iteritems(): if value and key in modes: logger.info('%s: %s -> %s' % (username, key, value)) engine.execute(target_contact_add_sql, (target_id, modes[key], value, value)) # update users that need to be contact_update_sql = 'UPDATE target_contact SET destination = %s WHERE target_id = (SELECT id FROM target WHERE name = %s) AND mode_id = %s' contact_insert_sql = 'INSERT INTO target_contact (target_id, mode_id, destination) VALUES ((SELECT id FROM target WHERE name = %s), %s, %s)' contact_delete_sql = 'DELETE FROM target_contact WHERE target_id = (SELECT id FROM target WHERE name = %s) AND mode_id = %s' logger.info('Users to update (%d)' % len(users_to_update)) for username in users_to_update: try: db_contacts = iris_users[username] oncall_contacts = oncall_users[username] for mode in modes: if mode in oncall_contacts and oncall_contacts[mode]: if mode in db_contacts: if oncall_contacts[mode] != db_contacts[mode]: logger.info('%s: updating %s' % (username, mode)) metrics.incr('user_contacts_updated') engine.execute( contact_update_sql, (oncall_contacts[mode], username, modes[mode])) else: logger.info('%s: adding %s' % (username, mode)) metrics.incr('user_contacts_updated') engine.execute( contact_insert_sql, (username, modes[mode], oncall_contacts[mode])) elif mode in db_contacts: logger.info('%s: deleting %s' % (username, mode)) metrics.incr('user_contacts_updated') engine.execute(contact_delete_sql, (username, modes[mode])) else: logger.debug('%s: missing %s' % (username, mode)) except SQLAlchemyError as e: metrics.incr('users_failed_to_update') metrics.incr('sql_errors') logger.exception('Failed to update user %s' % username) continue # sync teams between iris and oncall teams_to_insert = oncall_team_names - iris_team_names teams_to_deactivate = iris_team_names - oncall_team_names logger.info('Teams to insert (%d)' % len(teams_to_insert)) for t in teams_to_insert: logger.info('Inserting %s' % t) try: target_id = engine.execute(target_add_sql, (t, target_types['team'])).lastrowid metrics.incr('teams_added') except SQLAlchemyError as e: logger.exception('Error inserting team %s: %s' % (t, e)) metrics.incr('teams_failed_to_add') continue session.commit() session.close() # mark users/teams inactive if purge_old_users: logger.info('Users to mark inactive (%d)' % len(users_to_mark_inactive)) for username in users_to_mark_inactive: prune_target(engine, username, 'user') for team in teams_to_deactivate: prune_target(engine, team, 'team')
def sync_from_oncall(config, engine, purge_old_users=True): # users and teams present in our oncall database oncall_base_url = config.get('oncall-api') if not oncall_base_url: logger.error( 'Missing URL to oncall-api, which we use for user/team lookups. Bailing.' ) return oncall = oncallclient.OncallClient(config.get('oncall-app', ''), config.get('oncall-key', ''), oncall_base_url) oncall_users = fetch_users_from_oncall(oncall) if not oncall_users: logger.warning('No users found. Bailing.') return # get teams from oncall-api and separate the list of tuples into two lists of name and ids oncall_teams_api_response = fetch_teams_from_oncall(oncall) if not oncall_teams_api_response: logger.warning('No teams found. Bailing.') return oncall_team_response = list(zip(*oncall_teams_api_response)) oncall_team_names = [name.lower() for name in oncall_team_response[0]] oncall_team_ids = oncall_team_response[1] oncall_response_dict_name_key = dict( zip(oncall_team_names, oncall_team_ids)) oncall_response_dict_id_key = dict(zip(oncall_team_ids, oncall_team_names)) oncall_case_sensitive_dict = { name.lower(): name for name in oncall_team_response[0] } if not oncall_team_names: logger.warning('We do not have a list of team names') oncall_team_names = set(oncall_team_names) oncall_team_ids = set(oncall_team_ids) session = sessionmaker(bind=engine)() # users present in iris' database iris_users = {} for row in engine.execute( '''SELECT `target`.`name` as `name`, `mode`.`name` as `mode`, `target_contact`.`destination` FROM `target` JOIN `user` on `target`.`id` = `user`.`target_id` LEFT OUTER JOIN `target_contact` ON `target`.`id` = `target_contact`.`target_id` LEFT OUTER JOIN `mode` ON `target_contact`.`mode_id` = `mode`.`id` WHERE `target`.`active` = TRUE ORDER BY `target`.`name`'''): contacts = iris_users.setdefault(row.name, {}) if row.mode is None or row.destination is None: continue contacts[row.mode] = row.destination iris_usernames = iris_users.keys() # users from the oncall endpoints and config files metrics.set('users_found', len(oncall_users)) metrics.set('teams_found', len(oncall_team_names)) oncall_users.update(get_predefined_users(config)) oncall_usernames = oncall_users.keys() # set of users not presently in iris users_to_insert = oncall_usernames - iris_usernames # set of existing iris users that are in the user oncall database users_to_update = iris_usernames & oncall_usernames users_to_mark_inactive = iris_usernames - oncall_usernames # get objects needed for insertion target_types = { name: target_id for name, target_id in session.execute( 'SELECT `name`, `id` FROM `target_type`') } # 'team' and 'user' modes = { name: mode_id for name, mode_id in session.execute('SELECT `name`, `id` FROM `mode`') } iris_team_names = { name.lower() for (name, ) in engine.execute( '''SELECT `name` FROM `target` WHERE `type_id` = %s''', target_types['team']) } target_add_sql = 'INSERT INTO `target` (`name`, `type_id`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE `active` = TRUE' oncall_add_sql = 'INSERT INTO `oncall_team` (`target_id`, `oncall_team_id`) VALUES (%s, %s)' user_add_sql = 'INSERT IGNORE INTO `user` (`target_id`) VALUES (%s)' target_contact_add_sql = '''INSERT INTO `target_contact` (`target_id`, `mode_id`, `destination`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `destination` = %s''' # insert users that need to be logger.info('Users to insert (%d)', len(users_to_insert)) for username in users_to_insert: sleep(update_sleep) logger.info('Inserting %s', username) try: target_id = engine.execute( target_add_sql, (username, target_types['user'])).lastrowid engine.execute(user_add_sql, (target_id, )) except SQLAlchemyError as e: metrics.incr('users_failed_to_add') metrics.incr('sql_errors') logger.exception('Failed to add user %s' % username) continue metrics.incr('users_added') for key, value in oncall_users[username].items(): if value and key in modes: logger.info('%s: %s -> %s', username, key, value) try: engine.execute(target_contact_add_sql, (target_id, modes[key], value, value)) except SQLAlchemyError as e: logger.exception('Error adding contact for target id: %s', target_id) metrics.incr('sql_errors') continue # update users that need to be contact_update_sql = 'UPDATE target_contact SET destination = %s WHERE target_id = (SELECT id FROM target WHERE name = %s AND type_id = %s) AND mode_id = %s' contact_insert_sql = 'INSERT INTO target_contact (target_id, mode_id, destination) VALUES ((SELECT id FROM target WHERE name = %s AND type_id = %s), %s, %s)' contact_delete_sql = 'DELETE FROM target_contact WHERE target_id = (SELECT id FROM target WHERE name = %s AND type_id = %s) AND mode_id = %s' logger.info('Users to update (%d)', len(users_to_update)) for username in users_to_update: sleep(update_sleep) try: db_contacts = iris_users[username] oncall_contacts = oncall_users[username] for mode in modes: if mode in oncall_contacts and oncall_contacts[mode]: if mode in db_contacts: if oncall_contacts[mode] != db_contacts[mode]: logger.info('%s: updating %s', username, mode) metrics.incr('user_contacts_updated') engine.execute(contact_update_sql, (oncall_contacts[mode], username, target_types['user'], modes[mode])) else: logger.info('%s: adding %s', username, mode) metrics.incr('user_contacts_updated') engine.execute(contact_insert_sql, (username, target_types['user'], modes[mode], oncall_contacts[mode])) elif mode in db_contacts: logger.info('%s: deleting %s', username, mode) metrics.incr('user_contacts_updated') engine.execute( contact_delete_sql, (username, target_types['user'], modes[mode])) else: logger.debug('%s: missing %s', username, mode) except SQLAlchemyError as e: metrics.incr('users_failed_to_update') metrics.incr('sql_errors') logger.exception('Failed to update user %s', username) continue # sync teams between iris and oncall # iris_db_oncall_team_ids (team_ids in the oncall_team table) # oncall_team_ids (team_ids from oncall api call) # oncall_team_names (names from oncall api call) # oncall_response_dict_name_key (key value pairs of oncall team names and ids from api call) # oncall_response_dict_id_key same as above but key value inverted # oncall_case_sensitive_dict maps the case insensitive oncall name to the original capitalization # iris_team_names (names from target table) # iris_target_name_id_dict dictionary of target name -> target_id mappings # iris_db_oncall_team_id_name_dict dictionary of oncall team_id -> oncall name mappings # get all incoming names that match a target check if that target has an entry in oncall table if not make one iris_target_name_id_dict = { name.lower(): target_id for name, target_id in engine.execute( '''SELECT `name`, `id` FROM `target` WHERE `type_id` = %s''', target_types['team']) } matching_target_names = iris_team_names.intersection(oncall_team_names) if matching_target_names: existing_up_to_date_oncall_teams = { name.lower() for (name, ) in session.execute( '''SELECT `target`.`name` FROM `target` JOIN `oncall_team` ON `oncall_team`.`target_id` = `target`.`id` WHERE `target`.`name` IN :matching_names''', {'matching_names': tuple(matching_target_names)}) } # up to date target names that don't have an entry in the oncall_team table yet matching_target_names_no_oncall_entry = matching_target_names - existing_up_to_date_oncall_teams for t in matching_target_names_no_oncall_entry: logger.info('Inserting existing team into oncall_team %s', t) sleep(update_sleep) try: engine.execute( '''UPDATE `target` SET `active` = TRUE WHERE `id` = %s''', iris_target_name_id_dict[t]) engine.execute(oncall_add_sql, (iris_target_name_id_dict[t], oncall_response_dict_name_key[t])) except SQLAlchemyError as e: logger.exception('Error inserting oncall_team %s: %s', t, e) metrics.incr('sql_errors') continue # rename all mismatching target names iris_db_oncall_team_id_name_dict = { team_id: name.lower() for name, team_id in engine.execute( '''SELECT target.name, oncall_team.oncall_team_id FROM `target` JOIN `oncall_team` ON oncall_team.target_id = target.id''' ) } iris_db_oncall_team_ids = { oncall_team_id for (oncall_team_id, ) in engine.execute( '''SELECT `oncall_team_id` FROM `oncall_team`''') } matching_oncall_ids = oncall_team_ids.intersection(iris_db_oncall_team_ids) name_swaps = {} # find teams in the iris database whose names have changed for oncall_id in matching_oncall_ids: current_name = iris_db_oncall_team_id_name_dict[oncall_id] new_name = oncall_response_dict_id_key[oncall_id] try: if current_name != new_name: # handle edge case of teams swapping names if not iris_target_name_id_dict.get(new_name, None): target_id_to_rename = iris_target_name_id_dict[ current_name] logger.info('Renaming team %s to %s', current_name, new_name) engine.execute( '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''', (oncall_case_sensitive_dict[new_name], target_id_to_rename)) else: # there is a team swap so rename to a random name to prevent a violation of unique target name constraint new_name = str(uuid.uuid4()) target_id_to_rename = iris_target_name_id_dict[ current_name] name_swaps[oncall_id] = target_id_to_rename logger.info('Renaming team %s to %s', current_name, new_name) engine.execute( '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''', (new_name, target_id_to_rename)) sleep(update_sleep) except SQLAlchemyError as e: logger.exception('Error changing team name of %s to %s', current_name, new_name) metrics.incr('sql_errors') # go back and rename name_swaps to correct value for oncall_id, target_id_to_rename in name_swaps.items(): new_name = oncall_response_dict_id_key[oncall_id] try: engine.execute( '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''', (oncall_case_sensitive_dict[new_name], target_id_to_rename)) except SQLAlchemyError as e: logger.exception('Error renaming target: %s', new_name) metrics.incr('sql_errors') continue sleep(update_sleep) # create new entries for new teams # if the team_id doesn't exist in oncall_team at this point then it is a new team. new_team_ids = oncall_team_ids - iris_db_oncall_team_ids logger.info('Teams to insert (%d)' % len(new_team_ids)) for team_id in new_team_ids: t = oncall_case_sensitive_dict[oncall_response_dict_id_key[team_id]] new_target_id = None # add team to target table logger.info('Inserting team %s', t) sleep(update_sleep) try: new_target_id = engine.execute(target_add_sql, (t, target_types['team'])).lastrowid metrics.incr('teams_added') except SQLAlchemyError as e: logger.exception('Error inserting team %s: %s', t, e) metrics.incr('teams_failed_to_add') metrics.incr('sql_errors') continue # add team to oncall_team table if new_target_id: logger.info('Inserting new team into oncall_team %s', t) try: engine.execute(oncall_add_sql, (new_target_id, team_id)) except SQLAlchemyError as e: logger.exception('Error inserting oncall_team %s: %s', t, e) metrics.incr('sql_errors') continue session.commit() session.close() # mark users/teams inactive if purge_old_users: # find active teams that don't exist in oncall anymore updated_iris_team_names = { name.lower() for (name, ) in engine.execute( '''SELECT `name` FROM `target` WHERE `type_id` = %s AND `active` = TRUE''', target_types['team']) } teams_to_deactivate = updated_iris_team_names - oncall_team_names logger.info('Users to mark inactive (%d)' % len(users_to_mark_inactive)) logger.info('Teams to mark inactive (%d)' % len(teams_to_deactivate)) for username in users_to_mark_inactive: prune_target(engine, username, 'user') sleep(update_sleep) for team in teams_to_deactivate: prune_target(engine, team, 'team') sleep(update_sleep)
def process_retention(engine, max_days, batch_size, cooldown_time, archive_path): time_start = time.time() connection = engine.raw_connection() cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) deleted_incidents = 0 deleted_messages = 0 deleted_comments = 0 # First, archive/kill incidents and their messages while True: # Get incidents to archive and kill, in batches try: cursor.execute( ''' SELECT %s FROM `incident` LEFT JOIN `plan` on `plan`.`id` = `incident`.`plan_id` LEFT JOIN `application` on `application`.`id` = `incident`.`application_id` LEFT JOIN `target` ON `incident`.`owner_id` = `target`.`id` WHERE `incident`.`created` < (CURDATE() - INTERVAL %%s DAY) LIMIT %%s ''' % (', '.join(field[0] for field in incident_fields)), [max_days, batch_size]) except Exception: logger.exception('Failed getting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break incident_ids = deque() for incident in cursor: archive_incident(incident, archive_path) incident_ids.append(incident[0]) if not incident_ids: break logger.info('Archived %d incidents', len(incident_ids)) # Then, Archive+Kill all comments in these incidents while True: try: cursor.execute( ''' SELECT %s FROM `comment` LEFT JOIN `target` ON `comment`.`user_id` = `target`.`id` WHERE `comment`.`incident_id` in %%s LIMIT %%s ''' % (', '.join(field[0] for field in comment_fields)), [tuple(incident_ids), batch_size]) except Exception: metrics.incr('sql_errors') logger.exception('Failed getting comments') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break comment_ids = deque() for comment in cursor: archive_comment(comment, archive_path) comment_ids.append(comment[0]) if not comment_ids: break logger.info('Archived %d comments', len(comment_ids)) try: deleted_rows = cursor.execute( 'DELETE FROM `comment` WHERE `id` IN %s', [tuple(comment_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting comments from incidents') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break else: if deleted_rows: logger.info('Killed %d comments from %d incidents', deleted_rows, len(incident_ids)) deleted_comments += deleted_rows sleep(cooldown_time) else: break # Kill all dynamic plan maps associated with these incidents while True: try: deleted_rows = cursor.execute( 'DELETE FROM `dynamic_plan_map` WHERE `incident_id` IN %s', [tuple(incident_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting dynamic plan maps') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break else: if deleted_rows: logger.info('Killed %d dynamic plan maps', deleted_rows) deleted_messages += deleted_rows sleep(cooldown_time) else: break # Archive+Kill all messages in these incidents while True: try: cursor.execute( ''' SELECT %s FROM `message` JOIN `priority` on `priority`.`id` = `message`.`priority_id` LEFT JOIN `mode` on `mode`.`id` = `message`.`mode_id` LEFT JOIN `template` ON `message`.`template_id` = `template`.`id` LEFT JOIN `target` ON `message`.`target_id` = `target`.`id` WHERE `message`.`incident_id` in %%s LIMIT %%s ''' % (', '.join(field[0] for field in message_fields)), [tuple(incident_ids), batch_size]) except Exception: metrics.incr('sql_errors') logger.exception('Failed getting messages') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break message_ids = deque() for message in cursor: archive_message(message, archive_path) message_ids.append(message[0]) if not message_ids: break logger.info('Archived %d messages', len(message_ids)) # explicitly delete all the extra message data try: cursor.execute( 'DELETE FROM `message_changelog` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `response` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `twilio_delivery_status` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `twilio_retry` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `generic_message_sent_status` WHERE `message_id` IN %s', [tuple(message_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting message child') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) try: deleted_rows = cursor.execute( 'DELETE FROM `message` WHERE `id` IN %s', [tuple(message_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages from incidents') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) # try deleting individually to directly identify any issues and prevent single error from stopping cleanup deleted_rows = 0 for msg_id in message_ids: try: deleted_rows += cursor.execute( 'DELETE FROM `message` WHERE `id`=%s', msg_id) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting message id: %s', msg_id) else: if deleted_rows: logger.info('Killed %d messages from %d incidents', deleted_rows, len(incident_ids)) deleted_messages += deleted_rows sleep(cooldown_time) else: break # Finally kill incidents try: deleted_rows = cursor.execute( 'DELETE FROM `incident` WHERE `id` IN %s', [tuple(incident_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) # try deleting individually to directly identify any issues and prevent single error from stopping clean-up deleted_rows = 0 for inc_id in incident_ids: try: deleted_rows += cursor.execute( 'DELETE FROM `incident` WHERE `id`=%s', inc_id) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting incident id: %s', inc_id) logger.info('Deleted %s incidents', deleted_rows) deleted_incidents += deleted_rows sleep(cooldown_time) # Next, kill messages not tied to incidents, like quota notifs or incident tracking emails while True: try: deleted_rows = cursor.execute( 'DELETE FROM `message` WHERE `created` < (CURDATE() - INTERVAL %s DAY) AND `incident_id` IS NULL LIMIT %s', [max_days, batch_size]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages') try: cursor.close() except Exception: pass break else: if deleted_rows: logger.info('Killed %d misc messages', deleted_rows) deleted_messages += deleted_rows sleep(cooldown_time) else: break try: cursor.close() except Exception: pass connection.close() logger.info( 'Run took %.2f seconds and deleted %d incidents and %d messages', time.time() - time_start, deleted_incidents, deleted_messages) metrics.set('deleted_messages', deleted_messages) metrics.set('deleted_incidents', deleted_incidents) metrics.set('deleted_comments', deleted_comments)
def handle_api_notification_request(socket, address, req): notification = req['data'] if 'application' not in notification: reject_api_request(socket, address, 'INVALID application') logger.warn('Dropping OOB message due to missing application key') return notification['subject'] = '[%s] %s' % (notification['application'], notification.get('subject', '')) role = notification.get('role') if not role: reject_api_request(socket, address, 'INVALID role') logger.warn('Dropping OOB message with invalid role "%s" from app %s', role, notification['application']) return target = notification.get('target') if not target: reject_api_request(socket, address, 'INVALID target') logger.warn( 'Dropping OOB message with invalid target "%s" from app %s', target, notification['application']) return expanded_targets = None # if role is literal_target skip unrolling if not notification.get('unexpanded'): try: expanded_targets = cache.targets_for_role(role, target) except IrisRoleLookupException: expanded_targets = None if not expanded_targets: reject_api_request(socket, address, 'INVALID role:target') logger.warn( 'Dropping OOB message with invalid role:target "%s:%s" from app %s', role, target, notification['application']) return sanitize_unicode_dict(notification) # If we're rendering this using templates+context instead of body, fill in the # needed iris key. if 'template' in notification: if 'context' not in notification: logger.warn( 'Dropping OOB message due to missing context from app %s', notification['application']) reject_api_request(socket, address, 'INVALID context') return else: # fill in dummy iris meta data notification['context']['iris'] = {} elif 'email_html' in notification: if not isinstance(notification['email_html'], basestring): logger.warn( 'Dropping OOB message with invalid email_html from app %s: %s', notification['application'], notification['email_html']) reject_api_request(socket, address, 'INVALID email_html') return elif 'body' not in notification: reject_api_request(socket, address, 'INVALID body') logger.warn('Dropping OOB message with invalid body from app %s', notification['application']) return access_logger.info( '-> %s OK, to %s:%s (%s:%s)', address, role, target, notification['application'], notification.get('priority', notification.get('mode', '?'))) if notification.get('unexpanded'): notification['destination'] = notification['target'] send_funcs['message_send_enqueue'](notification) else: for _target in expanded_targets: temp_notification = notification.copy() temp_notification['target'] = _target send_funcs['message_send_enqueue'](temp_notification) metrics.incr('notification_cnt') socket.sendall(msgpack.packb('OK'))
def handle_api_notification_request(socket, address, req): notification = req['data'] if 'application' not in notification: reject_api_request(socket, address, 'INVALID application') logger.warning('Dropping OOB message due to missing application key') return notification['subject'] = '[%s] %s' % (notification['application'], notification.get('subject', '')) target_list = notification.get('target_list') role = notification.get('role') if not role and not target_list: reject_api_request(socket, address, 'INVALID role') logger.warning( 'Dropping OOB message with invalid role "%s" from app %s', role, notification['application']) return target = notification.get('target') if not (target or target_list): reject_api_request(socket, address, 'INVALID target') logger.warning( 'Dropping OOB message with invalid target "%s" from app %s', target, notification['application']) return expanded_targets = None # if role is literal_target skip unrolling if not notification.get('unexpanded'): # For multi-recipient notifications, pre-populate destination with literal targets, # then expand the remaining has_literal_target = False if target_list: expanded_targets = [] notification['destination'] = [] notification['bcc_destination'] = [] for t in target_list: role = t['role'] target = t['target'] bcc = t.get('bcc') try: if role == 'literal_target': if bcc: notification['bcc_destination'].append(target) else: notification['destination'].append(target) has_literal_target = True else: expanded = cache.targets_for_role(role, target) expanded_targets += [{ 'target': e, 'bcc': bcc } for e in expanded] except IrisRoleLookupException: # Maintain best-effort delivery for remaining targets if one fails to resolve continue else: try: expanded_targets = cache.targets_for_role(role, target) except IrisRoleLookupException: expanded_targets = None if not expanded_targets and not has_literal_target: reject_api_request(socket, address, 'INVALID role:target') logger.warning( 'Dropping OOB message with invalid role:target "%s:%s" from app %s', role, target, notification['application']) return sanitize_unicode_dict(notification) # If we're rendering this using templates+context instead of body, fill in the # needed iris key. if 'template' in notification: if 'context' not in notification: logger.warning( 'Dropping OOB message due to missing context from app %s', notification['application']) reject_api_request(socket, address, 'INVALID context') return else: # fill in dummy iris meta data notification['context']['iris'] = {} elif 'email_html' in notification: if not isinstance(notification['email_html'], str): logger.warning( 'Dropping OOB message with invalid email_html from app %s: %s', notification['application'], notification['email_html']) reject_api_request(socket, address, 'INVALID email_html') return elif 'body' not in notification: reject_api_request(socket, address, 'INVALID body') logger.warning('Dropping OOB message with invalid body from app %s', notification['application']) return access_logger.info( '-> %s OK, to %s:%s (%s:%s)', address, role, target, notification['application'], notification.get('priority', notification.get('mode', '?'))) notification_count = 1 if notification.get('unexpanded'): notification['destination'] = notification['target'] send_funcs['message_send_enqueue'](notification) elif notification.get('multi-recipient'): notification['target'] = expanded_targets send_funcs['message_send_enqueue'](notification) notification_count = len(expanded_targets) else: for _target in expanded_targets: temp_notification = notification.copy() temp_notification['target'] = _target send_funcs['message_send_enqueue'](temp_notification) metrics.incr('notification_cnt', inc=notification_count) socket.sendall(msgpack.packb('OK'))
def process_retention(engine, max_days, batch_size, cooldown_time, archive_path): time_start = time.time() connection = engine.raw_connection() cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) deleted_incidents = 0 deleted_messages = 0 # First, archive/kill incidents and their messages while True: # Get incidents to archive and kill, in batches try: cursor.execute( ''' SELECT %s FROM `incident` LEFT JOIN `plan` on `plan`.`id` = `incident`.`plan_id` LEFT JOIN `application` on `application`.`id` = `incident`.`application_id` LEFT JOIN `target` ON `incident`.`owner_id` = `target`.`id` WHERE `incident`.`created` < (CURDATE() - INTERVAL %%s DAY) LIMIT %%s ''' % (', '.join(field[0] for field in incident_fields)), [max_days, batch_size]) except Exception: logger.exception('Failed getting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break incident_ids = deque() for incident in cursor: archive_incident(incident, archive_path) incident_ids.append(incident[0]) if not incident_ids: break logger.info('Archived %d incidents', len(incident_ids)) # Archive+Kill all messages in these incidents while True: try: cursor.execute( ''' SELECT %s FROM `message` JOIN `priority` on `priority`.`id` = `message`.`priority_id` LEFT JOIN `mode` on `mode`.`id` = `message`.`mode_id` LEFT JOIN `template` ON `message`.`template_id` = `template`.`id` LEFT JOIN `target` ON `message`.`target_id` = `target`.`id` WHERE `message`.`incident_id` in %%s LIMIT %%s ''' % (', '.join(field[0] for field in message_fields)), [tuple(incident_ids), batch_size]) except Exception: metrics.incr('sql_errors') logger.exception('Failed getting messages') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break message_ids = deque() for message in cursor: archive_message(message, archive_path) message_ids.append(message[0]) if not message_ids: break logger.info('Archived %d messages', len(message_ids)) try: deleted_rows = cursor.execute('DELETE FROM `message` WHERE `id` IN %s', [tuple(message_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages from incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break else: if deleted_rows: logger.info('Killed %d messages from %d incidents', deleted_rows, len(incident_ids)) deleted_messages += deleted_rows sleep(cooldown_time) else: break # Finally kill incidents try: deleted_rows = cursor.execute('DELETE FROM `incident` WHERE `id` IN %s', [tuple(incident_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break else: logger.info('Deleted %s incidents', deleted_rows) deleted_incidents += deleted_rows sleep(cooldown_time) # Next, kill messages not tied to incidents, like quota notifs or incident tracking emails while True: try: deleted_rows = cursor.execute('DELETE FROM `message` WHERE `created` < (CURDATE() - INTERVAL %s DAY) AND `incident_id` IS NULL LIMIT %s', [max_days, batch_size]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages') try: cursor.close() except Exception: pass break else: if deleted_rows: logger.info('Killed %d misc messages', deleted_rows) deleted_messages += deleted_rows sleep(cooldown_time) else: break try: cursor.close() except Exception: pass connection.close() logger.info('Run took %.2f seconds and deleted %d incidents and %d messages', time.time() - time_start, deleted_incidents, deleted_messages) metrics.set('deleted_messages', deleted_messages) metrics.set('deleted_incidents', deleted_incidents)
def relay(message, iris_client): if message.headers is None: logger.info('Ignoring message with no headers %s (from %s)', message.message_id, message.sender.email_address) metrics.incr('message_ignore_count') return # Get headers into the format the iris expects from gmail headers = [{ 'name': header.name, 'value': header.value } for header in message.headers] # If this is a bulk message or an auto reply or something else, don't bother sending it to iris-api if is_pointless_message(headers): logger.info('Not relaying pointless message %s (from %s) to iris-api', message.message_id, message.sender.email_address) metrics.incr('message_ignore_count') return # To and From headers are strangely missing if message.to_recipients: headers.append({ 'name': 'To', 'value': message.to_recipients[0].email_address }) headers.append({'name': 'From', 'value': message.sender.email_address}) data = {'headers': headers, 'body': message.text_body.strip()} try: req = iris_client.post('response/email', json=data) except requests.exceptions.RequestException: metrics.incr('message_relay_failure_count') logger.exception('Failed posting message %s (from %s) to iris-api', message.message_id, message.sender.email_address) return code_type = req.status_code // 100 if code_type == 5: metrics.incr('message_relay_failure_count') logger.error( 'Failed posting message %s (from %s) to iris-api. Got status code %s and response %s', message.message_id, message.sender.email_address, req.status_code, req.text) elif code_type == 4: metrics.incr('malformed_message_count') logger.error( 'Failed posting message %s (from %s) to iris-api. Message likely malformed. Status code: %s. Response: %s', message.message_id, message.sender.email_address, req.status_code, req.text) elif code_type == 2: metrics.incr('message_relay_success_count') # If we create an incident using an email, this header will be set and will be the numeric ID # of the created incident; otherwise, the header will not exist or it will be a textual # error message. incident_header = req.headers.get('X-IRIS-INCIDENT') if isinstance(incident_header, basestring) and incident_header.isdigit(): metrics.incr('incident_created_count') else: logger.error( 'Failed posting message %s (from %s) to iris-api. Message likely malformed. Got back strange status code: %s. Response: %s', message.message_id, message.sender.email_address, req.status_code, req.text)
def fetch_and_send_message(): message = send_queue.get() has_contact = set_target_contact(message) if not has_contact: mark_message_has_no_contact(message) return if 'message_id' not in message: message['message_id'] = None drop_mode_id = api_cache.modes.get('drop') # If this app breaches hard quota, drop message on floor, and update in UI if it has an ID if not quota.allow_send(message): logger.warn( 'Hard message quota exceeded; Dropping this message on floor: %s', message) if message['message_id']: spawn(auditlog.message_change, message['message_id'], auditlog.MODE_CHANGE, message.get('mode', '?'), 'drop', 'Dropping due to hard quota violation.') # If we know the ID for the mode drop, reflect that for the message if drop_mode_id: message['mode'] = 'drop' message['mode_id'] = drop_mode_id else: logger.error( 'Can\'t mark message %s as dropped as we don\'t know the mode ID for %s', message, 'drop') # Render, so we're able to populate the message table with the proper subject/etc as well as # information that it was dropped. render(message) mark_message_as_sent(message) return # If we're set to drop this message, no-op this before message gets sent to a vendor if message.get('mode') == 'drop': if message['message_id']: render(message) mark_message_as_sent(message) add_mode_stat('drop', 0) metrics_key = 'app_%(application)s_mode_drop_cnt' % message metrics.add_new_metrics({metrics_key: 0}) metrics.incr(metrics_key) return render(message) # Drop this message, and mark it as dropped, rather than sending it, if its body is too long and we were normally # going to send it anyway. body_length = len(message['body']) if body_length > MAX_MESSAGE_BODY_LENGTH: logger.warn( 'Message id %s has a ridiculously long body (%s chars). Dropping it.', message['message_id'], body_length) spawn( auditlog.message_change, message['message_id'], auditlog.MODE_CHANGE, message.get('mode', '?'), 'drop', 'Dropping due to excessive body length (%s > %s chars)' % (body_length, MAX_MESSAGE_BODY_LENGTH)) metrics.incr('msg_drop_length_cnt') # Truncate this here to avoid a duplicate log message in mark_message_as_sent(), as we still need to call # that to update the body/subject message['body'] = message['body'][:MAX_MESSAGE_BODY_LENGTH] if drop_mode_id: message['mode'] = 'drop' message['mode_id'] = drop_mode_id mark_message_as_sent(message) return success = None try: success = distributed_send_message(message) except Exception: logger.exception('Failed to send message: %s', message) if message['mode'] == 'email': logger.error( 'unable to send %(mode)s %(message_id)s %(application)s %(destination)s %(subject)s %(body)s', message) metrics.incr('task_failure') else: logger.error( 'reclassifying as email %(mode)s %(message_id)s %(application)s %(destination)s %(subject)s %(body)s', message) old_mode = message['mode'] if (set_target_fallback_mode(message)): update_message_mode(message) auditlog.message_change( message['message_id'], auditlog.MODE_CHANGE, old_mode, message['mode'], 'Changing mode due to original mode failure') render(message) try: success = distributed_send_message(message) # nope - log and bail except Exception: metrics.incr('task_failure') logger.error( 'unable to send %(mode)s %(message_id)s %(application)s %(destination)s %(subject)s %(body)s', message) if success: metrics.incr('message_send_cnt') if message['message_id']: mark_message_as_sent(message) if message['message_id']: update_message_sent_status(message, success)
def main(): global config config = load_config() start_time = time.time() logger.info('[-] bootstraping sender...') init_sender(config) init_plugins(config.get('plugins', {})) init_vendors(config.get('vendors', []), config.get('applications', [])) send_task = spawn(send) worker_tasks = [spawn(worker) for x in xrange(100)] rpc.init(config['sender'], dict(send_message=send_message)) rpc.run(config['sender']) spawn(coordinator.update_forever) gwatch_renewer_task = None prune_audit_logs_task = None interval = 60 logger.info('[*] sender bootstrapped') while True: runtime = int(time.time()) logger.info('--> sender looop started.') cache.refresh() cache.purge() # If we're currently a master, ensure our master-greenlets are running # and we're doing the master duties if coordinator.am_i_master(): if not bool(gwatch_renewer_task): if should_mock_gwatch_renewer: gwatch_renewer_task = spawn(mock_gwatch_renewer) else: gwatch_renewer_task = spawn(gwatch_renewer) if not bool(prune_audit_logs_task): prune_audit_logs_task = spawn(prune_old_audit_logs_worker) try: escalate() deactivate() poll() aggregate(runtime) except Exception: metrics.incr('task_failure') logger.exception("Exception occured in main loop.") # If we're not master, don't do the master tasks and make sure those other # greenlets are stopped if they're running else: logger.info('I am not the master so I am not doing master sender tasks.') # Stop these task greenlets if they're running. Technically this should # never happen because if we're the master, we'll likely only stop being the # master if our process exits, which would kill these greenlets anyway. if bool(gwatch_renewer_task): logger.info('I am not master anymore so stopping the gwatch renewer') gwatch_renewer_task.kill() if bool(prune_audit_logs_task): logger.info('I am not master anymore so stopping the audit logs worker') prune_audit_logs_task.kill() # check status for all background greenlets and respawn if necessary if not bool(send_task): logger.error("send task failed, %s", send_task.exception) metrics.incr('task_failure') send_task = spawn(send) bad_workers = [] for i, task in enumerate(worker_tasks): if not bool(task): logger.error("worker task failed, %s", task.exception) metrics.incr('task_failure') bad_workers.append(i) for i in bad_workers: worker_tasks[i] = spawn(worker) now = time.time() metrics.set('sender_uptime', int(now - start_time)) spawn(metrics.emit) elapsed_time = now - runtime nap_time = max(0, interval - elapsed_time) logger.info('--> sender loop finished in %s seconds - sleeping %s seconds', elapsed_time, nap_time) sleep(nap_time)
def create_messages(incident_id, plan_notification_id): application_id = cache.incidents[incident_id]['application_id'] plan_notification = cache.plan_notifications[plan_notification_id] role = cache.roles[plan_notification['role_id']]['name'] target = cache.targets[plan_notification['target_id']]['name'] # find role/priority from plan_notification_id names = cache.targets_for_role(role, target) priority_id = plan_notification['priority_id'] changed_target = False body = '' if not names: metrics.incr('role_target_lookup_error') # Try to get creator of the plan and nag them instead name = None try: name = cache.plans[plan_notification['plan_id']]['creator'] except (KeyError, TypeError): pass if not name: logger.error(( 'Failed to find targets for incident %s, plan_notification_id: %s, role: %s, target: %s, result: %s and failed looking ' 'up the plan\'s creator'), incident_id, plan_notification_id, role, target, names) return False try: priority_id = api_cache.priorities['low']['id'] except KeyError: logger.error(( 'Failed to find targets for incident %s, plan_notification_id: %s, role: %s, target: %s, result: %s and failed looking ' 'up ID for low priority'), incident_id, plan_notification_id, role, target, names) return False logger.error(( 'Failed to find targets for incident %s, plan_notification_id: %s, role: %s, target: %s, result: %s. ' 'Reaching out to %s instead and lowering priority to low (%s)'), incident_id, plan_notification_id, role, target, names, name, priority_id) body = 'You are receiving this as you created this plan and we can\'t resolve %s of %s at this time.\n\n' % ( role, target) names = [name] changed_target = True connection = db.engine.raw_connection() cursor = connection.cursor() for name in names: t = cache.target_names[name] if t: target_id = t['id'] cursor.execute( INSERT_MESSAGE_SQL, (plan_notification['plan_id'], plan_notification_id, incident_id, application_id, target_id, priority_id, body)) if changed_target: connection.commit( ) # needed for the lastrowid to exist in the DB to satsify the constraint auditlog.message_change( cursor.lastrowid, auditlog.TARGET_CHANGE, role + '|' + target, name, 'Changing target as we failed resolving original target') else: metrics.incr('target_not_found') logger.error('No target found: %s', name) connection.commit() cursor.close() connection.close() return True