Beispiel #1
0
def main():
    config = load_config()
    metrics.init(config, 'iris-sync-targets', stats_reset)

    default_nap_time = 3600

    try:
        nap_time = int(config.get('sync_script_nap_time', default_nap_time))
    except ValueError:
        nap_time = default_nap_time

    engine = create_engine(
        config['db']['conn']['str'] % config['db']['conn']['kwargs'],
        **config['db']['kwargs'])

    # Initialize these to zero at the start of the app, and don't reset them at every
    # metrics interval
    metrics.set('users_found', 0)
    metrics.set('teams_found', 0)

    metrics_task = spawn(metrics.emit_forever)

    while True:
        if not bool(metrics_task):
            logger.error('metrics task failed, %s', metrics_task.exception)
            metrics_task = spawn(metrics.emit_forever)

        sync(config, engine)
        logger.info('Sleeping for %d seconds' % nap_time)
        sleep(nap_time)
Beispiel #2
0
    def allow_send(self, message):
        application = message.get('application')

        if not application:
            return True

        # Purpose of quotas is to protect downstreams. If we're already going to drop this message,
        # don't let it account against quota.
        if message.get('mode') == 'drop':
            return True

        rate = self.rates.get(application)

        if not rate:
            return True

        hard_buckets, soft_buckets, hard_limit, soft_limit, wait_time, plan_name, target = rate

        # Increment both buckets for this minute
        hard_buckets[-1] += 1
        soft_buckets[-1] += 1

        # If hard limit breached, disallow sending this message and create incident
        hard_quota_usage = sum(hard_buckets)

        hard_usage_pct = 0
        if hard_limit > 0:
            hard_usage_pct = (hard_quota_usage // hard_limit) * 100
        metrics.set('app_%s_quota_hard_usage_pct' % application,
                    hard_usage_pct)

        if hard_quota_usage > hard_limit:
            metrics.incr('quota_hard_exceed_cnt')
            if plan_name:
                with self.last_incidents_mutex:
                    self.notify_incident(application, hard_limit,
                                         len(hard_buckets), plan_name,
                                         wait_time)
            return False

        # If soft limit breached, just notify owner and still send
        soft_quota_usage = sum(soft_buckets)

        soft_usage_pct = 0
        if soft_limit > 0:
            soft_usage_pct = (soft_quota_usage // soft_limit) * 100
        metrics.set('app_%s_quota_soft_usage_pct' % application,
                    soft_usage_pct)

        if soft_quota_usage > soft_limit:
            metrics.incr('quota_soft_exceed_cnt')
            if target:
                with self.last_soft_quota_notification_time_mutex:
                    self.notify_target(application, soft_limit,
                                       len(soft_buckets), *target)
            return True

        return True
Beispiel #3
0
def main():
    boot_time = time.time()
    config = load_config()

    metrics.init(config, 'iris-owa-sync', default_metrics)

    owaconfig = config.get('owa')

    if not owaconfig:
        logger.critical('Missing OWA configs')
        sys.exit(1)

    api_host = owaconfig.get('api_host', 'http://localhost:16649')
    iris_client = IrisClient(api_host, 0, owaconfig['iris_app'],
                             owaconfig['iris_app_key'])

    proxies = owaconfig.get('proxies')

    # only way to configure a proxy is to monkey-patch (http adapter) a monkey-patch (baseprotocol) :/
    if proxies:
        UseProxyHttpAdapter._my_proxies = proxies
        exchangelib.protocol.BaseProtocol.HTTP_ADAPTER_CLS = UseProxyHttpAdapter

    creds = exchangelib.Credentials(**owaconfig['credentials'])

    try:
        nap_time = int(owaconfig.get('sleep_interval', 60))
    except ValueError:
        nap_time = 60

    while True:
        start_time = time.time()
        message_count = 0

        try:
            config = exchangelib.Configuration(credentials=creds,
                                               **owaconfig['config'])
            account = exchangelib.Account(config=config,
                                          access_type=exchangelib.DELEGATE,
                                          **owaconfig['account'])
        except (exchangelib.errors.EWSError,
                requests.exceptions.RequestException):
            logger.exception('Failed authenticating to OWA365')
            metrics.incr('owa_api_failure_count')
        else:
            logger.info('Receiving mail on behalf of %s',
                        owaconfig['account'].get('primary_smtp_address'))
            message_count = poll(account, iris_client)

        now = time.time()
        run_time = now - start_time
        logger.info(
            'Last run took %2.f seconds and processed %s messages. Waiting %s seconds until next poll..',
            run_time, message_count, nap_time)
        metrics.set('uptime', now - boot_time)
        metrics.emit()
        sleep(nap_time)
Beispiel #4
0
def deactivate():
    # deactivate incidents that have expired
    logger.info('[-] start deactivate task...')
    start_deactivation = time.time()

    connection = db.engine.raw_connection()
    cursor = connection.cursor()
    cursor.execute(INACTIVE_SQL)
    connection.commit()
    cursor.close()
    connection.close()

    metrics.set('deactivation', time.time() - start_deactivation)
    logger.info('[*] deactivate task finished')
Beispiel #5
0
def aggregate(now):
    # see if it's time to send the batches
    logger.info('[-] start aggregate task - queued: %s', len(messages))
    start_aggregations = time.time()
    for key in queues.keys():
        aggregation_window = cache.plans[key[0]]['aggregation_window']
        if now - sent.get(key, 0) >= aggregation_window:
            aggregated_message_ids = queues[key]

            connection = db.engine.raw_connection()
            cursor = connection.cursor()
            cursor.execute(
                'SELECT `id` FROM `message` WHERE active=1 AND `id` in %s',
                [aggregated_message_ids])
            active_message_ids = {r[0] for r in cursor}
            cursor.close()
            connection.close()

            inactive_message_ids = aggregated_message_ids - active_message_ids
            l = len(active_message_ids)
            logger.info(
                '[x] dropped %s messages from claimed incidents, %s remain for %r',
                len(inactive_message_ids), l, key)

            # remove inactive message from the queue
            for message_id in inactive_message_ids:
                del messages[message_id]

            if l == 1:
                m = messages.pop(next(iter(active_message_ids)))
                logger.info('aggregate - %(message_id)s pushing to send queue',
                            m)
                send_queue.put(m)
            elif l > 1:
                uuid = uuid4().hex
                m = messages[next(iter(active_message_ids))]
                logger.info('aggregate - %s pushing to send queue', uuid)
                m['batch_id'] = uuid

                # Cast from set to list, as sets are not msgpack serializable
                m['aggregated_ids'] = list(active_message_ids)
                send_queue.put(m)
                for message_id in active_message_ids:
                    del messages[message_id]
                logger.info('[-] purged %s from messages %s remaining',
                            active_message_ids, len(messages))
            del queues[key]
            sent[key] = now
    metrics.set('aggregations', time.time() - start_aggregations)
    logger.info('[*] aggregate task finished - queued: %s', len(messages))
Beispiel #6
0
    def allow_send(self, message):
        application = message.get('application')

        if not application:
            return True

        rate = self.rates.get(application)

        if not rate:
            return True

        hard_buckets, soft_buckets, hard_limit, soft_limit, wait_time, plan_name, target = rate

        # Increment both buckets for this minute
        hard_buckets[-1] += 1
        soft_buckets[-1] += 1

        # If hard limit breached, disallow sending this message and create incident
        hard_quota_usage = sum(hard_buckets)

        hard_usage_pct = 0
        if hard_limit > 0:
            hard_usage_pct = (hard_quota_usage / hard_limit) * 100
        metrics.set('app_%s_quota_hard_usage_pct' % application,
                    hard_usage_pct)

        if hard_quota_usage > hard_limit:
            metrics.incr('quota_hard_exceed_cnt')
            self.notify_incident(application, hard_limit, len(hard_buckets),
                                 plan_name, wait_time)
            return False

        # If soft limit breached, just notify owner and still send
        soft_quota_usage = sum(soft_buckets)

        soft_usage_pct = 0
        if soft_limit > 0:
            soft_usage_pct = (soft_quota_usage / soft_limit) * 100
        metrics.set('app_%s_quota_soft_usage_pct' % application,
                    soft_usage_pct)

        if soft_quota_usage > soft_limit:
            metrics.incr('quota_soft_exceed_cnt')
            self.notify_target(application, soft_limit, len(soft_buckets),
                               *target)
            return True

        return True
Beispiel #7
0
def main():
    global ldap_timeout
    config = load_config()
    metrics.init(config, 'iris-sync-targets', stats_reset)

    default_ldap_timeout = 20
    default_nap_time = 3600

    ldap_timeout = int(
        config.get('sync_script_ldap_timeout', default_ldap_timeout))
    try:
        nap_time = int(config.get('sync_script_nap_time', default_nap_time))
    except ValueError:
        nap_time = default_nap_time

    engine = create_engine(
        config['db']['conn']['str'] % config['db']['conn']['kwargs'],
        **config['db']['kwargs'])

    # Optionally, maintain an internal list of mailing lists from ldap that can also be
    # used as targets.
    ldap_lists = config.get('ldap_lists')

    # Initialize these to zero at the start of the app, and don't reset them at every
    # metrics interval
    metrics.set('users_found', 0)
    metrics.set('teams_found', 0)

    metrics.set('ldap_lists_found', 0)
    metrics.set('ldap_memberships_found', 0)

    metrics_task = spawn(metrics.emit_forever)

    while True:
        if not bool(metrics_task):
            logger.error('metrics task failed, %s', metrics_task.exception)
            metrics_task = spawn(metrics.emit_forever)

        sync_from_oncall(config, engine)

        # Do ldap mailing list sync *after* we do the normal sync, to ensure we have the users
        # which will be in ldap already populated.
        if ldap_lists:

            if 'ldap_cert_path' in ldap_lists:
                ldap_cert_path = ldap_lists['ldap_cert_path']
                if not os.access(ldap_cert_path, os.R_OK):
                    logger.error("Failed to read ldap_cert_path certificate")
                    raise IOError
                else:
                    ldap_lists['cert_path'] = ldap_cert_path
            list_run_start = time.time()
            sync_ldap_lists(ldap_lists, engine)
            logger.info('Ldap mailing list sync took %.2f seconds',
                        time.time() - list_run_start)

        logger.info('Sleeping for %d seconds' % nap_time)
        sleep(nap_time)
Beispiel #8
0
    def leave_cluster(self):
        self.started_shutdown = True

        # cancel any attempts to acquire leader lock which could make us hang
        self.lock.cancel()

        if self.zk.state == KazooState.CONNECTED:
            if self.party and self.party.participating:
                logger.info('Leaving party')
                self.party.leave()
            if self.lock and self.lock.is_acquired:
                logger.info('Releasing lock')
                self.lock.release()

        # Make us not the leader
        self.is_leader = False

        # Avoid sending metrics that we are still the leader when we're not
        metrics.set('is_leader_sender', 0)
Beispiel #9
0
def gwatch_renewer():
    gmail_config = config['gmail']
    gcli = Gmail(gmail_config, config.get('gmail_proxy'))
    while True:
        logger.info('[-] start gmail watcher loop...')
        logger.info('renewing gmail watcher...')
        re = gcli.watch(gmail_config['project'], gmail_config['topic'])
        try:
            history_id, expiration = (int(
                re['historyId']), int(re['expiration']) / 1000 - time.time())
        except KeyError:
            logger.exception(
                '[*] gmail watcher run failed. Skipping this run.')
        else:
            metrics.set('gmail_history_id', history_id)
            metrics.set('gmail_seconds_to_watch_expiration', expiration)
            logger.info('[*] gmail watcher loop finished')

        # only renew every 8 hours
        sleep(60 * 60 * 8)
Beispiel #10
0
def poll():
    # poll unsent messages
    logger.info('[-] start send task...')
    start_send = time.time()

    connection = db.engine.raw_connection()
    cursor = connection.cursor(db.dict_cursor)
    if messages:
        cursor.execute(UNSENT_MESSAGES_SQL + ' AND `message`.`id` NOT IN %s',
                       [tuple(messages)])
    else:
        cursor.execute(UNSENT_MESSAGES_SQL)

    new_msg_count = cursor.rowcount
    queued_msg_cnt = len(messages)
    metrics.set('new_msg_count', new_msg_count)
    logger.info('%d new messages waiting in database - queued: %d',
                new_msg_count, queued_msg_cnt)

    for m in cursor:
        # iris's own email response does not have context since content and
        # subject are already set
        if m.get('context'):
            context = ujson.loads(m['context'])
            # inject meta variables
            context['iris'] = {k: m[k] for k in m if k != 'context'}
            m['context'] = context
        message_queue.put(m)

    metrics.set('poll', time.time() - start_send)
    metrics.set('queue', len(messages))
    logger.info('[*] send task finished')
    cursor.close()
    connection.close()
Beispiel #11
0
    def update_forever(self):
        while True:
            if self.started_shutdown:
                return

            old_status = self.is_master
            self.update_status()
            new_status = self.is_master

            if old_status != new_status:
                log = logger.info
            else:
                log = logger.debug

            if self.is_master:
                log('I am the master sender')
            else:
                log('I am a slave sender')

            metrics.set('slave_instance_count', self.slave_count)
            metrics.set('is_master_sender', int(self.is_master is True))

            sleep(UPDATE_FREQUENCY)
Beispiel #12
0
def poll(account, iris_client):

    try:
        metrics.set('total_inbox_count', account.inbox.total_count)
        metrics.set('unread_inbox_count', account.inbox.unread_count)
    except (exchangelib.errors.EWSError, requests.exceptions.RequestException):
        logger.exception('Failed to gather inbox counts from OWA API')
        metrics.incr('owa_api_failure_count')

    processed_messages = 0
    messages_to_mark_read = []

    try:
        for message in account.inbox.filter(
                is_read=False).order_by('-datetime_received'):
            processed_messages += 1

            try:
                relay(message, iris_client)
            except Exception:
                logger.exception('Uncaught exception during message relaying')
                metrics.incr('message_relay_failure_count')

            # Mark it as read in bulk later. (This syntax isn't documented)
            message.is_read = True
            messages_to_mark_read.append((message, ('is_read', )))

    except (exchangelib.errors.EWSError, requests.exceptions.RequestException):
        logger.exception('Failed to iterate through inbox')
        metrics.incr('owa_api_failure_count')

    if messages_to_mark_read:
        bulk_update_count = len(messages_to_mark_read)
        logger.info('will mark %s messages as read', bulk_update_count)
        try:
            account.bulk_update(items=messages_to_mark_read)
        except (exchangelib.errors.EWSError,
                requests.exceptions.RequestException):
            logger.exception(
                'Failed to update read status on %s messages in bulk',
                bulk_update_count)
            metrics.incr('owa_api_failure_count')

    metrics.set('message_process_count', processed_messages)
    return processed_messages
Beispiel #13
0
def sync_ldap_lists(ldap_settings, engine):
    try:
        l = ldap.ldapobject.ReconnectLDAPObject(
            ldap_settings['connection']['url'])
    except Exception:
        logger.exception('Connecting to ldap to get our mailing lists failed.')
        return

    try:
        l.simple_bind_s(*ldap_settings['connection']['bind_args'])
    except Exception:
        logger.exception('binding to ldap to get our mailing lists failed.')
        return

    session = sessionmaker(bind=engine)()

    mailing_list_type_name = 'mailing-list'

    list_type_id = session.execute(
        'SELECT `id` FROM `target_type` WHERE `name` = :name', {
            'name': mailing_list_type_name
        }).scalar()
    if not list_type_id:
        try:
            list_type_id = session.execute(
                'INSERT INTO `target_type` (`name`) VALUES (:name)', {
                    'name': mailing_list_type_name
                }).lastrowid
            session.commit()
            logger.info('Created target_type "%s" with id %s',
                        mailing_list_type_name, list_type_id)
        except (IntegrityError, DataError):
            logger.exeption('Failed creating mailing-list type ID')
            return

    ldap_add_pause_interval = ldap_settings.get('user_add_pause_interval',
                                                None)
    ldap_add_pause_duration = ldap_settings.get('user_add_pause_duration', 1)

    ldap_lists = get_ldap_lists(l, ldap_settings['search_strings'])
    ldap_lists_count = len(ldap_lists)
    metrics.set('ldap_lists_found', ldap_lists_count)
    metrics.set('ldap_memberships_found', 0)
    logger.info('Found %s ldap lists', ldap_lists_count)

    existing_ldap_lists = {
        row[0]
        for row in session.execute(
            '''SELECT `name` FROM `target` WHERE `target`.`type_id` = :type_id''',
            {'type_id': list_type_id})
    }
    kill_lists = existing_ldap_lists - {item[1] for item in ldap_lists}
    if kill_lists:
        metrics.incr('ldap_lists_removed', len(kill_lists))
        for ldap_list in kill_lists:
            prune_target(engine, ldap_list, mailing_list_type_name)

    user_add_count = 0

    for list_cn, list_name in ldap_lists:
        try:
            members = get_ldap_flat_membership(l,
                                               ldap_settings['search_strings'],
                                               list_cn,
                                               ldap_settings['max_depth'], 0,
                                               set())
        except ldap.SERVER_DOWN:
            # reconnect and retry once
            metrics.incr('ldap_reconnects')
            logger.warning('LDAP server went away for list %s. Reconnecting',
                           list_name)
            l.reconnect(ldap_settings['connection']['url'])
            members = get_ldap_flat_membership(l,
                                               ldap_settings['search_strings'],
                                               list_cn,
                                               ldap_settings['max_depth'], 0,
                                               set())

        if not members:
            logger.info('Ignoring/pruning empty ldap list %s', list_name)
            continue

        num_members = len(members)
        metrics.incr('ldap_memberships_found', num_members)

        created = False
        list_id = session.execute(
            '''SELECT `mailing_list`.`target_id`
                                     FROM `mailing_list`
                                     JOIN `target` on `target`.`id` = `mailing_list`.`target_id`
                                     WHERE `target`.`name` = :name''', {
                'name': list_name
            }).scalar()

        if not list_id:
            try:
                list_id = session.execute(
                    '''INSERT INTO `target` (`type_id`, `name`)
                                             VALUES (:type_id, :name)''', {
                        'type_id': list_type_id,
                        'name': list_name
                    }).lastrowid
                session.commit()
            except (IntegrityError, DataError):
                logger.exception(
                    'Failed adding row to target table for mailing list %s. Skipping this list.',
                    list_name)
                metrics.incr('ldap_lists_failed_to_add')
                continue

            try:
                session.execute(
                    '''INSERT INTO `mailing_list` (`target_id`, `count`) VALUES (:list_id, :count)''',
                    {
                        'list_id': list_id,
                        'count': num_members
                    })
                session.commit()
            except (IntegrityError, DataError):
                logger.exception(
                    'Failed adding row to mailing_list table for mailing list %s (ID: %s). Skipping this list.',
                    list_name, list_id)
                metrics.incr('ldap_lists_failed_to_add')
                continue

            logger.info('Created list %s with id %s', list_name, list_id)
            metrics.incr('ldap_lists_added')
            created = True

        if not created:
            session.execute(
                'UPDATE `mailing_list` SET `count` = :count WHERE `target_id` = :list_id',
                {
                    'count': num_members,
                    'list_id': list_id
                })
            session.commit()

        existing_members = {
            row[0]
            for row in session.execute(
                '''
                            SELECT `target_contact`.`destination`
                            FROM `mailing_list_membership`
                            JOIN `target_contact` ON `target_contact`.`target_id` = `mailing_list_membership`.`user_id`
                            WHERE `mailing_list_membership`.`list_id` = :list_id
                            AND `target_contact`.`mode_id` = (SELECT `id` FROM `mode` WHERE `name` = 'email')
                            ''', {'list_id': list_id})
        }

        add_members = members - existing_members
        kill_members = existing_members - members

        if add_members:
            metrics.incr('ldap_memberships_added', len(add_members))

            for member in add_members:
                try:
                    session.execute(
                        '''INSERT IGNORE INTO `mailing_list_membership`
                                       (`list_id`, `user_id`)
                                       VALUES (:list_id,
                                               (SELECT `target_id` FROM `target_contact`
                                                JOIN `target` ON `target`.`id` = `target_id`
                                                WHERE `destination` = :name
                                                AND `mode_id` = (SELECT `id` FROM `mode` WHERE `name` = 'email')
                                                AND `target`.`type_id` = (SELECT `id` FROM `target_type` WHERE `name` = 'user')))
                                    ''', {
                            'list_id': list_id,
                            'name': member
                        })
                    logger.info('Added %s to list %s', member, list_name)
                except (IntegrityError, DataError):
                    metrics.incr('ldap_memberships_failed_to_add')
                    logger.warn('Failed adding %s to %s', member, list_name)

                user_add_count += 1
                if (ldap_add_pause_interval is not None) and (
                        user_add_count % ldap_add_pause_interval) == 0:
                    logger.info('Pausing for %s seconds every %s users.',
                                ldap_add_pause_duration,
                                ldap_add_pause_interval)
                    time.sleep(ldap_add_pause_duration)

        if kill_members:
            metrics.incr('ldap_memberships_removed', len(kill_members))
            batch_remove_ldap_memberships(session, list_id, kill_members)

    session.commit()
    session.close()
Beispiel #14
0
def sync_from_oncall(config, engine, purge_old_users=True):
    # users and teams present in our oncall database
    oncall_base_url = config.get('oncall-api')

    if not oncall_base_url:
        logger.error(
            'Missing URL to oncall-api, which we use for user/team lookups. Bailing.'
        )
        return

    oncall = oncallclient.OncallClient(config.get('oncall-app', ''),
                                       config.get('oncall-key', ''),
                                       oncall_base_url)
    oncall_users = fetch_users_from_oncall(oncall)

    if not oncall_users:
        logger.warning('No users found. Bailing.')
        return

    oncall_team_names = fetch_teams_from_oncall(oncall)

    if not oncall_team_names:
        logger.warning('We do not have a list of team names')

    oncall_team_names = set(oncall_team_names)

    session = sessionmaker(bind=engine)()

    # users present in iris' database
    iris_users = {}
    for row in engine.execute(
            '''SELECT `target`.`name` as `name`, `mode`.`name` as `mode`,
                                        `target_contact`.`destination`
                                 FROM `target`
                                 JOIN `user` on `target`.`id` = `user`.`target_id`
                                 LEFT OUTER JOIN `target_contact` ON `target`.`id` = `target_contact`.`target_id`
                                 LEFT OUTER JOIN `mode` ON `target_contact`.`mode_id` = `mode`.`id`
                                 WHERE `target`.`active` = TRUE
                                 ORDER BY `target`.`name`'''):
        contacts = iris_users.setdefault(row.name, {})
        if row.mode is None or row.destination is None:
            continue
        contacts[row.mode] = row.destination

    iris_usernames = iris_users.viewkeys()

    # users from the oncall endpoints and config files
    metrics.set('users_found', len(oncall_users))
    metrics.set('teams_found', len(oncall_team_names))
    oncall_users.update(get_predefined_users(config))
    oncall_usernames = oncall_users.viewkeys()

    # set of users not presently in iris
    users_to_insert = oncall_usernames - iris_usernames
    # set of existing iris users that are in the user oncall database
    users_to_update = iris_usernames & oncall_usernames
    users_to_mark_inactive = iris_usernames - oncall_usernames

    # get objects needed for insertion
    target_types = {
        name: id
        for name, id in session.execute(
            'SELECT `name`, `id` FROM `target_type`')
    }  # 'team' and 'user'
    modes = {
        name: id
        for name, id in session.execute('SELECT `name`, `id` FROM `mode`')
    }
    iris_team_names = {
        name
        for (name, ) in engine.execute(
            '''SELECT `name` FROM `target` WHERE `type_id` = %s''',
            target_types['team'])
    }

    target_add_sql = 'INSERT INTO `target` (`name`, `type_id`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE `active` = TRUE'
    user_add_sql = 'INSERT IGNORE INTO `user` (`target_id`) VALUES (%s)'
    target_contact_add_sql = '''INSERT INTO `target_contact` (`target_id`, `mode_id`, `destination`)
                                VALUES (%s, %s, %s)
                                ON DUPLICATE KEY UPDATE `destination` = %s'''

    # insert users that need to be
    logger.info('Users to insert (%d)' % len(users_to_insert))
    for username in users_to_insert:
        logger.info('Inserting %s' % username)
        try:
            target_id = engine.execute(
                target_add_sql, (username, target_types['user'])).lastrowid
            engine.execute(user_add_sql, (target_id, ))
        except SQLAlchemyError as e:
            metrics.incr('users_failed_to_add')
            metrics.incr('sql_errors')
            logger.exception('Failed to add user %s' % username)
            continue
        metrics.incr('users_added')
        for key, value in oncall_users[username].iteritems():
            if value and key in modes:
                logger.info('%s: %s -> %s' % (username, key, value))
                engine.execute(target_contact_add_sql,
                               (target_id, modes[key], value, value))

    # update users that need to be
    contact_update_sql = 'UPDATE target_contact SET destination = %s WHERE target_id = (SELECT id FROM target WHERE name = %s) AND mode_id = %s'
    contact_insert_sql = 'INSERT INTO target_contact (target_id, mode_id, destination) VALUES ((SELECT id FROM target WHERE name = %s), %s, %s)'
    contact_delete_sql = 'DELETE FROM target_contact WHERE target_id = (SELECT id FROM target WHERE name = %s) AND mode_id = %s'

    logger.info('Users to update (%d)' % len(users_to_update))
    for username in users_to_update:
        try:
            db_contacts = iris_users[username]
            oncall_contacts = oncall_users[username]
            for mode in modes:
                if mode in oncall_contacts and oncall_contacts[mode]:
                    if mode in db_contacts:
                        if oncall_contacts[mode] != db_contacts[mode]:
                            logger.info('%s: updating %s' % (username, mode))
                            metrics.incr('user_contacts_updated')
                            engine.execute(
                                contact_update_sql,
                                (oncall_contacts[mode], username, modes[mode]))
                    else:
                        logger.info('%s: adding %s' % (username, mode))
                        metrics.incr('user_contacts_updated')
                        engine.execute(
                            contact_insert_sql,
                            (username, modes[mode], oncall_contacts[mode]))
                elif mode in db_contacts:
                    logger.info('%s: deleting %s' % (username, mode))
                    metrics.incr('user_contacts_updated')
                    engine.execute(contact_delete_sql, (username, modes[mode]))
                else:
                    logger.debug('%s: missing %s' % (username, mode))
        except SQLAlchemyError as e:
            metrics.incr('users_failed_to_update')
            metrics.incr('sql_errors')
            logger.exception('Failed to update user %s' % username)
            continue

    # sync teams between iris and oncall
    teams_to_insert = oncall_team_names - iris_team_names
    teams_to_deactivate = iris_team_names - oncall_team_names

    logger.info('Teams to insert (%d)' % len(teams_to_insert))
    for t in teams_to_insert:
        logger.info('Inserting %s' % t)
        try:
            target_id = engine.execute(target_add_sql,
                                       (t, target_types['team'])).lastrowid
            metrics.incr('teams_added')
        except SQLAlchemyError as e:
            logger.exception('Error inserting team %s: %s' % (t, e))
            metrics.incr('teams_failed_to_add')
            continue
    session.commit()
    session.close()

    # mark users/teams inactive
    if purge_old_users:
        logger.info('Users to mark inactive (%d)' %
                    len(users_to_mark_inactive))
        for username in users_to_mark_inactive:
            prune_target(engine, username, 'user')
        for team in teams_to_deactivate:
            prune_target(engine, team, 'team')
Beispiel #15
0
def sync_from_oncall(config, engine, purge_old_users=True):
    # users and teams present in our oncall database
    oncall_base_url = config.get('oncall-api')

    if not oncall_base_url:
        logger.error(
            'Missing URL to oncall-api, which we use for user/team lookups. Bailing.'
        )
        return

    oncall = oncallclient.OncallClient(config.get('oncall-app', ''),
                                       config.get('oncall-key', ''),
                                       oncall_base_url)
    oncall_users = fetch_users_from_oncall(oncall)

    if not oncall_users:
        logger.warning('No users found. Bailing.')
        return

    # get teams from oncall-api and separate the list of tuples into two lists of name and ids
    oncall_teams_api_response = fetch_teams_from_oncall(oncall)
    if not oncall_teams_api_response:
        logger.warning('No teams found. Bailing.')
        return

    oncall_team_response = list(zip(*oncall_teams_api_response))
    oncall_team_names = [name.lower() for name in oncall_team_response[0]]
    oncall_team_ids = oncall_team_response[1]
    oncall_response_dict_name_key = dict(
        zip(oncall_team_names, oncall_team_ids))
    oncall_response_dict_id_key = dict(zip(oncall_team_ids, oncall_team_names))
    oncall_case_sensitive_dict = {
        name.lower(): name
        for name in oncall_team_response[0]
    }

    if not oncall_team_names:
        logger.warning('We do not have a list of team names')

    oncall_team_names = set(oncall_team_names)
    oncall_team_ids = set(oncall_team_ids)

    session = sessionmaker(bind=engine)()

    # users present in iris' database
    iris_users = {}
    for row in engine.execute(
            '''SELECT `target`.`name` as `name`, `mode`.`name` as `mode`,
                                        `target_contact`.`destination`
                                 FROM `target`
                                 JOIN `user` on `target`.`id` = `user`.`target_id`
                                 LEFT OUTER JOIN `target_contact` ON `target`.`id` = `target_contact`.`target_id`
                                 LEFT OUTER JOIN `mode` ON `target_contact`.`mode_id` = `mode`.`id`
                                 WHERE `target`.`active` = TRUE
                                 ORDER BY `target`.`name`'''):
        contacts = iris_users.setdefault(row.name, {})
        if row.mode is None or row.destination is None:
            continue
        contacts[row.mode] = row.destination

    iris_usernames = iris_users.keys()

    # users from the oncall endpoints and config files
    metrics.set('users_found', len(oncall_users))
    metrics.set('teams_found', len(oncall_team_names))
    oncall_users.update(get_predefined_users(config))
    oncall_usernames = oncall_users.keys()

    # set of users not presently in iris
    users_to_insert = oncall_usernames - iris_usernames
    # set of existing iris users that are in the user oncall database
    users_to_update = iris_usernames & oncall_usernames
    users_to_mark_inactive = iris_usernames - oncall_usernames

    # get objects needed for insertion
    target_types = {
        name: target_id
        for name, target_id in session.execute(
            'SELECT `name`, `id` FROM `target_type`')
    }  # 'team' and 'user'
    modes = {
        name: mode_id
        for name, mode_id in session.execute('SELECT `name`, `id` FROM `mode`')
    }
    iris_team_names = {
        name.lower()
        for (name, ) in engine.execute(
            '''SELECT `name` FROM `target` WHERE `type_id` = %s''',
            target_types['team'])
    }
    target_add_sql = 'INSERT INTO `target` (`name`, `type_id`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE `active` = TRUE'
    oncall_add_sql = 'INSERT INTO `oncall_team` (`target_id`, `oncall_team_id`) VALUES (%s, %s)'
    user_add_sql = 'INSERT IGNORE INTO `user` (`target_id`) VALUES (%s)'
    target_contact_add_sql = '''INSERT INTO `target_contact` (`target_id`, `mode_id`, `destination`)
                                VALUES (%s, %s, %s)
                                ON DUPLICATE KEY UPDATE `destination` = %s'''

    # insert users that need to be
    logger.info('Users to insert (%d)', len(users_to_insert))
    for username in users_to_insert:
        sleep(update_sleep)
        logger.info('Inserting %s', username)
        try:
            target_id = engine.execute(
                target_add_sql, (username, target_types['user'])).lastrowid
            engine.execute(user_add_sql, (target_id, ))
        except SQLAlchemyError as e:
            metrics.incr('users_failed_to_add')
            metrics.incr('sql_errors')
            logger.exception('Failed to add user %s' % username)
            continue
        metrics.incr('users_added')
        for key, value in oncall_users[username].items():
            if value and key in modes:
                logger.info('%s: %s -> %s', username, key, value)
                try:
                    engine.execute(target_contact_add_sql,
                                   (target_id, modes[key], value, value))
                except SQLAlchemyError as e:
                    logger.exception('Error adding contact for target id: %s',
                                     target_id)
                    metrics.incr('sql_errors')
                    continue

    # update users that need to be
    contact_update_sql = 'UPDATE target_contact SET destination = %s WHERE target_id = (SELECT id FROM target WHERE name = %s AND type_id = %s) AND mode_id = %s'
    contact_insert_sql = 'INSERT INTO target_contact (target_id, mode_id, destination) VALUES ((SELECT id FROM target WHERE name = %s AND type_id = %s), %s, %s)'
    contact_delete_sql = 'DELETE FROM target_contact WHERE target_id = (SELECT id FROM target WHERE name = %s AND type_id = %s) AND mode_id = %s'

    logger.info('Users to update (%d)', len(users_to_update))
    for username in users_to_update:
        sleep(update_sleep)
        try:
            db_contacts = iris_users[username]
            oncall_contacts = oncall_users[username]
            for mode in modes:
                if mode in oncall_contacts and oncall_contacts[mode]:
                    if mode in db_contacts:
                        if oncall_contacts[mode] != db_contacts[mode]:
                            logger.info('%s: updating %s', username, mode)
                            metrics.incr('user_contacts_updated')
                            engine.execute(contact_update_sql,
                                           (oncall_contacts[mode], username,
                                            target_types['user'], modes[mode]))
                    else:
                        logger.info('%s: adding %s', username, mode)
                        metrics.incr('user_contacts_updated')
                        engine.execute(contact_insert_sql,
                                       (username, target_types['user'],
                                        modes[mode], oncall_contacts[mode]))
                elif mode in db_contacts:
                    logger.info('%s: deleting %s', username, mode)
                    metrics.incr('user_contacts_updated')
                    engine.execute(
                        contact_delete_sql,
                        (username, target_types['user'], modes[mode]))
                else:
                    logger.debug('%s: missing %s', username, mode)
        except SQLAlchemyError as e:
            metrics.incr('users_failed_to_update')
            metrics.incr('sql_errors')
            logger.exception('Failed to update user %s', username)
            continue

# sync teams between iris and oncall

# iris_db_oncall_team_ids (team_ids in the oncall_team table)
# oncall_team_ids (team_ids from oncall api call)
# oncall_team_names (names from oncall api call)
# oncall_response_dict_name_key (key value pairs of oncall team names and ids from api call)
# oncall_response_dict_id_key same as above but key value inverted
# oncall_case_sensitive_dict maps the case insensitive oncall name to the original capitalization
# iris_team_names (names from target table)
# iris_target_name_id_dict dictionary of target name -> target_id mappings
# iris_db_oncall_team_id_name_dict dictionary of oncall team_id -> oncall name mappings

# get all incoming names that match a target check if that target has an entry in oncall table if not make one
    iris_target_name_id_dict = {
        name.lower(): target_id
        for name, target_id in engine.execute(
            '''SELECT `name`, `id` FROM `target` WHERE `type_id` = %s''',
            target_types['team'])
    }

    matching_target_names = iris_team_names.intersection(oncall_team_names)
    if matching_target_names:
        existing_up_to_date_oncall_teams = {
            name.lower()
            for (name, ) in session.execute(
                '''SELECT `target`.`name` FROM `target` JOIN `oncall_team` ON `oncall_team`.`target_id` = `target`.`id` WHERE `target`.`name` IN :matching_names''',
                {'matching_names': tuple(matching_target_names)})
        }
        # up to date target names that don't have an entry in the oncall_team table yet
        matching_target_names_no_oncall_entry = matching_target_names - existing_up_to_date_oncall_teams

        for t in matching_target_names_no_oncall_entry:
            logger.info('Inserting existing team into oncall_team %s', t)
            sleep(update_sleep)
            try:
                engine.execute(
                    '''UPDATE `target` SET `active` = TRUE WHERE `id` = %s''',
                    iris_target_name_id_dict[t])
                engine.execute(oncall_add_sql,
                               (iris_target_name_id_dict[t],
                                oncall_response_dict_name_key[t]))
            except SQLAlchemyError as e:
                logger.exception('Error inserting oncall_team %s: %s', t, e)
                metrics.incr('sql_errors')
                continue

# rename all mismatching target names

    iris_db_oncall_team_id_name_dict = {
        team_id: name.lower()
        for name, team_id in engine.execute(
            '''SELECT target.name, oncall_team.oncall_team_id FROM `target` JOIN `oncall_team` ON oncall_team.target_id = target.id'''
        )
    }

    iris_db_oncall_team_ids = {
        oncall_team_id
        for (oncall_team_id, ) in engine.execute(
            '''SELECT `oncall_team_id` FROM `oncall_team`''')
    }
    matching_oncall_ids = oncall_team_ids.intersection(iris_db_oncall_team_ids)

    name_swaps = {}

    # find teams in the iris database whose names have changed
    for oncall_id in matching_oncall_ids:

        current_name = iris_db_oncall_team_id_name_dict[oncall_id]
        new_name = oncall_response_dict_id_key[oncall_id]
        try:
            if current_name != new_name:
                # handle edge case of teams swapping names
                if not iris_target_name_id_dict.get(new_name, None):
                    target_id_to_rename = iris_target_name_id_dict[
                        current_name]
                    logger.info('Renaming team %s to %s', current_name,
                                new_name)
                    engine.execute(
                        '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''',
                        (oncall_case_sensitive_dict[new_name],
                         target_id_to_rename))
                else:
                    # there is a team swap so rename to a random name to prevent a violation of unique target name constraint
                    new_name = str(uuid.uuid4())
                    target_id_to_rename = iris_target_name_id_dict[
                        current_name]
                    name_swaps[oncall_id] = target_id_to_rename
                    logger.info('Renaming team %s to %s', current_name,
                                new_name)
                    engine.execute(
                        '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''',
                        (new_name, target_id_to_rename))
                sleep(update_sleep)
        except SQLAlchemyError as e:
            logger.exception('Error changing team name of %s to %s',
                             current_name, new_name)
            metrics.incr('sql_errors')

    # go back and rename name_swaps to correct value
    for oncall_id, target_id_to_rename in name_swaps.items():
        new_name = oncall_response_dict_id_key[oncall_id]
        try:
            engine.execute(
                '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''',
                (oncall_case_sensitive_dict[new_name], target_id_to_rename))
        except SQLAlchemyError as e:
            logger.exception('Error renaming target: %s', new_name)
            metrics.incr('sql_errors')
            continue
        sleep(update_sleep)


# create new entries for new teams

# if the team_id doesn't exist in oncall_team at this point then it is a new team.
    new_team_ids = oncall_team_ids - iris_db_oncall_team_ids
    logger.info('Teams to insert (%d)' % len(new_team_ids))

    for team_id in new_team_ids:
        t = oncall_case_sensitive_dict[oncall_response_dict_id_key[team_id]]
        new_target_id = None

        # add team to target table
        logger.info('Inserting team %s', t)
        sleep(update_sleep)
        try:
            new_target_id = engine.execute(target_add_sql,
                                           (t, target_types['team'])).lastrowid
            metrics.incr('teams_added')
        except SQLAlchemyError as e:
            logger.exception('Error inserting team %s: %s', t, e)
            metrics.incr('teams_failed_to_add')
            metrics.incr('sql_errors')
            continue

        # add team to oncall_team table
        if new_target_id:
            logger.info('Inserting new team into oncall_team %s', t)
            try:
                engine.execute(oncall_add_sql, (new_target_id, team_id))
            except SQLAlchemyError as e:
                logger.exception('Error inserting oncall_team %s: %s', t, e)
                metrics.incr('sql_errors')
                continue
    session.commit()
    session.close()

    # mark users/teams inactive
    if purge_old_users:
        # find active teams that don't exist in oncall anymore
        updated_iris_team_names = {
            name.lower()
            for (name, ) in engine.execute(
                '''SELECT `name` FROM `target` WHERE `type_id` = %s AND `active` = TRUE''',
                target_types['team'])
        }
        teams_to_deactivate = updated_iris_team_names - oncall_team_names

        logger.info('Users to mark inactive (%d)' %
                    len(users_to_mark_inactive))
        logger.info('Teams to mark inactive (%d)' % len(teams_to_deactivate))
        for username in users_to_mark_inactive:
            prune_target(engine, username, 'user')
            sleep(update_sleep)
        for team in teams_to_deactivate:
            prune_target(engine, team, 'team')
            sleep(update_sleep)
Beispiel #16
0
def process_retention(engine, max_days, batch_size, cooldown_time,
                      archive_path):
    time_start = time.time()

    connection = engine.raw_connection()
    cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)

    deleted_incidents = 0
    deleted_messages = 0
    deleted_comments = 0

    # First, archive/kill incidents and their messages
    while True:

        # Get incidents to archive and kill, in batches
        try:
            cursor.execute(
                '''
                    SELECT
                        %s
                    FROM `incident`
                    LEFT JOIN `plan` on `plan`.`id` = `incident`.`plan_id`
                    LEFT JOIN `application` on `application`.`id` = `incident`.`application_id`
                    LEFT JOIN `target` ON `incident`.`owner_id` = `target`.`id`
                    WHERE `incident`.`created` < (CURDATE() - INTERVAL %%s DAY)
                    LIMIT %%s
                ''' % (', '.join(field[0] for field in incident_fields)),
                [max_days, batch_size])
        except Exception:
            logger.exception('Failed getting incidents')
            try:
                cursor.close()
            except Exception:
                pass
            cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)
            break

        incident_ids = deque()

        for incident in cursor:
            archive_incident(incident, archive_path)
            incident_ids.append(incident[0])

        if not incident_ids:
            break

        logger.info('Archived %d incidents', len(incident_ids))

        # Then, Archive+Kill all comments in these incidents
        while True:

            try:
                cursor.execute(
                    '''
                        SELECT
                          %s
                        FROM `comment`
                        LEFT JOIN `target` ON `comment`.`user_id` = `target`.`id`
                        WHERE `comment`.`incident_id` in %%s
                        LIMIT %%s
                    ''' % (', '.join(field[0] for field in comment_fields)),
                    [tuple(incident_ids), batch_size])

            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed getting comments')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(
                    engine.dialect.dbapi.cursors.SSCursor)
                break

            comment_ids = deque()

            for comment in cursor:
                archive_comment(comment, archive_path)
                comment_ids.append(comment[0])

            if not comment_ids:
                break

            logger.info('Archived %d comments', len(comment_ids))

            try:
                deleted_rows = cursor.execute(
                    'DELETE FROM `comment` WHERE `id` IN %s',
                    [tuple(comment_ids)])
                connection.commit()
            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed deleting comments from incidents')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(
                    engine.dialect.dbapi.cursors.SSCursor)
                break
            else:
                if deleted_rows:
                    logger.info('Killed %d comments from %d incidents',
                                deleted_rows, len(incident_ids))
                    deleted_comments += deleted_rows
                    sleep(cooldown_time)
                else:
                    break

        # Kill all dynamic plan maps associated with these incidents
        while True:
            try:
                deleted_rows = cursor.execute(
                    'DELETE FROM `dynamic_plan_map` WHERE `incident_id` IN %s',
                    [tuple(incident_ids)])
                connection.commit()
            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed deleting dynamic plan maps')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(
                    engine.dialect.dbapi.cursors.SSCursor)
                break
            else:
                if deleted_rows:
                    logger.info('Killed %d dynamic plan maps', deleted_rows)
                    deleted_messages += deleted_rows
                    sleep(cooldown_time)
                else:
                    break

        # Archive+Kill all messages in these incidents
        while True:

            try:
                cursor.execute(
                    '''
                        SELECT
                          %s
                        FROM `message`
                        JOIN `priority` on `priority`.`id` = `message`.`priority_id`
                        LEFT JOIN `mode` on `mode`.`id` = `message`.`mode_id`
                        LEFT JOIN `template` ON `message`.`template_id` = `template`.`id`
                        LEFT JOIN `target` ON `message`.`target_id` = `target`.`id`
                        WHERE `message`.`incident_id` in %%s
                        LIMIT %%s
                    ''' % (', '.join(field[0] for field in message_fields)),
                    [tuple(incident_ids), batch_size])

            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed getting messages')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(
                    engine.dialect.dbapi.cursors.SSCursor)
                break

            message_ids = deque()

            for message in cursor:
                archive_message(message, archive_path)
                message_ids.append(message[0])

            if not message_ids:
                break

            logger.info('Archived %d messages', len(message_ids))

            # explicitly delete all the extra message data
            try:
                cursor.execute(
                    'DELETE FROM `message_changelog` WHERE `message_id` IN %s',
                    [tuple(message_ids)])
                cursor.execute(
                    'DELETE FROM `response` WHERE `message_id` IN %s',
                    [tuple(message_ids)])
                cursor.execute(
                    'DELETE FROM `twilio_delivery_status` WHERE `message_id` IN %s',
                    [tuple(message_ids)])
                cursor.execute(
                    'DELETE FROM `twilio_retry` WHERE `message_id` IN %s',
                    [tuple(message_ids)])
                cursor.execute(
                    'DELETE FROM `generic_message_sent_status` WHERE `message_id` IN %s',
                    [tuple(message_ids)])
                connection.commit()
            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed deleting message child')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(
                    engine.dialect.dbapi.cursors.SSCursor)

            try:
                deleted_rows = cursor.execute(
                    'DELETE FROM `message` WHERE `id` IN %s',
                    [tuple(message_ids)])
                connection.commit()
            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed deleting messages from incidents')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(
                    engine.dialect.dbapi.cursors.SSCursor)
                # try deleting individually to directly identify any issues and prevent single error from stopping cleanup
                deleted_rows = 0
                for msg_id in message_ids:
                    try:
                        deleted_rows += cursor.execute(
                            'DELETE FROM `message` WHERE `id`=%s', msg_id)
                        connection.commit()
                    except Exception:
                        metrics.incr('sql_errors')
                        logger.exception('Failed deleting message id: %s',
                                         msg_id)
            else:
                if deleted_rows:
                    logger.info('Killed %d messages from %d incidents',
                                deleted_rows, len(incident_ids))
                    deleted_messages += deleted_rows
                    sleep(cooldown_time)
                else:
                    break

        # Finally kill incidents
        try:
            deleted_rows = cursor.execute(
                'DELETE FROM `incident` WHERE `id` IN %s',
                [tuple(incident_ids)])
            connection.commit()
        except Exception:
            metrics.incr('sql_errors')
            logger.exception('Failed deleting incidents')
            try:
                cursor.close()
            except Exception:
                pass
            cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)
            # try deleting individually to directly identify any issues and prevent single error from stopping clean-up
            deleted_rows = 0
            for inc_id in incident_ids:
                try:
                    deleted_rows += cursor.execute(
                        'DELETE FROM `incident` WHERE `id`=%s', inc_id)
                    connection.commit()
                except Exception:
                    metrics.incr('sql_errors')
                    logger.exception('Failed deleting incident id: %s', inc_id)
        logger.info('Deleted %s incidents', deleted_rows)
        deleted_incidents += deleted_rows
        sleep(cooldown_time)

    # Next, kill messages not tied to incidents, like quota notifs or incident tracking emails
    while True:
        try:
            deleted_rows = cursor.execute(
                'DELETE FROM `message` WHERE `created` < (CURDATE() - INTERVAL %s DAY) AND `incident_id` IS NULL LIMIT %s',
                [max_days, batch_size])
            connection.commit()
        except Exception:
            metrics.incr('sql_errors')
            logger.exception('Failed deleting messages')
            try:
                cursor.close()
            except Exception:
                pass
            break
        else:
            if deleted_rows:
                logger.info('Killed %d misc messages', deleted_rows)
                deleted_messages += deleted_rows
                sleep(cooldown_time)
            else:
                break

    try:
        cursor.close()
    except Exception:
        pass
    connection.close()

    logger.info(
        'Run took %.2f seconds and deleted %d incidents and %d messages',
        time.time() - time_start, deleted_incidents, deleted_messages)
    metrics.set('deleted_messages', deleted_messages)
    metrics.set('deleted_incidents', deleted_incidents)
    metrics.set('deleted_comments', deleted_comments)
Beispiel #17
0
def process_retention(engine, max_days, batch_size, cooldown_time, archive_path):
    time_start = time.time()

    connection = engine.raw_connection()
    cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)

    deleted_incidents = 0
    deleted_messages = 0

    # First, archive/kill incidents and their messages
    while True:

        # Get incidents to archive and kill, in batches
        try:
            cursor.execute(
                '''
                    SELECT
                        %s
                    FROM `incident`
                    LEFT JOIN `plan` on `plan`.`id` = `incident`.`plan_id`
                    LEFT JOIN `application` on `application`.`id` = `incident`.`application_id`
                    LEFT JOIN `target` ON `incident`.`owner_id` = `target`.`id`
                    WHERE `incident`.`created` < (CURDATE() - INTERVAL %%s DAY)
                    LIMIT %%s
                ''' % (', '.join(field[0] for field in incident_fields)),
                [max_days, batch_size])
        except Exception:
            logger.exception('Failed getting incidents')
            try:
                cursor.close()
            except Exception:
                pass
            cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)
            break

        incident_ids = deque()

        for incident in cursor:
            archive_incident(incident, archive_path)
            incident_ids.append(incident[0])

        if not incident_ids:
            break

        logger.info('Archived %d incidents', len(incident_ids))

        # Archive+Kill all messages in these incidents
        while True:

            try:
                cursor.execute(
                    '''
                        SELECT
                          %s
                        FROM `message`
                        JOIN `priority` on `priority`.`id` = `message`.`priority_id`
                        LEFT JOIN `mode` on `mode`.`id` = `message`.`mode_id`
                        LEFT JOIN `template` ON `message`.`template_id` = `template`.`id`
                        LEFT JOIN `target` ON `message`.`target_id` = `target`.`id`
                        WHERE `message`.`incident_id` in %%s
                        LIMIT %%s
                    ''' % (', '.join(field[0] for field in message_fields)),
                    [tuple(incident_ids), batch_size])

            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed getting messages')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)
                break

            message_ids = deque()

            for message in cursor:
                archive_message(message, archive_path)
                message_ids.append(message[0])

            if not message_ids:
                break

            logger.info('Archived %d messages', len(message_ids))

            try:
                deleted_rows = cursor.execute('DELETE FROM `message` WHERE `id` IN %s', [tuple(message_ids)])
                connection.commit()
            except Exception:
                metrics.incr('sql_errors')
                logger.exception('Failed deleting messages from incidents')
                try:
                    cursor.close()
                except Exception:
                    pass
                cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)
                break
            else:
                if deleted_rows:
                    logger.info('Killed %d messages from %d incidents', deleted_rows, len(incident_ids))
                    deleted_messages += deleted_rows
                    sleep(cooldown_time)
                else:
                    break

        # Finally kill incidents
        try:
            deleted_rows = cursor.execute('DELETE FROM `incident` WHERE `id` IN %s', [tuple(incident_ids)])
            connection.commit()
        except Exception:
            metrics.incr('sql_errors')
            logger.exception('Failed deleting incidents')
            try:
                cursor.close()
            except Exception:
                pass
            cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor)
            break
        else:
            logger.info('Deleted %s incidents', deleted_rows)
            deleted_incidents += deleted_rows
            sleep(cooldown_time)

    # Next, kill messages not tied to incidents, like quota notifs or incident tracking emails
    while True:
        try:
            deleted_rows = cursor.execute('DELETE FROM `message` WHERE `created` < (CURDATE() - INTERVAL %s DAY) AND `incident_id` IS NULL LIMIT %s', [max_days, batch_size])
            connection.commit()
        except Exception:
            metrics.incr('sql_errors')
            logger.exception('Failed deleting messages')
            try:
                cursor.close()
            except Exception:
                pass
            break
        else:
            if deleted_rows:
                logger.info('Killed %d misc messages', deleted_rows)
                deleted_messages += deleted_rows
                sleep(cooldown_time)
            else:
                break

    try:
        cursor.close()
    except Exception:
        pass
    connection.close()

    logger.info('Run took %.2f seconds and deleted %d incidents and %d messages', time.time() - time_start, deleted_incidents, deleted_messages)
    metrics.set('deleted_messages', deleted_messages)
    metrics.set('deleted_incidents', deleted_incidents)
Beispiel #18
0
def main():
    global ldap_timeout
    global ldap_pagination_size
    global update_sleep
    config = load_config()
    metrics.init(config, 'iris-sync-targets', stats_reset)

    default_ldap_timeout = 60
    default_ldap_pagination_size = 400
    default_update_sleep = 0
    default_ldap_nap_time = 3600
    default_oncall_nap_time = 60

    ldap_timeout = int(config.get('sync_script_ldap_timeout', default_ldap_timeout))
    ldap_pagination_size = int(config.get('sync_script_ldap_pagination_size', default_ldap_pagination_size))
    update_sleep = float(config.get('target_update_pause', default_update_sleep))
    try:
        ldap_nap_time = int(config.get('sync_script_ldap_nap_time', default_ldap_nap_time))
        oncall_nap_time = int(config.get('sync_script_oncall_nap_time', default_oncall_nap_time))
    except ValueError:
        ldap_nap_time = default_ldap_nap_time
        oncall_nap_time = default_oncall_nap_time

    # check if we are using special connection settings for this script
    if config.get('db_target_sync'):
        engine = create_engine(config['db_target_sync']['conn']['str'] % config['db_target_sync']['conn']['kwargs'],
                               **config['db_target_sync']['kwargs'])
    else:
        engine = create_engine(config['db']['conn']['str'] % config['db']['conn']['kwargs'],
                               **config['db']['kwargs'])

    # Optionally, maintain an internal list of mailing lists from ldap that can also be
    # used as targets.
    ldap_lists = config.get('ldap_lists')

    # Initialize these to zero at the start of the app, and don't reset them at every
    # metrics interval
    metrics.set('users_found', 0)
    metrics.set('teams_found', 0)

    metrics.set('ldap_lists_found', 0)
    metrics.set('ldap_memberships_found', 0)

    metrics_task = spawn(metrics.emit_forever)
    oncall_task = spawn(oncall_sync_loop, config, engine, oncall_nap_time)

    if ldap_lists:
        if 'ldap_cert_path' in ldap_lists:
            ldap_cert_path = ldap_lists['ldap_cert_path']
            if not os.access(ldap_cert_path, os.R_OK):
                logger.error("Failed to read ldap_cert_path certificate")
                raise IOError
            else:
                ldap_lists['cert_path'] = ldap_cert_path
        ldap_task = spawn(ldap_sync_loop, ldap_lists, engine, ldap_nap_time)

    while True:
        if not bool(metrics_task):
            metrics.incr('failed_tasks')
            logger.error('metrics task failed, %s', metrics_task.exception)
            spawn(metrics.emit_forever)

        if not bool(oncall_task):
            metrics.incr('failed_tasks')
            logger.error('oncall task failed, %s', oncall_task.exception)
            metrics_task = spawn(oncall_sync_loop, config, engine, oncall_nap_time)

        if ldap_lists:
            if not bool(ldap_task):
                metrics.incr('failed_tasks')
                logger.error('ldap task failed, %s', ldap_task.exception)
                ldap_task = spawn(ldap_sync_loop, ldap_lists, engine, ldap_nap_time)

        sleep(10)
Beispiel #19
0
def main():
    global config
    config = load_config()

    start_time = time.time()

    logger.info('[-] bootstraping sender...')
    init_sender(config)
    init_plugins(config.get('plugins', {}))
    init_vendors(config.get('vendors', []), config.get('applications', []))

    send_task = spawn(send)
    worker_tasks = [spawn(worker) for x in xrange(100)]

    rpc.init(config['sender'], dict(send_message=send_message))
    rpc.run(config['sender'])

    spawn(coordinator.update_forever)

    gwatch_renewer_task = None
    prune_audit_logs_task = None

    interval = 60
    logger.info('[*] sender bootstrapped')
    while True:
        runtime = int(time.time())
        logger.info('--> sender looop started.')

        cache.refresh()
        cache.purge()

        # If we're currently a master, ensure our master-greenlets are running
        # and we're doing the master duties
        if coordinator.am_i_master():
            if not bool(gwatch_renewer_task):
                if should_mock_gwatch_renewer:
                    gwatch_renewer_task = spawn(mock_gwatch_renewer)
                else:
                    gwatch_renewer_task = spawn(gwatch_renewer)

            if not bool(prune_audit_logs_task):
                prune_audit_logs_task = spawn(prune_old_audit_logs_worker)

            try:
                escalate()
                deactivate()
                poll()
                aggregate(runtime)
            except Exception:
                metrics.incr('task_failure')
                logger.exception("Exception occured in main loop.")

        # If we're not master, don't do the master tasks and make sure those other
        # greenlets are stopped if they're running
        else:
            logger.info('I am not the master so I am not doing master sender tasks.')

            # Stop these task greenlets if they're running. Technically this should
            # never happen because if we're the master, we'll likely only stop being the
            # master if our process exits, which would kill these greenlets anyway.
            if bool(gwatch_renewer_task):
                logger.info('I am not master anymore so stopping the gwatch renewer')
                gwatch_renewer_task.kill()

            if bool(prune_audit_logs_task):
                logger.info('I am not master anymore so stopping the audit logs worker')
                prune_audit_logs_task.kill()

        # check status for all background greenlets and respawn if necessary
        if not bool(send_task):
            logger.error("send task failed, %s", send_task.exception)
            metrics.incr('task_failure')
            send_task = spawn(send)
        bad_workers = []
        for i, task in enumerate(worker_tasks):
            if not bool(task):
                logger.error("worker task failed, %s", task.exception)
                metrics.incr('task_failure')
                bad_workers.append(i)
        for i in bad_workers:
            worker_tasks[i] = spawn(worker)

        now = time.time()
        metrics.set('sender_uptime', int(now - start_time))

        spawn(metrics.emit)

        elapsed_time = now - runtime
        nap_time = max(0, interval - elapsed_time)
        logger.info('--> sender loop finished in %s seconds - sleeping %s seconds',
                    elapsed_time, nap_time)
        sleep(nap_time)
Beispiel #20
0
                # no message created due to role look up failure, reset step to
                # 0 for retry
                step = 0
            cursor.execute(UPDATE_INCIDENT_SQL, (step, incident_id))
            msg_count += step_msg_cnt
        else:
            logger.error('plan id %d has no steps, incident id %d is invalid',
                         plan_id, incident_id)
            cursor.execute(INVALIDATE_INCIDENT, incident_id)
        connection.commit()
    cursor.close()
    connection.close()

    logger.info('[*] %s new messages', msg_count)
    logger.info('[*] escalate task finished')
    metrics.set('notifications', time.time() - start_notifications)


def aggregate(now):
    # see if it's time to send the batches
    logger.info('[-] start aggregate task - queued: %s', len(messages))
    start_aggregations = time.time()
    for key in queues.keys():
        aggregation_window = cache.plans[key[0]]['aggregation_window']
        if now - sent.get(key, 0) >= aggregation_window:
            aggregated_message_ids = queues[key]

            connection = db.engine.raw_connection()
            cursor = connection.cursor()
            cursor.execute(
                'SELECT `id` FROM `message` WHERE active=1 AND `id` in %s',