Beispiel #1
0
def _do_final_cleanup(conn, logger, is_locked, tables_to_delete):
    """
    Function to perform final cleanup to remove intermediate tables and release locks.

    :param conn: database connection obj
    :param logger: dirbs logger obj
    :param is_locked: bool (to check if there is postgres advisory lock)
    :param tables_to_delete: list of tables to delete
    """
    if is_locked:
        with conn.cursor() as cursor:
            cursor.execute('SELECT pg_advisory_unlock(%s::BIGINT)',
                           [hash_string_64bit('dirbs-classify')])

    with conn.cursor() as cursor:
        remaining_tables_to_delete = copy.copy(tables_to_delete)
        for t in tables_to_delete:
            try:
                cursor.execute(
                    sql.SQL('DROP TABLE IF EXISTS {0} CASCADE').format(
                        sql.Identifier(t)))
                conn.commit()
                remaining_tables_to_delete.remove(t)
            except:  # noqa: E722
                for t_not_deleted in remaining_tables_to_delete:
                    logger.warn(
                        'Failed to drop table {0} due to exception. Please issue '
                        '\'DROP TABLE IF EXISTS {0}\' manually!'.format(
                            t_not_deleted))
                raise
Beispiel #2
0
    def _import_lock_key(self):
        """String Key for the advisory lock to guard against multiple concurrent imports of the same type.

        Subclasses should override if they want to allow concurrent imports. For example,
        the operator data importer allows multiples to happen as long as they are for different operators.
        """
        return hash_string_64bit(self._import_type)
Beispiel #3
0
def cli(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
        metrics_root, metrics_run_root, conditions, safety_check, curr_date,
        disable_sanity_checks):
    """
    DIRBS script to classify IMEIs.

    Iterates through all configured conditions and write to the classification_state table.

    :param ctx: click command context
    :param config: dirbs config instance
    :param statsd: statsd instance
    :param logger: dirbs logger instance
    :param run_id: job run id
    :param conn: database connection
    :param metadata_conn: database connection for job metadata
    :param command: command name
    :param metrics_root:
    :param metrics_run_root:
    :param conditions: list of user supplied conditions
    :param safety_check: bool (enable/disable safety check)
    :param curr_date: date to use for classification
    :param disable_sanity_checks: bool (enable/disable sanity checks)
    """
    _warn_about_curr_date(curr_date, logger)
    _warn_about_disabled_safety_check(safety_check, logger)

    # If we didn't specify a condition, use all configured conditions
    if conditions is None:
        conditions = config.conditions

    # Query the job metadata table for all successful classification runs
    successful_job_runs = metadata.query_for_command_runs(metadata_conn,
                                                          'dirbs-classify',
                                                          successful_only=True)
    if successful_job_runs and not disable_sanity_checks and not _perform_sanity_checks(
            config, successful_job_runs[0].extra_metadata):
        raise ClassifySanityCheckFailedException(
            'Sanity checks failed, configurations are not identical to the last successful classification'
        )

    logger.info('Classifying using conditions: {0}'.format(','.join(
        [c.label for c in conditions])))

    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        curr_date=curr_date.isoformat() if curr_date is not None else None,
        conditions=[c.as_dict() for c in conditions],
        operators=[op.as_dict() for op in config.region_config.operators],
        amnesty=config.amnesty_config.as_dict())

    # Per-condition intermediate tables
    intermediate_tables = []

    # Flag indicating whether we had a failure to change exit code
    had_errored_condition = False

    try:
        locked = False
        with conn, conn.cursor() as cursor:
            # Lock to prevent multiple simultaneous classifications
            cursor.execute('SELECT pg_try_advisory_lock(%s::BIGINT)',
                           [hash_string_64bit('dirbs-classify')])
            locked = cursor.fetchone()[0]
            if not locked:
                raise ClassifyLockException(
                    'Could not acquire lock for classification. '
                    'Are there any other dirbs-classify instances running at the moment?'
                )

            # Calculate total IMEI count
            if safety_check:
                logger.info(
                    'Counting number of IMEIs in network_imeis for safety check...'
                )
                cursor.execute('SELECT COUNT(*) FROM network_imeis')
                total_imei_count = cursor.fetchone()[0]
                logger.info(
                    'Finished counting number of IMEIs in network_imeis for safety check'
                )
            else:
                total_imei_count = -1

        matched_imei_counts = {}
        nworkers = config.multiprocessing_config.max_db_connections
        condition_objs = [Condition(cond_config) for cond_config in conditions]

        with futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
            logger.info(
                'Simultaneously classifying {0:d} dimensions using up to {1:d} workers...'
                .format(len(conditions), nworkers))

            calc_futures_to_condition = {}
            update_futures_to_condition = {}
            per_condition_state = defaultdict(
                lambda: dict(num_completed_calc_jobs=0,
                             num_total_calc_jobs=0,
                             num_completed_update_jobs=0,
                             num_total_update_jobs=0,
                             num_matched_imeis=0))
            for c in condition_objs:
                # Make sure we record all temporary tables so that we can cleanup later
                intermediate_tables.append(c.intermediate_tbl_name(run_id))
                # Queue the condition calculations and keep track
                for f in c.queue_calc_imeis_jobs(executor, config, run_id,
                                                 curr_date):
                    calc_futures_to_condition[f] = c
                    per_condition_state[c.label]['num_total_calc_jobs'] += 1

            # Process calculation futures
            for condition, job_state in _completed_calc_jobs(
                    calc_futures_to_condition, per_condition_state, logger):
                max_ratio = condition.config.max_allowed_matching_ratio
                num_matched_imeis = job_state['num_matched_imeis']
                max_matched_imeis = max_ratio * total_imei_count
                if safety_check and total_imei_count > 0 and num_matched_imeis > max_matched_imeis:
                    ratio = min(num_matched_imeis / total_imei_count, 1)
                    logger.error(
                        'Refusing to classify using condition \'{0}\': '
                        'This condition matches more than the maximum number of IMEIs allowed by the '
                        'condition\'s configuration '
                        '(matched_imeis={1:d}, ratio={2:f}, max_ratio={3:f})'.
                        format(condition.label, num_matched_imeis, ratio,
                               max_ratio))
                    had_errored_condition = True
                else:
                    # Queue the classification state updates and keep track
                    for f in condition.queue_update_classification_state_jobs(
                            executor, config, run_id, curr_date):
                        update_futures_to_condition[f] = condition
                        per_condition_state[
                            condition.label]['num_total_update_jobs'] += 1

            # Process update futures
            for condition, job_state in _completed_update_jobs(
                    update_futures_to_condition, per_condition_state, logger):
                # Update metadata about matched IMEI counts every time each condition finishes
                matched_imei_counts[
                    condition.label] = job_state['num_matched_imeis']
                metadata.add_optional_job_metadata(
                    metadata_conn,
                    command,
                    run_id,
                    matched_imei_counts=matched_imei_counts)
                # Output StatsD stats
                statsd.gauge(
                    '{0}matched_imeis.{1}'.format(metrics_run_root,
                                                  condition.label.lower()),
                    job_state['num_matched_imeis'])

    finally:
        _do_final_cleanup(conn, logger, locked, intermediate_tables)

        # If we had an error condition, generate an error return code on exit
        if had_errored_condition:
            sys.exit(1)