def import_data(self): """Main import function that checks files and validates the contents before importing to the database.""" assert self._was_entered and 'Attempting to import with an importer outside of a with statement!' try: # Store time so that we can track metrics for total listgen time st = time.time() # Store initial metadata metadata.add_optional_job_metadata(self._metadata_conn, 'dirbs-import', self.import_id, **self._import_metadata) # Log initial message self._logger.info('Importing {0} data from file \'{1}\''.format( self._import_type, self._filename)) # Init staging table (commit afterwards to ensure other processes can see table) with self._conn: self._time_component_perf('init_staging', self._init_staging_table) if self._supports_imei_shards: self._time_component_perf('init_staging_shards', self._init_staging_table_shards) # Compute MD5 hash self._time_component_perf('compute_md5', self._compute_md5_hash) # Now do extract -> split -> preprocess -> prevalidate -> upload pipeline self._time_component_perf('upload_pipeline', self._upload_pipeline) # ANALYZE staging table after upload self._time_component_perf('analyze_staging', self._analyze_staging_table) # Run binary (yes/no) validation checks that operator on "raw" data (prior to post-processing) self._time_component_perf('validation_binary_checks_raw', self._validate_binary_checks_raw) # Post-process staging table self._time_component_perf('postprocess_staging', self._postprocess_staging_data) # Run binary (yes/no) validation checks self._time_component_perf('validation_binary_checks', self._validate_binary_checks) # Run row threshold validation checks self._time_component_perf('validation_threshold_checks', self._validate_threshold_checks) # Run validation checks based on historic data self._time_component_perf('validation_historical_checks', self._validate_historical_checks) # Copy data from the staging table rows_before = -1 # Sentinel value if self._need_previous_count_for_stats: rows_before = self.row_count rows_inserted, rows_updated, row_deleted = \ self._time_component_perf('copy_from_staging', self._copy_staging_data) # Output import stats self._time_component_perf('output_stats', self._output_stats, rows_before, rows_inserted, rows_updated, row_deleted) finally: dt = int((time.time() - st) * 1000) self._log_normalized_import_time_metrics(dt)
def process(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Start whitelist processing job.""" logger.info('Initiating Whitelist processing job...') operator_config = config.broker_config.operators kafka_config = config.broker_config.kafka h_consumer = create_kafka_consumer(logger, config) h_producer = create_kafka_producer(logger, config) # Store metadata metadata.add_optional_job_metadata(metadata_conn, command, run_id, kafka={ 'host': kafka_config.hostname, 'port': kafka_config.port, 'topic': kafka_config.topic }, operators=[{ 'operator': op.id, 'topic': op.topic } for op in operator_config]) whitelist_processing_job(consumer=h_consumer, producer=h_producer, operator_config=operator_config, conn=conn, logger=logger)
def _compute_md5_hash(self): """Method to compute the MD5 hash for the filename.""" self._logger.info('Computing MD5 hash of the input file...') with open(self._filename, 'rb') as f: md5 = compute_md5_hash(f) self._logger.info('Computed MD5 hash of the input file') metadata.add_optional_job_metadata(self._metadata_conn, 'dirbs-import', self.import_id, input_file_md5=md5)
def cli(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """ DIRBS script to catalog data files received by DIRBS Core. :param ctx: click commands context object :param config: dirbs config :param statsd: statsd instance :param logger: logger instance :param run_id: current run id of the job :param conn: database connection :param metadata_conn: database connection to store metadata :param command: job command :param metrics_root: :param metrics_run_root: """ # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, prospectors=config.catalog_config.prospectors, perform_prevalidation=config.catalog_config.perform_prevalidation) harvested_files = _harvest_files(config.catalog_config.prospectors, logger) logger.info('Fetching files in the existing data catalog...') cataloged_files = _fetch_catalog_files(config) logger.info('Found {0} file(s) in the existing catalog'.format( len(cataloged_files))) uncataloged_files = [ x for x in harvested_files if x['file_properties'] not in cataloged_files ] logger.info('Discovered {0} new or modified file(s)'.format( len(uncataloged_files))) if len(uncataloged_files) > 0: logger.info( 'Determining catalog attributes for the discovered files...') uncataloged_files = _populate_file_properties( config, uncataloged_files, run_id, config.catalog_config.perform_prevalidation, logger) logger.info( 'Finished determining catalog attributes for the discovered files') logger.info('Updating data catalog with new or modified files...') _update_catalog(uncataloged_files, config) logger.info('Finished updating data catalog') else: logger.info('Data catalog is already up-to-date!')
def non_active_pairs(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, output_dir: str, period: int) -> None: """Generate list of Non-Active pairs over specified period. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics output_dir: output directory path period: period in days for a pair being count as not active (not active for these many days) Returns: None """ metadata.add_optional_job_metadata( metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) current_date = datetime.date.today() last_seen_date = datetime.date( current_date.year, current_date.month, current_date.day) - datetime.timedelta(period) logger.info( 'List of None-Active Pairs with last_seen less than {0} will be generated' .format(last_seen_date)) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: report_metadata = write_non_active_pairs(conn, logger, report_dir, last_seen_date) statsd.gauge( '{0}runtime.per_report.non_active_pairs'.format(metrics_run_root), cp.duration) metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def stolen_violations(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, output_dir: str, newer_than: str, filter_by_conditions: list) -> None: """Generate per-MNO list of IMEIs seen on the network after they were reported stolen. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics output_dir: output directory path newer_than: violation newer then this date filter_by_conditions: list of condition to filter by Returns: None """ operators_configured_check(config, logger) metadata.add_optional_job_metadata( metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: report_metadata = write_stolen_violations(config, logger, report_dir, conn, filter_by_conditions, newer_than) statsd.gauge( '{0}runtime.per_report.blacklist_violations_stolen'.format( metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def condition_imei_overlaps(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, force_refresh, disable_retention_check, disable_data_check, debug_query_performance, month, year, output_dir): """ Generate per-condition reports showing matched IMEIs seen on more than one MNO network. :param ctx: current cli context :param config: dirbs config obj :param statsd: statsd obj :param logger: dirbs logger obj :param run_id: job run id :param conn: database connection :param metadata_conn: database metadata connection :param command: command name :param metrics_root: :param metrics_run_root: :param force_refresh: force refresh flag :param disable_retention_check: retention check flag :param disable_data_check: data check flag :param debug_query_performance: debug query performance flag :param month: data month :param year: data year :param output_dir: output directory path """ _reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) metadata.add_optional_job_metadata(metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month) report_metadata = [] with utils.CodeProfiler() as cp: country_name = config.region_config.name logger.info('Generating country per-condition IMEI overlap reports (classified IMEIs seen on more than ' 'one MNO\'s network this month...') cond_names = [c.label for c in config.conditions] report_metadata.extend(_write_condition_imei_overlaps(conn, config, month, year, country_name, report_dir, cond_names)) statsd.gauge('{0}runtime.per_report.condition_imei_overlaps'.format(metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def unregistered_subscribers(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, output_dir: str, newer_than: str): """Generate per-MNO list of IMSIs that are not registered in subscribers list. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics output_dir: output directory path newer_than: violation newer then this date Returns: None """ operators_configured_check(config, logger) metadata.add_optional_job_metadata( metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: report_metadata = write_un_registered_subscribers( logger, config, report_dir, conn, newer_than) statsd.gauge( '{0}runtime.per_report.unregistered_subscribers'.format( metrics_run_root), cp.duration) # store metadata metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def top_duplicates(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, force_refresh, disable_retention_check, disable_data_check, debug_query_performance, month, year, output_dir): """ Generate report listing IMEIs seen with more than 5 IMSIs in a given month and year. :param ctx: current cli context :param config: dirbs config obj :param statsd: statsd obj :param logger: dirbs logger obj :param run_id: job run id :param conn: database connection :param metadata_conn: database metadata connection :param command: command name :param metrics_root: :param metrics_run_root: :param force_refresh: force refresh flag :param disable_retention_check: retention check flag :param disable_data_check: data check flag :param debug_query_performance: debug query performace flag :param month: data month :param year: data year :param output_dir: output directory path """ _reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) metadata.add_optional_job_metadata(metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_metadata = [] report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month) with utils.CodeProfiler() as cp: imsi_min_limit = 5 country_name = config.region_config.name logger.info('Generating country duplicate IMEI report (IMEIs seen with more than {0:d} IMSIs this ' 'reporting month)...'.format(imsi_min_limit)) report_metadata.extend(_write_country_duplicates_report(conn, config, month, year, country_name, report_dir, imsi_min_limit=imsi_min_limit)) statsd.gauge('{0}runtime.per_report.top_duplicates'.format(metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def gsma_not_found(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, force_refresh, disable_retention_check, disable_data_check, debug_query_performance, month, year, output_dir): """ Generate report of all GSMA not found IMEIs. :param ctx: current cli context :param config: dirbs config obj :param statsd: statsd obj :param logger: dirbs logger obj :param run_id: job run id :param conn: database connection :param metadata_conn: database metadata connection :param command: command name :param metrics_root: :param metrics_run_root: :param force_refresh: force refresh flag :param disable_retention_check: data retention check flag :param disable_data_check: data check flag :param debug_query_performance: query performance flag :param month: data month :param year: data year :param output_dir: output directory path """ _reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) metadata.add_optional_job_metadata(metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month) report_metadata = [] with utils.CodeProfiler() as cp: logger.info('Generating country GSMA not found report...') country_name = config.region_config.name report_metadata.extend(_write_country_gsma_not_found_report(conn, config, month, year, country_name, report_dir)) statsd.gauge('{0}runtime.per_report.gsma_not_found'.format(metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def blacklist_violations(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, output_dir: str, month: int, year: int) -> None: """Generate per-operator blacklist violations. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics output_dir: output directory path month: reporting month year: reporting year Returns: None """ metadata.add_optional_job_metadata( metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: report_metadata = write_blacklist_violations(logger, config, report_dir, conn, month, year) statsd.gauge( '{0}runtime.per_report.blacklist_violation'.format(metrics_run_root), cp.duration) metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def classified_triplets(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, output_dir: str, conditions: list) -> None: """Generate per-condition classified triplets list. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics output_dir: output directory path conditions: list of conditions for classified triplets Returns: None """ metadata.add_optional_job_metadata( metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: report_metadata = write_classified_triplets(logger, conditions, report_dir, conn) statsd.gauge( '{0}runtime.per_report.classified_triplets'.format(metrics_run_root), cp.duration) metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def transient_msisdns(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, output_dir: str, period: int, num_of_imeis: int, current_date: str) -> None: """Generate list of MSISDNS used with possible transient IMEIs. Required Arguments: period: Analysis period in days (positive integer) num_of_imeis: Number of IMEIs a MSISDN must be seen with for analysis """ metadata.add_optional_job_metadata( metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: report_metadata = write_transient_msisdns(logger, period, report_dir, conn, config, num_of_imeis, current_date=current_date) statsd.gauge( '{0}runtime.per_report.transient_msisdns'.format(metrics_run_root), cp.duration) metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def lists(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Prune obsolete lists data.""" curr_date = ctx.obj['CURR_DATE'] # store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, retention_months=config.retention_config.months_retention) logger.info( 'Pruning lists tables to remove any obsolete data with end_time outside the retention window..' ) retention_months = config.retention_config.months_retention if curr_date is None: curr_date = datetime.date.today() first_month_to_drop = datetime.date( curr_date.year, curr_date.month, 1) - relativedelta.relativedelta(months=retention_months) logger.info( 'Lists data with end_time earlier than {0} will be pruned'.format( first_month_to_drop)) with utils.db_role_setter( conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor: logger.debug('Calculating original number of rows in lists tables...') row_count_sql = sql.SQL( """SELECT blacklist_row_count, noft_lists_row_count, excp_lists_row_count FROM (SELECT COUNT(*) FROM blacklist) AS blacklist_row_count, (SELECT COUNT(*) FROM notifications_lists) AS noft_lists_row_count, (SELECT COUNT(*) FROM exceptions_lists) AS excp_lists_row_count""" ) cursor.execute(row_count_sql) rows_before = cursor.fetchone() blacklist_rows_before = int( rows_before.blacklist_row_count.strip('()')) notflist_rows_before = int( rows_before.noft_lists_row_count.strip('()')) excplist_rows_before = int( rows_before.excp_lists_row_count.strip('()')) rows_before = blacklist_rows_before + notflist_rows_before + excplist_rows_before logger.debug('Calculated original number of rows in lists tables...') statsd.gauge('{0}blacklist_rows_before'.format(metrics_run_root), blacklist_rows_before) statsd.gauge( '{0}notifications_lists_rows_before'.format(metrics_run_root), notflist_rows_before) statsd.gauge( '{0}exceptions_lists_rows_before'.format(metrics_run_root), excplist_rows_before) metadata.add_optional_job_metadata( metadata_conn, command, run_id, blacklist_rows_before=blacklist_rows_before, notifications_lists_rows_before=notflist_rows_before, exceptions_lists_rows_before=excplist_rows_before) # Calculate number of rows in the lists table outside the retention window job_metadata_filter_sql = """SELECT run_id FROM job_metadata WHERE command = 'dirbs-listgen' AND end_time < '{0}'""".format( first_month_to_drop) cursor.execute( sql.SQL("""SELECT COUNT(*) FROM blacklist WHERE start_run_id IN ({0})""".format( job_metadata_filter_sql))) total_bl_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of blacklist table outside the retention window to prune' .format(total_bl_rows_out_window_to_prune)) cursor.execute( sql.SQL("""SELECT COUNT(*) FROM notifications_lists WHERE start_run_id IN ({0})""".format( job_metadata_filter_sql))) total_nl_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of notifications lists table outside the retention window to prune' .format(total_nl_rows_out_window_to_prune)) cursor.execute( sql.SQL("""SELECT COUNT(*) FROM exceptions_lists WHERE start_run_id IN ({0})""".format( job_metadata_filter_sql))) total_nl_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of exceptions lists table outside the retention window to prune' .format(total_nl_rows_out_window_to_prune)) # We repartition the tables to re-create them, passing a condition sql logger.debug('Re-creating blacklist table...') num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn) src_filter_sql = cursor.mogrify( """WHERE start_run_id NOT IN ({0})""".format( job_metadata_filter_sql)) partition_utils.repartition_blacklist( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created blacklist table') logger.debug('Re-creating notifications lists table...') partition_utils.repartition_notifications_lists( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created notifications lists table') logger.debug('Re-creating exceptions lists table...') partition_utils.repartition_exceptions_lists( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created exceptions lists table') logger.debug('Calculating new number of rows in lists tables...') cursor.execute(row_count_sql) rows_after = cursor.fetchone() blacklist_rows_after = int(rows_after.blacklist_row_count.strip('()')) notflist_rows_after = int(rows_after.noft_lists_row_count.strip('()')) excplist_rows_after = int(rows_after.excp_lists_row_count.strip('()')) rows_after = blacklist_rows_after + notflist_rows_after + excplist_rows_after logger.debug('Calculated new number of rows in lists tables') statsd.gauge('{0}blacklist_rows_after'.format(metrics_run_root), blacklist_rows_after) statsd.gauge( '{0}notifications_lists_rows_after'.format(metrics_run_root), notflist_rows_after) statsd.gauge('{0}exceptions_lists_rows_after'.format(metrics_run_root), excplist_rows_after) metadata.add_optional_job_metadata( metadata_conn, command, run_id, blacklist_rows_before=blacklist_rows_after, notifications_lists_rows_before=notflist_rows_after, exceptions_lists_rows_before=excplist_rows_after) logger.info('Pruned {0:d} rows from lists tables'.format(rows_after - rows_before))
def classification_state(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Prune obsolete classification_state data.""" curr_date = ctx.obj['CURR_DATE'] # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=curr_date.isoformat() if curr_date is not None else None, retention_months=config.retention_config.months_retention) logger.info( 'Pruning classification_state table to remove any classification state data related to ' 'obsolete conditions and data with end_date outside the retention window..' ) cond_config_list = [c.label for c in config.conditions] retention_months = config.retention_config.months_retention if curr_date is None: curr_date = datetime.date.today() first_month_to_drop = datetime.date( curr_date.year, curr_date.month, 1) - relativedelta.relativedelta(months=retention_months) logger.info( 'Classification state data with end_date earlier than {0} will be ' 'pruned'.format(first_month_to_drop)) with utils.db_role_setter( conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor: logger.debug( 'Calculating original number of rows in classification_state table...' ) cursor.execute('SELECT COUNT(*) FROM classification_state') rows_before = cursor.fetchone()[0] logger.debug( 'Calculated original number of rows in classification_state table') statsd.gauge('{0}rows_before'.format(metrics_run_root), rows_before) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_before=rows_before) # Calculate number of rows in the classification table outside retention window cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE end_date < %s """), [first_month_to_drop]) total_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table ' 'with end_date outside the retention window to prune.'.format( total_rows_out_window_to_prune)) # Calculate number of rows in the classification with conditions no longer existing cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE NOT starts_with_prefix(cond_name, %s)""" ), [cond_config_list]) total_rows_no_cond_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table with conditions ' 'no longer existing to prune.'.format(total_rows_no_cond_to_prune)) logger.debug('Re-creating classification_state table...') # Basically, we just re-partition the classification_state table to re-create it, passing a src_filter_sql # parameter num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn) src_filter_sql = cursor.mogrify( """WHERE (end_date > %s OR end_date IS NULL) AND cond_name LIKE ANY(%s)""", [first_month_to_drop, cond_config_list]) partition_utils.repartition_classification_state( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created classification_state table') logger.debug( 'Calculating new number of rows in classification_state table...') cursor.execute('SELECT COUNT(*) FROM classification_state') rows_after = cursor.fetchone()[0] logger.debug( 'Calculated new number of rows in classification_state table') statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_after=rows_after) logger.info('Pruned {0:d} rows from classification_state table'.format( rows_after - rows_before))
def blacklist(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, condition_name, prune_all): """Expire IMEIs outside the blacklist retention period from blacklist.""" current_date = datetime.date.today() retention_days = config.retention_config.blacklist_retention if condition_name is None and prune_all is False: logger.info( 'Error: one of the arguments "condition_name" or "--prune-all" is required' ) metadata.add_optional_job_metadata(metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=False) elif condition_name is not None and prune_all is True: logger.info( 'Error: only one of the arguments "condition_name" or "--prune-all" is required' ) metadata.add_optional_job_metadata(metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=False) elif retention_days == 0: logger.info( 'Blacklist will not be prune, as retention value is set to {0}'. format(retention_days)) metadata.add_optional_job_metadata(metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=False) else: _warn_about_prune_all(prune_all, logger) logger.info( 'Pruning blacklist to remove any data related to specified condition ' 'outside the retention window.') last_retention_date = datetime.date( current_date.year, current_date.month, current_date.day) - datetime.timedelta(retention_days) # store metadata logger.info( 'Blacklist entries with start_date earlier than {0} will be pruned' .format(last_retention_date)) metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=True, last_retention_date=last_retention_date.isoformat()) with utils.db_role_setter( conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor: logger.debug( 'Calculating original number of rows with block_date in classification_state table...' ) cursor.execute("""SELECT COUNT(*) FROM classification_state WHERE block_date IS NOT NULL AND end_date IS NULL""") rows_before = cursor.fetchone()[0] logger.debug( 'Calculated original number of rows (having block_date) in classification_state table' ) statsd.gauge('{0}rows_before'.format(metrics_run_root), rows_before) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_before=rows_before) # if its a condition based pruning if not prune_all: cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE start_date < %s AND cond_name = %s AND end_date IS NULL AND block_date IS NOT NULL"""), [last_retention_date, condition_name[0].label]) total_rows_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table ' 'with start_date for {1} dimension outside the blacklist ' 'retention window.'.format(total_rows_to_prune, condition_name[0].label)) if total_rows_to_prune > 0: cursor.execute( sql.SQL("""UPDATE classification_state SET end_date = '{0}' WHERE start_date < '{1}' AND cond_name = '{2}' AND end_date IS NULL AND block_date IS NOT NULL""". format(current_date.isoformat(), last_retention_date, condition_name[0].label))) logger.info( 'Pruned {0:d} rows from blacklist for {1} dimension'. format(total_rows_to_prune, condition_name[0].label)) # prune without any condition else: cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE start_date < %s AND end_date IS NULL AND block_date IS NOT NULL"""), [last_retention_date]) total_rows_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table ' 'with start_date outside the blacklist retention window.'. format(total_rows_to_prune)) if total_rows_to_prune > 0: cursor.execute( sql.SQL("""UPDATE classification_state SET end_date = '{0}' WHERE start_date < '{1}' AND end_date IS NULL AND block_date IS NOT NULL""". format(current_date.isoformat(), last_retention_date))) logger.info('Pruned {0:d} rows from blacklist'.format( total_rows_to_prune)) logger.debug( 'Calculating remaining number of rows with block_date (end_date is null) ' 'in classification_state table...') cursor.execute("""SELECT COUNT(*) FROM classification_state WHERE block_date IS NOT NULL AND end_date IS NULL""") rows_after = cursor.fetchone()[0] logger.debug( 'Calculated remaining number of rows (having block_date and end_date null) ' 'in classification_state table') statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_after=rows_after)
def top_duplicates(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, force_refresh: bool, disable_retention_check: bool, disable_data_check: bool, debug_query_performance: bool, month: int, year: int, output_dir: str) -> None: """Generate report listing IMEIs seen with more than 5 IMSIs in a given month and year. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics force_refresh: bool to force writing/generating reports from scratch disable_retention_check: bool to disable data retention check disable_data_check: bool to disable data check debug_query_performance: bool to debug query performance month: reporting month year: reporting year output_dir: output directory path Returns: None """ reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) metadata.add_optional_job_metadata( metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_metadata = [] report_dir = make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month) with utils.CodeProfiler() as cp: imsi_min_limit = 5 country_name = config.region_config.name logger.info( 'Generating country duplicate IMEI report (IMEIs seen with more than {0:d} IMSIs this ' 'reporting month)...'.format(imsi_min_limit)) report_metadata.extend( write_country_duplicates_report(conn, config, month, year, country_name, report_dir, imsi_min_limit=imsi_min_limit)) statsd.gauge( '{0}runtime.per_report.top_duplicates'.format(metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def condition_imei_overlaps(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, force_refresh: bool, disable_retention_check: bool, disable_data_check: bool, debug_query_performance: bool, month: int, year: int, output_dir: str): """Generate per-condition reports showing matched IMEIs seen on more than one MNO network. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics force_refresh: bool to force writing/generating reports from scratch disable_retention_check: bool to disable data retention check disable_data_check: bool to disable data check debug_query_performance: bool to debug query performance month: reporting month year: reporting year output_dir: output directory path Returns: None """ reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) metadata.add_optional_job_metadata( metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config, year=year, month=month) report_metadata = [] with utils.CodeProfiler() as cp: country_name = config.region_config.name logger.info( 'Generating country per-condition IMEI overlap reports (classified IMEIs seen on more than ' "one MNO\'s network this month...") cond_names = [c.label for c in config.conditions] report_metadata.extend( write_condition_imei_overlaps(conn, config, month, year, country_name, report_dir, cond_names)) statsd.gauge( '{0}runtime.per_report.condition_imei_overlaps'.format( metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def stolen_violations(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, output_dir, newer_than, filter_by_conditions): """ Generate per-MNO list of IMEIs seen on the network after they were reported stolen. :param ctx: current cli context :param config: dirbs config obj :param statsd: statsd obj :param logger: dirbs logger obj :param run_id: job run id :param conn: database connection :param metadata_conn: metadata database connection :param command: command name :param metrics_root: :param metrics_run_root: :param output_dir: output directory path :param newer_than: newer than flag :param filter_by_conditions: flag """ _operators_configured_check(config, logger) metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config) with utils.CodeProfiler() as cp: logger.info('Generating per-MNO stolen list violations reports...') with contextlib.ExitStack() as stack: # Push files into exit stack so that they will all be closed. operator_ids = [o.id for o in config.region_config.operators] filename_op_map = {'stolen_violations_{0}.csv'.format(o): o for o in operator_ids} opname_file_map = {o: stack.enter_context(open(os.path.join(report_dir, fn), 'w', encoding='utf8')) for fn, o in filename_op_map.items()} # Create a map from operator name to csv writer opname_csvwriter_map = {o: csv.writer(opname_file_map[o]) for o in operator_ids} # Write the header to each csvwriter for _, writer in opname_csvwriter_map.items(): writer.writerow(['imei_norm', 'last_seen', 'reporting_date']) # Run a query to find all the classified IMEIs seen on multiple operators blacklist_violations_grace_period_days = config.report_config.blacklist_violations_grace_period_days with conn.cursor() as cursor: query = sql.SQL("""SELECT imei_norm, last_seen, reporting_date, operator_id FROM (SELECT imei_norm, MIN(reporting_date) AS reporting_date FROM stolen_list GROUP BY imei_norm) AS stolen_imeis JOIN LATERAL ( SELECT imei_norm, operator_id, MAX(last_seen) AS last_seen FROM monthly_network_triplets_per_mno_no_null_imeis nt WHERE imei_norm = stolen_imeis.imei_norm AND virt_imei_shard = calc_virt_imei_shard(stolen_imeis.imei_norm) GROUP BY imei_norm, operator_id) network_imeis USING (imei_norm) WHERE network_imeis.last_seen > stolen_imeis.reporting_date + %s {0} {1}""") if filter_by_conditions: cond_filter_query = """AND EXISTS(SELECT 1 FROM classification_state WHERE imei_norm = stolen_imeis.imei_norm AND virt_imei_shard = calc_virt_imei_shard(stolen_imeis.imei_norm) AND cond_name IN %s AND end_date IS NULL)""" sql_bytes = cursor.mogrify(cond_filter_query, [tuple([c.label for c in filter_by_conditions])]) conditions_filter_sql = sql.SQL(str(sql_bytes, conn.encoding)) else: conditions_filter_sql = sql.SQL('') if newer_than: newer_than_query = 'AND last_seen > %s' sql_bytes = cursor.mogrify(newer_than_query, [newer_than]) date_filter_sql = sql.SQL(str(sql_bytes, conn.encoding)) else: date_filter_sql = sql.SQL('') cursor.execute(query.format(conditions_filter_sql, date_filter_sql), [blacklist_violations_grace_period_days]) for res in cursor: opname_csvwriter_map[res.operator_id].writerow([res.imei_norm, res.last_seen.strftime('%Y%m%d'), res.reporting_date.strftime('%Y%m%d')]) report_metadata = _gen_metadata_for_reports(list(filename_op_map.keys()), report_dir) statsd.gauge('{0}runtime.per_report.blacklist_violations_stolen'.format(metrics_run_root), cp.duration) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def triplets(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Prune old monthly_network_triplets data.""" curr_date = ctx.obj['CURR_DATE'] # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=curr_date.isoformat() if curr_date is not None else None, retention_months=config.retention_config.months_retention) if curr_date is None: curr_date = datetime.date.today() with conn.cursor() as cursor: logger.info( 'Pruning monthly_network_triplets data outside the retention window from database...' ) retention_months = config.retention_config.months_retention first_month_to_drop = datetime.date( curr_date.year, curr_date.month, 1) - relativedelta.relativedelta(months=retention_months) logger.info( 'monthly_network_triplets partitions older than {0} will be pruned' .format(first_month_to_drop)) country_monthly_partitions = utils.child_table_names( conn, 'monthly_network_triplets_country') operator_partitions = utils.child_table_names( conn, 'monthly_network_triplets_per_mno') operator_monthly_partitions = [] for op_partition in operator_partitions: operator_monthly_partitions.extend( utils.child_table_names(conn, op_partition)) parent_tbl_names = [ 'monthly_network_triplets_country', 'monthly_network_triplets_per_mno' ] rows_before = {} for tbl in parent_tbl_names: logger.debug( 'Calculating original number of rows in {0} table...'.format( tbl)) cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl))) rows_before[tbl] = cursor.fetchone()[0] logger.debug( 'Calculated original number of rows in {0} table'.format(tbl)) statsd.gauge('{0}.{1}.rows_before'.format(metrics_run_root, tbl), rows_before[tbl]) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_before=rows_before) total_rows_pruned = 0 total_partitions = country_monthly_partitions + operator_monthly_partitions for tblname in total_partitions: invariants_list = utils.table_invariants_list( conn, [tblname], ['triplet_month', 'triplet_year']) assert len(invariants_list) <= 1 if len(invariants_list) == 0: logger.warning( 'Found empty partition {0}. Dropping...'.format(tblname)) cursor.execute( sql.SQL("""DROP TABLE {0} CASCADE""").format( sql.Identifier(tblname))) else: month, year = tuple(invariants_list[0]) # Check if table year/month is outside the retention window if (datetime.date(year, month, 1) < first_month_to_drop): # Calculate number of rows in the partition table cursor.execute( sql.SQL("""SELECT COUNT(*) FROM {0}""").format( sql.Identifier(tblname))) partition_table_rows = cursor.fetchone()[0] total_rows_pruned += partition_table_rows logger.info('Dropping table {0} with {1} rows...'.format( tblname, partition_table_rows)) cursor.execute( sql.SQL("""DROP TABLE {0} CASCADE""").format( sql.Identifier(tblname))) logger.info('Dropped table {0}'.format(tblname)) rows_after = {} for tbl in parent_tbl_names: logger.debug( 'Calculating new number of rows in {0} table...'.format(tbl)) cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl))) rows_after[tbl] = cursor.fetchone()[0] logger.debug( 'Calculated new number of rows in {0} table'.format(tbl)) statsd.gauge('{0}.{1}.rows_after'.format(metrics_run_root, tbl), rows_after[tbl]) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_after=rows_after) total_rows_before = sum(rows_before.values()) total_rows_after = sum(rows_after.values()) assert (total_rows_before - total_rows_after) == total_rows_pruned logger.info( 'Pruned {0:d} rows of monthly_network_triplets data outside the retention window from database' .format(total_rows_pruned))
def standard(ctx: callable, config: callable, statsd: callable, logger: callable, run_id: int, conn: callable, metadata_conn: callable, command: str, metrics_root: callable, metrics_run_root: callable, force_refresh: bool, disable_retention_check: bool, disable_data_check: bool, debug_query_performance: bool, month: int, year: int, output_dir: str) -> None: """Generate standard monthly operator and country-level reports. Arguments: ctx: click context object config: DIRBS config object statsd: DIRBS statsd connection object logger: DIRBS custom logger object run_id: run id of the current job conn: DIRBS PostgreSQL connection object metadata_conn: DIRBS PostgreSQL metadata connection object command: name of the command metrics_root: root object for the statsd metrics metrics_run_root: root object for the statsd run metrics force_refresh: bool to force writing/generating reports from scratch disable_retention_check: bool to disable data retention check disable_data_check: bool to disable data check debug_query_performance: bool to debug query performance month: reporting month year: reporting year output_dir: output directory path Returns: None """ # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) # Next, generate all the report data so that report generation can happen very quickly data_id, class_run_id, per_tac_compliance_data = generate_monthly_report_stats( config, conn, month, year, statsd, metrics_run_root, run_id, force_refresh, debug_query_performance) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, data_id=data_id, classification_run_id=class_run_id) report_dir = make_report_directory(ctx, output_dir, run_id, conn, config, class_run_id=class_run_id, year=year, month=month, data_id=data_id) # First, copy all the report JS/CSS files into the output directory in # cachebusted form and get the cachebusted filenames asset_map = {} report_assets = ['js/report.js', 'css/report.css'] for fn in report_assets: logger.info('Copying required asset "%s" to report folder', fn) asset = pkgutil.get_data('dirbs', fn) name, ext = fn.split('/')[-1].split('.') filename = '{0}_{1}.{2}'.format( name, utils.cachebusted_filename_from_contents(asset), ext) asset_map[fn] = filename with open(os.path.join(report_dir, filename), 'wb') as of: of.write(asset) js_filename = asset_map['js/report.js'] css_filename = asset_map['css/report.css'] # Next, generate the country level report report_metadata = [] with utils.CodeProfiler() as cp: logger.info('Generating country report...') country_name = config.region_config.name country_per_tac_compliance_data = None if per_tac_compliance_data is not None: country_per_tac_compliance_data = per_tac_compliance_data[ OperatorConfig.COUNTRY_OPERATOR_NAME] report = CountryReport( conn, data_id, config, month, year, country_name, has_compliance_data=country_per_tac_compliance_data is not None) report_metadata.extend( write_report(report, month, year, report_dir, country_name, css_filename, js_filename, country_per_tac_compliance_data)) statsd.gauge('{0}runtime.per_report.country'.format(metrics_run_root), cp.duration) operators = config.region_config.operators # Finally, generate the operator reports for op in operators: with utils.CodeProfiler() as cp: logger.info('Generating operator report for operator ID %s...', op.id) operator_per_tac_compliance_data = None if per_tac_compliance_data is not None: operator_per_tac_compliance_data = per_tac_compliance_data.get( op.id) report = OperatorReport( conn, data_id, config, month, year, op, has_compliance_data=operator_per_tac_compliance_data is not None) report_prefix = '{0}_{1}'.format(country_name, op.id) report_metadata.extend( write_report(report, month, year, report_dir, report_prefix, css_filename, js_filename, operator_per_tac_compliance_data)) statsd.gauge( '{0}runtime.per_report.operators.{1}'.format( metrics_run_root, op.id), cp.duration) # Store per-report job metadata metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)
def cli(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, conditions, safety_check, curr_date, disable_sanity_checks): """ DIRBS script to classify IMEIs. Iterates through all configured conditions and write to the classification_state table. :param ctx: click command context :param config: dirbs config instance :param statsd: statsd instance :param logger: dirbs logger instance :param run_id: job run id :param conn: database connection :param metadata_conn: database connection for job metadata :param command: command name :param metrics_root: :param metrics_run_root: :param conditions: list of user supplied conditions :param safety_check: bool (enable/disable safety check) :param curr_date: date to use for classification :param disable_sanity_checks: bool (enable/disable sanity checks) """ _warn_about_curr_date(curr_date, logger) _warn_about_disabled_safety_check(safety_check, logger) # If we didn't specify a condition, use all configured conditions if conditions is None: conditions = config.conditions # Query the job metadata table for all successful classification runs successful_job_runs = metadata.query_for_command_runs(metadata_conn, 'dirbs-classify', successful_only=True) if successful_job_runs and not disable_sanity_checks and not _perform_sanity_checks( config, successful_job_runs[0].extra_metadata): raise ClassifySanityCheckFailedException( 'Sanity checks failed, configurations are not identical to the last successful classification' ) logger.info('Classifying using conditions: {0}'.format(','.join( [c.label for c in conditions]))) # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=curr_date.isoformat() if curr_date is not None else None, conditions=[c.as_dict() for c in conditions], operators=[op.as_dict() for op in config.region_config.operators], amnesty=config.amnesty_config.as_dict()) # Per-condition intermediate tables intermediate_tables = [] # Flag indicating whether we had a failure to change exit code had_errored_condition = False try: locked = False with conn, conn.cursor() as cursor: # Lock to prevent multiple simultaneous classifications cursor.execute('SELECT pg_try_advisory_lock(%s::BIGINT)', [hash_string_64bit('dirbs-classify')]) locked = cursor.fetchone()[0] if not locked: raise ClassifyLockException( 'Could not acquire lock for classification. ' 'Are there any other dirbs-classify instances running at the moment?' ) # Calculate total IMEI count if safety_check: logger.info( 'Counting number of IMEIs in network_imeis for safety check...' ) cursor.execute('SELECT COUNT(*) FROM network_imeis') total_imei_count = cursor.fetchone()[0] logger.info( 'Finished counting number of IMEIs in network_imeis for safety check' ) else: total_imei_count = -1 matched_imei_counts = {} nworkers = config.multiprocessing_config.max_db_connections condition_objs = [Condition(cond_config) for cond_config in conditions] with futures.ProcessPoolExecutor(max_workers=nworkers) as executor: logger.info( 'Simultaneously classifying {0:d} dimensions using up to {1:d} workers...' .format(len(conditions), nworkers)) calc_futures_to_condition = {} update_futures_to_condition = {} per_condition_state = defaultdict( lambda: dict(num_completed_calc_jobs=0, num_total_calc_jobs=0, num_completed_update_jobs=0, num_total_update_jobs=0, num_matched_imeis=0)) for c in condition_objs: # Make sure we record all temporary tables so that we can cleanup later intermediate_tables.append(c.intermediate_tbl_name(run_id)) # Queue the condition calculations and keep track for f in c.queue_calc_imeis_jobs(executor, config, run_id, curr_date): calc_futures_to_condition[f] = c per_condition_state[c.label]['num_total_calc_jobs'] += 1 # Process calculation futures for condition, job_state in _completed_calc_jobs( calc_futures_to_condition, per_condition_state, logger): max_ratio = condition.config.max_allowed_matching_ratio num_matched_imeis = job_state['num_matched_imeis'] max_matched_imeis = max_ratio * total_imei_count if safety_check and total_imei_count > 0 and num_matched_imeis > max_matched_imeis: ratio = min(num_matched_imeis / total_imei_count, 1) logger.error( 'Refusing to classify using condition \'{0}\': ' 'This condition matches more than the maximum number of IMEIs allowed by the ' 'condition\'s configuration ' '(matched_imeis={1:d}, ratio={2:f}, max_ratio={3:f})'. format(condition.label, num_matched_imeis, ratio, max_ratio)) had_errored_condition = True else: # Queue the classification state updates and keep track for f in condition.queue_update_classification_state_jobs( executor, config, run_id, curr_date): update_futures_to_condition[f] = condition per_condition_state[ condition.label]['num_total_update_jobs'] += 1 # Process update futures for condition, job_state in _completed_update_jobs( update_futures_to_condition, per_condition_state, logger): # Update metadata about matched IMEI counts every time each condition finishes matched_imei_counts[ condition.label] = job_state['num_matched_imeis'] metadata.add_optional_job_metadata( metadata_conn, command, run_id, matched_imei_counts=matched_imei_counts) # Output StatsD stats statsd.gauge( '{0}matched_imeis.{1}'.format(metrics_run_root, condition.label.lower()), job_state['num_matched_imeis']) finally: _do_final_cleanup(conn, logger, locked, intermediate_tables) # If we had an error condition, generate an error return code on exit if had_errored_condition: sys.exit(1)
def standard(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, force_refresh, disable_retention_check, disable_data_check, debug_query_performance, month, year, output_dir): """ Generate standard monthly operator and country-level reports. :param ctx: current cli context :param config: dirbs config obj :param statsd: statsd obj :param logger: dirbs logger obj :param run_id: job run id :param conn: database connection :param metadata_conn: database metadata connection :param command: command name :param metrics_root: :param metrics_run_root: :param force_refresh: force fresh flag :param disable_retention_check: retention check flag :param disable_data_check: data check flag :param debug_query_performance: query performance flag :param month: data month :param year: data year :param output_dir: output directory path """ # Store metadata metadata.add_optional_job_metadata(metadata_conn, command, run_id, refreshed_data=force_refresh, month=month, year=year, report_schema_version=report_schema_version, output_dir=os.path.abspath(str(output_dir))) _reports_validation_checks(disable_retention_check, year, month, logger, config, conn, disable_data_check) # Next, generate all the report data so that report generation can happen very quickly data_id, class_run_id, per_tac_compliance_data = generate_monthly_report_stats(config, conn, month, year, statsd, metrics_run_root, run_id, force_refresh, debug_query_performance) # Store metadata about the report data ID and classification run ID metadata.add_optional_job_metadata(metadata_conn, command, run_id, data_id=data_id, classification_run_id=class_run_id) report_dir = _make_report_directory(ctx, output_dir, run_id, conn, config, class_run_id=class_run_id, year=year, month=month, data_id=data_id) # First, copy all the report JS/CSS files into the output directory in # cachebusted form and get the cachebusted filenames asset_map = {} report_assets = [ 'js/report.js', 'css/report.css' ] for fn in report_assets: logger.info('Copying required asset "%s" to report folder', fn) asset = pkgutil.get_data('dirbs', fn) name, ext = fn.split('/')[-1].split('.') filename = '{0}_{1}.{2}'.format(name, utils.cachebusted_filename_from_contents(asset), ext) asset_map[fn] = filename with open(os.path.join(report_dir, filename), 'wb') as of: of.write(asset) js_filename = asset_map['js/report.js'] css_filename = asset_map['css/report.css'] # Next, generate the country level report report_metadata = [] with utils.CodeProfiler() as cp: logger.info('Generating country report...') country_name = config.region_config.name country_per_tac_compliance_data = None if per_tac_compliance_data is not None: country_per_tac_compliance_data = per_tac_compliance_data[OperatorConfig.COUNTRY_OPERATOR_NAME] report = CountryReport(conn, data_id, config, month, year, country_name, has_compliance_data=country_per_tac_compliance_data is not None) report_metadata.extend(_write_report(report, month, year, report_dir, country_name, css_filename, js_filename, country_per_tac_compliance_data)) statsd.gauge('{0}runtime.per_report.country'.format(metrics_run_root), cp.duration) operators = config.region_config.operators # Finally, generate the operator reports for op in operators: with utils.CodeProfiler() as cp: logger.info('Generating operator report for operator ID %s...', op.id) operator_per_tac_compliance_data = None if per_tac_compliance_data is not None: operator_per_tac_compliance_data = per_tac_compliance_data.get(op.id) report = OperatorReport(conn, data_id, config, month, year, op, has_compliance_data=operator_per_tac_compliance_data is not None) report_prefix = '{0}_{1}'.format(country_name, op.id) report_metadata.extend(_write_report(report, month, year, report_dir, report_prefix, css_filename, js_filename, operator_per_tac_compliance_data)) statsd.gauge('{0}runtime.per_report.operators.{1}'.format(metrics_run_root, op.id), cp.duration) # Store per-report job metadata metadata.add_optional_job_metadata(metadata_conn, command, run_id, report_outputs=report_metadata)