def create_per_mno_lists_partition(conn, *, operator_id, parent_tbl_name, tbl_name, num_physical_shards=None, unlogged=False, fillfactor=80): """Function to DRY out creation of a new operator partition for notifications_lists.""" assert len(tbl_name) < 64 if num_physical_shards is None: num_physical_shards = num_physical_imei_shards(conn) if unlogged: unlogged_sql = sql.SQL('UNLOGGED ') else: unlogged_sql = sql.SQL('') with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'): cursor.execute( sql.SQL( """CREATE {unlogged}TABLE {0} PARTITION OF {1} FOR VALUES IN (%s) PARTITION BY RANGE (virt_imei_shard) """ ).format(sql.Identifier(tbl_name), sql.Identifier(parent_tbl_name), unlogged=unlogged_sql), [operator_id] ) _grant_perms_list(conn, part_name=tbl_name) # Create child partitions create_imei_shard_partitions(conn, tbl_name=tbl_name, num_physical_shards=num_physical_shards, perms_func=_grant_perms_list, fillfactor=fillfactor)
def upgrade(self, db_conn): """Overrides AbstractMigrator upgrade method.""" logger = logging.getLogger('dirbs.db') with db_conn.cursor() as cursor, utils.db_role_setter( db_conn, role_name='dirbs_core_import_operator'): child_table_names_list = utils.child_table_names( db_conn, 'daily_per_mno_hll_sketches') for partition_name in child_table_names_list: logger.info( 'Copying partition {0} into daily_per_mno_hll_sketches table...' .format(partition_name)) cursor.execute( sql.SQL("""INSERT INTO daily_per_mno_hll_sketches SELECT * FROM {partition_name_id}"""). format(partition_name_id=sql.Identifier(partition_name))) logger.info( 'Copied partition {0} into daily_per_mno_hll_sketches table' .format(partition_name)) logger.info( 'Dropping daily_per_mno_hll_sketches partition {0}...'. format(partition_name)) cursor.execute( sql.SQL("""DROP TABLE {partition_name_id}""").format( partition_name_id=sql.Identifier(partition_name))) logger.info('Dropped partition {0}'.format(partition_name))
def _init_staging_table_shards(self): """Method to create IMEI shards in the staging table if supported by the importer.""" assert self._supports_imei_shards with db_role_setter(self._conn, role_name=self._owner_role_name): tbl_name = self._staging_tbl_name partition_utils.create_imei_shard_partitions(self._conn, tbl_name=tbl_name, unlogged=True) for name, rstart, rend in partition_utils.physical_imei_shards( self._conn, tbl_name=tbl_name): self._on_staging_table_shard_creation(name, rstart, rend)
def partition_registration_list(self, conn, *, num_physical_shards): """Method to repartition registration_list for v47 upgrade.""" with conn.cursor() as cursor, utils.db_role_setter( conn, role_name='dirbs_core_power_user'): # Create parent partition cursor.execute("""CREATE TABLE historic_registration_list_new ( LIKE historic_registration_list INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (virt_imei_shard) """) part_utils._grant_perms_registration_list( conn, part_name='historic_registration_list_new') # Create child partitions part_utils.create_imei_shard_partitions( conn, tbl_name='historic_registration_list_new', num_physical_shards=num_physical_shards, perms_func=part_utils._grant_perms_registration_list, fillfactor=80) # Insert data from original partition cursor.execute("""INSERT INTO historic_registration_list_new SELECT * FROM historic_registration_list""") # Add in indexes to each partition idx_metadata = [ part_utils.IndexMetadatum(idx_cols=['imei_norm'], is_unique=True, partial_sql='WHERE end_date IS NULL') ] part_utils.add_indices(conn, tbl_name='historic_registration_list_new', idx_metadata=idx_metadata) # Drop old view + table, rename tables, indexes and constraints cursor.execute('DROP VIEW registration_list') cursor.execute('DROP TABLE historic_registration_list CASCADE') part_utils.rename_table_and_indices( conn, old_tbl_name='historic_registration_list_new', new_tbl_name='historic_registration_list', idx_metadata=idx_metadata) cursor.execute("""CREATE OR REPLACE VIEW registration_list AS SELECT imei_norm, make, model, status, virt_imei_shard FROM historic_registration_list WHERE end_date IS NULL WITH CHECK OPTION""" ) cursor.execute("""GRANT SELECT ON registration_list TO dirbs_core_classify, dirbs_core_api, dirbs_core_import_registration_list""" )
def install(ctx): """ Installs latest schema on clean DB instance. :param ctx: current cli context obj :return: status """ logger = logging.getLogger('dirbs.db') config = common.ensure_config(ctx) db_config = config.db_config with utils.create_db_connection(db_config) as conn, conn.cursor() as cur: logger.info('Creating initial base DB schema in DB %s on host %s', db_config.database, db_config.host) # Check if there is stuff already in there cur.execute("""SELECT COUNT(*) FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = current_schema()""") is_clean = (cur.fetchone()[0] == 0) if not is_clean: logger.error('Can\'t install latest schema into a non-clean DB') logger.error( 'Instead, use dirbs-db upgrade to upgrade the schema to the latest version' ) sys.exit(1) # Set our role here so that new objects get created with dirbs_core_power_user as owner by default with utils.db_role_setter(conn, role_name='dirbs_core_power_user'): # First we setup the schema, search path etc. sql = pkgutil.get_data('dirbs', 'sql/base/on_db_creation.sql') cur.execute(sql) # Install the base schema for v19 and set current version to 19 base_schema = 'sql/base/v19_schema.sql' logger.info('Restoring base v19 schema from SQL file: %s', base_schema) sql = pkgutil.get_data('dirbs', base_schema) cur.execute(sql) utils.set_db_schema_version(conn, min_schema_version) logger.info( 'Successfully created base v{0:d} schema. Scheduling dirbs-db upgrade...' .format(min_schema_version)) # Then we call upgrade to complete the process rv = 0 if code_db_schema_version > min_schema_version: rv = ctx.invoke(upgrade) else: # Can't do anything until we know the schema is the right version _store_job_metadata(config, 'install') return rv
def repartition_classification_state(conn, *, num_physical_shards, src_filter_sql=None): """Function to repartition the classification_state table.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_power_user'): # Create parent partition cursor.execute( """CREATE TABLE classification_state_new ( LIKE classification_state INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (virt_imei_shard) """ ) _grant_perms_classification_state(conn, part_name='classification_state_new') # Create child partitions create_imei_shard_partitions(conn, tbl_name='classification_state_new', num_physical_shards=num_physical_shards, perms_func=_grant_perms_classification_state, fillfactor=80) # Insert data from original partition base_sql = sql.SQL("""INSERT INTO classification_state_new SELECT * FROM classification_state""") if src_filter_sql is not None: insert_sql = sql.SQL('{0} {1}').format(base_sql, sql.SQL(src_filter_sql)) else: insert_sql = base_sql cursor.execute(insert_sql) # Add in indexes to each partition idx_metadata = [ IndexMetadatum(idx_cols=cols, is_unique=is_uniq, partial_sql=partial) for cols, is_uniq, partial in [ (['row_id'], True, None), (['imei_norm', 'cond_name'], True, 'WHERE end_date IS NULL'), (['block_date'], False, 'WHERE end_date IS NULL'), (['cond_name'], False, 'WHERE end_date IS NULL') ] ] add_indices(conn, tbl_name='classification_state_new', idx_metadata=idx_metadata) # Drop old table, rename tables, indexes and constraints cursor.execute('ALTER SEQUENCE classification_state_row_id_seq OWNED BY classification_state_new.row_id') cursor.execute('DROP TABLE classification_state CASCADE') rename_table_and_indices(conn, old_tbl_name='classification_state_new', new_tbl_name='classification_state', idx_metadata=idx_metadata)
def repartition_exceptions_lists(conn, *, num_physical_shards, src_filter_sql=None): """Function to repartition the exceptions_lists table.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'): # Create parent partition cursor.execute( """CREATE TABLE exceptions_lists_new ( LIKE exceptions_lists INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY LIST (operator_id) """ ) _grant_perms_list(conn, part_name='exceptions_lists_new') # Work out who the operators are imei_shard_names = utils.child_table_names(conn, 'exceptions_lists') operators = [x.operator_id for x in utils.table_invariants_list(conn, imei_shard_names, ['operator_id'])] # Create child partitions (operator at top level, then IMEI-sharded) for op_id in operators: tbl_name = per_mno_lists_partition(operator_id=op_id, suffix='_new', list_type='exceptions') create_per_mno_lists_partition(conn, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name, operator_id=op_id, num_physical_shards=num_physical_shards) # Insert data from original partition base_sql = sql.SQL("""INSERT INTO exceptions_lists_new SELECT * FROM exceptions_lists""") if src_filter_sql is not None: insert_sql = sql.SQL('{0} {1}').format(base_sql, sql.SQL(src_filter_sql)) else: insert_sql = base_sql cursor.execute(insert_sql) # Add in indexes to each partition add_indices(conn, tbl_name='exceptions_lists_new', idx_metadata=exceptions_lists_indices()) # Drop old table, after assigning sequence to new table cursor.execute('ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id') cursor.execute('DROP TABLE exceptions_lists CASCADE') # Rename tables, indexes and constraints rename_table_and_indices(conn, old_tbl_name='exceptions_lists_new', new_tbl_name='exceptions_lists', idx_metadata=exceptions_lists_indices())
def repartition_pairing_list(conn, *, num_physical_shards): """Function to repartition the pairing_list table.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_power_user'): cursor.execute( """CREATE TABLE historic_pairing_list_new ( LIKE historic_pairing_list INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (virt_imei_shard) """ ) _grant_perms_pairing_list(conn, part_name='historic_pairing_list_new') # Create child partitions create_imei_shard_partitions(conn, tbl_name='historic_pairing_list_new', num_physical_shards=num_physical_shards, perms_func=_grant_perms_pairing_list, fillfactor=80) # Insert data from original partition cursor.execute("""INSERT INTO historic_pairing_list_new SELECT * FROM historic_pairing_list""") # Add in indexes to each partition idx_metadata = [IndexMetadatum(idx_cols=['imei_norm', 'imsi'], is_unique=True, partial_sql='WHERE end_date IS NULL')] add_indices(conn, tbl_name='historic_pairing_list_new', idx_metadata=idx_metadata) # Drop old view + table, rename tables, indexes and constraints cursor.execute('DROP VIEW pairing_list') cursor.execute('DROP TABLE historic_pairing_list CASCADE') rename_table_and_indices(conn, old_tbl_name='historic_pairing_list_new', new_tbl_name='historic_pairing_list', idx_metadata=idx_metadata) cursor.execute("""CREATE VIEW pairing_list AS SELECT imei_norm, imsi, virt_imei_shard FROM historic_pairing_list WHERE end_date IS NULL WITH CHECK OPTION""") cursor.execute("""GRANT SELECT ON pairing_list TO dirbs_core_listgen, dirbs_core_report, dirbs_core_api, dirbs_core_import_pairing_list""")
def repartition_blacklist(conn, *, num_physical_shards): """Function to repartition the blacklist table.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'): # Create parent partition cursor.execute( """CREATE TABLE blacklist_new ( LIKE blacklist INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (virt_imei_shard) """ ) # Create child partitions create_imei_shard_partitions(conn, tbl_name='blacklist_new', num_physical_shards=num_physical_shards, perms_func=_grant_perms_list, fillfactor=80) # Insert data from original partition cursor.execute("""INSERT INTO blacklist_new SELECT * FROM blacklist""") # Add in indexes to each partition idx_metadata = [ IndexMetadatum(idx_cols=cols, is_unique=is_uniq, partial_sql=partial) for cols, is_uniq, partial in [ (['imei_norm'], True, 'WHERE end_run_id IS NULL'), (['end_run_id'], False, None), (['start_run_id'], False, None) ] ] add_indices(conn, tbl_name='blacklist_new', idx_metadata=idx_metadata) # Drop old table, rename tables, indexes and constraints cursor.execute('ALTER SEQUENCE blacklist_row_id_seq OWNED BY blacklist_new.row_id') cursor.execute('DROP TABLE blacklist CASCADE') rename_table_and_indices(conn, old_tbl_name='blacklist_new', new_tbl_name='blacklist', idx_metadata=idx_metadata)
def repartition_network_imeis(conn, *, num_physical_shards): """Function to repartition the network_imeis table.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'): # Create parent partition cursor.execute( """CREATE TABLE network_imeis_new ( LIKE network_imeis INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (virt_imei_shard) """ ) _grant_perms_network_imeis(conn, part_name='network_imeis_new') # Create child partitions create_imei_shard_partitions(conn, tbl_name='network_imeis_new', num_physical_shards=num_physical_shards, perms_func=_grant_perms_network_imeis, fillfactor=80) # Insert data from original partition cursor.execute("""INSERT INTO network_imeis_new SELECT * FROM network_imeis""") # Add in indexes to each partition idx_metadata = [ IndexMetadatum(idx_cols=cols, is_unique=is_uniq, partial_sql=partial) for cols, is_uniq, partial in [ (['imei_norm'], True, None), (['first_seen'], False, None) ] ] add_indices(conn, tbl_name='network_imeis_new', idx_metadata=idx_metadata) # Drop old table, rename tables, indexes and constraints cursor.execute('DROP TABLE network_imeis CASCADE') rename_table_and_indices(conn, old_tbl_name='network_imeis_new', new_tbl_name='network_imeis', idx_metadata=idx_metadata)
def create_monthly_network_triplets_per_mno_partition(conn, *, operator_id, month, year, suffix='', num_physical_shards=None, fillfactor=45): """Function to DRY out creation of a new month/year partition for monthly_network_triplets_per_mno.""" if num_physical_shards is None: num_physical_shards = num_physical_imei_shards(conn) with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'): parent_tbl_name = 'monthly_network_triplets_per_mno{0}'.format(suffix) op_part_name = monthly_network_triplets_per_mno_partition(operator_id=operator_id, month=None, year=None, suffix=suffix) assert len(op_part_name) < 64 cursor.execute( sql.SQL( """CREATE TABLE IF NOT EXISTS {0} PARTITION OF {1} FOR VALUES IN (%s) PARTITION BY RANGE (triplet_year, triplet_month) """ ).format(sql.Identifier(op_part_name), sql.Identifier(parent_tbl_name)), [operator_id] ) _grant_perms_monthly_network_triplets(conn, part_name=op_part_name) part_name = monthly_network_triplets_per_mno_partition(operator_id=operator_id, month=month, year=year, suffix=suffix) assert len(part_name) < 64 cursor.execute( sql.SQL( """CREATE TABLE {0} PARTITION OF {1} FOR VALUES FROM %s TO %s PARTITION BY RANGE (virt_imei_shard) """ ).format(sql.Identifier(part_name), sql.Identifier(op_part_name)), [(year, month), (year, month + 1)] ) _grant_perms_monthly_network_triplets(conn, part_name=part_name) # Create child partitions create_imei_shard_partitions(conn, tbl_name=part_name, num_physical_shards=num_physical_shards, perms_func=_grant_perms_monthly_network_triplets, fillfactor=fillfactor)
def _repartition_exceptions_lists(self, conn, *, num_physical_shards): """Repartition the exceptions lists to support msisdn.""" with conn.cursor() as cursor, utils.db_role_setter( conn, role_name='dirbs_core_listgen'): cursor.execute("""CREATE TABLE exceptions_lists_new ( LIKE exceptions_lists INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY LIST (operator_id); ALTER TABLE exceptions_lists_new ADD COLUMN msisdn TEXT NOT NULL; """) part_utils._grant_perms_list( conn, part_name='exceptions_lists_new') # grant relevant permissions imei_shard_names = utils.child_table_names( conn, 'exceptions_lists') # determine the child table names operators = [ o.operator_id for o in utils.table_invariants_list( conn, imei_shard_names, ['operator_id']) ] # workout who the operators are # create child partitions for new list (operator at top level, then IMEI sharded) for op_id in operators: tbl_name = part_utils.per_mno_lists_partition( operator_id=op_id, suffix='_new', list_type='exceptions') part_utils.create_per_mno_lists_partition( conn, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name, operator_id=op_id, num_physical_shards=num_physical_shards) # insert data into the new parent partition cursor.execute("""INSERT INTO exceptions_lists_new SELECT e.row_id, e.operator_id, e.imei_norm, e.imsi, e.start_run_id, e.end_run_id, e.delta_reason, e.virt_imei_shard, p.msisdn FROM exceptions_lists e INNER JOIN historic_pairing_list p ON e.imsi = p.imsi""" ) # add indexes in each partitions part_utils.add_indices( conn, tbl_name='exceptions_lists_new', idx_metadata=part_utils.exceptions_lists_indices()) # drop old table, after assigning sequence to new table cursor.execute( 'ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id' ) cursor.execute('DROP TABLE exceptions_lists CASCADE') # rename table, indexes and constraints part_utils.rename_table_and_indices( conn, old_tbl_name='exceptions_lists_new', new_tbl_name='exceptions_lists', idx_metadata=part_utils.exceptions_lists_indices()) # recreating gen_exceptionlist function with utils.db_role_setter(conn, role_name='dirbs_core_power_user'): cursor.execute(""" DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT); -- -- Recreate function to generate a full exceptions_list for a given -- run_id and operator. -- A value of -1 means get the latest list. -- CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, imsi TEXT, msisdn TEXT ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always -- return all rows where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT el.imei_norm, el.virt_imei_shard, el.imsi, el.msisdn FROM exceptions_lists el WHERE el.operator_id = op_id AND el.delta_reason != 'removed' AND run_id >= el.start_run_id AND (run_id < el.end_run_id OR el.end_run_id IS NULL); END $$; DROP FUNCTION gen_delta_exceptions_list(op_id TEXT, base_run_id BIGINT, run_id BIGINT); -- -- Create function to generate a per-MNO delta exceptions list for a run_id, operator -- id and optional base_run_id. -- -- If not base_run_id is supplied, this function will use the maximum run_id found in -- the DB that it less than than the supplied run_id -- CREATE FUNCTION gen_delta_exceptions_list(op_id TEXT, base_run_id BIGINT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, imsi TEXT, msisdn TEXT, delta_reason TEXT ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id -- IF run_id = -1 THEN run_id := max_bigint(); END IF; IF run_id < base_run_id THEN RAISE EXCEPTION 'Parameter base_run_id % greater than run_id %', base_run_id, run_id; END IF; RETURN QUERY SELECT * FROM (SELECT el.imei_norm, el.imsi, el.msisdn, overall_delta_reason(el.delta_reason ORDER BY start_run_id DESC) AS delta_reason FROM exceptions_lists el WHERE operator_id = op_id AND start_run_id > base_run_id AND start_run_id <= run_id GROUP BY el.imei_norm, el.imsi, el.msisdn) x WHERE x.delta_reason IS NOT NULL; END $$; """) # noqa: Q440, Q441
def upgrade(self, db_conn): # noqa: C901 """Overrides AbstractMigrator upgrade method.""" logger = logging.getLogger('dirbs.db') with db_conn.cursor() as cursor: cursor.execute( """CREATE FUNCTION calc_virt_imei_shard(imei TEXT) RETURNS SMALLINT LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE AS $$ BEGIN RETURN SUBSTRING(COALESCE(imei, ''), 13, 2)::SMALLINT; EXCEPTION WHEN OTHERS THEN RETURN 0; END; $$""") # By default, create 4 shards num_initial_shards = 4 logger.info('Re-partitioning classification_state table...') cursor.execute( 'ALTER TABLE classification_state ADD COLUMN virt_imei_shard SMALLINT' ) cursor.execute( 'UPDATE classification_state SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE classification_state ALTER COLUMN virt_imei_shard SET NOT NULL' ) part_utils.repartition_classification_state( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned classification_state table') logger.info('Re-partitioning registration_list table...') cursor.execute( 'ALTER TABLE historic_registration_list ADD COLUMN virt_imei_shard SMALLINT' ) cursor.execute( 'UPDATE historic_registration_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE historic_registration_list ALTER COLUMN virt_imei_shard SET NOT NULL' ) self.partition_registration_list( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned registration_list table') logger.info('Re-partitioning pairing_list table...') cursor.execute( 'ALTER TABLE historic_pairing_list ADD COLUMN virt_imei_shard SMALLINT' ) cursor.execute( 'UPDATE historic_pairing_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE historic_pairing_list ALTER COLUMN virt_imei_shard SET NOT NULL' ) part_utils.repartition_pairing_list( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned pairing_list table') logger.info('Re-partitioning blacklist table...') cursor.execute( 'ALTER TABLE blacklist ADD COLUMN virt_imei_shard SMALLINT') cursor.execute( 'UPDATE blacklist SET virt_imei_shard = calc_virt_imei_shard(imei_norm)' ) cursor.execute( 'ALTER TABLE blacklist ALTER COLUMN virt_imei_shard SET NOT NULL' ) part_utils.repartition_blacklist( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned blacklist table') # Need to make sure owner of list tables is dirbs_core_listgen logger.info('Re-partitioning notifications_lists table...') # The original notifications_lists were not created with a single sequence for the IDs, so just do now with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'): cursor.execute( """CREATE UNLOGGED TABLE notifications_lists_new ( row_id BIGSERIAL NOT NULL, operator_id TEXT NOT NULL, imei_norm TEXT NOT NULL, imsi TEXT NOT NULL, msisdn TEXT NOT NULL, block_date DATE NOT NULL, reasons TEXT[] NOT NULL, amnesty_granted BOOLEAN DEFAULT FALSE NOT NULL, start_run_id BIGINT NOT NULL, end_run_id BIGINT, delta_reason TEXT NOT NULL CHECK (delta_reason IN ('new', 'resolved', 'blacklisted', 'no_longer_seen', 'changed')), virt_imei_shard SMALLINT NOT NULL ) PARTITION BY LIST (operator_id) """) # Work out who the operators are partitions = utils.child_table_names(db_conn, 'notifications_lists') # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user) # due to bad previous migration scripts with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in partitions: cursor.execute( sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen'). format(sql.Identifier(p))) operators = [ x.operator_id for x in utils.table_invariants_list( db_conn, partitions, ['operator_id']) ] # Create operator child partitions for op_id in operators: tbl_name = part_utils.per_mno_lists_partition( operator_id=op_id, suffix='_new', list_type='notifications') part_utils.create_per_mno_lists_partition( db_conn, operator_id=op_id, parent_tbl_name='notifications_lists_new', tbl_name=tbl_name, num_physical_shards=1, unlogged=True, fillfactor=100) cursor.execute( """INSERT INTO notifications_lists_new(operator_id, imei_norm, imsi, msisdn, block_date, reasons, start_run_id, end_run_id, delta_reason, virt_imei_shard) SELECT operator_id, imei_norm, imsi, msisdn, block_date, reasons, start_run_id, end_run_id, delta_reason, calc_virt_imei_shard(imei_norm) FROM notifications_lists """) # Drop old table, rename tables, indexes and constraints cursor.execute("""ALTER TABLE notifications_lists_new RENAME CONSTRAINT notifications_lists_new_delta_reason_check TO notifications_lists_delta_reason_check""") cursor.execute('DROP TABLE notifications_lists CASCADE') cursor.execute("""ALTER SEQUENCE notifications_lists_new_row_id_seq RENAME TO notifications_lists_row_id_seq""") part_utils.rename_table_and_indices( db_conn, old_tbl_name='notifications_lists_new', new_tbl_name='notifications_lists') part_utils.repartition_notifications_lists( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned notifications_lists table') logger.info('Re-partitioning exceptions_lists table...') # The original exceptions_lists were not created with a single sequence for the IDs, so just do now with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'): cursor.execute("""CREATE UNLOGGED TABLE exceptions_lists_new ( row_id BIGSERIAL NOT NULL, operator_id TEXT NOT NULL, imei_norm TEXT NOT NULL, imsi TEXT NOT NULL, start_run_id BIGINT NOT NULL, end_run_id BIGINT, delta_reason TEXT NOT NULL CHECK (delta_reason IN ('added', 'removed')), virt_imei_shard SMALLINT NOT NULL ) PARTITION BY LIST (operator_id) """) # Work out who the operators are partitions = utils.child_table_names(db_conn, 'exceptions_lists') # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user) # due to bad previous migration scripts with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in partitions: cursor.execute( sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen'). format(sql.Identifier(p))) operators = [ x.operator_id for x in utils.table_invariants_list( db_conn, partitions, ['operator_id']) ] # Create operator child partitions for op_id in operators: tbl_name = part_utils.per_mno_lists_partition( operator_id=op_id, suffix='_new', list_type='exceptions') part_utils.create_per_mno_lists_partition( db_conn, operator_id=op_id, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name, num_physical_shards=1, unlogged=True, fillfactor=100) cursor.execute( """INSERT INTO exceptions_lists_new(operator_id, imei_norm, imsi, start_run_id, end_run_id, delta_reason, virt_imei_shard) SELECT operator_id, imei_norm, imsi, start_run_id, end_run_id, delta_reason, calc_virt_imei_shard(imei_norm) FROM exceptions_lists """) # Drop old table, rename tables, indexes and constraints cursor.execute("""ALTER TABLE exceptions_lists_new RENAME CONSTRAINT exceptions_lists_new_delta_reason_check TO exceptions_lists_delta_reason_check""") cursor.execute('DROP TABLE exceptions_lists CASCADE') cursor.execute( 'ALTER SEQUENCE exceptions_lists_new_row_id_seq RENAME TO exceptions_lists_row_id_seq' ) part_utils.rename_table_and_indices( db_conn, old_tbl_name='exceptions_lists_new', new_tbl_name='exceptions_lists') part_utils.repartition_exceptions_lists( db_conn, num_physical_shards=num_initial_shards) logger.info('Re-partitioned exceptions_lists table') logger.info('Re-partitioning seen_imeis (network_imeis) table') # First, just put everything in a temporary table so that we can call partutils with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute("""CREATE UNLOGGED TABLE network_imeis ( first_seen DATE NOT NULL, last_seen DATE NOT NULL, seen_rat_bitmask INTEGER, imei_norm TEXT NOT NULL, virt_imei_shard SMALLINT NOT NULL ) """) # # We disable index scans here as doing a merge append with index scans is much slower and involves # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort # the results by imei_norm # cursor.execute('SET enable_indexscan = false') cursor.execute("""INSERT INTO network_imeis SELECT MIN(first_seen), MAX(last_seen), bit_or(seen_rat_bitmask), imei_norm, calc_virt_imei_shard(imei_norm) FROM seen_imeis GROUP BY imei_norm """) cursor.execute('SET enable_indexscan = true') part_utils.repartition_network_imeis( db_conn, num_physical_shards=num_initial_shards) cursor.execute('DROP TABLE seen_imeis CASCADE') logger.info('Re-partitioned seen_imeis (network_imeis) table') # First, just put all country-level triplets in a temporary table so that we can call partition_utils with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute( """CREATE UNLOGGED TABLE monthly_network_triplets_country ( triplet_year SMALLINT NOT NULL, triplet_month SMALLINT NOT NULL, first_seen DATE NOT NULL, last_seen DATE NOT NULL, date_bitmask INTEGER NOT NULL, triplet_hash UUID NOT NULL, imei_norm TEXT, imsi TEXT, msisdn TEXT, virt_imei_shard SMALLINT NOT NULL, CHECK (last_seen >= first_seen), CHECK (EXTRACT(month FROM last_seen) = triplet_month AND EXTRACT(year FROM last_seen) = triplet_year), CHECK (EXTRACT(month FROM first_seen) = triplet_month AND EXTRACT(year FROM first_seen) = triplet_year) ) PARTITION BY RANGE (triplet_year, triplet_month) """) # Work out what partitions to create and create them partitions = utils.child_table_names(db_conn, 'seen_triplets') # Make sure that they are owned by dirbs_core_import_operator (they can be owner by dirbs_core_power_user) # due to bad previous migration scripts with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in partitions: cursor.execute( sql.SQL( 'ALTER TABLE {0} OWNER TO dirbs_core_import_operator' ).format(sql.Identifier(p))) year_month_tuples = { (x.triplet_year, x.triplet_month) for x in utils.table_invariants_list( db_conn, partitions, ['triplet_year', 'triplet_month']) } for year, month in year_month_tuples: part_utils.create_monthly_network_triplets_country_partition( db_conn, month=month, year=year, num_physical_shards=1) with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute( """CREATE UNLOGGED TABLE monthly_network_triplets_per_mno ( LIKE monthly_network_triplets_country INCLUDING ALL, operator_id TEXT NOT NULL ) PARTITION BY LIST (operator_id) """) # Work out what partitions to create and create them op_year_month_tuples = { (x.operator_id, x.triplet_year, x.triplet_month) for x in utils.table_invariants_list( db_conn, partitions, ['operator_id', 'triplet_year', 'triplet_month']) } # Create child partitions at per-MNO level for op, year, month in op_year_month_tuples: part_utils.create_monthly_network_triplets_per_mno_partition( db_conn, operator_id=op, month=month, year=year, num_physical_shards=1) # Create temporary monthly_network_triplets_per_mno table for year, month in year_month_tuples: logger.info( 'Generating temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}...' .format(month, year)) cursor.execute( """INSERT INTO monthly_network_triplets_per_mno SELECT %(year)s, %(month)s, first_seen, last_seen, date_bitmask, triplet_hash, imei_norm, imsi, msisdn, calc_virt_imei_shard(imei_norm), operator_id FROM seen_triplets WHERE triplet_year = %(year)s AND triplet_month = %(month)s """, { 'year': year, 'month': month }) logger.info( 'Generated temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}' .format(month, year)) # Create temporary monthly_network_triplets_country table. We need to do this monthly as we need # to aggregate by triplets on a monthly basis # # We disable index scans here as doing a merge append with index scans is much slower and involves # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort # the results by imei_norm # cursor.execute('SET enable_indexscan = false') for year, month in year_month_tuples: logger.info( 'Generating temporary monthly_network_triplets_country entries for {0:02d}/{1:d}...' .format(month, year)) cursor.execute( """INSERT INTO monthly_network_triplets_country SELECT %(year)s, %(month)s, MIN(first_seen), MAX(last_seen), bit_or(date_bitmask), triplet_hash, FIRST(imei_norm), FIRST(imsi), FIRST(msisdn), calc_virt_imei_shard(FIRST(imei_norm)) FROM seen_triplets WHERE triplet_year = %(year)s AND triplet_month = %(month)s GROUP BY triplet_hash """, { 'year': year, 'month': month }) logger.info( 'Generated temporary monthly_network_triplets_country entries for {0:02d}/{1:d}' .format(month, year)) cursor.execute('SET enable_indexscan = true') logger.info( 'Re-partitioning temporary monthly_network_triplets tables...') # Previously, the operator_data view was owned by dirbs_core_power_user but is now owned by the # dirbs_core_import_operator since it must be re-created with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): cursor.execute( 'ALTER VIEW operator_data OWNER TO dirbs_core_import_operator' ) part_utils.repartition_monthly_network_triplets( db_conn, num_physical_shards=num_initial_shards) cursor.execute('DROP TABLE seen_triplets CASCADE') logger.info( 'Re-partitioned temporary monthly_network_triplets tables') # Replace list generation function to include virt_imei_shard cursor.execute(""" DROP FUNCTION gen_blacklist(run_id BIGINT); DROP FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT); DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT); -- -- Create function to generate a full blacklist for a given run_id. A value of -1 means get the latest -- list. -- CREATE FUNCTION gen_blacklist(run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, block_date DATE, reasons TEXT[] ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ DECLARE query_run_id BIGINT; BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows -- where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT bl.imei_norm, bl.virt_imei_shard, bl.block_date, bl.reasons FROM blacklist bl WHERE bl.delta_reason != 'unblocked' AND run_id >= bl.start_run_id AND (run_id < bl.end_run_id OR bl.end_run_id IS NULL); END $$; -- -- Create function to generate a full notifications_list for a given run_id and operator ID. A value -- of -1 means get the latest list. -- CREATE FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, imsi TEXT, msisdn TEXT, block_date DATE, reasons TEXT[], amnesty_granted BOOLEAN ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows -- where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT nl.imei_norm, nl.virt_imei_shard, nl.imsi, nl.msisdn, nl.block_date, nl.reasons, nl.amnesty_granted FROM notifications_lists nl WHERE nl.operator_id = op_id AND nl.delta_reason NOT IN ('resolved', 'blacklisted') AND run_id >= nl.start_run_id AND (run_id < nl.end_run_id OR nl.end_run_id IS NULL); END $$; -- -- Create function to generate a full exceptions_list for a given run_id and operator ID. A value -- of -1 means get the latest list. -- CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1) RETURNS TABLE ( imei_norm TEXT, virt_imei_shard SMALLINT, imsi TEXT ) LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE AS $$ BEGIN -- -- If we don't specify a run_id, just set to the maximum run_id which will always return all -- rows where end_run_id is NULL -- IF run_id = -1 THEN run_id := max_bigint(); END IF; RETURN QUERY SELECT el.imei_norm, el.virt_imei_shard, el.imsi FROM exceptions_lists el WHERE el.operator_id = op_id AND el.delta_reason != 'removed' AND run_id >= el.start_run_id AND (run_id < el.end_run_id OR el.end_run_id IS NULL); END $$; """) # noqa: Q440, Q441 # Update schema metadata table cursor.execute( """ALTER TABLE schema_metadata ADD COLUMN phys_shards SMALLINT NOT NULL DEFAULT %s CHECK (phys_shards > 0 AND phys_shards <= 100)""", [num_initial_shards]) cursor.execute( 'ALTER TABLE schema_metadata ALTER COLUMN phys_shards DROP DEFAULT' ) # Drop obsolete columns cursor.execute( 'ALTER TABLE schema_metadata DROP COLUMN potential_whitespace_imsis_msisdns' ) cursor.execute( 'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_imsi_records' ) cursor.execute( 'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_msisdn_records' )
def lists(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Prune obsolete lists data.""" curr_date = ctx.obj['CURR_DATE'] # store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, retention_months=config.retention_config.months_retention) logger.info( 'Pruning lists tables to remove any obsolete data with end_time outside the retention window..' ) retention_months = config.retention_config.months_retention if curr_date is None: curr_date = datetime.date.today() first_month_to_drop = datetime.date( curr_date.year, curr_date.month, 1) - relativedelta.relativedelta(months=retention_months) logger.info( 'Lists data with end_time earlier than {0} will be pruned'.format( first_month_to_drop)) with utils.db_role_setter( conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor: logger.debug('Calculating original number of rows in lists tables...') row_count_sql = sql.SQL( """SELECT blacklist_row_count, noft_lists_row_count, excp_lists_row_count FROM (SELECT COUNT(*) FROM blacklist) AS blacklist_row_count, (SELECT COUNT(*) FROM notifications_lists) AS noft_lists_row_count, (SELECT COUNT(*) FROM exceptions_lists) AS excp_lists_row_count""" ) cursor.execute(row_count_sql) rows_before = cursor.fetchone() blacklist_rows_before = int( rows_before.blacklist_row_count.strip('()')) notflist_rows_before = int( rows_before.noft_lists_row_count.strip('()')) excplist_rows_before = int( rows_before.excp_lists_row_count.strip('()')) rows_before = blacklist_rows_before + notflist_rows_before + excplist_rows_before logger.debug('Calculated original number of rows in lists tables...') statsd.gauge('{0}blacklist_rows_before'.format(metrics_run_root), blacklist_rows_before) statsd.gauge( '{0}notifications_lists_rows_before'.format(metrics_run_root), notflist_rows_before) statsd.gauge( '{0}exceptions_lists_rows_before'.format(metrics_run_root), excplist_rows_before) metadata.add_optional_job_metadata( metadata_conn, command, run_id, blacklist_rows_before=blacklist_rows_before, notifications_lists_rows_before=notflist_rows_before, exceptions_lists_rows_before=excplist_rows_before) # Calculate number of rows in the lists table outside the retention window job_metadata_filter_sql = """SELECT run_id FROM job_metadata WHERE command = 'dirbs-listgen' AND end_time < '{0}'""".format( first_month_to_drop) cursor.execute( sql.SQL("""SELECT COUNT(*) FROM blacklist WHERE start_run_id IN ({0})""".format( job_metadata_filter_sql))) total_bl_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of blacklist table outside the retention window to prune' .format(total_bl_rows_out_window_to_prune)) cursor.execute( sql.SQL("""SELECT COUNT(*) FROM notifications_lists WHERE start_run_id IN ({0})""".format( job_metadata_filter_sql))) total_nl_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of notifications lists table outside the retention window to prune' .format(total_nl_rows_out_window_to_prune)) cursor.execute( sql.SQL("""SELECT COUNT(*) FROM exceptions_lists WHERE start_run_id IN ({0})""".format( job_metadata_filter_sql))) total_nl_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of exceptions lists table outside the retention window to prune' .format(total_nl_rows_out_window_to_prune)) # We repartition the tables to re-create them, passing a condition sql logger.debug('Re-creating blacklist table...') num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn) src_filter_sql = cursor.mogrify( """WHERE start_run_id NOT IN ({0})""".format( job_metadata_filter_sql)) partition_utils.repartition_blacklist( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created blacklist table') logger.debug('Re-creating notifications lists table...') partition_utils.repartition_notifications_lists( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created notifications lists table') logger.debug('Re-creating exceptions lists table...') partition_utils.repartition_exceptions_lists( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created exceptions lists table') logger.debug('Calculating new number of rows in lists tables...') cursor.execute(row_count_sql) rows_after = cursor.fetchone() blacklist_rows_after = int(rows_after.blacklist_row_count.strip('()')) notflist_rows_after = int(rows_after.noft_lists_row_count.strip('()')) excplist_rows_after = int(rows_after.excp_lists_row_count.strip('()')) rows_after = blacklist_rows_after + notflist_rows_after + excplist_rows_after logger.debug('Calculated new number of rows in lists tables') statsd.gauge('{0}blacklist_rows_after'.format(metrics_run_root), blacklist_rows_after) statsd.gauge( '{0}notifications_lists_rows_after'.format(metrics_run_root), notflist_rows_after) statsd.gauge('{0}exceptions_lists_rows_after'.format(metrics_run_root), excplist_rows_after) metadata.add_optional_job_metadata( metadata_conn, command, run_id, blacklist_rows_before=blacklist_rows_after, notifications_lists_rows_before=notflist_rows_after, exceptions_lists_rows_before=excplist_rows_after) logger.info('Pruned {0:d} rows from lists tables'.format(rows_after - rows_before))
def blacklist(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root, condition_name, prune_all): """Expire IMEIs outside the blacklist retention period from blacklist.""" current_date = datetime.date.today() retention_days = config.retention_config.blacklist_retention if condition_name is None and prune_all is False: logger.info( 'Error: one of the arguments "condition_name" or "--prune-all" is required' ) metadata.add_optional_job_metadata(metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=False) elif condition_name is not None and prune_all is True: logger.info( 'Error: only one of the arguments "condition_name" or "--prune-all" is required' ) metadata.add_optional_job_metadata(metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=False) elif retention_days == 0: logger.info( 'Blacklist will not be prune, as retention value is set to {0}'. format(retention_days)) metadata.add_optional_job_metadata(metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=False) else: _warn_about_prune_all(prune_all, logger) logger.info( 'Pruning blacklist to remove any data related to specified condition ' 'outside the retention window.') last_retention_date = datetime.date( current_date.year, current_date.month, current_date.day) - datetime.timedelta(retention_days) # store metadata logger.info( 'Blacklist entries with start_date earlier than {0} will be pruned' .format(last_retention_date)) metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=current_date.isoformat(), retention_days=retention_days, job_executed=True, last_retention_date=last_retention_date.isoformat()) with utils.db_role_setter( conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor: logger.debug( 'Calculating original number of rows with block_date in classification_state table...' ) cursor.execute("""SELECT COUNT(*) FROM classification_state WHERE block_date IS NOT NULL AND end_date IS NULL""") rows_before = cursor.fetchone()[0] logger.debug( 'Calculated original number of rows (having block_date) in classification_state table' ) statsd.gauge('{0}rows_before'.format(metrics_run_root), rows_before) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_before=rows_before) # if its a condition based pruning if not prune_all: cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE start_date < %s AND cond_name = %s AND end_date IS NULL AND block_date IS NOT NULL"""), [last_retention_date, condition_name[0].label]) total_rows_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table ' 'with start_date for {1} dimension outside the blacklist ' 'retention window.'.format(total_rows_to_prune, condition_name[0].label)) if total_rows_to_prune > 0: cursor.execute( sql.SQL("""UPDATE classification_state SET end_date = '{0}' WHERE start_date < '{1}' AND cond_name = '{2}' AND end_date IS NULL AND block_date IS NOT NULL""". format(current_date.isoformat(), last_retention_date, condition_name[0].label))) logger.info( 'Pruned {0:d} rows from blacklist for {1} dimension'. format(total_rows_to_prune, condition_name[0].label)) # prune without any condition else: cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE start_date < %s AND end_date IS NULL AND block_date IS NOT NULL"""), [last_retention_date]) total_rows_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table ' 'with start_date outside the blacklist retention window.'. format(total_rows_to_prune)) if total_rows_to_prune > 0: cursor.execute( sql.SQL("""UPDATE classification_state SET end_date = '{0}' WHERE start_date < '{1}' AND end_date IS NULL AND block_date IS NOT NULL""". format(current_date.isoformat(), last_retention_date))) logger.info('Pruned {0:d} rows from blacklist'.format( total_rows_to_prune)) logger.debug( 'Calculating remaining number of rows with block_date (end_date is null) ' 'in classification_state table...') cursor.execute("""SELECT COUNT(*) FROM classification_state WHERE block_date IS NOT NULL AND end_date IS NULL""") rows_after = cursor.fetchone()[0] logger.debug( 'Calculated remaining number of rows (having block_date and end_date null) ' 'in classification_state table') statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_after=rows_after)
def classification_state(ctx, config, statsd, logger, run_id, conn, metadata_conn, command, metrics_root, metrics_run_root): """Prune obsolete classification_state data.""" curr_date = ctx.obj['CURR_DATE'] # Store metadata metadata.add_optional_job_metadata( metadata_conn, command, run_id, curr_date=curr_date.isoformat() if curr_date is not None else None, retention_months=config.retention_config.months_retention) logger.info( 'Pruning classification_state table to remove any classification state data related to ' 'obsolete conditions and data with end_date outside the retention window..' ) cond_config_list = [c.label for c in config.conditions] retention_months = config.retention_config.months_retention if curr_date is None: curr_date = datetime.date.today() first_month_to_drop = datetime.date( curr_date.year, curr_date.month, 1) - relativedelta.relativedelta(months=retention_months) logger.info( 'Classification state data with end_date earlier than {0} will be ' 'pruned'.format(first_month_to_drop)) with utils.db_role_setter( conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor: logger.debug( 'Calculating original number of rows in classification_state table...' ) cursor.execute('SELECT COUNT(*) FROM classification_state') rows_before = cursor.fetchone()[0] logger.debug( 'Calculated original number of rows in classification_state table') statsd.gauge('{0}rows_before'.format(metrics_run_root), rows_before) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_before=rows_before) # Calculate number of rows in the classification table outside retention window cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE end_date < %s """), [first_month_to_drop]) total_rows_out_window_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table ' 'with end_date outside the retention window to prune.'.format( total_rows_out_window_to_prune)) # Calculate number of rows in the classification with conditions no longer existing cursor.execute( sql.SQL("""SELECT COUNT(*) FROM classification_state WHERE NOT starts_with_prefix(cond_name, %s)""" ), [cond_config_list]) total_rows_no_cond_to_prune = cursor.fetchone()[0] logger.info( 'Found {0:d} rows of classification_state table with conditions ' 'no longer existing to prune.'.format(total_rows_no_cond_to_prune)) logger.debug('Re-creating classification_state table...') # Basically, we just re-partition the classification_state table to re-create it, passing a src_filter_sql # parameter num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn) src_filter_sql = cursor.mogrify( """WHERE (end_date > %s OR end_date IS NULL) AND cond_name LIKE ANY(%s)""", [first_month_to_drop, cond_config_list]) partition_utils.repartition_classification_state( conn, num_physical_shards=num_phys_imei_shards, src_filter_sql=str(src_filter_sql, encoding=conn.encoding)) logger.debug('Re-created classification_state table') logger.debug( 'Calculating new number of rows in classification_state table...') cursor.execute('SELECT COUNT(*) FROM classification_state') rows_after = cursor.fetchone()[0] logger.debug( 'Calculated new number of rows in classification_state table') statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after) metadata.add_optional_job_metadata(metadata_conn, command, run_id, rows_after=rows_after) logger.info('Pruned {0:d} rows from classification_state table'.format( rows_after - rows_before))
def upgrade(self, db_conn): """Overrides AbstractMigrator upgrade method.""" logger = logging.getLogger('dirbs.db') with db_conn.cursor() as cursor: # Set search_path to include hll cursor.execute('SET search_path = core, hll;') cursor.execute(""" DO $$ DECLARE database_name TEXT; BEGIN SELECT current_database() INTO database_name; -- Set the search path of this database to "core" EXECUTE 'ALTER DATABASE ' || quote_ident(database_name) || ' SET search_path TO core, hll'; END $$;""") logger.info('Creating daily_per_mno_hll_sketches table...') with utils.db_role_setter(db_conn, role_name='dirbs_core_import_operator'): cursor.execute("""CREATE TABLE daily_per_mno_hll_sketches ( PRIMARY KEY (data_date, operator_id), data_date DATE NOT NULL, operator_id TEXT NOT NULL, creation_date DATE NOT NULL, triplet_hll HLL NOT NULL, imei_hll HLL NOT NULL, imsi_hll HLL NOT NULL, msisdn_hll HLL NOT NULL, imei_imsis_hll HLL NOT NULL, imei_msisdns_hll HLL NOT NULL, imsi_msisdns_hll HLL NOT NULL ) """) cursor.execute( 'GRANT SELECT ON daily_per_mno_hll_sketches TO dirbs_core_report' ) logger.info('Created daily_per_mno_hll_sketches table') logger.info( 'Populating daily_per_mno_hll_sketches from seen_triplets...' ) child_table_names_list = utils.child_table_names( db_conn, 'seen_triplets') # Make sure that seen_triplets partitions are owned by dirbs_core_import_operator (they are supposed # to be). Previously migration scripts failed to set ownership correctly when tables were re-written # and they were incorrectly owned by dirbs_core_power_user. with utils.db_role_setter(db_conn, role_name='dirbs_core_power_user'): for p in child_table_names_list: cursor.execute( sql.SQL( 'ALTER TABLE {0} OWNER TO dirbs_core_import_operator' ).format(sql.Identifier(p))) for partition_name in child_table_names_list: logger.info( 'Populating daily_per_mno_hll_sketches from partition {0}...' .format(partition_name)) cursor.execute( sql.SQL( 'SELECT triplet_year, triplet_month FROM {0} LIMIT 1' ).format(sql.Identifier(partition_name))) res = cursor.fetchone() if res is None: # Table is empty continue year = res.triplet_year month = res.triplet_month days_in_month = monthrange(year, month)[1] triplet_sql_list = [] imei_sql_list = [] imsi_sql_list = [] msisdn_sql_list = [] imei_imsis_sql_list = [] imei_msisdns_sql_list = [] imsi_msisdns_sql_list = [] final_select_sql_list = [] hll_partition_name = 'hll_{0}'.format(partition_name) cursor.execute( sql.SQL( """CREATE TABLE {0} (PRIMARY KEY (data_date, operator_id), LIKE daily_per_mno_hll_sketches) INHERITS (daily_per_mno_hll_sketches); ALTER TABLE {0} OWNER TO dirbs_core_import_operator """).format( sql.Identifier(hll_partition_name))) aggregated_data_temp_table = 'temp_{0}'.format( hll_partition_name) base_query = sql.SQL( """CREATE TEMP TABLE {aggregated_data_temp_table_id} AS SELECT {select_sql} FROM {partition_tbl_id}""") for day in range(1, days_in_month + 1): day_literal = sql.Literal(day) triplet_sql_list.append( sql.SQL( """hll_add_agg(hll_hash_text(triplet_hash::TEXT)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND imei_norm IS NOT NULL AND imsi IS NOT NULL AND msisdn IS NOT NULL) AS triplet_day{day_literal}""" ).format(day_literal=day_literal)) imei_sql_list.append( sql.SQL("""hll_add_agg(hll_hash_text(imei_norm)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND imei_norm IS NOT NULL) AS imei_day{day_literal}""" ).format(day_literal=day_literal)) imsi_sql_list.append( sql.SQL("""hll_add_agg(hll_hash_text(imsi)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND imsi IS NOT NULL) AS imsi_day{day_literal}""" ).format(day_literal=day_literal)) msisdn_sql_list.append( sql.SQL("""hll_add_agg(hll_hash_text(msisdn)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND msisdn IS NOT NULL) AS msisdn_day{day_literal}""" ).format(day_literal=day_literal)) imei_imsis_sql_list.append( sql.SQL( """hll_add_agg(hll_hash_text(imei_norm||'$'||imsi)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND imei_norm IS NOT NULL AND imsi IS NOT NULL) AS imei_imsis_day{day_literal}""" ).format(day_literal=day_literal)) imei_msisdns_sql_list.append( sql.SQL( """hll_add_agg(hll_hash_text(imei_norm||'$'||msisdn)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND imei_norm IS NOT NULL AND msisdn IS NOT NULL ) AS imei_msisdns_day{day_literal}""" ).format(day_literal=day_literal)) imsi_msisdns_sql_list.append( sql.SQL( """hll_add_agg(hll_hash_text(imsi||'$'||msisdn)) FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0 AND imsi IS NOT NULL AND msisdn IS NOT NULL) AS imsi_msisdns_day{day_literal}""" ).format(day_literal=day_literal)) for sql_list in [ triplet_sql_list, imei_sql_list, imsi_sql_list, msisdn_sql_list, imei_imsis_sql_list, imei_msisdns_sql_list, imsi_msisdns_sql_list ]: final_select_sql_list.extend(sql_list) final_query = base_query \ .format(aggregated_data_temp_table_id=sql.Identifier(aggregated_data_temp_table), select_sql=sql.SQL(', ').join(final_select_sql_list), partition_tbl_id=sql.Identifier(partition_name)) cursor.execute(final_query) for day in range(1, days_in_month + 1): str_split = partition_name.split('_') op = str_split[2] job_start_time = datetime.now() day_literal = sql.Literal(day) cursor.execute( sql.SQL( """INSERT INTO {0} (data_date, operator_id, creation_date, triplet_hll, imei_hll, imsi_hll, msisdn_hll, imei_imsis_hll, imei_msisdns_hll, imsi_msisdns_hll) SELECT make_date(%s, %s, {day_literal}) AS data_date, %s AS operator_id, %s AS creation_date, CASE WHEN triplet_day{day_literal} IS NULL THEN hll_empty() ELSE triplet_day{day_literal} END AS triplet_hll, CASE WHEN imei_day{day_literal} IS NULL THEN hll_empty() ELSE imei_day{day_literal} END AS imei_hll, CASE WHEN imsi_day{day_literal} IS NULL THEN hll_empty() ELSE imsi_day{day_literal} END AS imsi_hll, CASE WHEN msisdn_day{day_literal} IS NULL THEN hll_empty() ELSE msisdn_day{day_literal} END AS msisdn_hll, CASE WHEN imei_imsis_day{day_literal} IS NULL THEN hll_empty() ELSE imei_imsis_day{day_literal} END AS imei_imsis_hll, CASE WHEN imei_msisdns_day{day_literal} IS NULL THEN hll_empty() ELSE imei_msisdns_day{day_literal} END AS imei_msisdns_hll, CASE WHEN imsi_msisdns_day{day_literal} IS NULL THEN hll_empty() ELSE imsi_msisdns_day{day_literal} END AS imsi_msisdns_hll FROM {1}""").format( sql.Identifier(hll_partition_name), sql.Identifier(aggregated_data_temp_table), day_literal=day_literal), [year, month, op, job_start_time]) logger.info( 'Populated daily_per_mno_hll_sketches from partition {0}' .format(partition_name)) logger.info( 'Populated daily_per_mno_hll_sketches from seen_triplets')
def upgrade(ctx): """ Upgrades the current DB schema to the version supported by this code using migration scripts. :param ctx: current cli context obj """ logger = logging.getLogger('dirbs.db') config = common.ensure_config(ctx) db_config = config.db_config needs_analyze = False with utils.create_db_connection(db_config) as conn: logger.info('Querying DB schema version for DB %s on host %s', db_config.database, db_config.host) with conn.cursor() as cur: try: version = utils.query_db_schema_version(conn) except ProgrammingError: logger.warn( 'Could not determine current schema version. Assuming no version' ) version = None if version is None: logger.error( 'DB currently not installed or version number could not be determined. Can\'t upgrade' ) sys.exit(1) if version < min_schema_version: logger.error( 'Current DB schema is older than DIRBS 4.0.0. Can\'t upgrade' ) sys.exit(1) if version > code_db_schema_version: logger.error('DB schema newer than code. Can\'t upgrade') sys.exit(1) if version != code_db_schema_version: logger.info('Upgrading DB schema from version %d to %d', version, code_db_schema_version) # If we're upgrading, make sure we schedule a full ANALYZE outside the transaction later needs_analyze = True # Set our role here so that new objects get created with dirbs_core_power_user as owner by default with utils.db_role_setter(conn, role_name='dirbs_core_power_user'): for old_version in range(version, code_db_schema_version): new_version = old_version + 1 # Check if there is a special migration class, otherwise use standard SQL file try: module_name = 'dirbs.schema_migrators.v{0}_upgrade'.format( new_version) module = importlib.import_module(module_name) logger.info('Running Python migration script: %s', module_name) migrator = module.migrator() migrator.upgrade(conn) except ImportError as ex: script_name = 'sql/migration_scripts/v{0:d}_upgrade.sql'.format( new_version) logger.info('Running SQL migration script: %s', script_name) sql = pkgutil.get_data('dirbs', script_name) cur.execute(sql) # We commit after every version upgrade utils.set_db_schema_version(conn, new_version) conn.commit() logger.info( 'Successfully updated schema - DB schema version is now %d', code_db_schema_version) # Can't do anything until we know the schema is the right version _store_job_metadata(config, 'upgrade') else: logger.info('DB schema is already latest version') # Schedule a full ANALYZE at the end of an upgrade if needs_analyze: logger.info( 'Running ANALYZE of entire database after upgrade...') cur.execute('ANALYZE') logger.info( 'Finished running ANALYZE of entire database after upgrade' )
def _repartition_pairing_list(self, conn, *, num_physical_shards): """Repartition pairing list to implement change in structure.""" with conn.cursor() as cursor, utils.db_role_setter( conn, role_name='dirbs_core_power_user'): cursor.execute("""CREATE TABLE historic_pairing_list_new ( LIKE historic_pairing_list INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (virt_imei_shard) """) cursor.execute( """ALTER TABLE historic_pairing_list_new ADD COLUMN msisdn TEXT NOT NULL""" ) # grant permissions part_utils._grant_perms_pairing_list( conn, part_name='historic_pairing_list_new') # create child partitions part_utils.create_imei_shard_partitions( conn, tbl_name='historic_pairing_list_new', num_physical_shards=num_physical_shards, perms_func=part_utils._grant_perms_pairing_list, fillfactor=80) # copy data from original partition cursor.execute("""INSERT INTO historic_pairing_list_new SELECT p.imei_norm, p.imsi, p.start_date, p.end_date, p.virt_imei_shard, m.msisdn FROM historic_pairing_list p INNER JOIN monthly_network_triplets_country m ON p.imsi = m.imsi""" ) # add indexes idx_metadata = [ part_utils.IndexMetadatum( idx_cols=['imei_norm', 'imsi', 'msisdn'], is_unique=True, partial_sql='WHERE end_date IS NULL') ] part_utils.add_indices(conn, tbl_name='historic_pairing_list_new', idx_metadata=idx_metadata) # drop old views, tables, indexes and constraints cursor.execute('DROP VIEW pairing_list') cursor.execute('DROP TABLE historic_pairing_list CASCADE') part_utils.rename_table_and_indices( conn, old_tbl_name='historic_pairing_list_new', new_tbl_name='historic_pairing_list', idx_metadata=idx_metadata) # create new view and grant permissions cursor.execute("""CREATE VIEW pairing_list AS SELECT imei_norm, imsi, msisdn, virt_imei_shard FROM historic_pairing_list WHERE end_date IS NULL WITH CHECK OPTION""" ) cursor.execute("""GRANT SELECT ON pairing_list TO dirbs_core_listgen, dirbs_core_report, dirbs_core_api, dirbs_core_import_pairing_list """) # drop and recreate staging data insert trigger cursor.execute(""" DROP FUNCTION pairing_list_staging_data_insert_trigger_fn() CASCADE; CREATE FUNCTION pairing_list_staging_data_insert_trigger_fn() RETURNS trigger LANGUAGE plpgsql AS $$ BEGIN -- Clean/normalize data before inserting NEW.imei_norm = normalize_imei(NULLIF(TRIM(NEW.imei), '')); NEW.imsi = NULLIF(TRIM(new.imsi), ''); NEW.msisdn = NULLIF(TRIM(new.msisdn), ''); RETURN NEW; END $$; """)
def repartition_monthly_network_triplets(conn, *, num_physical_shards): """Function to repartition the monthly_network_triplets_country and monthly_network_triplets_country tables.""" with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'): # Create parent partitions cursor.execute( """CREATE TABLE monthly_network_triplets_country_new ( LIKE monthly_network_triplets_country INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY RANGE (triplet_year, triplet_month) """ ) _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_country_new') cursor.execute( """CREATE TABLE monthly_network_triplets_per_mno_new ( LIKE monthly_network_triplets_per_mno INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING STORAGE INCLUDING COMMENTS ) PARTITION BY LIST (operator_id) """ ) _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_per_mno_new') # Work out what year-month tuples we have country_monthly_partitions = utils.child_table_names(conn, 'monthly_network_triplets_country') country_year_month_tuples = [(x.triplet_year, x.triplet_month) for x in utils.table_invariants_list(conn, country_monthly_partitions, ['triplet_year', 'triplet_month'])] operator_partitions = utils.child_table_names(conn, 'monthly_network_triplets_per_mno') operator_monthly_partitions = set() for op_partition in operator_partitions: operator_monthly_partitions.update(utils.child_table_names(conn, op_partition)) mno_year_month_tuples = [(x.operator_id, x.triplet_year, x.triplet_month) for x in utils.table_invariants_list(conn, operator_monthly_partitions, ['operator_id', 'triplet_year', 'triplet_month'])] latest_year_month = None # Sort year month tuples and get the maximum year month combination. country_year_month_tuples = sorted(country_year_month_tuples, key=lambda x: (x[0], x[1]), reverse=True) if len(country_year_month_tuples) > 0: latest_year_month = country_year_month_tuples[0] # Create child partitions at country level for year, month in country_year_month_tuples: # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we # pack tightly to ensure optimal usage of disk space and optimal scan performance latest_year, latest_month = latest_year_month fillfactor = 45 if year == latest_year and month == latest_month else 100 create_monthly_network_triplets_country_partition(conn, month=month, year=year, suffix='_new', num_physical_shards=num_physical_shards, fillfactor=fillfactor) # Create child partitions at per-MNO level for op, year, month in mno_year_month_tuples: # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we # pack tightly to ensure optimal usage of disk space and optimal scan performance latest_year, latest_month = latest_year_month fillfactor = 45 if year == latest_year and month == latest_month else 100 create_monthly_network_triplets_per_mno_partition(conn, operator_id=op, month=month, year=year, suffix='_new', num_physical_shards=num_physical_shards, fillfactor=fillfactor) # Populate country-level table from old table cursor.execute("""INSERT INTO monthly_network_triplets_country_new SELECT * FROM monthly_network_triplets_country""") # Populate per-MNO-level table from old table cursor.execute("""INSERT INTO monthly_network_triplets_per_mno_new SELECT * FROM monthly_network_triplets_per_mno""") # Add in indexes add_indices(conn, tbl_name='monthly_network_triplets_country_new', idx_metadata=monthly_network_triplets_country_indices()) add_indices(conn, tbl_name='monthly_network_triplets_per_mno_new', idx_metadata=monthly_network_triplets_per_mno_indices()) # Drop old tables cursor.execute('DROP TABLE monthly_network_triplets_country CASCADE') cursor.execute('DROP TABLE monthly_network_triplets_per_mno CASCADE') # Renames tables rename_table_and_indices(conn, old_tbl_name='monthly_network_triplets_country_new', new_tbl_name='monthly_network_triplets_country', idx_metadata=monthly_network_triplets_country_indices()) rename_table_and_indices(conn, old_tbl_name='monthly_network_triplets_per_mno_new', new_tbl_name='monthly_network_triplets_per_mno', idx_metadata=monthly_network_triplets_per_mno_indices()) cursor.execute("""CREATE OR REPLACE VIEW operator_data AS SELECT sq.connection_date, sq.imei_norm, sq.imsi, sq.msisdn, sq.operator_id FROM (SELECT make_date(nt.triplet_year::integer, nt.triplet_month::integer, dom.dom) AS connection_date, nt.imei_norm, nt.imsi, nt.msisdn, nt.operator_id FROM generate_series(1, 31) dom(dom), monthly_network_triplets_per_mno nt WHERE (nt.date_bitmask & (1 << (dom.dom - 1))) <> 0) sq""") cursor.execute("""CREATE VIEW monthly_network_triplets_country_no_null_imeis AS SELECT * FROM monthly_network_triplets_country WHERE imei_norm IS NOT NULL""") cursor.execute("""CREATE VIEW monthly_network_triplets_per_mno_no_null_imeis AS SELECT * FROM monthly_network_triplets_per_mno WHERE imei_norm IS NOT NULL""") cursor.execute(sql.SQL('GRANT SELECT ON operator_data TO dirbs_core_base')) for role in ['dirbs_core_listgen', 'dirbs_core_classify', 'dirbs_core_report', 'dirbs_core_api']: cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_country_no_null_imeis TO {0}""").format(sql.Identifier(role))) cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_per_mno_no_null_imeis TO {0}""").format(sql.Identifier(role))) cursor.execute("""CREATE VIEW monthly_network_triplets_with_invalid_data_flags AS SELECT nt.*, nt.imei_norm IS NULL AS is_null_imei, is_unclean_imei(nt.imei_norm) AS is_unclean_imei, nt.imsi IS NULL AS is_null_imsi, is_unclean_imsi(nt.imsi) AS is_unclean_imsi, nt.msisdn IS NULL AS is_null_msisdn FROM monthly_network_triplets_per_mno nt""")