Ejemplo n.º 1
0
    def upgrade(self, db_conn):
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor, utils.db_role_setter(
                db_conn, role_name='dirbs_core_import_operator'):
            child_table_names_list = utils.child_table_names(
                db_conn, 'daily_per_mno_hll_sketches')
            for partition_name in child_table_names_list:
                logger.info(
                    'Copying partition {0} into daily_per_mno_hll_sketches table...'
                    .format(partition_name))
                cursor.execute(
                    sql.SQL("""INSERT INTO daily_per_mno_hll_sketches
                                               SELECT *
                                                 FROM {partition_name_id}""").
                    format(partition_name_id=sql.Identifier(partition_name)))
                logger.info(
                    'Copied partition {0} into daily_per_mno_hll_sketches table'
                    .format(partition_name))

                logger.info(
                    'Dropping daily_per_mno_hll_sketches partition {0}...'.
                    format(partition_name))
                cursor.execute(
                    sql.SQL("""DROP TABLE {partition_name_id}""").format(
                        partition_name_id=sql.Identifier(partition_name)))
                logger.info('Dropped partition {0}'.format(partition_name))
Ejemplo n.º 2
0
def add_indices(conn, *, tbl_name, idx_metadata, if_not_exists=False):
    """Helper that add indexes to a potentially partitioned table.

    If table is not partitioned, will just add the index to the physical table. Otherwise, will recursively
    go through the child tables until if find non-partitioned tables to add the indices to.
    """
    if not utils.is_table_partitioned(conn, tbl_name):
        _add_indices_to_single_shard(conn, part_name=tbl_name, idx_metadata=idx_metadata,
                                     if_not_exists=if_not_exists)
    else:
        for child_tbl_name in utils.child_table_names(conn, tbl_name):
            add_indices(conn, tbl_name=child_tbl_name, idx_metadata=idx_metadata, if_not_exists=if_not_exists)
Ejemplo n.º 3
0
    def upgrade(self, db_conn):
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor:
            # _create_seen_imeis_partition creates a table LIKE seen_imeis that needs to have the new col as well
            child_table_names_list = utils.child_table_names(db_conn, 'seen_imeis')
            logger.info('Adding last_seen date to all partitions of seen_imeis table: '
                        '{0}'.format(', '.join(child_table_names_list)))

            cursor.execute("""ALTER TABLE seen_imeis ADD COLUMN last_seen date DEFAULT NULL""")

            for c in child_table_names_list:
                logger.info('Adding last_seen date value to table {0}...'.format(c))

                cursor.execute(sql.SQL("""SELECT operator_id FROM {0} LIMIT 1""")
                               .format(sql.Identifier(c)))

                res = cursor.fetchone()
                if res:
                    operator_id = res.operator_id
                    logger.info('Setting last_seen value to max seen_date in partition {0}...'.format(c))

                    cursor.execute(sql.SQL("""UPDATE {0} s1
                                                 SET last_seen = s2.max_seen
                                                FROM
                                                     (SELECT imei_norm, MAX(last_seen) AS max_seen
                                                        FROM seen_triplets_no_null_imeis
                                                       WHERE operator_id = %s
                                                    GROUP BY imei_norm)s2
                                               WHERE s1.imei_norm = s2.imei_norm""")
                                   .format(sql.Identifier(c)), [operator_id])

                    logger.info('Set last_seen value to max last_seen date')

                else:
                    logger.info('Skipped setting last_seen values in partition {0} as is empty'.format(c))

                logger.info('Setting any last_seen values that are NULL to the first_seen value in partition {0}'
                            '...'.format(c))

                cursor.execute(sql.SQL("""UPDATE {0}
                                             SET last_seen = first_seen
                                           WHERE last_seen IS NULL""").format(sql.Identifier(c)))

                logger.info('Set any last_seen values that are NULL to the first_seen value')
                logger.info('Added last_seen date value to table {0}'.format(c))

            cursor.execute("""ALTER TABLE seen_imeis ALTER COLUMN last_seen DROP DEFAULT""")
            cursor.execute("""ALTER TABLE seen_imeis ALTER COLUMN last_seen SET NOT NULL""")
            logger.info('Added last_seen date to all partitions of seen_imeis table')
Ejemplo n.º 4
0
    def upgrade(self, db_conn):
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor:
            child_table_names_list = utils.child_table_names(db_conn, 'seen_triplets')
            logger.info('Adding index to all partitions of seen_triplets table: '
                        '{0}'.format(', '.join(child_table_names_list)))
            for c in child_table_names_list:
                logger.info('Adding MSISDN index to table {0}...'.format(c))
                cursor.execute(sql.SQL("""CREATE INDEX {0} ON {1}(msisdn);""")
                               .format(sql.Identifier(c + '_msisdn_idx'),
                                       sql.Identifier(c)))
                logger.info('Added MSISDN index to table {0}'.format(c))

            logger.info('Added index to all partitions of seen_triplets table')
Ejemplo n.º 5
0
def repartition_exceptions_lists(conn, *, num_physical_shards, src_filter_sql=None):
    """Function to repartition the exceptions_lists table."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'):
        # Create parent partition
        cursor.execute(
            """CREATE TABLE exceptions_lists_new (
                   LIKE exceptions_lists INCLUDING DEFAULTS
                                         INCLUDING IDENTITY
                                         INCLUDING CONSTRAINTS
                                         INCLUDING STORAGE
                                         INCLUDING COMMENTS
               )
               PARTITION BY LIST (operator_id)
            """
        )
        _grant_perms_list(conn, part_name='exceptions_lists_new')

        # Work out who the operators are
        imei_shard_names = utils.child_table_names(conn, 'exceptions_lists')
        operators = [x.operator_id for x in utils.table_invariants_list(conn, imei_shard_names, ['operator_id'])]

        # Create child partitions (operator at top level, then IMEI-sharded)
        for op_id in operators:
            tbl_name = per_mno_lists_partition(operator_id=op_id, suffix='_new', list_type='exceptions')
            create_per_mno_lists_partition(conn, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name,
                                           operator_id=op_id, num_physical_shards=num_physical_shards)

        # Insert data from original partition
        base_sql = sql.SQL("""INSERT INTO exceptions_lists_new
                                   SELECT *
                                     FROM exceptions_lists""")
        if src_filter_sql is not None:
            insert_sql = sql.SQL('{0} {1}').format(base_sql, sql.SQL(src_filter_sql))
        else:
            insert_sql = base_sql
        cursor.execute(insert_sql)

        # Add in indexes to each partition
        add_indices(conn, tbl_name='exceptions_lists_new', idx_metadata=exceptions_lists_indices())

        # Drop old table, after assigning sequence to new table
        cursor.execute('ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id')
        cursor.execute('DROP TABLE exceptions_lists CASCADE')

        #  Rename tables, indexes and constraints
        rename_table_and_indices(conn, old_tbl_name='exceptions_lists_new',
                                 new_tbl_name='exceptions_lists', idx_metadata=exceptions_lists_indices())
Ejemplo n.º 6
0
def _queue_add_indices_parallel_job(conn, executor, db_config, *, tbl_name, idx_metadata, if_not_exists=False):
    """Function to queue and accumulate futures."""
    futures = []
    if not utils.is_table_partitioned(conn, tbl_name):
        for idx_metadatum in idx_metadata:
            futures.append(executor.submit(_add_indices_parallel_single_job,
                                           db_config,
                                           tbl_name=tbl_name,
                                           idx_metadatum=idx_metadatum,
                                           if_not_exists=if_not_exists))
    else:
        for child_tbl_name in utils.child_table_names(conn, tbl_name):
            futures.extend(_queue_add_indices_parallel_job(conn,
                                                           executor,
                                                           db_config,
                                                           tbl_name=child_tbl_name,
                                                           idx_metadata=idx_metadata,
                                                           if_not_exists=if_not_exists))

    return futures
Ejemplo n.º 7
0
def rename_table_and_indices(conn, *, old_tbl_name, new_tbl_name, idx_metadata=None):
    """Function to rename a potentially partitioned table and all associated indices on leaf tables."""
    if idx_metadata is None:
        idx_metadata = []

    with conn.cursor() as cursor:
        cursor.execute(sql.SQL('ALTER TABLE {0} RENAME TO {1}').format(sql.Identifier(old_tbl_name),
                                                                       sql.Identifier(new_tbl_name)))

        if not utils.is_table_partitioned(conn, new_tbl_name):
            for idx_metadatum in idx_metadata:
                old_idx_name = idx_metadatum.idx_name(old_tbl_name)
                new_idx_name = idx_metadatum.idx_name(new_tbl_name)
                cursor.execute(sql.SQL('ALTER INDEX {0} RENAME TO {1}').format(
                               sql.Identifier(old_idx_name), sql.Identifier(new_idx_name)))
        else:
            for child_tbl_name in utils.child_table_names(conn, new_tbl_name):
                # Child tables should start with the old table name
                assert child_tbl_name.startswith(old_tbl_name)
                suffix = child_tbl_name[len(old_tbl_name):]
                dest_table_name = new_tbl_name + suffix
                rename_table_and_indices(conn, old_tbl_name=child_tbl_name, new_tbl_name=dest_table_name,
                                         idx_metadata=idx_metadata)
Ejemplo n.º 8
0
def _validate_data_partitions(config: callable, conn: callable, month: int,
                              year: int, logger: callable,
                              disable_data_check: bool) -> None:
    """
    Validate that data is present for all configured operators and only configured operators.

    Arguments:
        config: DIRBS config object
        conn: DIRBS postgresql connection object
        month: data partition month
        year: data partition year
        logger: DIRBS logger object
        disable_data_check: boolean to disable data check
    Returns:
        None
    Raises:
        MissingOperatorDataException: if monthly_network_triplets_per_mno partition is missing for any operator
        ExtraOperatorDataException: if monthly_network_triplets_per_mno partition is detected for unconfigured mno
    """
    operators = config.region_config.operators
    assert len(operators) > 0

    operator_partitions = utils.child_table_names(
        conn, 'monthly_network_triplets_per_mno')
    observed_operator_ids = {
        x
        for x in utils.table_invariants_list(conn, operator_partitions,
                                             ['operator_id'])
    }
    required_operator_ids = {(o.id, ) for o in operators}
    missing_operator_ids = required_operator_ids - observed_operator_ids
    if len(missing_operator_ids) > 0:
        msg = 'Missing monthly_network_triplets_per_mno partitions for operators: {0}' \
              .format(', '.join([x[0] for x in missing_operator_ids]))
        if disable_data_check:
            logger.warning(msg)
        else:
            logger.error(msg)
            raise exceptions.MissingOperatorDataException(msg)

    extra_operator_ids = observed_operator_ids - required_operator_ids
    if len(extra_operator_ids) > 0:
        msg = 'Extra monthly_network_triplets_per_mno partitions detected for unconfigured operators: {0}' \
              .format(', '.join([x[0] for x in extra_operator_ids]))
        if disable_data_check:
            logger.warning(msg)
        else:
            logger.error(msg)
            raise exceptions.ExtraOperatorDataException(msg)

    operator_monthly_partitions = set()
    for op_partition in operator_partitions:
        operator_monthly_partitions.update(
            utils.child_table_names(conn, op_partition))
    observed_invariants = {
        x
        for x in utils.table_invariants_list(
            conn, operator_monthly_partitions,
            ['operator_id', 'triplet_year', 'triplet_month'])
    }
    observed_invariants = {
        x
        for x in observed_invariants
        if x.triplet_year == year and x.triplet_month == month
    }
    required_invariants = {(o.id, year, month) for o in operators}
    missing_invariants = required_invariants - observed_invariants
    if len(missing_invariants) > 0:
        msg = 'Missing monthly_network_triplets_per_mno partitions for the requested reporting ' \
              'month for the following configured operators: {0}' \
              .format(', '.join([x[0] for x in missing_invariants]))
        if disable_data_check:
            logger.warning(msg)
        else:
            logger.error(msg)
            raise exceptions.MissingOperatorDataException(msg)

    extra_invariants = observed_invariants - required_invariants
    if len(extra_invariants) > 0:
        msg = 'Extra monthly_network_triplets_per_mno partitions detected for the requested ' \
              'reporting month for the following unconfigured operators: {0}' \
              .format(', '.join([x[0] for x in extra_invariants]))
        if disable_data_check:
            logger.warning(msg)
        else:
            logger.error(msg)
            raise exceptions.ExtraOperatorDataException(msg)

    country_imei_shard_name = partition_utils.monthly_network_triplets_country_partition(
        month=month, year=year)
    with conn.cursor() as cursor:
        cursor.execute(utils.table_exists_sql(), [country_imei_shard_name])
        partition_exists = cursor.fetchone()[0]
        if not partition_exists:
            msg = 'Missing monthly_network_triplets_country partition for year and month'
            if disable_data_check:
                logger.warning(msg)
            else:
                logger.error(msg)
                raise exceptions.ExtraOperatorDataException(msg)
Ejemplo n.º 9
0
    def upgrade(self, db_conn):  # noqa: C901
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor:
            cursor.execute(
                """CREATE FUNCTION calc_virt_imei_shard(imei TEXT) RETURNS SMALLINT
                              LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE
                              AS $$
                              BEGIN
                                  RETURN SUBSTRING(COALESCE(imei, ''), 13, 2)::SMALLINT;
                              EXCEPTION WHEN OTHERS THEN
                                  RETURN 0;
                              END;
                              $$""")

            # By default, create 4 shards
            num_initial_shards = 4

            logger.info('Re-partitioning classification_state table...')
            cursor.execute(
                'ALTER TABLE classification_state ADD COLUMN virt_imei_shard SMALLINT'
            )
            cursor.execute(
                'UPDATE classification_state SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE classification_state ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            part_utils.repartition_classification_state(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned classification_state table')

            logger.info('Re-partitioning registration_list table...')
            cursor.execute(
                'ALTER TABLE historic_registration_list ADD COLUMN virt_imei_shard SMALLINT'
            )
            cursor.execute(
                'UPDATE historic_registration_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE historic_registration_list ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            self.partition_registration_list(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned registration_list table')

            logger.info('Re-partitioning pairing_list table...')
            cursor.execute(
                'ALTER TABLE historic_pairing_list ADD COLUMN virt_imei_shard SMALLINT'
            )
            cursor.execute(
                'UPDATE historic_pairing_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE historic_pairing_list ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            part_utils.repartition_pairing_list(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned pairing_list table')

            logger.info('Re-partitioning blacklist table...')
            cursor.execute(
                'ALTER TABLE blacklist ADD COLUMN virt_imei_shard SMALLINT')
            cursor.execute(
                'UPDATE blacklist SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE blacklist ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            part_utils.repartition_blacklist(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned blacklist table')

            # Need to make sure owner of list tables is dirbs_core_listgen
            logger.info('Re-partitioning notifications_lists table...')
            # The original notifications_lists were not created with a single sequence for the IDs, so just do now
            with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'):
                cursor.execute(
                    """CREATE UNLOGGED TABLE notifications_lists_new (
                           row_id BIGSERIAL NOT NULL,
                           operator_id TEXT NOT NULL,
                           imei_norm TEXT NOT NULL,
                           imsi TEXT NOT NULL,
                           msisdn TEXT NOT NULL,
                           block_date DATE NOT NULL,
                           reasons TEXT[] NOT NULL,
                           amnesty_granted BOOLEAN DEFAULT FALSE NOT NULL,
                           start_run_id BIGINT NOT NULL,
                           end_run_id BIGINT,
                           delta_reason TEXT NOT NULL CHECK (delta_reason IN ('new', 'resolved', 'blacklisted',
                                                                              'no_longer_seen', 'changed')),
                           virt_imei_shard SMALLINT NOT NULL
                       ) PARTITION BY LIST (operator_id)
                    """)

            # Work out who the operators are
            partitions = utils.child_table_names(db_conn,
                                                 'notifications_lists')
            # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user)
            # due to bad previous migration scripts
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                for p in partitions:
                    cursor.execute(
                        sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen').
                        format(sql.Identifier(p)))

            operators = [
                x.operator_id for x in utils.table_invariants_list(
                    db_conn, partitions, ['operator_id'])
            ]

            # Create operator child partitions
            for op_id in operators:
                tbl_name = part_utils.per_mno_lists_partition(
                    operator_id=op_id,
                    suffix='_new',
                    list_type='notifications')
                part_utils.create_per_mno_lists_partition(
                    db_conn,
                    operator_id=op_id,
                    parent_tbl_name='notifications_lists_new',
                    tbl_name=tbl_name,
                    num_physical_shards=1,
                    unlogged=True,
                    fillfactor=100)

            cursor.execute(
                """INSERT INTO notifications_lists_new(operator_id, imei_norm, imsi, msisdn, block_date,
                                                       reasons, start_run_id, end_run_id, delta_reason,
                                                       virt_imei_shard)
                        SELECT operator_id, imei_norm, imsi, msisdn, block_date,
                               reasons, start_run_id, end_run_id, delta_reason, calc_virt_imei_shard(imei_norm)
                          FROM notifications_lists
                """)
            # Drop old table, rename tables, indexes and constraints
            cursor.execute("""ALTER TABLE notifications_lists_new
                              RENAME CONSTRAINT notifications_lists_new_delta_reason_check
                              TO notifications_lists_delta_reason_check""")
            cursor.execute('DROP TABLE notifications_lists CASCADE')
            cursor.execute("""ALTER SEQUENCE notifications_lists_new_row_id_seq
                              RENAME TO notifications_lists_row_id_seq""")
            part_utils.rename_table_and_indices(
                db_conn,
                old_tbl_name='notifications_lists_new',
                new_tbl_name='notifications_lists')
            part_utils.repartition_notifications_lists(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned notifications_lists table')

            logger.info('Re-partitioning exceptions_lists table...')
            # The original exceptions_lists were not created with a single sequence for the IDs, so just do now
            with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'):
                cursor.execute("""CREATE UNLOGGED TABLE exceptions_lists_new (
                           row_id BIGSERIAL NOT NULL,
                           operator_id TEXT NOT NULL,
                           imei_norm TEXT NOT NULL,
                           imsi TEXT NOT NULL,
                           start_run_id BIGINT NOT NULL,
                           end_run_id BIGINT,
                           delta_reason TEXT NOT NULL CHECK (delta_reason IN ('added', 'removed')),
                           virt_imei_shard SMALLINT NOT NULL
                       ) PARTITION BY LIST (operator_id)
                    """)
            # Work out who the operators are
            partitions = utils.child_table_names(db_conn, 'exceptions_lists')
            # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user)
            # due to bad previous migration scripts
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                for p in partitions:
                    cursor.execute(
                        sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen').
                        format(sql.Identifier(p)))
            operators = [
                x.operator_id for x in utils.table_invariants_list(
                    db_conn, partitions, ['operator_id'])
            ]

            # Create operator child partitions
            for op_id in operators:
                tbl_name = part_utils.per_mno_lists_partition(
                    operator_id=op_id, suffix='_new', list_type='exceptions')
                part_utils.create_per_mno_lists_partition(
                    db_conn,
                    operator_id=op_id,
                    parent_tbl_name='exceptions_lists_new',
                    tbl_name=tbl_name,
                    num_physical_shards=1,
                    unlogged=True,
                    fillfactor=100)

            cursor.execute(
                """INSERT INTO exceptions_lists_new(operator_id, imei_norm, imsi, start_run_id,
                                                    end_run_id, delta_reason, virt_imei_shard)
                        SELECT operator_id, imei_norm, imsi, start_run_id, end_run_id, delta_reason,
                               calc_virt_imei_shard(imei_norm)
                          FROM exceptions_lists
                """)
            # Drop old table, rename tables, indexes and constraints
            cursor.execute("""ALTER TABLE exceptions_lists_new
                              RENAME CONSTRAINT exceptions_lists_new_delta_reason_check
                              TO exceptions_lists_delta_reason_check""")
            cursor.execute('DROP TABLE exceptions_lists CASCADE')
            cursor.execute(
                'ALTER SEQUENCE exceptions_lists_new_row_id_seq RENAME TO exceptions_lists_row_id_seq'
            )
            part_utils.rename_table_and_indices(
                db_conn,
                old_tbl_name='exceptions_lists_new',
                new_tbl_name='exceptions_lists')
            part_utils.repartition_exceptions_lists(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned exceptions_lists table')

            logger.info('Re-partitioning seen_imeis (network_imeis) table')
            # First, just put everything in a temporary table so that we can call partutils
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute("""CREATE UNLOGGED TABLE network_imeis (
                           first_seen DATE NOT NULL,
                           last_seen DATE NOT NULL,
                           seen_rat_bitmask INTEGER,
                           imei_norm TEXT NOT NULL,
                           virt_imei_shard SMALLINT NOT NULL
                       )
                    """)
            #
            # We disable index scans here as doing a merge append with index scans is much slower and involves
            # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort
            # the results by imei_norm
            #
            cursor.execute('SET enable_indexscan = false')
            cursor.execute("""INSERT INTO network_imeis
                        SELECT MIN(first_seen),
                               MAX(last_seen),
                               bit_or(seen_rat_bitmask),
                               imei_norm,
                               calc_virt_imei_shard(imei_norm)
                          FROM seen_imeis
                      GROUP BY imei_norm
                """)
            cursor.execute('SET enable_indexscan = true')
            part_utils.repartition_network_imeis(
                db_conn, num_physical_shards=num_initial_shards)
            cursor.execute('DROP TABLE seen_imeis CASCADE')
            logger.info('Re-partitioned seen_imeis (network_imeis) table')

            # First, just put all country-level triplets in a temporary table so that we can call partition_utils
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute(
                    """CREATE UNLOGGED TABLE monthly_network_triplets_country (
                           triplet_year SMALLINT NOT NULL,
                           triplet_month SMALLINT NOT NULL,
                           first_seen DATE NOT NULL,
                           last_seen DATE NOT NULL,
                           date_bitmask INTEGER NOT NULL,
                           triplet_hash UUID NOT NULL,
                           imei_norm TEXT,
                           imsi TEXT,
                           msisdn TEXT,
                           virt_imei_shard SMALLINT NOT NULL,
                           CHECK (last_seen >= first_seen),
                           CHECK (EXTRACT(month FROM last_seen) = triplet_month AND
                                  EXTRACT(year FROM last_seen) = triplet_year),
                           CHECK (EXTRACT(month FROM first_seen) = triplet_month AND
                                  EXTRACT(year FROM first_seen) = triplet_year)
                       ) PARTITION BY RANGE (triplet_year, triplet_month)
                    """)

            # Work out what partitions to create and create them
            partitions = utils.child_table_names(db_conn, 'seen_triplets')
            # Make sure that they are owned by dirbs_core_import_operator (they can be owner by dirbs_core_power_user)
            # due to bad previous migration scripts
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                for p in partitions:
                    cursor.execute(
                        sql.SQL(
                            'ALTER TABLE {0} OWNER TO dirbs_core_import_operator'
                        ).format(sql.Identifier(p)))

            year_month_tuples = {
                (x.triplet_year, x.triplet_month)
                for x in utils.table_invariants_list(
                    db_conn, partitions, ['triplet_year', 'triplet_month'])
            }
            for year, month in year_month_tuples:
                part_utils.create_monthly_network_triplets_country_partition(
                    db_conn, month=month, year=year, num_physical_shards=1)

            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute(
                    """CREATE UNLOGGED TABLE monthly_network_triplets_per_mno (
                            LIKE monthly_network_triplets_country INCLUDING ALL,
                            operator_id TEXT NOT NULL
                       ) PARTITION BY LIST (operator_id)
                    """)

            # Work out what partitions to create and create them
            op_year_month_tuples = {
                (x.operator_id, x.triplet_year, x.triplet_month)
                for x in utils.table_invariants_list(
                    db_conn, partitions,
                    ['operator_id', 'triplet_year', 'triplet_month'])
            }
            # Create child partitions at per-MNO level
            for op, year, month in op_year_month_tuples:
                part_utils.create_monthly_network_triplets_per_mno_partition(
                    db_conn,
                    operator_id=op,
                    month=month,
                    year=year,
                    num_physical_shards=1)

            # Create temporary monthly_network_triplets_per_mno table
            for year, month in year_month_tuples:
                logger.info(
                    'Generating temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}...'
                    .format(month, year))
                cursor.execute(
                    """INSERT INTO monthly_network_triplets_per_mno
                            SELECT %(year)s,
                                   %(month)s,
                                   first_seen,
                                   last_seen,
                                   date_bitmask,
                                   triplet_hash,
                                   imei_norm,
                                   imsi,
                                   msisdn,
                                   calc_virt_imei_shard(imei_norm),
                                   operator_id
                              FROM seen_triplets
                             WHERE triplet_year = %(year)s
                               AND triplet_month = %(month)s
                    """, {
                        'year': year,
                        'month': month
                    })
                logger.info(
                    'Generated temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}'
                    .format(month, year))

            # Create temporary monthly_network_triplets_country table. We need to do this monthly as we need
            # to aggregate by triplets on a monthly basis
            #
            # We disable index scans here as doing a merge append with index scans is much slower and involves
            # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort
            # the results by imei_norm
            #
            cursor.execute('SET enable_indexscan = false')
            for year, month in year_month_tuples:
                logger.info(
                    'Generating temporary monthly_network_triplets_country entries for {0:02d}/{1:d}...'
                    .format(month, year))
                cursor.execute(
                    """INSERT INTO monthly_network_triplets_country
                            SELECT %(year)s,
                                   %(month)s,
                                   MIN(first_seen),
                                   MAX(last_seen),
                                   bit_or(date_bitmask),
                                   triplet_hash,
                                   FIRST(imei_norm),
                                   FIRST(imsi),
                                   FIRST(msisdn),
                                   calc_virt_imei_shard(FIRST(imei_norm))
                              FROM seen_triplets
                             WHERE triplet_year = %(year)s
                               AND triplet_month = %(month)s
                          GROUP BY triplet_hash
                    """, {
                        'year': year,
                        'month': month
                    })
                logger.info(
                    'Generated temporary monthly_network_triplets_country entries for {0:02d}/{1:d}'
                    .format(month, year))
            cursor.execute('SET enable_indexscan = true')

            logger.info(
                'Re-partitioning temporary monthly_network_triplets tables...')
            # Previously, the operator_data view was owned by dirbs_core_power_user but is now owned by the
            # dirbs_core_import_operator since it must be re-created
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                cursor.execute(
                    'ALTER VIEW operator_data OWNER TO dirbs_core_import_operator'
                )
            part_utils.repartition_monthly_network_triplets(
                db_conn, num_physical_shards=num_initial_shards)
            cursor.execute('DROP TABLE seen_triplets CASCADE')
            logger.info(
                'Re-partitioned temporary monthly_network_triplets tables')

            # Replace list generation function to include virt_imei_shard
            cursor.execute("""
                DROP FUNCTION gen_blacklist(run_id BIGINT);
                DROP FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT);
                DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT);

                --
                -- Create function to generate a full blacklist for a given run_id. A value of -1 means get the latest
                -- list.
                --
                CREATE FUNCTION gen_blacklist(run_id BIGINT = -1)
                    RETURNS TABLE (
                        imei_norm       TEXT,
                        virt_imei_shard SMALLINT,
                        block_date      DATE,
                        reasons         TEXT[]
                    )
                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                    AS $$
                DECLARE
                    query_run_id    BIGINT;
                BEGIN
                    --
                    -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows
                    -- where end_run_id is NULL
                    --
                    IF run_id = -1 THEN
                        run_id := max_bigint();
                    END IF;

                    RETURN QUERY SELECT bl.imei_norm,
                                        bl.virt_imei_shard,
                                        bl.block_date,
                                        bl.reasons
                                   FROM blacklist bl
                                  WHERE bl.delta_reason != 'unblocked'
                                    AND run_id >= bl.start_run_id
                                    AND (run_id < bl.end_run_id OR bl.end_run_id IS NULL);
                END
                $$;

                --
                -- Create function to generate a full notifications_list for a given run_id and operator ID. A value
                -- of -1 means get the latest list.
                --
                CREATE FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT = -1)
                    RETURNS TABLE (
                        imei_norm       TEXT,
                        virt_imei_shard SMALLINT,
                        imsi            TEXT,
                        msisdn          TEXT,
                        block_date      DATE,
                        reasons         TEXT[],
                        amnesty_granted BOOLEAN
                    )
                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                    AS $$
                BEGIN
                    --
                    -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows
                    -- where end_run_id is NULL
                    --
                    IF run_id = -1 THEN
                        run_id := max_bigint();
                    END IF;

                    RETURN QUERY SELECT nl.imei_norm,
                                        nl.virt_imei_shard,
                                        nl.imsi,
                                        nl.msisdn,
                                        nl.block_date,
                                        nl.reasons,
                                        nl.amnesty_granted
                                   FROM notifications_lists nl
                                  WHERE nl.operator_id = op_id
                                    AND nl.delta_reason NOT IN ('resolved', 'blacklisted')
                                    AND run_id >= nl.start_run_id
                                    AND (run_id < nl.end_run_id OR nl.end_run_id IS NULL);
                END
                $$;

                --
                -- Create function to generate a full exceptions_list for a given run_id and operator ID. A value
                -- of -1 means get the latest list.
                --
                CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1)
                    RETURNS TABLE (
                        imei_norm       TEXT,
                        virt_imei_shard SMALLINT,
                        imsi            TEXT
                    )
                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                    AS $$
                BEGIN
                    --
                    -- If we don't specify a run_id, just set to the maximum run_id which will always return all
                    -- rows where end_run_id is NULL
                    --
                    IF run_id = -1 THEN
                        run_id := max_bigint();
                    END IF;

                    RETURN QUERY SELECT el.imei_norm,
                                        el.virt_imei_shard,
                                        el.imsi
                                   FROM exceptions_lists el
                                  WHERE el.operator_id = op_id
                                    AND el.delta_reason != 'removed'
                                    AND run_id >= el.start_run_id
                                    AND (run_id < el.end_run_id OR el.end_run_id IS NULL);
                END
                $$;
            """)  # noqa: Q440, Q441

            # Update schema metadata table
            cursor.execute(
                """ALTER TABLE schema_metadata ADD COLUMN phys_shards SMALLINT NOT NULL
                              DEFAULT %s CHECK (phys_shards > 0 AND phys_shards <= 100)""",
                [num_initial_shards])
            cursor.execute(
                'ALTER TABLE schema_metadata ALTER COLUMN phys_shards DROP DEFAULT'
            )

            # Drop obsolete columns
            cursor.execute(
                'ALTER TABLE schema_metadata DROP COLUMN potential_whitespace_imsis_msisdns'
            )
            cursor.execute(
                'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_imsi_records'
            )
            cursor.execute(
                'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_msisdn_records'
            )
Ejemplo n.º 10
0
def triplets(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
             metrics_root, metrics_run_root):
    """Prune old monthly_network_triplets data."""
    curr_date = ctx.obj['CURR_DATE']

    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        curr_date=curr_date.isoformat() if curr_date is not None else None,
        retention_months=config.retention_config.months_retention)

    if curr_date is None:
        curr_date = datetime.date.today()

    with conn.cursor() as cursor:
        logger.info(
            'Pruning monthly_network_triplets data outside the retention window from database...'
        )
        retention_months = config.retention_config.months_retention
        first_month_to_drop = datetime.date(
            curr_date.year, curr_date.month,
            1) - relativedelta.relativedelta(months=retention_months)
        logger.info(
            'monthly_network_triplets partitions older than {0} will be pruned'
            .format(first_month_to_drop))

        country_monthly_partitions = utils.child_table_names(
            conn, 'monthly_network_triplets_country')
        operator_partitions = utils.child_table_names(
            conn, 'monthly_network_triplets_per_mno')
        operator_monthly_partitions = []
        for op_partition in operator_partitions:
            operator_monthly_partitions.extend(
                utils.child_table_names(conn, op_partition))

        parent_tbl_names = [
            'monthly_network_triplets_country',
            'monthly_network_triplets_per_mno'
        ]
        rows_before = {}
        for tbl in parent_tbl_names:
            logger.debug(
                'Calculating original number of rows in {0} table...'.format(
                    tbl))
            cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl)))
            rows_before[tbl] = cursor.fetchone()[0]
            logger.debug(
                'Calculated original number of rows in {0} table'.format(tbl))
            statsd.gauge('{0}.{1}.rows_before'.format(metrics_run_root, tbl),
                         rows_before[tbl])
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_before=rows_before)

        total_rows_pruned = 0
        total_partitions = country_monthly_partitions + operator_monthly_partitions
        for tblname in total_partitions:
            invariants_list = utils.table_invariants_list(
                conn, [tblname], ['triplet_month', 'triplet_year'])
            assert len(invariants_list) <= 1
            if len(invariants_list) == 0:
                logger.warning(
                    'Found empty partition {0}. Dropping...'.format(tblname))
                cursor.execute(
                    sql.SQL("""DROP TABLE {0} CASCADE""").format(
                        sql.Identifier(tblname)))
            else:
                month, year = tuple(invariants_list[0])

                # Check if table year/month is outside the retention window
                if (datetime.date(year, month, 1) < first_month_to_drop):
                    # Calculate number of rows in the partition table
                    cursor.execute(
                        sql.SQL("""SELECT COUNT(*) FROM {0}""").format(
                            sql.Identifier(tblname)))
                    partition_table_rows = cursor.fetchone()[0]
                    total_rows_pruned += partition_table_rows

                    logger.info('Dropping table {0} with {1} rows...'.format(
                        tblname, partition_table_rows))
                    cursor.execute(
                        sql.SQL("""DROP TABLE {0} CASCADE""").format(
                            sql.Identifier(tblname)))
                    logger.info('Dropped table {0}'.format(tblname))

        rows_after = {}
        for tbl in parent_tbl_names:
            logger.debug(
                'Calculating new number of rows in {0} table...'.format(tbl))
            cursor.execute(sql.SQL('SELECT COUNT(*) FROM {0}'.format(tbl)))
            rows_after[tbl] = cursor.fetchone()[0]
            logger.debug(
                'Calculated new number of rows in {0} table'.format(tbl))
            statsd.gauge('{0}.{1}.rows_after'.format(metrics_run_root, tbl),
                         rows_after[tbl])
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_after=rows_after)

        total_rows_before = sum(rows_before.values())
        total_rows_after = sum(rows_after.values())

        assert (total_rows_before - total_rows_after) == total_rows_pruned
        logger.info(
            'Pruned {0:d} rows of monthly_network_triplets data outside the retention window from database'
            .format(total_rows_pruned))
Ejemplo n.º 11
0
    def upgrade(self, db_conn):
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor:
            # Set search_path to include hll
            cursor.execute('SET search_path = core, hll;')
            cursor.execute("""
                DO $$
                DECLARE
                    database_name TEXT;
                BEGIN
                    SELECT current_database() INTO database_name;
                    -- Set the search path of this database to "core"
                    EXECUTE 'ALTER DATABASE ' || quote_ident(database_name) || ' SET search_path TO core, hll';
                END $$;""")

            logger.info('Creating daily_per_mno_hll_sketches table...')
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute("""CREATE TABLE daily_per_mno_hll_sketches (
                                      PRIMARY KEY (data_date, operator_id),
                                      data_date         DATE  NOT NULL,
                                      operator_id       TEXT  NOT NULL,
                                      creation_date     DATE  NOT NULL,
                                      triplet_hll       HLL   NOT NULL,
                                      imei_hll          HLL   NOT NULL,
                                      imsi_hll          HLL   NOT NULL,
                                      msisdn_hll        HLL   NOT NULL,
                                      imei_imsis_hll    HLL   NOT NULL,
                                      imei_msisdns_hll  HLL   NOT NULL,
                                      imsi_msisdns_hll  HLL   NOT NULL
                                  )
                                """)
                cursor.execute(
                    'GRANT SELECT ON daily_per_mno_hll_sketches TO dirbs_core_report'
                )
                logger.info('Created daily_per_mno_hll_sketches table')

                logger.info(
                    'Populating daily_per_mno_hll_sketches from seen_triplets...'
                )
                child_table_names_list = utils.child_table_names(
                    db_conn, 'seen_triplets')

                # Make sure that seen_triplets partitions are owned by dirbs_core_import_operator (they are supposed
                # to be). Previously migration scripts failed to set ownership correctly when tables were re-written
                # and they were incorrectly owned by dirbs_core_power_user.
                with utils.db_role_setter(db_conn,
                                          role_name='dirbs_core_power_user'):
                    for p in child_table_names_list:
                        cursor.execute(
                            sql.SQL(
                                'ALTER TABLE {0} OWNER TO dirbs_core_import_operator'
                            ).format(sql.Identifier(p)))

                for partition_name in child_table_names_list:
                    logger.info(
                        'Populating daily_per_mno_hll_sketches from partition {0}...'
                        .format(partition_name))
                    cursor.execute(
                        sql.SQL(
                            'SELECT triplet_year, triplet_month FROM {0} LIMIT 1'
                        ).format(sql.Identifier(partition_name)))
                    res = cursor.fetchone()
                    if res is None:
                        # Table is empty
                        continue

                    year = res.triplet_year
                    month = res.triplet_month
                    days_in_month = monthrange(year, month)[1]
                    triplet_sql_list = []
                    imei_sql_list = []
                    imsi_sql_list = []
                    msisdn_sql_list = []
                    imei_imsis_sql_list = []
                    imei_msisdns_sql_list = []
                    imsi_msisdns_sql_list = []
                    final_select_sql_list = []

                    hll_partition_name = 'hll_{0}'.format(partition_name)
                    cursor.execute(
                        sql.SQL(
                            """CREATE TABLE {0} (PRIMARY KEY (data_date, operator_id),
                                                      LIKE daily_per_mno_hll_sketches)
                                                      INHERITS (daily_per_mno_hll_sketches);
                                                      ALTER TABLE {0} OWNER TO dirbs_core_import_operator
                                           """).format(
                                sql.Identifier(hll_partition_name)))

                    aggregated_data_temp_table = 'temp_{0}'.format(
                        hll_partition_name)
                    base_query = sql.SQL(
                        """CREATE TEMP TABLE {aggregated_data_temp_table_id} AS
                                                SELECT {select_sql}
                                                  FROM {partition_tbl_id}""")

                    for day in range(1, days_in_month + 1):
                        day_literal = sql.Literal(day)
                        triplet_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(triplet_hash::TEXT))
                                                           FILTER(WHERE (date_bitmask
                                                                         & (1 << ({day_literal} - 1))) <> 0
                                                              AND imei_norm IS NOT NULL
                                                              AND imsi IS NOT NULL
                                                              AND msisdn IS NOT NULL) AS triplet_day{day_literal}"""
                            ).format(day_literal=day_literal))

                        imei_sql_list.append(
                            sql.SQL("""hll_add_agg(hll_hash_text(imei_norm))
                                                            FILTER(WHERE (date_bitmask
                                                                          & (1 << ({day_literal} - 1))) <> 0
                                                                     AND imei_norm IS NOT NULL)
                                                            AS imei_day{day_literal}"""
                                    ).format(day_literal=day_literal))

                        imsi_sql_list.append(
                            sql.SQL("""hll_add_agg(hll_hash_text(imsi))
                                                        FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0
                                                                 AND imsi IS NOT NULL) AS imsi_day{day_literal}"""
                                    ).format(day_literal=day_literal))

                        msisdn_sql_list.append(
                            sql.SQL("""hll_add_agg(hll_hash_text(msisdn))
                                                              FILTER(WHERE (date_bitmask
                                                                            & (1 << ({day_literal} - 1))) <> 0
                                                                       AND msisdn IS NOT NULL)
                                                              AS msisdn_day{day_literal}"""
                                    ).format(day_literal=day_literal))

                        imei_imsis_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(imei_norm||'$'||imsi))
                                                                  FILTER(WHERE (date_bitmask
                                                                                & (1 << ({day_literal} - 1))) <> 0
                                                                           AND imei_norm IS NOT NULL
                                                                           AND imsi IS NOT NULL)
                                                                  AS imei_imsis_day{day_literal}"""
                            ).format(day_literal=day_literal))

                        imei_msisdns_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(imei_norm||'$'||msisdn))
                                                                    FILTER(WHERE (date_bitmask
                                                                                  & (1 << ({day_literal} - 1))) <> 0
                                                                             AND  imei_norm IS NOT NULL
                                                                             AND  msisdn IS NOT NULL
                                                                           ) AS imei_msisdns_day{day_literal}"""
                            ).format(day_literal=day_literal))

                        imsi_msisdns_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(imsi||'$'||msisdn))
                                                                    FILTER(WHERE (date_bitmask
                                                                                  & (1 << ({day_literal} - 1))) <> 0
                                                                             AND  imsi IS NOT NULL
                                                                             AND  msisdn IS NOT NULL)
                                                                             AS imsi_msisdns_day{day_literal}"""
                            ).format(day_literal=day_literal))

                    for sql_list in [
                            triplet_sql_list, imei_sql_list, imsi_sql_list,
                            msisdn_sql_list, imei_imsis_sql_list,
                            imei_msisdns_sql_list, imsi_msisdns_sql_list
                    ]:
                        final_select_sql_list.extend(sql_list)

                    final_query = base_query \
                        .format(aggregated_data_temp_table_id=sql.Identifier(aggregated_data_temp_table),
                                select_sql=sql.SQL(', ').join(final_select_sql_list),
                                partition_tbl_id=sql.Identifier(partition_name))

                    cursor.execute(final_query)

                    for day in range(1, days_in_month + 1):
                        str_split = partition_name.split('_')
                        op = str_split[2]
                        job_start_time = datetime.now()
                        day_literal = sql.Literal(day)

                        cursor.execute(
                            sql.SQL(
                                """INSERT INTO {0} (data_date, operator_id, creation_date, triplet_hll,
                                                                   imei_hll, imsi_hll, msisdn_hll, imei_imsis_hll,
                                                                   imei_msisdns_hll, imsi_msisdns_hll)
                                                       SELECT make_date(%s, %s, {day_literal}) AS data_date,
                                                              %s AS operator_id, %s AS creation_date,
                                                              CASE
                                                                  WHEN triplet_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE triplet_day{day_literal}
                                                              END AS triplet_hll,
                                                              CASE
                                                                  WHEN imei_day{day_literal} IS NULL THEN hll_empty()
                                                                  ELSE imei_day{day_literal}
                                                              END AS imei_hll,
                                                              CASE
                                                                  WHEN imsi_day{day_literal} IS NULL THEN hll_empty()
                                                                  ELSE imsi_day{day_literal}
                                                              END AS imsi_hll,
                                                              CASE
                                                                  WHEN msisdn_day{day_literal} IS NULL THEN hll_empty()
                                                                  ELSE msisdn_day{day_literal}
                                                              END AS msisdn_hll,
                                                              CASE
                                                                  WHEN imei_imsis_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE imei_imsis_day{day_literal}
                                                              END AS imei_imsis_hll,
                                                              CASE
                                                                  WHEN imei_msisdns_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE imei_msisdns_day{day_literal}
                                                              END AS imei_msisdns_hll,
                                                              CASE
                                                                  WHEN imsi_msisdns_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE imsi_msisdns_day{day_literal}
                                                              END AS imsi_msisdns_hll

                                                         FROM {1}""").format(
                                    sql.Identifier(hll_partition_name),
                                    sql.Identifier(aggregated_data_temp_table),
                                    day_literal=day_literal),
                            [year, month, op, job_start_time])

                    logger.info(
                        'Populated daily_per_mno_hll_sketches from partition {0}'
                        .format(partition_name))

            logger.info(
                'Populated daily_per_mno_hll_sketches from seen_triplets')
Ejemplo n.º 12
0
    def _repartition_exceptions_lists(self, conn, *, num_physical_shards):
        """Repartition the exceptions lists to support msisdn."""
        with conn.cursor() as cursor, utils.db_role_setter(
                conn, role_name='dirbs_core_listgen'):
            cursor.execute("""CREATE TABLE exceptions_lists_new (
                       LIKE exceptions_lists INCLUDING DEFAULTS
                                             INCLUDING IDENTITY
                                             INCLUDING CONSTRAINTS
                                             INCLUDING STORAGE
                                             INCLUDING COMMENTS
                   )
                   PARTITION BY LIST (operator_id);

                   ALTER TABLE exceptions_lists_new ADD COLUMN msisdn TEXT NOT NULL;
                """)

            part_utils._grant_perms_list(
                conn,
                part_name='exceptions_lists_new')  # grant relevant permissions
            imei_shard_names = utils.child_table_names(
                conn, 'exceptions_lists')  # determine the child table names
            operators = [
                o.operator_id for o in utils.table_invariants_list(
                    conn, imei_shard_names, ['operator_id'])
            ]  # workout who the operators are

            # create child partitions for new list (operator at top level, then IMEI sharded)
            for op_id in operators:
                tbl_name = part_utils.per_mno_lists_partition(
                    operator_id=op_id, suffix='_new', list_type='exceptions')
                part_utils.create_per_mno_lists_partition(
                    conn,
                    parent_tbl_name='exceptions_lists_new',
                    tbl_name=tbl_name,
                    operator_id=op_id,
                    num_physical_shards=num_physical_shards)

            # insert data into the new parent partition
            cursor.execute("""INSERT INTO exceptions_lists_new
                                   SELECT e.row_id, e.operator_id, e.imei_norm, e.imsi, e.start_run_id, e.end_run_id,
                                          e.delta_reason, e.virt_imei_shard, p.msisdn
                                     FROM exceptions_lists e
                               INNER JOIN historic_pairing_list p ON e.imsi = p.imsi"""
                           )

            # add indexes in each partitions
            part_utils.add_indices(
                conn,
                tbl_name='exceptions_lists_new',
                idx_metadata=part_utils.exceptions_lists_indices())

            # drop old table, after assigning sequence to new table
            cursor.execute(
                'ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id'
            )
            cursor.execute('DROP TABLE exceptions_lists CASCADE')

            # rename table, indexes and constraints
            part_utils.rename_table_and_indices(
                conn,
                old_tbl_name='exceptions_lists_new',
                new_tbl_name='exceptions_lists',
                idx_metadata=part_utils.exceptions_lists_indices())

            # recreating gen_exceptionlist function
            with utils.db_role_setter(conn, role_name='dirbs_core_power_user'):
                cursor.execute("""
                                DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT);

                                --
                                -- Recreate function to generate a full exceptions_list for a given
                                -- run_id and operator.
                                -- A value of -1 means get the latest list.
                                --
                                CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1)
                                    RETURNS TABLE (
                                        imei_norm       TEXT,
                                        virt_imei_shard SMALLINT,
                                        imsi            TEXT,
                                        msisdn          TEXT
                                    )
                                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                                    AS $$
                                BEGIN
                                    --
                                    -- If we don't specify a run_id, just set to the maximum run_id which will always
                                    -- return all rows where end_run_id is NULL
                                    --
                                    IF run_id = -1 THEN
                                        run_id := max_bigint();
                                    END IF;

                                    RETURN QUERY SELECT el.imei_norm,
                                                        el.virt_imei_shard,
                                                        el.imsi,
                                                        el.msisdn
                                                   FROM exceptions_lists el
                                                  WHERE el.operator_id = op_id
                                                    AND el.delta_reason != 'removed'
                                                    AND run_id >= el.start_run_id
                                                    AND (run_id < el.end_run_id OR el.end_run_id IS NULL);
                                END
                                $$;

                                DROP FUNCTION gen_delta_exceptions_list(op_id TEXT, base_run_id BIGINT, run_id BIGINT);

                                --
                                -- Create function to generate a per-MNO delta exceptions list for a run_id, operator
                                -- id and optional base_run_id.
                                --
                                -- If not base_run_id is supplied, this function will use the maximum run_id found in
                                -- the DB that it less than than the supplied run_id
                                --
                                CREATE FUNCTION gen_delta_exceptions_list(op_id TEXT,
                                                                          base_run_id BIGINT,
                                                                          run_id BIGINT = -1)
                                    RETURNS TABLE (
                                        imei_norm       TEXT,
                                        imsi            TEXT,
                                        msisdn          TEXT,
                                        delta_reason    TEXT
                                    )
                                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                                    AS $$
                                BEGIN
                                    --
                                    -- If we don't specify a run_id, just set to the maximum run_id
                                    --
                                    IF run_id = -1 THEN
                                        run_id := max_bigint();
                                    END IF;

                                    IF run_id < base_run_id THEN
                                      RAISE EXCEPTION 'Parameter base_run_id % greater than run_id %',
                                                      base_run_id, run_id;
                                    END IF;

                                    RETURN QUERY SELECT *
                                                   FROM (SELECT el.imei_norm,
                                                                el.imsi,
                                                                el.msisdn,
                                                                overall_delta_reason(el.delta_reason
                                                                        ORDER BY start_run_id DESC) AS delta_reason
                                                           FROM exceptions_lists el
                                                          WHERE operator_id = op_id
                                                            AND start_run_id > base_run_id
                                                            AND start_run_id <= run_id
                                                       GROUP BY el.imei_norm, el.imsi, el.msisdn) x
                                                  WHERE x.delta_reason IS NOT NULL;
                                END
                                $$;
                                """)  # noqa: Q440, Q441
Ejemplo n.º 13
0
def repartition_monthly_network_triplets(conn, *, num_physical_shards):
    """Function to repartition the monthly_network_triplets_country and monthly_network_triplets_country tables."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'):
        # Create parent partitions
        cursor.execute(
            """CREATE TABLE monthly_network_triplets_country_new (
                   LIKE monthly_network_triplets_country INCLUDING DEFAULTS
                                                         INCLUDING IDENTITY
                                                         INCLUDING CONSTRAINTS
                                                         INCLUDING STORAGE
                                                         INCLUDING COMMENTS
               ) PARTITION BY RANGE (triplet_year, triplet_month)
            """
        )
        _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_country_new')

        cursor.execute(
            """CREATE TABLE monthly_network_triplets_per_mno_new (
                   LIKE monthly_network_triplets_per_mno INCLUDING DEFAULTS
                                                         INCLUDING IDENTITY
                                                         INCLUDING CONSTRAINTS
                                                         INCLUDING STORAGE
                                                         INCLUDING COMMENTS
               ) PARTITION BY LIST (operator_id)
            """
        )
        _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_per_mno_new')

        # Work out what year-month tuples we have
        country_monthly_partitions = utils.child_table_names(conn, 'monthly_network_triplets_country')
        country_year_month_tuples = [(x.triplet_year, x.triplet_month)
                                     for x in utils.table_invariants_list(conn,
                                                                          country_monthly_partitions,
                                                                          ['triplet_year', 'triplet_month'])]

        operator_partitions = utils.child_table_names(conn, 'monthly_network_triplets_per_mno')
        operator_monthly_partitions = set()
        for op_partition in operator_partitions:
            operator_monthly_partitions.update(utils.child_table_names(conn, op_partition))
        mno_year_month_tuples = [(x.operator_id, x.triplet_year, x.triplet_month)
                                 for x in utils.table_invariants_list(conn,
                                                                      operator_monthly_partitions,
                                                                      ['operator_id',
                                                                       'triplet_year',
                                                                       'triplet_month'])]

        latest_year_month = None
        # Sort year month tuples and get the maximum year month combination.
        country_year_month_tuples = sorted(country_year_month_tuples, key=lambda x: (x[0], x[1]), reverse=True)
        if len(country_year_month_tuples) > 0:
            latest_year_month = country_year_month_tuples[0]

        # Create child partitions at country level
        for year, month in country_year_month_tuples:
            # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we
            # pack tightly to ensure optimal usage of disk space and optimal scan performance
            latest_year, latest_month = latest_year_month
            fillfactor = 45 if year == latest_year and month == latest_month else 100
            create_monthly_network_triplets_country_partition(conn, month=month, year=year, suffix='_new',
                                                              num_physical_shards=num_physical_shards,
                                                              fillfactor=fillfactor)

        # Create child partitions at per-MNO level
        for op, year, month in mno_year_month_tuples:
            # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we
            # pack tightly to ensure optimal usage of disk space and optimal scan performance
            latest_year, latest_month = latest_year_month
            fillfactor = 45 if year == latest_year and month == latest_month else 100
            create_monthly_network_triplets_per_mno_partition(conn, operator_id=op, month=month, year=year,
                                                              suffix='_new', num_physical_shards=num_physical_shards,
                                                              fillfactor=fillfactor)

        # Populate country-level table from old table
        cursor.execute("""INSERT INTO monthly_network_triplets_country_new
                               SELECT *
                                 FROM monthly_network_triplets_country""")

        # Populate per-MNO-level table from old table
        cursor.execute("""INSERT INTO monthly_network_triplets_per_mno_new
                               SELECT *
                                 FROM monthly_network_triplets_per_mno""")

        # Add in indexes
        add_indices(conn, tbl_name='monthly_network_triplets_country_new',
                    idx_metadata=monthly_network_triplets_country_indices())
        add_indices(conn, tbl_name='monthly_network_triplets_per_mno_new',
                    idx_metadata=monthly_network_triplets_per_mno_indices())

        # Drop old tables
        cursor.execute('DROP TABLE monthly_network_triplets_country CASCADE')
        cursor.execute('DROP TABLE monthly_network_triplets_per_mno CASCADE')

        # Renames tables
        rename_table_and_indices(conn,
                                 old_tbl_name='monthly_network_triplets_country_new',
                                 new_tbl_name='monthly_network_triplets_country',
                                 idx_metadata=monthly_network_triplets_country_indices())
        rename_table_and_indices(conn,
                                 old_tbl_name='monthly_network_triplets_per_mno_new',
                                 new_tbl_name='monthly_network_triplets_per_mno',
                                 idx_metadata=monthly_network_triplets_per_mno_indices())

        cursor.execute("""CREATE OR REPLACE VIEW operator_data AS
                          SELECT sq.connection_date,
                                 sq.imei_norm,
                                 sq.imsi,
                                 sq.msisdn,
                                 sq.operator_id
                            FROM (SELECT make_date(nt.triplet_year::integer,
                                         nt.triplet_month::integer,
                                         dom.dom) AS connection_date,
                                         nt.imei_norm,
                                         nt.imsi,
                                         nt.msisdn,
                                         nt.operator_id
                                    FROM generate_series(1, 31) dom(dom),
                                         monthly_network_triplets_per_mno nt
                                   WHERE (nt.date_bitmask & (1 << (dom.dom - 1))) <> 0) sq""")
        cursor.execute("""CREATE VIEW monthly_network_triplets_country_no_null_imeis AS
                          SELECT *
                            FROM monthly_network_triplets_country
                           WHERE imei_norm IS NOT NULL""")
        cursor.execute("""CREATE VIEW monthly_network_triplets_per_mno_no_null_imeis AS
                          SELECT *
                            FROM monthly_network_triplets_per_mno
                           WHERE imei_norm IS NOT NULL""")

        cursor.execute(sql.SQL('GRANT SELECT ON operator_data TO dirbs_core_base'))
        for role in ['dirbs_core_listgen', 'dirbs_core_classify', 'dirbs_core_report', 'dirbs_core_api']:
            cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_country_no_null_imeis
                                      TO {0}""").format(sql.Identifier(role)))
            cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_per_mno_no_null_imeis
                                      TO {0}""").format(sql.Identifier(role)))

        cursor.execute("""CREATE VIEW monthly_network_triplets_with_invalid_data_flags AS
                               SELECT nt.*,
                                      nt.imei_norm IS NULL AS is_null_imei,
                                      is_unclean_imei(nt.imei_norm) AS is_unclean_imei,
                                      nt.imsi IS NULL AS is_null_imsi,
                                      is_unclean_imsi(nt.imsi) AS is_unclean_imsi,
                                      nt.msisdn IS NULL AS is_null_msisdn
                                 FROM monthly_network_triplets_per_mno nt""")
Ejemplo n.º 14
0
def _validate_data_partitions(config, conn, month, year, logger, disable_data_check):
    """
    Validate that data is present for all configured operators and only configured operators.

    :param config: dirbs config obj
    :param conn: database conection
    :param month: data month
    :param year: data year
    :param logger: dirbs logger obj
    :param disable_data_check: data check flag
    """
    operators = config.region_config.operators
    assert len(operators) > 0

    operator_partitions = utils.child_table_names(conn, 'monthly_network_triplets_per_mno')
    observed_operator_ids = {x for x in utils.table_invariants_list(conn, operator_partitions, ['operator_id'])}
    required_operator_ids = {(o.id,) for o in operators}
    missing_operator_ids = required_operator_ids - observed_operator_ids
    if len(missing_operator_ids) > 0:
        msg = 'Missing monthly_network_triplets_per_mno partitions for operators: {0}' \
              .format(', '.join([x[0] for x in missing_operator_ids]))
        if disable_data_check:
            logger.warn(msg)
        else:
            logger.error(msg)
            raise exceptions.MissingOperatorDataException(msg)

    extra_operator_ids = observed_operator_ids - required_operator_ids
    if len(extra_operator_ids) > 0:
        msg = 'Extra monthly_network_triplets_per_mno partitions detected for unconfigured operators: {0}' \
              .format(', '.join([x[0] for x in extra_operator_ids]))
        if disable_data_check:
            logger.warn(msg)
        else:
            logger.error(msg)
            raise exceptions.ExtraOperatorDataException(msg)

    operator_monthly_partitions = set()
    for op_partition in operator_partitions:
        operator_monthly_partitions.update(utils.child_table_names(conn, op_partition))
    observed_invariants = {x for x in utils.table_invariants_list(conn,
                                                                  operator_monthly_partitions,
                                                                  ['operator_id', 'triplet_year', 'triplet_month'])}
    observed_invariants = {x for x in observed_invariants if x.triplet_year == year and x.triplet_month == month}
    required_invariants = {(o.id, year, month) for o in operators}
    missing_invariants = required_invariants - observed_invariants
    if len(missing_invariants) > 0:
        msg = 'Missing monthly_network_triplets_per_mno partitions for the requested reporting ' \
              'month for the following configured operators: {0}' \
              .format(', '.join([x[0] for x in missing_invariants]))
        if disable_data_check:
            logger.warn(msg)
        else:
            logger.error(msg)
            raise exceptions.MissingOperatorDataException(msg)

    extra_invariants = observed_invariants - required_invariants
    if len(extra_invariants) > 0:
        msg = 'Extra monthly_network_triplets_per_mno partitions detected for the requested ' \
              'reporting month for the following unconfigured operators: {0}' \
              .format(', '.join([x[0] for x in extra_invariants]))
        if disable_data_check:
            logger.warn(msg)
        else:
            logger.error(msg)
            raise exceptions.ExtraOperatorDataException(msg)

    country_imei_shard_name = partition_utils.monthly_network_triplets_country_partition(month=month, year=year)
    with conn.cursor() as cursor:
        cursor.execute(utils.table_exists_sql(), [country_imei_shard_name])
        partition_exists = cursor.fetchone()[0]
        if not partition_exists:
            msg = 'Missing monthly_network_triplets_country partition for year and month'
            if disable_data_check:
                logger.warn(msg)
            else:
                logger.error(msg)
                raise exceptions.ExtraOperatorDataException(msg)