Beispiel #1
0
def create_per_mno_lists_partition(conn, *, operator_id, parent_tbl_name, tbl_name, num_physical_shards=None,
                                   unlogged=False, fillfactor=80):
    """Function to DRY out creation of a new operator partition for notifications_lists."""
    assert len(tbl_name) < 64

    if num_physical_shards is None:
        num_physical_shards = num_physical_imei_shards(conn)

    if unlogged:
        unlogged_sql = sql.SQL('UNLOGGED ')
    else:
        unlogged_sql = sql.SQL('')

    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'):
        cursor.execute(
            sql.SQL(
                """CREATE {unlogged}TABLE {0} PARTITION OF {1}
                   FOR VALUES IN (%s) PARTITION BY RANGE (virt_imei_shard)
                """
            ).format(sql.Identifier(tbl_name), sql.Identifier(parent_tbl_name), unlogged=unlogged_sql),
            [operator_id]
        )
        _grant_perms_list(conn, part_name=tbl_name)

        # Create child partitions
        create_imei_shard_partitions(conn, tbl_name=tbl_name, num_physical_shards=num_physical_shards,
                                     perms_func=_grant_perms_list, fillfactor=fillfactor)
Beispiel #2
0
    def upgrade(self, db_conn):
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor, utils.db_role_setter(
                db_conn, role_name='dirbs_core_import_operator'):
            child_table_names_list = utils.child_table_names(
                db_conn, 'daily_per_mno_hll_sketches')
            for partition_name in child_table_names_list:
                logger.info(
                    'Copying partition {0} into daily_per_mno_hll_sketches table...'
                    .format(partition_name))
                cursor.execute(
                    sql.SQL("""INSERT INTO daily_per_mno_hll_sketches
                                               SELECT *
                                                 FROM {partition_name_id}""").
                    format(partition_name_id=sql.Identifier(partition_name)))
                logger.info(
                    'Copied partition {0} into daily_per_mno_hll_sketches table'
                    .format(partition_name))

                logger.info(
                    'Dropping daily_per_mno_hll_sketches partition {0}...'.
                    format(partition_name))
                cursor.execute(
                    sql.SQL("""DROP TABLE {partition_name_id}""").format(
                        partition_name_id=sql.Identifier(partition_name)))
                logger.info('Dropped partition {0}'.format(partition_name))
Beispiel #3
0
 def _init_staging_table_shards(self):
     """Method to create IMEI shards in the staging table if supported by the importer."""
     assert self._supports_imei_shards
     with db_role_setter(self._conn, role_name=self._owner_role_name):
         tbl_name = self._staging_tbl_name
         partition_utils.create_imei_shard_partitions(self._conn,
                                                      tbl_name=tbl_name,
                                                      unlogged=True)
         for name, rstart, rend in partition_utils.physical_imei_shards(
                 self._conn, tbl_name=tbl_name):
             self._on_staging_table_shard_creation(name, rstart, rend)
Beispiel #4
0
    def partition_registration_list(self, conn, *, num_physical_shards):
        """Method to repartition registration_list for v47 upgrade."""
        with conn.cursor() as cursor, utils.db_role_setter(
                conn, role_name='dirbs_core_power_user'):
            # Create parent partition
            cursor.execute("""CREATE TABLE historic_registration_list_new (
                                   LIKE historic_registration_list INCLUDING DEFAULTS
                                                                   INCLUDING IDENTITY
                                                                   INCLUDING CONSTRAINTS
                                                                   INCLUDING STORAGE
                                                                   INCLUDING COMMENTS
                               )
                               PARTITION BY RANGE (virt_imei_shard)
                            """)
            part_utils._grant_perms_registration_list(
                conn, part_name='historic_registration_list_new')
            # Create child partitions
            part_utils.create_imei_shard_partitions(
                conn,
                tbl_name='historic_registration_list_new',
                num_physical_shards=num_physical_shards,
                perms_func=part_utils._grant_perms_registration_list,
                fillfactor=80)
            # Insert data from original partition
            cursor.execute("""INSERT INTO historic_registration_list_new
                                   SELECT *
                                     FROM historic_registration_list""")

            # Add in indexes to each partition
            idx_metadata = [
                part_utils.IndexMetadatum(idx_cols=['imei_norm'],
                                          is_unique=True,
                                          partial_sql='WHERE end_date IS NULL')
            ]
            part_utils.add_indices(conn,
                                   tbl_name='historic_registration_list_new',
                                   idx_metadata=idx_metadata)

            # Drop old view + table, rename tables, indexes and constraints
            cursor.execute('DROP VIEW registration_list')
            cursor.execute('DROP TABLE historic_registration_list CASCADE')
            part_utils.rename_table_and_indices(
                conn,
                old_tbl_name='historic_registration_list_new',
                new_tbl_name='historic_registration_list',
                idx_metadata=idx_metadata)
            cursor.execute("""CREATE OR REPLACE VIEW registration_list AS
                                   SELECT imei_norm, make, model, status, virt_imei_shard
                                     FROM historic_registration_list
                                    WHERE end_date IS NULL WITH CHECK OPTION"""
                           )
            cursor.execute("""GRANT SELECT ON registration_list
                                      TO dirbs_core_classify, dirbs_core_api, dirbs_core_import_registration_list"""
                           )
Beispiel #5
0
def install(ctx):
    """
    Installs latest schema on clean DB instance.

    :param ctx: current cli context obj
    :return: status
    """
    logger = logging.getLogger('dirbs.db')
    config = common.ensure_config(ctx)
    db_config = config.db_config
    with utils.create_db_connection(db_config) as conn, conn.cursor() as cur:
        logger.info('Creating initial base DB schema in DB %s on host %s',
                    db_config.database, db_config.host)

        # Check if there is stuff already in there
        cur.execute("""SELECT COUNT(*)
                         FROM pg_class c
                         JOIN pg_namespace n ON n.oid = c.relnamespace
                        WHERE n.nspname = current_schema()""")
        is_clean = (cur.fetchone()[0] == 0)
        if not is_clean:
            logger.error('Can\'t install latest schema into a non-clean DB')
            logger.error(
                'Instead, use dirbs-db upgrade to upgrade the schema to the latest version'
            )
            sys.exit(1)

        # Set our role here so that new objects get created with dirbs_core_power_user as owner by default
        with utils.db_role_setter(conn, role_name='dirbs_core_power_user'):
            # First we setup the schema, search path etc.
            sql = pkgutil.get_data('dirbs', 'sql/base/on_db_creation.sql')
            cur.execute(sql)

            # Install the base schema for v19 and set current version to 19
            base_schema = 'sql/base/v19_schema.sql'
            logger.info('Restoring base v19 schema from SQL file: %s',
                        base_schema)
            sql = pkgutil.get_data('dirbs', base_schema)
            cur.execute(sql)
            utils.set_db_schema_version(conn, min_schema_version)
            logger.info(
                'Successfully created base v{0:d} schema. Scheduling dirbs-db upgrade...'
                .format(min_schema_version))

    # Then we call upgrade to complete the process
    rv = 0
    if code_db_schema_version > min_schema_version:
        rv = ctx.invoke(upgrade)
    else:
        # Can't do anything until we know the schema is the right version
        _store_job_metadata(config, 'install')

    return rv
Beispiel #6
0
def repartition_classification_state(conn, *, num_physical_shards, src_filter_sql=None):
    """Function to repartition the classification_state table."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_power_user'):
        # Create parent partition
        cursor.execute(
            """CREATE TABLE classification_state_new (
                   LIKE classification_state INCLUDING DEFAULTS
                                             INCLUDING IDENTITY
                                             INCLUDING CONSTRAINTS
                                             INCLUDING STORAGE
                                             INCLUDING COMMENTS
               )
               PARTITION BY RANGE (virt_imei_shard)
            """
        )
        _grant_perms_classification_state(conn, part_name='classification_state_new')

        # Create child partitions
        create_imei_shard_partitions(conn, tbl_name='classification_state_new',
                                     num_physical_shards=num_physical_shards,
                                     perms_func=_grant_perms_classification_state, fillfactor=80)

        # Insert data from original partition
        base_sql = sql.SQL("""INSERT INTO classification_state_new
                                   SELECT *
                                     FROM classification_state""")
        if src_filter_sql is not None:
            insert_sql = sql.SQL('{0} {1}').format(base_sql, sql.SQL(src_filter_sql))
        else:
            insert_sql = base_sql

        cursor.execute(insert_sql)

        # Add in indexes to each partition
        idx_metadata = [
            IndexMetadatum(idx_cols=cols, is_unique=is_uniq, partial_sql=partial)
            for cols, is_uniq, partial in [
                (['row_id'], True, None),
                (['imei_norm', 'cond_name'], True, 'WHERE end_date IS NULL'),
                (['block_date'], False, 'WHERE end_date IS NULL'),
                (['cond_name'], False, 'WHERE end_date IS NULL')
            ]
        ]
        add_indices(conn, tbl_name='classification_state_new', idx_metadata=idx_metadata)

        # Drop old table, rename tables, indexes and constraints
        cursor.execute('ALTER SEQUENCE classification_state_row_id_seq OWNED BY classification_state_new.row_id')
        cursor.execute('DROP TABLE classification_state CASCADE')
        rename_table_and_indices(conn, old_tbl_name='classification_state_new',
                                 new_tbl_name='classification_state', idx_metadata=idx_metadata)
Beispiel #7
0
def repartition_exceptions_lists(conn, *, num_physical_shards, src_filter_sql=None):
    """Function to repartition the exceptions_lists table."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'):
        # Create parent partition
        cursor.execute(
            """CREATE TABLE exceptions_lists_new (
                   LIKE exceptions_lists INCLUDING DEFAULTS
                                         INCLUDING IDENTITY
                                         INCLUDING CONSTRAINTS
                                         INCLUDING STORAGE
                                         INCLUDING COMMENTS
               )
               PARTITION BY LIST (operator_id)
            """
        )
        _grant_perms_list(conn, part_name='exceptions_lists_new')

        # Work out who the operators are
        imei_shard_names = utils.child_table_names(conn, 'exceptions_lists')
        operators = [x.operator_id for x in utils.table_invariants_list(conn, imei_shard_names, ['operator_id'])]

        # Create child partitions (operator at top level, then IMEI-sharded)
        for op_id in operators:
            tbl_name = per_mno_lists_partition(operator_id=op_id, suffix='_new', list_type='exceptions')
            create_per_mno_lists_partition(conn, parent_tbl_name='exceptions_lists_new', tbl_name=tbl_name,
                                           operator_id=op_id, num_physical_shards=num_physical_shards)

        # Insert data from original partition
        base_sql = sql.SQL("""INSERT INTO exceptions_lists_new
                                   SELECT *
                                     FROM exceptions_lists""")
        if src_filter_sql is not None:
            insert_sql = sql.SQL('{0} {1}').format(base_sql, sql.SQL(src_filter_sql))
        else:
            insert_sql = base_sql
        cursor.execute(insert_sql)

        # Add in indexes to each partition
        add_indices(conn, tbl_name='exceptions_lists_new', idx_metadata=exceptions_lists_indices())

        # Drop old table, after assigning sequence to new table
        cursor.execute('ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id')
        cursor.execute('DROP TABLE exceptions_lists CASCADE')

        #  Rename tables, indexes and constraints
        rename_table_and_indices(conn, old_tbl_name='exceptions_lists_new',
                                 new_tbl_name='exceptions_lists', idx_metadata=exceptions_lists_indices())
Beispiel #8
0
def repartition_pairing_list(conn, *, num_physical_shards):
    """Function to repartition the pairing_list table."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_power_user'):
        cursor.execute(
            """CREATE TABLE historic_pairing_list_new (
                   LIKE historic_pairing_list INCLUDING DEFAULTS
                                              INCLUDING IDENTITY
                                              INCLUDING CONSTRAINTS
                                              INCLUDING STORAGE
                                              INCLUDING COMMENTS
               )
               PARTITION BY RANGE (virt_imei_shard)
            """
        )
        _grant_perms_pairing_list(conn, part_name='historic_pairing_list_new')

        # Create child partitions
        create_imei_shard_partitions(conn, tbl_name='historic_pairing_list_new',
                                     num_physical_shards=num_physical_shards,
                                     perms_func=_grant_perms_pairing_list, fillfactor=80)

        # Insert data from original partition
        cursor.execute("""INSERT INTO historic_pairing_list_new
                               SELECT *
                                 FROM historic_pairing_list""")

        # Add in indexes to each partition
        idx_metadata = [IndexMetadatum(idx_cols=['imei_norm', 'imsi'],
                                       is_unique=True,
                                       partial_sql='WHERE end_date IS NULL')]
        add_indices(conn, tbl_name='historic_pairing_list_new', idx_metadata=idx_metadata)

        # Drop old view + table, rename tables, indexes and constraints
        cursor.execute('DROP VIEW pairing_list')
        cursor.execute('DROP TABLE historic_pairing_list CASCADE')
        rename_table_and_indices(conn, old_tbl_name='historic_pairing_list_new',
                                 new_tbl_name='historic_pairing_list', idx_metadata=idx_metadata)
        cursor.execute("""CREATE VIEW pairing_list AS
                               SELECT imei_norm, imsi, virt_imei_shard
                                 FROM historic_pairing_list
                                WHERE end_date IS NULL WITH CHECK OPTION""")
        cursor.execute("""GRANT SELECT ON pairing_list
                          TO dirbs_core_listgen, dirbs_core_report, dirbs_core_api, dirbs_core_import_pairing_list""")
Beispiel #9
0
def repartition_blacklist(conn, *, num_physical_shards):
    """Function to repartition the blacklist table."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_listgen'):
        # Create parent partition
        cursor.execute(
            """CREATE TABLE blacklist_new (
                   LIKE blacklist INCLUDING DEFAULTS
                                  INCLUDING IDENTITY
                                  INCLUDING CONSTRAINTS
                                  INCLUDING STORAGE
                                  INCLUDING COMMENTS
               )
               PARTITION BY RANGE (virt_imei_shard)
            """
        )

        # Create child partitions
        create_imei_shard_partitions(conn, tbl_name='blacklist_new', num_physical_shards=num_physical_shards,
                                     perms_func=_grant_perms_list, fillfactor=80)

        # Insert data from original partition
        cursor.execute("""INSERT INTO blacklist_new
                               SELECT *
                                 FROM blacklist""")

        # Add in indexes to each partition
        idx_metadata = [
            IndexMetadatum(idx_cols=cols, is_unique=is_uniq, partial_sql=partial)
            for cols, is_uniq, partial in [
                (['imei_norm'], True, 'WHERE end_run_id IS NULL'),
                (['end_run_id'], False, None),
                (['start_run_id'], False, None)
            ]
        ]
        add_indices(conn, tbl_name='blacklist_new', idx_metadata=idx_metadata)

        # Drop old table, rename tables, indexes and constraints
        cursor.execute('ALTER SEQUENCE blacklist_row_id_seq OWNED BY blacklist_new.row_id')
        cursor.execute('DROP TABLE blacklist CASCADE')
        rename_table_and_indices(conn, old_tbl_name='blacklist_new',
                                 new_tbl_name='blacklist', idx_metadata=idx_metadata)
Beispiel #10
0
def repartition_network_imeis(conn, *, num_physical_shards):
    """Function to repartition the network_imeis table."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'):
        # Create parent partition
        cursor.execute(
            """CREATE TABLE network_imeis_new (
                   LIKE network_imeis INCLUDING DEFAULTS
                                      INCLUDING IDENTITY
                                      INCLUDING CONSTRAINTS
                                      INCLUDING STORAGE
                                      INCLUDING COMMENTS
               )
               PARTITION BY RANGE (virt_imei_shard)
            """
        )
        _grant_perms_network_imeis(conn, part_name='network_imeis_new')

        # Create child partitions
        create_imei_shard_partitions(conn, tbl_name='network_imeis_new', num_physical_shards=num_physical_shards,
                                     perms_func=_grant_perms_network_imeis, fillfactor=80)

        # Insert data from original partition
        cursor.execute("""INSERT INTO network_imeis_new
                               SELECT *
                                 FROM network_imeis""")

        # Add in indexes to each partition
        idx_metadata = [
            IndexMetadatum(idx_cols=cols, is_unique=is_uniq, partial_sql=partial)
            for cols, is_uniq, partial in [
                (['imei_norm'], True, None),
                (['first_seen'], False, None)
            ]
        ]
        add_indices(conn, tbl_name='network_imeis_new', idx_metadata=idx_metadata)

        # Drop old table, rename tables, indexes and constraints
        cursor.execute('DROP TABLE network_imeis CASCADE')
        rename_table_and_indices(conn, old_tbl_name='network_imeis_new',
                                 new_tbl_name='network_imeis', idx_metadata=idx_metadata)
Beispiel #11
0
def create_monthly_network_triplets_per_mno_partition(conn, *, operator_id, month, year, suffix='',
                                                      num_physical_shards=None, fillfactor=45):
    """Function to DRY out creation of a new month/year partition for monthly_network_triplets_per_mno."""
    if num_physical_shards is None:
        num_physical_shards = num_physical_imei_shards(conn)

    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'):
        parent_tbl_name = 'monthly_network_triplets_per_mno{0}'.format(suffix)
        op_part_name = monthly_network_triplets_per_mno_partition(operator_id=operator_id, month=None,
                                                                  year=None, suffix=suffix)
        assert len(op_part_name) < 64

        cursor.execute(
            sql.SQL(
                """CREATE TABLE IF NOT EXISTS {0} PARTITION OF {1}
                   FOR VALUES IN (%s) PARTITION BY RANGE (triplet_year, triplet_month)
                """
            ).format(sql.Identifier(op_part_name), sql.Identifier(parent_tbl_name)),
            [operator_id]
        )
        _grant_perms_monthly_network_triplets(conn, part_name=op_part_name)

        part_name = monthly_network_triplets_per_mno_partition(operator_id=operator_id,
                                                               month=month, year=year, suffix=suffix)
        assert len(part_name) < 64

        cursor.execute(
            sql.SQL(
                """CREATE TABLE {0} PARTITION OF {1}
                   FOR VALUES FROM %s TO %s PARTITION BY RANGE (virt_imei_shard)
                """
            ).format(sql.Identifier(part_name), sql.Identifier(op_part_name)),
            [(year, month), (year, month + 1)]
        )
        _grant_perms_monthly_network_triplets(conn, part_name=part_name)

        # Create child partitions
        create_imei_shard_partitions(conn, tbl_name=part_name, num_physical_shards=num_physical_shards,
                                     perms_func=_grant_perms_monthly_network_triplets, fillfactor=fillfactor)
Beispiel #12
0
    def _repartition_exceptions_lists(self, conn, *, num_physical_shards):
        """Repartition the exceptions lists to support msisdn."""
        with conn.cursor() as cursor, utils.db_role_setter(
                conn, role_name='dirbs_core_listgen'):
            cursor.execute("""CREATE TABLE exceptions_lists_new (
                       LIKE exceptions_lists INCLUDING DEFAULTS
                                             INCLUDING IDENTITY
                                             INCLUDING CONSTRAINTS
                                             INCLUDING STORAGE
                                             INCLUDING COMMENTS
                   )
                   PARTITION BY LIST (operator_id);

                   ALTER TABLE exceptions_lists_new ADD COLUMN msisdn TEXT NOT NULL;
                """)

            part_utils._grant_perms_list(
                conn,
                part_name='exceptions_lists_new')  # grant relevant permissions
            imei_shard_names = utils.child_table_names(
                conn, 'exceptions_lists')  # determine the child table names
            operators = [
                o.operator_id for o in utils.table_invariants_list(
                    conn, imei_shard_names, ['operator_id'])
            ]  # workout who the operators are

            # create child partitions for new list (operator at top level, then IMEI sharded)
            for op_id in operators:
                tbl_name = part_utils.per_mno_lists_partition(
                    operator_id=op_id, suffix='_new', list_type='exceptions')
                part_utils.create_per_mno_lists_partition(
                    conn,
                    parent_tbl_name='exceptions_lists_new',
                    tbl_name=tbl_name,
                    operator_id=op_id,
                    num_physical_shards=num_physical_shards)

            # insert data into the new parent partition
            cursor.execute("""INSERT INTO exceptions_lists_new
                                   SELECT e.row_id, e.operator_id, e.imei_norm, e.imsi, e.start_run_id, e.end_run_id,
                                          e.delta_reason, e.virt_imei_shard, p.msisdn
                                     FROM exceptions_lists e
                               INNER JOIN historic_pairing_list p ON e.imsi = p.imsi"""
                           )

            # add indexes in each partitions
            part_utils.add_indices(
                conn,
                tbl_name='exceptions_lists_new',
                idx_metadata=part_utils.exceptions_lists_indices())

            # drop old table, after assigning sequence to new table
            cursor.execute(
                'ALTER SEQUENCE exceptions_lists_row_id_seq OWNED BY exceptions_lists_new.row_id'
            )
            cursor.execute('DROP TABLE exceptions_lists CASCADE')

            # rename table, indexes and constraints
            part_utils.rename_table_and_indices(
                conn,
                old_tbl_name='exceptions_lists_new',
                new_tbl_name='exceptions_lists',
                idx_metadata=part_utils.exceptions_lists_indices())

            # recreating gen_exceptionlist function
            with utils.db_role_setter(conn, role_name='dirbs_core_power_user'):
                cursor.execute("""
                                DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT);

                                --
                                -- Recreate function to generate a full exceptions_list for a given
                                -- run_id and operator.
                                -- A value of -1 means get the latest list.
                                --
                                CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1)
                                    RETURNS TABLE (
                                        imei_norm       TEXT,
                                        virt_imei_shard SMALLINT,
                                        imsi            TEXT,
                                        msisdn          TEXT
                                    )
                                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                                    AS $$
                                BEGIN
                                    --
                                    -- If we don't specify a run_id, just set to the maximum run_id which will always
                                    -- return all rows where end_run_id is NULL
                                    --
                                    IF run_id = -1 THEN
                                        run_id := max_bigint();
                                    END IF;

                                    RETURN QUERY SELECT el.imei_norm,
                                                        el.virt_imei_shard,
                                                        el.imsi,
                                                        el.msisdn
                                                   FROM exceptions_lists el
                                                  WHERE el.operator_id = op_id
                                                    AND el.delta_reason != 'removed'
                                                    AND run_id >= el.start_run_id
                                                    AND (run_id < el.end_run_id OR el.end_run_id IS NULL);
                                END
                                $$;

                                DROP FUNCTION gen_delta_exceptions_list(op_id TEXT, base_run_id BIGINT, run_id BIGINT);

                                --
                                -- Create function to generate a per-MNO delta exceptions list for a run_id, operator
                                -- id and optional base_run_id.
                                --
                                -- If not base_run_id is supplied, this function will use the maximum run_id found in
                                -- the DB that it less than than the supplied run_id
                                --
                                CREATE FUNCTION gen_delta_exceptions_list(op_id TEXT,
                                                                          base_run_id BIGINT,
                                                                          run_id BIGINT = -1)
                                    RETURNS TABLE (
                                        imei_norm       TEXT,
                                        imsi            TEXT,
                                        msisdn          TEXT,
                                        delta_reason    TEXT
                                    )
                                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                                    AS $$
                                BEGIN
                                    --
                                    -- If we don't specify a run_id, just set to the maximum run_id
                                    --
                                    IF run_id = -1 THEN
                                        run_id := max_bigint();
                                    END IF;

                                    IF run_id < base_run_id THEN
                                      RAISE EXCEPTION 'Parameter base_run_id % greater than run_id %',
                                                      base_run_id, run_id;
                                    END IF;

                                    RETURN QUERY SELECT *
                                                   FROM (SELECT el.imei_norm,
                                                                el.imsi,
                                                                el.msisdn,
                                                                overall_delta_reason(el.delta_reason
                                                                        ORDER BY start_run_id DESC) AS delta_reason
                                                           FROM exceptions_lists el
                                                          WHERE operator_id = op_id
                                                            AND start_run_id > base_run_id
                                                            AND start_run_id <= run_id
                                                       GROUP BY el.imei_norm, el.imsi, el.msisdn) x
                                                  WHERE x.delta_reason IS NOT NULL;
                                END
                                $$;
                                """)  # noqa: Q440, Q441
Beispiel #13
0
    def upgrade(self, db_conn):  # noqa: C901
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor:
            cursor.execute(
                """CREATE FUNCTION calc_virt_imei_shard(imei TEXT) RETURNS SMALLINT
                              LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE
                              AS $$
                              BEGIN
                                  RETURN SUBSTRING(COALESCE(imei, ''), 13, 2)::SMALLINT;
                              EXCEPTION WHEN OTHERS THEN
                                  RETURN 0;
                              END;
                              $$""")

            # By default, create 4 shards
            num_initial_shards = 4

            logger.info('Re-partitioning classification_state table...')
            cursor.execute(
                'ALTER TABLE classification_state ADD COLUMN virt_imei_shard SMALLINT'
            )
            cursor.execute(
                'UPDATE classification_state SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE classification_state ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            part_utils.repartition_classification_state(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned classification_state table')

            logger.info('Re-partitioning registration_list table...')
            cursor.execute(
                'ALTER TABLE historic_registration_list ADD COLUMN virt_imei_shard SMALLINT'
            )
            cursor.execute(
                'UPDATE historic_registration_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE historic_registration_list ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            self.partition_registration_list(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned registration_list table')

            logger.info('Re-partitioning pairing_list table...')
            cursor.execute(
                'ALTER TABLE historic_pairing_list ADD COLUMN virt_imei_shard SMALLINT'
            )
            cursor.execute(
                'UPDATE historic_pairing_list SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE historic_pairing_list ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            part_utils.repartition_pairing_list(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned pairing_list table')

            logger.info('Re-partitioning blacklist table...')
            cursor.execute(
                'ALTER TABLE blacklist ADD COLUMN virt_imei_shard SMALLINT')
            cursor.execute(
                'UPDATE blacklist SET virt_imei_shard = calc_virt_imei_shard(imei_norm)'
            )
            cursor.execute(
                'ALTER TABLE blacklist ALTER COLUMN virt_imei_shard SET NOT NULL'
            )
            part_utils.repartition_blacklist(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned blacklist table')

            # Need to make sure owner of list tables is dirbs_core_listgen
            logger.info('Re-partitioning notifications_lists table...')
            # The original notifications_lists were not created with a single sequence for the IDs, so just do now
            with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'):
                cursor.execute(
                    """CREATE UNLOGGED TABLE notifications_lists_new (
                           row_id BIGSERIAL NOT NULL,
                           operator_id TEXT NOT NULL,
                           imei_norm TEXT NOT NULL,
                           imsi TEXT NOT NULL,
                           msisdn TEXT NOT NULL,
                           block_date DATE NOT NULL,
                           reasons TEXT[] NOT NULL,
                           amnesty_granted BOOLEAN DEFAULT FALSE NOT NULL,
                           start_run_id BIGINT NOT NULL,
                           end_run_id BIGINT,
                           delta_reason TEXT NOT NULL CHECK (delta_reason IN ('new', 'resolved', 'blacklisted',
                                                                              'no_longer_seen', 'changed')),
                           virt_imei_shard SMALLINT NOT NULL
                       ) PARTITION BY LIST (operator_id)
                    """)

            # Work out who the operators are
            partitions = utils.child_table_names(db_conn,
                                                 'notifications_lists')
            # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user)
            # due to bad previous migration scripts
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                for p in partitions:
                    cursor.execute(
                        sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen').
                        format(sql.Identifier(p)))

            operators = [
                x.operator_id for x in utils.table_invariants_list(
                    db_conn, partitions, ['operator_id'])
            ]

            # Create operator child partitions
            for op_id in operators:
                tbl_name = part_utils.per_mno_lists_partition(
                    operator_id=op_id,
                    suffix='_new',
                    list_type='notifications')
                part_utils.create_per_mno_lists_partition(
                    db_conn,
                    operator_id=op_id,
                    parent_tbl_name='notifications_lists_new',
                    tbl_name=tbl_name,
                    num_physical_shards=1,
                    unlogged=True,
                    fillfactor=100)

            cursor.execute(
                """INSERT INTO notifications_lists_new(operator_id, imei_norm, imsi, msisdn, block_date,
                                                       reasons, start_run_id, end_run_id, delta_reason,
                                                       virt_imei_shard)
                        SELECT operator_id, imei_norm, imsi, msisdn, block_date,
                               reasons, start_run_id, end_run_id, delta_reason, calc_virt_imei_shard(imei_norm)
                          FROM notifications_lists
                """)
            # Drop old table, rename tables, indexes and constraints
            cursor.execute("""ALTER TABLE notifications_lists_new
                              RENAME CONSTRAINT notifications_lists_new_delta_reason_check
                              TO notifications_lists_delta_reason_check""")
            cursor.execute('DROP TABLE notifications_lists CASCADE')
            cursor.execute("""ALTER SEQUENCE notifications_lists_new_row_id_seq
                              RENAME TO notifications_lists_row_id_seq""")
            part_utils.rename_table_and_indices(
                db_conn,
                old_tbl_name='notifications_lists_new',
                new_tbl_name='notifications_lists')
            part_utils.repartition_notifications_lists(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned notifications_lists table')

            logger.info('Re-partitioning exceptions_lists table...')
            # The original exceptions_lists were not created with a single sequence for the IDs, so just do now
            with utils.db_role_setter(db_conn, role_name='dirbs_core_listgen'):
                cursor.execute("""CREATE UNLOGGED TABLE exceptions_lists_new (
                           row_id BIGSERIAL NOT NULL,
                           operator_id TEXT NOT NULL,
                           imei_norm TEXT NOT NULL,
                           imsi TEXT NOT NULL,
                           start_run_id BIGINT NOT NULL,
                           end_run_id BIGINT,
                           delta_reason TEXT NOT NULL CHECK (delta_reason IN ('added', 'removed')),
                           virt_imei_shard SMALLINT NOT NULL
                       ) PARTITION BY LIST (operator_id)
                    """)
            # Work out who the operators are
            partitions = utils.child_table_names(db_conn, 'exceptions_lists')
            # Make sure that they are owned by dirbs_core_listgen (they can be owner by dirbs_core_power_user)
            # due to bad previous migration scripts
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                for p in partitions:
                    cursor.execute(
                        sql.SQL('ALTER TABLE {0} OWNER TO dirbs_core_listgen').
                        format(sql.Identifier(p)))
            operators = [
                x.operator_id for x in utils.table_invariants_list(
                    db_conn, partitions, ['operator_id'])
            ]

            # Create operator child partitions
            for op_id in operators:
                tbl_name = part_utils.per_mno_lists_partition(
                    operator_id=op_id, suffix='_new', list_type='exceptions')
                part_utils.create_per_mno_lists_partition(
                    db_conn,
                    operator_id=op_id,
                    parent_tbl_name='exceptions_lists_new',
                    tbl_name=tbl_name,
                    num_physical_shards=1,
                    unlogged=True,
                    fillfactor=100)

            cursor.execute(
                """INSERT INTO exceptions_lists_new(operator_id, imei_norm, imsi, start_run_id,
                                                    end_run_id, delta_reason, virt_imei_shard)
                        SELECT operator_id, imei_norm, imsi, start_run_id, end_run_id, delta_reason,
                               calc_virt_imei_shard(imei_norm)
                          FROM exceptions_lists
                """)
            # Drop old table, rename tables, indexes and constraints
            cursor.execute("""ALTER TABLE exceptions_lists_new
                              RENAME CONSTRAINT exceptions_lists_new_delta_reason_check
                              TO exceptions_lists_delta_reason_check""")
            cursor.execute('DROP TABLE exceptions_lists CASCADE')
            cursor.execute(
                'ALTER SEQUENCE exceptions_lists_new_row_id_seq RENAME TO exceptions_lists_row_id_seq'
            )
            part_utils.rename_table_and_indices(
                db_conn,
                old_tbl_name='exceptions_lists_new',
                new_tbl_name='exceptions_lists')
            part_utils.repartition_exceptions_lists(
                db_conn, num_physical_shards=num_initial_shards)
            logger.info('Re-partitioned exceptions_lists table')

            logger.info('Re-partitioning seen_imeis (network_imeis) table')
            # First, just put everything in a temporary table so that we can call partutils
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute("""CREATE UNLOGGED TABLE network_imeis (
                           first_seen DATE NOT NULL,
                           last_seen DATE NOT NULL,
                           seen_rat_bitmask INTEGER,
                           imei_norm TEXT NOT NULL,
                           virt_imei_shard SMALLINT NOT NULL
                       )
                    """)
            #
            # We disable index scans here as doing a merge append with index scans is much slower and involves
            # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort
            # the results by imei_norm
            #
            cursor.execute('SET enable_indexscan = false')
            cursor.execute("""INSERT INTO network_imeis
                        SELECT MIN(first_seen),
                               MAX(last_seen),
                               bit_or(seen_rat_bitmask),
                               imei_norm,
                               calc_virt_imei_shard(imei_norm)
                          FROM seen_imeis
                      GROUP BY imei_norm
                """)
            cursor.execute('SET enable_indexscan = true')
            part_utils.repartition_network_imeis(
                db_conn, num_physical_shards=num_initial_shards)
            cursor.execute('DROP TABLE seen_imeis CASCADE')
            logger.info('Re-partitioned seen_imeis (network_imeis) table')

            # First, just put all country-level triplets in a temporary table so that we can call partition_utils
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute(
                    """CREATE UNLOGGED TABLE monthly_network_triplets_country (
                           triplet_year SMALLINT NOT NULL,
                           triplet_month SMALLINT NOT NULL,
                           first_seen DATE NOT NULL,
                           last_seen DATE NOT NULL,
                           date_bitmask INTEGER NOT NULL,
                           triplet_hash UUID NOT NULL,
                           imei_norm TEXT,
                           imsi TEXT,
                           msisdn TEXT,
                           virt_imei_shard SMALLINT NOT NULL,
                           CHECK (last_seen >= first_seen),
                           CHECK (EXTRACT(month FROM last_seen) = triplet_month AND
                                  EXTRACT(year FROM last_seen) = triplet_year),
                           CHECK (EXTRACT(month FROM first_seen) = triplet_month AND
                                  EXTRACT(year FROM first_seen) = triplet_year)
                       ) PARTITION BY RANGE (triplet_year, triplet_month)
                    """)

            # Work out what partitions to create and create them
            partitions = utils.child_table_names(db_conn, 'seen_triplets')
            # Make sure that they are owned by dirbs_core_import_operator (they can be owner by dirbs_core_power_user)
            # due to bad previous migration scripts
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                for p in partitions:
                    cursor.execute(
                        sql.SQL(
                            'ALTER TABLE {0} OWNER TO dirbs_core_import_operator'
                        ).format(sql.Identifier(p)))

            year_month_tuples = {
                (x.triplet_year, x.triplet_month)
                for x in utils.table_invariants_list(
                    db_conn, partitions, ['triplet_year', 'triplet_month'])
            }
            for year, month in year_month_tuples:
                part_utils.create_monthly_network_triplets_country_partition(
                    db_conn, month=month, year=year, num_physical_shards=1)

            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute(
                    """CREATE UNLOGGED TABLE monthly_network_triplets_per_mno (
                            LIKE monthly_network_triplets_country INCLUDING ALL,
                            operator_id TEXT NOT NULL
                       ) PARTITION BY LIST (operator_id)
                    """)

            # Work out what partitions to create and create them
            op_year_month_tuples = {
                (x.operator_id, x.triplet_year, x.triplet_month)
                for x in utils.table_invariants_list(
                    db_conn, partitions,
                    ['operator_id', 'triplet_year', 'triplet_month'])
            }
            # Create child partitions at per-MNO level
            for op, year, month in op_year_month_tuples:
                part_utils.create_monthly_network_triplets_per_mno_partition(
                    db_conn,
                    operator_id=op,
                    month=month,
                    year=year,
                    num_physical_shards=1)

            # Create temporary monthly_network_triplets_per_mno table
            for year, month in year_month_tuples:
                logger.info(
                    'Generating temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}...'
                    .format(month, year))
                cursor.execute(
                    """INSERT INTO monthly_network_triplets_per_mno
                            SELECT %(year)s,
                                   %(month)s,
                                   first_seen,
                                   last_seen,
                                   date_bitmask,
                                   triplet_hash,
                                   imei_norm,
                                   imsi,
                                   msisdn,
                                   calc_virt_imei_shard(imei_norm),
                                   operator_id
                              FROM seen_triplets
                             WHERE triplet_year = %(year)s
                               AND triplet_month = %(month)s
                    """, {
                        'year': year,
                        'month': month
                    })
                logger.info(
                    'Generated temporary monthly_network_triplets_per_mno entries for {0:02d}/{1:d}'
                    .format(month, year))

            # Create temporary monthly_network_triplets_country table. We need to do this monthly as we need
            # to aggregate by triplets on a monthly basis
            #
            # We disable index scans here as doing a merge append with index scans is much slower and involves
            # a lot of seeks which kills performance on non-SSD drives. Better to use an append plan and sort
            # the results by imei_norm
            #
            cursor.execute('SET enable_indexscan = false')
            for year, month in year_month_tuples:
                logger.info(
                    'Generating temporary monthly_network_triplets_country entries for {0:02d}/{1:d}...'
                    .format(month, year))
                cursor.execute(
                    """INSERT INTO monthly_network_triplets_country
                            SELECT %(year)s,
                                   %(month)s,
                                   MIN(first_seen),
                                   MAX(last_seen),
                                   bit_or(date_bitmask),
                                   triplet_hash,
                                   FIRST(imei_norm),
                                   FIRST(imsi),
                                   FIRST(msisdn),
                                   calc_virt_imei_shard(FIRST(imei_norm))
                              FROM seen_triplets
                             WHERE triplet_year = %(year)s
                               AND triplet_month = %(month)s
                          GROUP BY triplet_hash
                    """, {
                        'year': year,
                        'month': month
                    })
                logger.info(
                    'Generated temporary monthly_network_triplets_country entries for {0:02d}/{1:d}'
                    .format(month, year))
            cursor.execute('SET enable_indexscan = true')

            logger.info(
                'Re-partitioning temporary monthly_network_triplets tables...')
            # Previously, the operator_data view was owned by dirbs_core_power_user but is now owned by the
            # dirbs_core_import_operator since it must be re-created
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_power_user'):
                cursor.execute(
                    'ALTER VIEW operator_data OWNER TO dirbs_core_import_operator'
                )
            part_utils.repartition_monthly_network_triplets(
                db_conn, num_physical_shards=num_initial_shards)
            cursor.execute('DROP TABLE seen_triplets CASCADE')
            logger.info(
                'Re-partitioned temporary monthly_network_triplets tables')

            # Replace list generation function to include virt_imei_shard
            cursor.execute("""
                DROP FUNCTION gen_blacklist(run_id BIGINT);
                DROP FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT);
                DROP FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT);

                --
                -- Create function to generate a full blacklist for a given run_id. A value of -1 means get the latest
                -- list.
                --
                CREATE FUNCTION gen_blacklist(run_id BIGINT = -1)
                    RETURNS TABLE (
                        imei_norm       TEXT,
                        virt_imei_shard SMALLINT,
                        block_date      DATE,
                        reasons         TEXT[]
                    )
                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                    AS $$
                DECLARE
                    query_run_id    BIGINT;
                BEGIN
                    --
                    -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows
                    -- where end_run_id is NULL
                    --
                    IF run_id = -1 THEN
                        run_id := max_bigint();
                    END IF;

                    RETURN QUERY SELECT bl.imei_norm,
                                        bl.virt_imei_shard,
                                        bl.block_date,
                                        bl.reasons
                                   FROM blacklist bl
                                  WHERE bl.delta_reason != 'unblocked'
                                    AND run_id >= bl.start_run_id
                                    AND (run_id < bl.end_run_id OR bl.end_run_id IS NULL);
                END
                $$;

                --
                -- Create function to generate a full notifications_list for a given run_id and operator ID. A value
                -- of -1 means get the latest list.
                --
                CREATE FUNCTION gen_notifications_list(op_id TEXT, run_id BIGINT = -1)
                    RETURNS TABLE (
                        imei_norm       TEXT,
                        virt_imei_shard SMALLINT,
                        imsi            TEXT,
                        msisdn          TEXT,
                        block_date      DATE,
                        reasons         TEXT[],
                        amnesty_granted BOOLEAN
                    )
                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                    AS $$
                BEGIN
                    --
                    -- If we don't specify a run_id, just set to the maximum run_id which will always return all rows
                    -- where end_run_id is NULL
                    --
                    IF run_id = -1 THEN
                        run_id := max_bigint();
                    END IF;

                    RETURN QUERY SELECT nl.imei_norm,
                                        nl.virt_imei_shard,
                                        nl.imsi,
                                        nl.msisdn,
                                        nl.block_date,
                                        nl.reasons,
                                        nl.amnesty_granted
                                   FROM notifications_lists nl
                                  WHERE nl.operator_id = op_id
                                    AND nl.delta_reason NOT IN ('resolved', 'blacklisted')
                                    AND run_id >= nl.start_run_id
                                    AND (run_id < nl.end_run_id OR nl.end_run_id IS NULL);
                END
                $$;

                --
                -- Create function to generate a full exceptions_list for a given run_id and operator ID. A value
                -- of -1 means get the latest list.
                --
                CREATE FUNCTION gen_exceptions_list(op_id TEXT, run_id BIGINT = -1)
                    RETURNS TABLE (
                        imei_norm       TEXT,
                        virt_imei_shard SMALLINT,
                        imsi            TEXT
                    )
                    LANGUAGE plpgsql STRICT STABLE PARALLEL SAFE
                    AS $$
                BEGIN
                    --
                    -- If we don't specify a run_id, just set to the maximum run_id which will always return all
                    -- rows where end_run_id is NULL
                    --
                    IF run_id = -1 THEN
                        run_id := max_bigint();
                    END IF;

                    RETURN QUERY SELECT el.imei_norm,
                                        el.virt_imei_shard,
                                        el.imsi
                                   FROM exceptions_lists el
                                  WHERE el.operator_id = op_id
                                    AND el.delta_reason != 'removed'
                                    AND run_id >= el.start_run_id
                                    AND (run_id < el.end_run_id OR el.end_run_id IS NULL);
                END
                $$;
            """)  # noqa: Q440, Q441

            # Update schema metadata table
            cursor.execute(
                """ALTER TABLE schema_metadata ADD COLUMN phys_shards SMALLINT NOT NULL
                              DEFAULT %s CHECK (phys_shards > 0 AND phys_shards <= 100)""",
                [num_initial_shards])
            cursor.execute(
                'ALTER TABLE schema_metadata ALTER COLUMN phys_shards DROP DEFAULT'
            )

            # Drop obsolete columns
            cursor.execute(
                'ALTER TABLE schema_metadata DROP COLUMN potential_whitespace_imsis_msisdns'
            )
            cursor.execute(
                'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_imsi_records'
            )
            cursor.execute(
                'ALTER TABLE report_monthly_stats DROP COLUMN num_whitespace_msisdn_records'
            )
Beispiel #14
0
def lists(ctx, config, statsd, logger, run_id, conn, metadata_conn, command,
          metrics_root, metrics_run_root):
    """Prune obsolete lists data."""
    curr_date = ctx.obj['CURR_DATE']

    # store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        retention_months=config.retention_config.months_retention)

    logger.info(
        'Pruning lists tables to remove any obsolete data with end_time outside the retention window..'
    )
    retention_months = config.retention_config.months_retention

    if curr_date is None:
        curr_date = datetime.date.today()

    first_month_to_drop = datetime.date(
        curr_date.year, curr_date.month,
        1) - relativedelta.relativedelta(months=retention_months)
    logger.info(
        'Lists data with end_time earlier than {0} will be pruned'.format(
            first_month_to_drop))

    with utils.db_role_setter(
            conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor:
        logger.debug('Calculating original number of rows in lists tables...')
        row_count_sql = sql.SQL(
            """SELECT blacklist_row_count, noft_lists_row_count, excp_lists_row_count
                                     FROM (SELECT COUNT(*)
                                             FROM blacklist) AS blacklist_row_count,
                                          (SELECT COUNT(*)
                                             FROM notifications_lists) AS noft_lists_row_count,
                                          (SELECT COUNT(*)
                                             FROM exceptions_lists) AS excp_lists_row_count"""
        )
        cursor.execute(row_count_sql)
        rows_before = cursor.fetchone()
        blacklist_rows_before = int(
            rows_before.blacklist_row_count.strip('()'))
        notflist_rows_before = int(
            rows_before.noft_lists_row_count.strip('()'))
        excplist_rows_before = int(
            rows_before.excp_lists_row_count.strip('()'))
        rows_before = blacklist_rows_before + notflist_rows_before + excplist_rows_before
        logger.debug('Calculated original number of rows in lists tables...')
        statsd.gauge('{0}blacklist_rows_before'.format(metrics_run_root),
                     blacklist_rows_before)
        statsd.gauge(
            '{0}notifications_lists_rows_before'.format(metrics_run_root),
            notflist_rows_before)
        statsd.gauge(
            '{0}exceptions_lists_rows_before'.format(metrics_run_root),
            excplist_rows_before)
        metadata.add_optional_job_metadata(
            metadata_conn,
            command,
            run_id,
            blacklist_rows_before=blacklist_rows_before,
            notifications_lists_rows_before=notflist_rows_before,
            exceptions_lists_rows_before=excplist_rows_before)

        # Calculate number of rows in the lists table outside the retention window
        job_metadata_filter_sql = """SELECT run_id
                                       FROM job_metadata
                                      WHERE command = 'dirbs-listgen'
                                        AND end_time < '{0}'""".format(
            first_month_to_drop)

        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM blacklist
                                   WHERE start_run_id IN ({0})""".format(
                job_metadata_filter_sql)))
        total_bl_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of blacklist table outside the retention window to prune'
            .format(total_bl_rows_out_window_to_prune))

        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM notifications_lists
                                   WHERE start_run_id IN ({0})""".format(
                job_metadata_filter_sql)))
        total_nl_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of notifications lists table outside the retention window to prune'
            .format(total_nl_rows_out_window_to_prune))

        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM exceptions_lists
                                   WHERE start_run_id IN ({0})""".format(
                job_metadata_filter_sql)))
        total_nl_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of exceptions lists table outside the retention window to prune'
            .format(total_nl_rows_out_window_to_prune))

        # We repartition the tables to re-create them, passing a condition sql
        logger.debug('Re-creating blacklist table...')
        num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn)
        src_filter_sql = cursor.mogrify(
            """WHERE start_run_id NOT IN ({0})""".format(
                job_metadata_filter_sql))
        partition_utils.repartition_blacklist(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created blacklist table')

        logger.debug('Re-creating notifications lists table...')
        partition_utils.repartition_notifications_lists(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created notifications lists table')

        logger.debug('Re-creating exceptions lists table...')
        partition_utils.repartition_exceptions_lists(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created exceptions lists table')

        logger.debug('Calculating new number of rows in lists tables...')
        cursor.execute(row_count_sql)
        rows_after = cursor.fetchone()
        blacklist_rows_after = int(rows_after.blacklist_row_count.strip('()'))
        notflist_rows_after = int(rows_after.noft_lists_row_count.strip('()'))
        excplist_rows_after = int(rows_after.excp_lists_row_count.strip('()'))
        rows_after = blacklist_rows_after + notflist_rows_after + excplist_rows_after
        logger.debug('Calculated new number of rows in lists tables')
        statsd.gauge('{0}blacklist_rows_after'.format(metrics_run_root),
                     blacklist_rows_after)
        statsd.gauge(
            '{0}notifications_lists_rows_after'.format(metrics_run_root),
            notflist_rows_after)
        statsd.gauge('{0}exceptions_lists_rows_after'.format(metrics_run_root),
                     excplist_rows_after)
        metadata.add_optional_job_metadata(
            metadata_conn,
            command,
            run_id,
            blacklist_rows_before=blacklist_rows_after,
            notifications_lists_rows_before=notflist_rows_after,
            exceptions_lists_rows_before=excplist_rows_after)
        logger.info('Pruned {0:d} rows from lists tables'.format(rows_after -
                                                                 rows_before))
Beispiel #15
0
def blacklist(ctx, config, statsd, logger, run_id, conn, metadata_conn,
              command, metrics_root, metrics_run_root, condition_name,
              prune_all):
    """Expire IMEIs outside the blacklist retention period from blacklist."""
    current_date = datetime.date.today()
    retention_days = config.retention_config.blacklist_retention

    if condition_name is None and prune_all is False:
        logger.info(
            'Error: one of the arguments "condition_name" or "--prune-all" is required'
        )
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           curr_date=current_date.isoformat(),
                                           retention_days=retention_days,
                                           job_executed=False)
    elif condition_name is not None and prune_all is True:
        logger.info(
            'Error: only one of the arguments "condition_name" or "--prune-all" is required'
        )
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           curr_date=current_date.isoformat(),
                                           retention_days=retention_days,
                                           job_executed=False)
    elif retention_days == 0:
        logger.info(
            'Blacklist will not be prune, as retention value is set to {0}'.
            format(retention_days))
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           curr_date=current_date.isoformat(),
                                           retention_days=retention_days,
                                           job_executed=False)
    else:
        _warn_about_prune_all(prune_all, logger)
        logger.info(
            'Pruning blacklist to remove any data related to specified condition '
            'outside the retention window.')
        last_retention_date = datetime.date(
            current_date.year, current_date.month,
            current_date.day) - datetime.timedelta(retention_days)

        # store metadata
        logger.info(
            'Blacklist entries with start_date earlier than {0} will be pruned'
            .format(last_retention_date))
        metadata.add_optional_job_metadata(
            metadata_conn,
            command,
            run_id,
            curr_date=current_date.isoformat(),
            retention_days=retention_days,
            job_executed=True,
            last_retention_date=last_retention_date.isoformat())

        with utils.db_role_setter(
                conn,
                role_name='dirbs_core_power_user'), conn.cursor() as cursor:
            logger.debug(
                'Calculating original number of rows with block_date in classification_state table...'
            )

            cursor.execute("""SELECT COUNT(*)
                                FROM classification_state
                               WHERE block_date IS NOT NULL
                                 AND end_date IS NULL""")
            rows_before = cursor.fetchone()[0]

            logger.debug(
                'Calculated original number of rows (having block_date) in classification_state table'
            )
            statsd.gauge('{0}rows_before'.format(metrics_run_root),
                         rows_before)
            metadata.add_optional_job_metadata(metadata_conn,
                                               command,
                                               run_id,
                                               rows_before=rows_before)

            # if its a condition based pruning
            if not prune_all:
                cursor.execute(
                    sql.SQL("""SELECT COUNT(*)
                                            FROM classification_state
                                           WHERE start_date < %s
                                             AND cond_name = %s
                                             AND end_date IS NULL
                                             AND block_date IS NOT NULL"""),
                    [last_retention_date, condition_name[0].label])
                total_rows_to_prune = cursor.fetchone()[0]

                logger.info(
                    'Found {0:d} rows of classification_state table '
                    'with start_date for {1} dimension outside the blacklist '
                    'retention window.'.format(total_rows_to_prune,
                                               condition_name[0].label))

                if total_rows_to_prune > 0:
                    cursor.execute(
                        sql.SQL("""UPDATE classification_state
                                                 SET end_date = '{0}'
                                               WHERE start_date < '{1}'
                                                 AND cond_name = '{2}'
                                                 AND end_date IS NULL
                                                 AND block_date IS NOT NULL""".
                                format(current_date.isoformat(),
                                       last_retention_date,
                                       condition_name[0].label)))

                logger.info(
                    'Pruned {0:d} rows from blacklist for {1} dimension'.
                    format(total_rows_to_prune, condition_name[0].label))

            # prune without any condition
            else:
                cursor.execute(
                    sql.SQL("""SELECT COUNT(*)
                                            FROM classification_state
                                           WHERE start_date < %s
                                             AND end_date IS NULL
                                             AND block_date IS NOT NULL"""),
                    [last_retention_date])
                total_rows_to_prune = cursor.fetchone()[0]

                logger.info(
                    'Found {0:d} rows of classification_state table '
                    'with start_date outside the blacklist retention window.'.
                    format(total_rows_to_prune))

                if total_rows_to_prune > 0:
                    cursor.execute(
                        sql.SQL("""UPDATE classification_state
                                                 SET end_date = '{0}'
                                               WHERE start_date < '{1}'
                                                 AND end_date IS NULL
                                                 AND block_date IS NOT NULL""".
                                format(current_date.isoformat(),
                                       last_retention_date)))
                logger.info('Pruned {0:d} rows from blacklist'.format(
                    total_rows_to_prune))

            logger.debug(
                'Calculating remaining number of rows with block_date (end_date is null) '
                'in classification_state table...')
            cursor.execute("""SELECT COUNT(*)
                                FROM classification_state
                               WHERE block_date IS NOT NULL
                                 AND end_date IS NULL""")
            rows_after = cursor.fetchone()[0]

            logger.debug(
                'Calculated remaining number of rows (having block_date and end_date null) '
                'in classification_state table')
            statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after)
            metadata.add_optional_job_metadata(metadata_conn,
                                               command,
                                               run_id,
                                               rows_after=rows_after)
Beispiel #16
0
def classification_state(ctx, config, statsd, logger, run_id, conn,
                         metadata_conn, command, metrics_root,
                         metrics_run_root):
    """Prune obsolete classification_state data."""
    curr_date = ctx.obj['CURR_DATE']

    # Store metadata
    metadata.add_optional_job_metadata(
        metadata_conn,
        command,
        run_id,
        curr_date=curr_date.isoformat() if curr_date is not None else None,
        retention_months=config.retention_config.months_retention)

    logger.info(
        'Pruning classification_state table to remove any classification state data related to '
        'obsolete conditions and data with end_date outside the retention window..'
    )

    cond_config_list = [c.label for c in config.conditions]
    retention_months = config.retention_config.months_retention

    if curr_date is None:
        curr_date = datetime.date.today()

    first_month_to_drop = datetime.date(
        curr_date.year, curr_date.month,
        1) - relativedelta.relativedelta(months=retention_months)
    logger.info(
        'Classification state data with end_date earlier than {0} will be '
        'pruned'.format(first_month_to_drop))

    with utils.db_role_setter(
            conn, role_name='dirbs_core_power_user'), conn.cursor() as cursor:
        logger.debug(
            'Calculating original number of rows in classification_state table...'
        )
        cursor.execute('SELECT COUNT(*) FROM classification_state')
        rows_before = cursor.fetchone()[0]
        logger.debug(
            'Calculated original number of rows in classification_state table')
        statsd.gauge('{0}rows_before'.format(metrics_run_root), rows_before)
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_before=rows_before)

        # Calculate number of rows in the classification table outside retention window
        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM classification_state
                                   WHERE end_date < %s """),
            [first_month_to_drop])
        total_rows_out_window_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of classification_state table '
            'with end_date outside the retention window to prune.'.format(
                total_rows_out_window_to_prune))

        # Calculate number of rows in the classification with conditions no longer existing
        cursor.execute(
            sql.SQL("""SELECT COUNT(*)
                                    FROM classification_state
                                   WHERE NOT starts_with_prefix(cond_name, %s)"""
                    ), [cond_config_list])
        total_rows_no_cond_to_prune = cursor.fetchone()[0]
        logger.info(
            'Found {0:d} rows of classification_state table with conditions '
            'no longer existing to prune.'.format(total_rows_no_cond_to_prune))

        logger.debug('Re-creating classification_state table...')
        # Basically, we just re-partition the classification_state table to re-create it, passing a src_filter_sql
        # parameter
        num_phys_imei_shards = partition_utils.num_physical_imei_shards(conn)
        src_filter_sql = cursor.mogrify(
            """WHERE (end_date > %s
                                              OR end_date IS NULL)
                                             AND cond_name LIKE ANY(%s)""",
            [first_month_to_drop, cond_config_list])
        partition_utils.repartition_classification_state(
            conn,
            num_physical_shards=num_phys_imei_shards,
            src_filter_sql=str(src_filter_sql, encoding=conn.encoding))
        logger.debug('Re-created classification_state table')

        logger.debug(
            'Calculating new number of rows in classification_state table...')
        cursor.execute('SELECT COUNT(*) FROM classification_state')
        rows_after = cursor.fetchone()[0]
        logger.debug(
            'Calculated new number of rows in classification_state table')
        statsd.gauge('{0}rows_after'.format(metrics_run_root), rows_after)
        metadata.add_optional_job_metadata(metadata_conn,
                                           command,
                                           run_id,
                                           rows_after=rows_after)

        logger.info('Pruned {0:d} rows from classification_state table'.format(
            rows_after - rows_before))
Beispiel #17
0
    def upgrade(self, db_conn):
        """Overrides AbstractMigrator upgrade method."""
        logger = logging.getLogger('dirbs.db')
        with db_conn.cursor() as cursor:
            # Set search_path to include hll
            cursor.execute('SET search_path = core, hll;')
            cursor.execute("""
                DO $$
                DECLARE
                    database_name TEXT;
                BEGIN
                    SELECT current_database() INTO database_name;
                    -- Set the search path of this database to "core"
                    EXECUTE 'ALTER DATABASE ' || quote_ident(database_name) || ' SET search_path TO core, hll';
                END $$;""")

            logger.info('Creating daily_per_mno_hll_sketches table...')
            with utils.db_role_setter(db_conn,
                                      role_name='dirbs_core_import_operator'):
                cursor.execute("""CREATE TABLE daily_per_mno_hll_sketches (
                                      PRIMARY KEY (data_date, operator_id),
                                      data_date         DATE  NOT NULL,
                                      operator_id       TEXT  NOT NULL,
                                      creation_date     DATE  NOT NULL,
                                      triplet_hll       HLL   NOT NULL,
                                      imei_hll          HLL   NOT NULL,
                                      imsi_hll          HLL   NOT NULL,
                                      msisdn_hll        HLL   NOT NULL,
                                      imei_imsis_hll    HLL   NOT NULL,
                                      imei_msisdns_hll  HLL   NOT NULL,
                                      imsi_msisdns_hll  HLL   NOT NULL
                                  )
                                """)
                cursor.execute(
                    'GRANT SELECT ON daily_per_mno_hll_sketches TO dirbs_core_report'
                )
                logger.info('Created daily_per_mno_hll_sketches table')

                logger.info(
                    'Populating daily_per_mno_hll_sketches from seen_triplets...'
                )
                child_table_names_list = utils.child_table_names(
                    db_conn, 'seen_triplets')

                # Make sure that seen_triplets partitions are owned by dirbs_core_import_operator (they are supposed
                # to be). Previously migration scripts failed to set ownership correctly when tables were re-written
                # and they were incorrectly owned by dirbs_core_power_user.
                with utils.db_role_setter(db_conn,
                                          role_name='dirbs_core_power_user'):
                    for p in child_table_names_list:
                        cursor.execute(
                            sql.SQL(
                                'ALTER TABLE {0} OWNER TO dirbs_core_import_operator'
                            ).format(sql.Identifier(p)))

                for partition_name in child_table_names_list:
                    logger.info(
                        'Populating daily_per_mno_hll_sketches from partition {0}...'
                        .format(partition_name))
                    cursor.execute(
                        sql.SQL(
                            'SELECT triplet_year, triplet_month FROM {0} LIMIT 1'
                        ).format(sql.Identifier(partition_name)))
                    res = cursor.fetchone()
                    if res is None:
                        # Table is empty
                        continue

                    year = res.triplet_year
                    month = res.triplet_month
                    days_in_month = monthrange(year, month)[1]
                    triplet_sql_list = []
                    imei_sql_list = []
                    imsi_sql_list = []
                    msisdn_sql_list = []
                    imei_imsis_sql_list = []
                    imei_msisdns_sql_list = []
                    imsi_msisdns_sql_list = []
                    final_select_sql_list = []

                    hll_partition_name = 'hll_{0}'.format(partition_name)
                    cursor.execute(
                        sql.SQL(
                            """CREATE TABLE {0} (PRIMARY KEY (data_date, operator_id),
                                                      LIKE daily_per_mno_hll_sketches)
                                                      INHERITS (daily_per_mno_hll_sketches);
                                                      ALTER TABLE {0} OWNER TO dirbs_core_import_operator
                                           """).format(
                                sql.Identifier(hll_partition_name)))

                    aggregated_data_temp_table = 'temp_{0}'.format(
                        hll_partition_name)
                    base_query = sql.SQL(
                        """CREATE TEMP TABLE {aggregated_data_temp_table_id} AS
                                                SELECT {select_sql}
                                                  FROM {partition_tbl_id}""")

                    for day in range(1, days_in_month + 1):
                        day_literal = sql.Literal(day)
                        triplet_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(triplet_hash::TEXT))
                                                           FILTER(WHERE (date_bitmask
                                                                         & (1 << ({day_literal} - 1))) <> 0
                                                              AND imei_norm IS NOT NULL
                                                              AND imsi IS NOT NULL
                                                              AND msisdn IS NOT NULL) AS triplet_day{day_literal}"""
                            ).format(day_literal=day_literal))

                        imei_sql_list.append(
                            sql.SQL("""hll_add_agg(hll_hash_text(imei_norm))
                                                            FILTER(WHERE (date_bitmask
                                                                          & (1 << ({day_literal} - 1))) <> 0
                                                                     AND imei_norm IS NOT NULL)
                                                            AS imei_day{day_literal}"""
                                    ).format(day_literal=day_literal))

                        imsi_sql_list.append(
                            sql.SQL("""hll_add_agg(hll_hash_text(imsi))
                                                        FILTER(WHERE (date_bitmask & (1 << ({day_literal} - 1))) <> 0
                                                                 AND imsi IS NOT NULL) AS imsi_day{day_literal}"""
                                    ).format(day_literal=day_literal))

                        msisdn_sql_list.append(
                            sql.SQL("""hll_add_agg(hll_hash_text(msisdn))
                                                              FILTER(WHERE (date_bitmask
                                                                            & (1 << ({day_literal} - 1))) <> 0
                                                                       AND msisdn IS NOT NULL)
                                                              AS msisdn_day{day_literal}"""
                                    ).format(day_literal=day_literal))

                        imei_imsis_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(imei_norm||'$'||imsi))
                                                                  FILTER(WHERE (date_bitmask
                                                                                & (1 << ({day_literal} - 1))) <> 0
                                                                           AND imei_norm IS NOT NULL
                                                                           AND imsi IS NOT NULL)
                                                                  AS imei_imsis_day{day_literal}"""
                            ).format(day_literal=day_literal))

                        imei_msisdns_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(imei_norm||'$'||msisdn))
                                                                    FILTER(WHERE (date_bitmask
                                                                                  & (1 << ({day_literal} - 1))) <> 0
                                                                             AND  imei_norm IS NOT NULL
                                                                             AND  msisdn IS NOT NULL
                                                                           ) AS imei_msisdns_day{day_literal}"""
                            ).format(day_literal=day_literal))

                        imsi_msisdns_sql_list.append(
                            sql.SQL(
                                """hll_add_agg(hll_hash_text(imsi||'$'||msisdn))
                                                                    FILTER(WHERE (date_bitmask
                                                                                  & (1 << ({day_literal} - 1))) <> 0
                                                                             AND  imsi IS NOT NULL
                                                                             AND  msisdn IS NOT NULL)
                                                                             AS imsi_msisdns_day{day_literal}"""
                            ).format(day_literal=day_literal))

                    for sql_list in [
                            triplet_sql_list, imei_sql_list, imsi_sql_list,
                            msisdn_sql_list, imei_imsis_sql_list,
                            imei_msisdns_sql_list, imsi_msisdns_sql_list
                    ]:
                        final_select_sql_list.extend(sql_list)

                    final_query = base_query \
                        .format(aggregated_data_temp_table_id=sql.Identifier(aggregated_data_temp_table),
                                select_sql=sql.SQL(', ').join(final_select_sql_list),
                                partition_tbl_id=sql.Identifier(partition_name))

                    cursor.execute(final_query)

                    for day in range(1, days_in_month + 1):
                        str_split = partition_name.split('_')
                        op = str_split[2]
                        job_start_time = datetime.now()
                        day_literal = sql.Literal(day)

                        cursor.execute(
                            sql.SQL(
                                """INSERT INTO {0} (data_date, operator_id, creation_date, triplet_hll,
                                                                   imei_hll, imsi_hll, msisdn_hll, imei_imsis_hll,
                                                                   imei_msisdns_hll, imsi_msisdns_hll)
                                                       SELECT make_date(%s, %s, {day_literal}) AS data_date,
                                                              %s AS operator_id, %s AS creation_date,
                                                              CASE
                                                                  WHEN triplet_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE triplet_day{day_literal}
                                                              END AS triplet_hll,
                                                              CASE
                                                                  WHEN imei_day{day_literal} IS NULL THEN hll_empty()
                                                                  ELSE imei_day{day_literal}
                                                              END AS imei_hll,
                                                              CASE
                                                                  WHEN imsi_day{day_literal} IS NULL THEN hll_empty()
                                                                  ELSE imsi_day{day_literal}
                                                              END AS imsi_hll,
                                                              CASE
                                                                  WHEN msisdn_day{day_literal} IS NULL THEN hll_empty()
                                                                  ELSE msisdn_day{day_literal}
                                                              END AS msisdn_hll,
                                                              CASE
                                                                  WHEN imei_imsis_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE imei_imsis_day{day_literal}
                                                              END AS imei_imsis_hll,
                                                              CASE
                                                                  WHEN imei_msisdns_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE imei_msisdns_day{day_literal}
                                                              END AS imei_msisdns_hll,
                                                              CASE
                                                                  WHEN imsi_msisdns_day{day_literal} IS NULL
                                                                  THEN hll_empty()
                                                                  ELSE imsi_msisdns_day{day_literal}
                                                              END AS imsi_msisdns_hll

                                                         FROM {1}""").format(
                                    sql.Identifier(hll_partition_name),
                                    sql.Identifier(aggregated_data_temp_table),
                                    day_literal=day_literal),
                            [year, month, op, job_start_time])

                    logger.info(
                        'Populated daily_per_mno_hll_sketches from partition {0}'
                        .format(partition_name))

            logger.info(
                'Populated daily_per_mno_hll_sketches from seen_triplets')
Beispiel #18
0
def upgrade(ctx):
    """
    Upgrades the current DB schema to the version supported by this code using migration scripts.

    :param ctx: current cli context obj
    """
    logger = logging.getLogger('dirbs.db')
    config = common.ensure_config(ctx)
    db_config = config.db_config
    needs_analyze = False
    with utils.create_db_connection(db_config) as conn:
        logger.info('Querying DB schema version for DB %s on host %s',
                    db_config.database, db_config.host)
        with conn.cursor() as cur:
            try:
                version = utils.query_db_schema_version(conn)
            except ProgrammingError:
                logger.warn(
                    'Could not determine current schema version. Assuming no version'
                )
                version = None

            if version is None:
                logger.error(
                    'DB currently not installed or version number could not be determined. Can\'t upgrade'
                )
                sys.exit(1)

            if version < min_schema_version:
                logger.error(
                    'Current DB schema is older than DIRBS 4.0.0. Can\'t upgrade'
                )
                sys.exit(1)

            if version > code_db_schema_version:
                logger.error('DB schema newer than code. Can\'t upgrade')
                sys.exit(1)

            if version != code_db_schema_version:
                logger.info('Upgrading DB schema from version %d to %d',
                            version, code_db_schema_version)

                # If we're upgrading, make sure we schedule a full ANALYZE outside the transaction later
                needs_analyze = True

                # Set our role here so that new objects get created with dirbs_core_power_user as owner by default
                with utils.db_role_setter(conn,
                                          role_name='dirbs_core_power_user'):
                    for old_version in range(version, code_db_schema_version):
                        new_version = old_version + 1
                        # Check if there is a special migration class, otherwise use standard SQL file
                        try:
                            module_name = 'dirbs.schema_migrators.v{0}_upgrade'.format(
                                new_version)
                            module = importlib.import_module(module_name)
                            logger.info('Running Python migration script: %s',
                                        module_name)
                            migrator = module.migrator()
                            migrator.upgrade(conn)
                        except ImportError as ex:
                            script_name = 'sql/migration_scripts/v{0:d}_upgrade.sql'.format(
                                new_version)
                            logger.info('Running SQL migration script: %s',
                                        script_name)
                            sql = pkgutil.get_data('dirbs', script_name)
                            cur.execute(sql)

                        # We commit after every version upgrade
                        utils.set_db_schema_version(conn, new_version)
                        conn.commit()

                logger.info(
                    'Successfully updated schema - DB schema version is now %d',
                    code_db_schema_version)
                # Can't do anything until we know the schema is the right version
                _store_job_metadata(config, 'upgrade')
            else:
                logger.info('DB schema is already latest version')

            # Schedule a full ANALYZE at the end of an upgrade
            if needs_analyze:
                logger.info(
                    'Running ANALYZE of entire database after upgrade...')
                cur.execute('ANALYZE')
                logger.info(
                    'Finished running ANALYZE of entire database after upgrade'
                )
Beispiel #19
0
    def _repartition_pairing_list(self, conn, *, num_physical_shards):
        """Repartition pairing list to implement change in structure."""
        with conn.cursor() as cursor, utils.db_role_setter(
                conn, role_name='dirbs_core_power_user'):
            cursor.execute("""CREATE TABLE historic_pairing_list_new (
                       LIKE historic_pairing_list INCLUDING DEFAULTS
                                                  INCLUDING IDENTITY
                                                  INCLUDING CONSTRAINTS
                                                  INCLUDING STORAGE
                                                  INCLUDING COMMENTS
                   )
                   PARTITION BY RANGE (virt_imei_shard)
                """)
            cursor.execute(
                """ALTER TABLE historic_pairing_list_new ADD COLUMN msisdn TEXT NOT NULL"""
            )

            # grant permissions
            part_utils._grant_perms_pairing_list(
                conn, part_name='historic_pairing_list_new')

            # create child partitions
            part_utils.create_imei_shard_partitions(
                conn,
                tbl_name='historic_pairing_list_new',
                num_physical_shards=num_physical_shards,
                perms_func=part_utils._grant_perms_pairing_list,
                fillfactor=80)

            # copy data from original partition
            cursor.execute("""INSERT INTO historic_pairing_list_new
                                   SELECT p.imei_norm, p.imsi, p.start_date, p.end_date, p.virt_imei_shard, m.msisdn
                                     FROM historic_pairing_list p
                               INNER JOIN monthly_network_triplets_country m ON p.imsi = m.imsi"""
                           )

            # add indexes
            idx_metadata = [
                part_utils.IndexMetadatum(
                    idx_cols=['imei_norm', 'imsi', 'msisdn'],
                    is_unique=True,
                    partial_sql='WHERE end_date IS NULL')
            ]
            part_utils.add_indices(conn,
                                   tbl_name='historic_pairing_list_new',
                                   idx_metadata=idx_metadata)

            # drop old views, tables, indexes and constraints
            cursor.execute('DROP VIEW pairing_list')
            cursor.execute('DROP TABLE historic_pairing_list CASCADE')
            part_utils.rename_table_and_indices(
                conn,
                old_tbl_name='historic_pairing_list_new',
                new_tbl_name='historic_pairing_list',
                idx_metadata=idx_metadata)

            # create new view and grant permissions
            cursor.execute("""CREATE VIEW pairing_list AS
                                   SELECT imei_norm, imsi, msisdn, virt_imei_shard
                                     FROM historic_pairing_list
                                    WHERE end_date IS NULL WITH CHECK OPTION"""
                           )
            cursor.execute("""GRANT SELECT ON pairing_list
                              TO dirbs_core_listgen, dirbs_core_report, dirbs_core_api, dirbs_core_import_pairing_list
                           """)

            # drop and recreate staging data insert trigger
            cursor.execute("""
                            DROP FUNCTION pairing_list_staging_data_insert_trigger_fn() CASCADE;

                            CREATE FUNCTION pairing_list_staging_data_insert_trigger_fn() RETURNS trigger
                                LANGUAGE plpgsql
                                AS $$
                            BEGIN
                                -- Clean/normalize data before inserting
                                NEW.imei_norm = normalize_imei(NULLIF(TRIM(NEW.imei), ''));
                                NEW.imsi = NULLIF(TRIM(new.imsi), '');
                                NEW.msisdn = NULLIF(TRIM(new.msisdn), '');
                                RETURN NEW;
                            END
                            $$;
            """)
Beispiel #20
0
def repartition_monthly_network_triplets(conn, *, num_physical_shards):
    """Function to repartition the monthly_network_triplets_country and monthly_network_triplets_country tables."""
    with conn.cursor() as cursor, utils.db_role_setter(conn, role_name='dirbs_core_import_operator'):
        # Create parent partitions
        cursor.execute(
            """CREATE TABLE monthly_network_triplets_country_new (
                   LIKE monthly_network_triplets_country INCLUDING DEFAULTS
                                                         INCLUDING IDENTITY
                                                         INCLUDING CONSTRAINTS
                                                         INCLUDING STORAGE
                                                         INCLUDING COMMENTS
               ) PARTITION BY RANGE (triplet_year, triplet_month)
            """
        )
        _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_country_new')

        cursor.execute(
            """CREATE TABLE monthly_network_triplets_per_mno_new (
                   LIKE monthly_network_triplets_per_mno INCLUDING DEFAULTS
                                                         INCLUDING IDENTITY
                                                         INCLUDING CONSTRAINTS
                                                         INCLUDING STORAGE
                                                         INCLUDING COMMENTS
               ) PARTITION BY LIST (operator_id)
            """
        )
        _grant_perms_monthly_network_triplets(conn, part_name='monthly_network_triplets_per_mno_new')

        # Work out what year-month tuples we have
        country_monthly_partitions = utils.child_table_names(conn, 'monthly_network_triplets_country')
        country_year_month_tuples = [(x.triplet_year, x.triplet_month)
                                     for x in utils.table_invariants_list(conn,
                                                                          country_monthly_partitions,
                                                                          ['triplet_year', 'triplet_month'])]

        operator_partitions = utils.child_table_names(conn, 'monthly_network_triplets_per_mno')
        operator_monthly_partitions = set()
        for op_partition in operator_partitions:
            operator_monthly_partitions.update(utils.child_table_names(conn, op_partition))
        mno_year_month_tuples = [(x.operator_id, x.triplet_year, x.triplet_month)
                                 for x in utils.table_invariants_list(conn,
                                                                      operator_monthly_partitions,
                                                                      ['operator_id',
                                                                       'triplet_year',
                                                                       'triplet_month'])]

        latest_year_month = None
        # Sort year month tuples and get the maximum year month combination.
        country_year_month_tuples = sorted(country_year_month_tuples, key=lambda x: (x[0], x[1]), reverse=True)
        if len(country_year_month_tuples) > 0:
            latest_year_month = country_year_month_tuples[0]

        # Create child partitions at country level
        for year, month in country_year_month_tuples:
            # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we
            # pack tightly to ensure optimal usage of disk space and optimal scan performance
            latest_year, latest_month = latest_year_month
            fillfactor = 45 if year == latest_year and month == latest_month else 100
            create_monthly_network_triplets_country_partition(conn, month=month, year=year, suffix='_new',
                                                              num_physical_shards=num_physical_shards,
                                                              fillfactor=fillfactor)

        # Create child partitions at per-MNO level
        for op, year, month in mno_year_month_tuples:
            # Fillfactor is 45 for most recent month since it will likely still be updated. For older months we
            # pack tightly to ensure optimal usage of disk space and optimal scan performance
            latest_year, latest_month = latest_year_month
            fillfactor = 45 if year == latest_year and month == latest_month else 100
            create_monthly_network_triplets_per_mno_partition(conn, operator_id=op, month=month, year=year,
                                                              suffix='_new', num_physical_shards=num_physical_shards,
                                                              fillfactor=fillfactor)

        # Populate country-level table from old table
        cursor.execute("""INSERT INTO monthly_network_triplets_country_new
                               SELECT *
                                 FROM monthly_network_triplets_country""")

        # Populate per-MNO-level table from old table
        cursor.execute("""INSERT INTO monthly_network_triplets_per_mno_new
                               SELECT *
                                 FROM monthly_network_triplets_per_mno""")

        # Add in indexes
        add_indices(conn, tbl_name='monthly_network_triplets_country_new',
                    idx_metadata=monthly_network_triplets_country_indices())
        add_indices(conn, tbl_name='monthly_network_triplets_per_mno_new',
                    idx_metadata=monthly_network_triplets_per_mno_indices())

        # Drop old tables
        cursor.execute('DROP TABLE monthly_network_triplets_country CASCADE')
        cursor.execute('DROP TABLE monthly_network_triplets_per_mno CASCADE')

        # Renames tables
        rename_table_and_indices(conn,
                                 old_tbl_name='monthly_network_triplets_country_new',
                                 new_tbl_name='monthly_network_triplets_country',
                                 idx_metadata=monthly_network_triplets_country_indices())
        rename_table_and_indices(conn,
                                 old_tbl_name='monthly_network_triplets_per_mno_new',
                                 new_tbl_name='monthly_network_triplets_per_mno',
                                 idx_metadata=monthly_network_triplets_per_mno_indices())

        cursor.execute("""CREATE OR REPLACE VIEW operator_data AS
                          SELECT sq.connection_date,
                                 sq.imei_norm,
                                 sq.imsi,
                                 sq.msisdn,
                                 sq.operator_id
                            FROM (SELECT make_date(nt.triplet_year::integer,
                                         nt.triplet_month::integer,
                                         dom.dom) AS connection_date,
                                         nt.imei_norm,
                                         nt.imsi,
                                         nt.msisdn,
                                         nt.operator_id
                                    FROM generate_series(1, 31) dom(dom),
                                         monthly_network_triplets_per_mno nt
                                   WHERE (nt.date_bitmask & (1 << (dom.dom - 1))) <> 0) sq""")
        cursor.execute("""CREATE VIEW monthly_network_triplets_country_no_null_imeis AS
                          SELECT *
                            FROM monthly_network_triplets_country
                           WHERE imei_norm IS NOT NULL""")
        cursor.execute("""CREATE VIEW monthly_network_triplets_per_mno_no_null_imeis AS
                          SELECT *
                            FROM monthly_network_triplets_per_mno
                           WHERE imei_norm IS NOT NULL""")

        cursor.execute(sql.SQL('GRANT SELECT ON operator_data TO dirbs_core_base'))
        for role in ['dirbs_core_listgen', 'dirbs_core_classify', 'dirbs_core_report', 'dirbs_core_api']:
            cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_country_no_null_imeis
                                      TO {0}""").format(sql.Identifier(role)))
            cursor.execute(sql.SQL("""GRANT SELECT ON monthly_network_triplets_per_mno_no_null_imeis
                                      TO {0}""").format(sql.Identifier(role)))

        cursor.execute("""CREATE VIEW monthly_network_triplets_with_invalid_data_flags AS
                               SELECT nt.*,
                                      nt.imei_norm IS NULL AS is_null_imei,
                                      is_unclean_imei(nt.imei_norm) AS is_unclean_imei,
                                      nt.imsi IS NULL AS is_null_imsi,
                                      is_unclean_imsi(nt.imsi) AS is_unclean_imsi,
                                      nt.msisdn IS NULL AS is_null_msisdn
                                 FROM monthly_network_triplets_per_mno nt""")