def _matching_imeis_sql(self,
                            conn,
                            app_config,
                            virt_imei_range_start,
                            virt_imei_range_end,
                            curr_date=None):
        """Overrides Dimension._matching_imeis_sql."""
        analysis_end_date = compute_analysis_end_date(conn, curr_date)
        analysis_start_date = analysis_end_date - datetime.timedelta(
            days=self._lookback_days)
        self._log_analysis_window(analysis_start_date, analysis_end_date)
        operators = app_config.region_config.operators
        mcc_mnc_pairs = [op.mcc_mnc_pairs for op in operators]
        flat_mcc_mnc_pairs = [
            val for sublist in mcc_mnc_pairs for val in sublist
        ]
        mcc_list_from_set = list(
            {'{0}%'.format(x['mcc'])
             for x in flat_mcc_mnc_pairs})

        return sql.SQL("""SELECT imei_norm
                 FROM monthly_network_triplets_country
                WHERE imei_norm IS NOT NULL
                  AND last_seen >= {analysis_start_date}
                  AND first_seen < {analysis_end_date}
                  AND virt_imei_shard >= {virt_imei_range_start}
                  AND virt_imei_shard < {virt_imei_range_end}
                  AND NOT starts_with_prefix(imsi, {mcc_list})
            """).format(
            analysis_start_date=sql.Literal(analysis_start_date),
            analysis_end_date=sql.Literal(analysis_end_date),
            virt_imei_range_start=sql.Literal(virt_imei_range_start),
            virt_imei_range_end=sql.Literal(virt_imei_range_end),
            mcc_list=sql.Literal(mcc_list_from_set)).as_string(conn)
Ejemplo n.º 2
0
    def _matching_imeis_sql(self, conn, app_config, virt_imei_range_start, virt_imei_range_end, curr_date=None):
        """
        Overrides Dimension._matching_imeis_sql.

        :param conn: database connection
        :param app_config: dirbs config obj
        :param virt_imei_range_start: virtual imei shard range start
        :param virt_imei_range_end: virtual imei shard range end
        :param curr_date: user defined current date for analysis
        :return: SQL
        """
        analysis_end_date = compute_analysis_end_date(conn, curr_date)
        rbi_list = [rbi for rbi in self.final_rbi_delays.keys()]
        delay_list = [self.final_rbi_delays[rbi] for rbi in rbi_list]

        # HACK: This is used by _write_country_gsma_not_found_report in cli/report.py which instantiates
        # a dimension without understanding about paralllel queries. Therefore, we passed in 1 and 100
        # to cover the entire range of IMEIs and expect it to read from the network_imeis table.
        if virt_imei_range_start == 1 and virt_imei_range_end == 100:
            network_imeis_shard = 'network_imeis'
        else:
            network_imeis_shard = partition_utils.imei_shard_name(base_name='network_imeis',
                                                                  virt_imei_range_start=virt_imei_range_start,
                                                                  virt_imei_range_end=virt_imei_range_end)

        # The first CTE 'not_in_gsma' calculates the first date the IMEI was observed on the network for
        # all IMEIs that have a TAC that is not present in the GSMA database. The min_first_seen date
        # is computed using the minimum of first_seen date among all operators the IMEI was observed on.
        # The second CTE 'rbi_delays' is the list of RBIs and corresponding delays that were configured.
        # The results of the two CTEs are joined using RBI as the key.
        # Finally, those IMEIs whose min_seen_date + RBI delay is less than curr_date are classified
        # as gsma_not_found and rest are excluded.
        # Note 1: The less than check implies that even after adding delay, these IMEIs would still not
        # have been allocated by the classification date and hence are not valid IMEIs.
        # Note 2: The delay is added on a per-IMEI basis rather than per-TAC due to potential for someone
        # squatting on an unallocated TAC in the past.
        return sql.SQL(
            """SELECT imei_norm
                 FROM (WITH not_in_gsma AS (SELECT imei_norm,
                                                   first_seen AS min_first_seen,
                                                   LEFT(imei_norm, 2) AS rbi
                                              FROM {network_imeis_shard}
                                             WHERE NOT EXISTS (SELECT 1
                                                                 FROM gsma_data
                                                                WHERE tac = LEFT(imei_norm, 8))),
                             rbi_delays AS (SELECT rbi,
                                                   delay
                                              FROM UNNEST({rbi_list}::TEXT[], {delay_list}::INT[]) AS tbl(rbi, delay))
                     SELECT imei_norm
                       FROM not_in_gsma
                  LEFT JOIN rbi_delays
                      USING (rbi)
                      WHERE min_first_seen + COALESCE(delay, 0) < {analysis_end_date}) invalid_imeis
            """).format(network_imeis_shard=sql.Identifier(network_imeis_shard),  # noqa: Q447, Q449
                        rbi_list=sql.Literal(rbi_list),
                        delay_list=sql.Literal(delay_list),
                        analysis_end_date=sql.Literal(analysis_end_date)).as_string(conn)
Ejemplo n.º 3
0
    def _calc_analysis_window(self, conn, curr_date=None):
        """Method used to calculate the analysis window (as a tuple) given a curr date."""
        analysis_end_date = compute_analysis_end_date(conn, curr_date)
        if self._period_months is not None:
            analysis_start_date = analysis_end_date - relativedelta.relativedelta(
                months=self._period_months)
        else:
            analysis_start_date = analysis_end_date - relativedelta.relativedelta(
                days=self._period_days)

        self._log_analysis_window(analysis_start_date, analysis_end_date)
        return analysis_start_date, analysis_end_date
Ejemplo n.º 4
0
    def _calc_analysis_window(self, conn, curr_date=None):
        """
        Method used to calculate the analysis window (as a tuple) given a curr date.

        Arguments:
            conn: DIRBS Postgresql connection
            curr_date: current date of the analysis
        """
        analysis_end_date = compute_analysis_end_date(conn, curr_date)
        analysis_start_date = analysis_end_date - relativedelta.relativedelta(
            days=self._period)
        self._log_analysis_window(analysis_start_date, analysis_end_date)
        return analysis_start_date, analysis_end_date
Ejemplo n.º 5
0
    def _calc_analysis_window(self, conn, curr_date=None):
        """
        Method used to calculate the analysis window (as a tuple) given a curr date.

        :param conn: database connection
        :param curr_date: user defined current date (default None)
        :return: dates range for analysis
        """
        analysis_end_date = compute_analysis_end_date(conn, curr_date)
        if self._period_months is not None:
            analysis_start_date = analysis_end_date - relativedelta.relativedelta(
                months=self._period_months)
        else:
            analysis_start_date = analysis_end_date - relativedelta.relativedelta(
                days=self._period_days)

        self._log_analysis_window(analysis_start_date, analysis_end_date)
        return analysis_start_date, analysis_end_date
Ejemplo n.º 6
0
def write_transient_msisdns(logger: callable,
                            period: int,
                            report_dir: str,
                            conn: callable,
                            config: callable,
                            num_of_imeis: int,
                            current_date: str = None):
    """Helper method to write transient msisdns report.

    Arguments:
        logger: DIRBS logger object
        period: analysis period in days
        report_dir: output directory to write files into
        conn: DIRBS postgresql connection object
        config: DIRBS config object
        num_of_imeis: Number of IMEIs to be seen with
        current_date: setting custom current date for analysis
    Returns:
        Report metadata
    """
    logger.info('Generating per-operator possible transient MSISDNs list...')
    with contextlib.ExitStack() as stack:
        # push files to the exit stack so that they will closed properly at the end
        operator_ids = [o.id for o in config.region_config.operators]
        filename_op_map = {
            'transient_msisdns_{0}.csv'.format(o): o
            for o in operator_ids
        }
        opname_file_map = {
            o: stack.enter_context(
                open(os.path.join(report_dir, fn), 'w', encoding='utf-8'))
            for fn, o in filename_op_map.items()
        }

        # create a map from operator name to csv writer
        opname_csvwriter_map = {
            o: csv.writer(opname_file_map[o])
            for o in operator_ids
        }

        # write header to each file
        for _, writer in opname_csvwriter_map.items():
            writer.writerow(['msisdn'])

        # the operation begins here
        # compute time periods for analysis
        current_date = datetime.date.today(
        ) if current_date is None else current_date
        analysis_end_date = utils.compute_analysis_end_date(conn, current_date)
        analysis_start_date = analysis_end_date - relativedelta.relativedelta(
            days=period)
        logger.debug('Analysis start date: {0}, analysis_end_date: {1}'.format(
            analysis_start_date, analysis_end_date))
        with conn.cursor() as cursor:
            query_bit_counts_in_period = sql.SQL(
                """SELECT msisdn, imeis_count, operator_id
                                                      FROM (SELECT msisdn, operator_id, SUM(bit) AS imeis_count
                                                              FROM (SELECT msisdn, operator_id,
                                                                           get_bitmask_within_window(
                                                                                date_bitmask,
                                                                                first_seen,
                                                                                last_seen,
                                                                                {analysis_start_date},
                                                                                {analysis_start_dom},
                                                                                {analysis_end_date},
                                                                                {analysis_end_dom}
                                                                                ) AS date_bitmask
                                                                      FROM monthly_network_triplets_per_mno
                                                                     WHERE last_seen >= {analysis_start_date}
                                                                       AND first_seen < {analysis_end_date}
                                                                       AND is_valid_msisdn(msisdn)) mn
                                                        CROSS JOIN generate_series(0, 30) AS i
                                                        CROSS JOIN LATERAL get_bit(mn.date_bitmask::bit(31), i) AS bit
                                                          GROUP BY msisdn, operator_id) AS msisdns_to_imeis
                                                     WHERE imeis_count/{period} >= {num_of_imeis}"""
            ).format(analysis_start_date=sql.Literal(analysis_start_date),
                     analysis_start_dom=sql.Literal(analysis_start_date.day),
                     analysis_end_date=sql.Literal(analysis_end_date),
                     analysis_end_dom=sql.Literal(analysis_end_date.day),
                     period=sql.Literal(period),
                     num_of_imeis=sql.Literal(num_of_imeis))
            cursor.execute(query_bit_counts_in_period.as_string(conn))
            msisdn_to_imei_count_map = [{
                'msisdn': res.msisdn,
                'imei_count': res.imeis_count,
                'operator_id': res.operator_id
            } for res in cursor]

            possible_transients = [
            ]  # dict to identify possible transients based on tests
            for val in msisdn_to_imei_count_map:
                imei_extraction_query = sql.SQL(
                    """SELECT DISTINCT imei_norm
                                                     FROM monthly_network_triplets_country_no_null_imeis
                                                    WHERE msisdn = {msisdn}
                                                      AND last_seen >= {analysis_start_date}
                                                      AND first_seen < {analysis_end_date}
                                                 ORDER BY imei_norm ASC"""
                ).format(msisdn=sql.Literal(val.get('msisdn')),
                         analysis_start_date=sql.Literal(analysis_start_date),
                         analysis_end_date=sql.Literal(analysis_end_date))
                cursor.execute(imei_extraction_query)
                imei_list = [
                    res.imei_norm for res in cursor
                    if res.imei_norm.isnumeric()
                ]
                tac_list = [int(imei[:8]) for imei in imei_list]
                imei_list = list(map(int, imei_list))

                analysis_tests = {
                    'identical_tac': False,
                    'consecutive_tac': False,
                    'arithmetic_tac': False,
                    'consecutive_imei': False,
                    'arithmetic_imei': False
                }

                logger.info('Performing TAC analysis on the data...')
                if len(set(tac_list)) == 1:
                    analysis_tests['identical_tac'] = True
                else:
                    if _have_consecutive_numbers(tac_list):
                        analysis_tests['consecutive_tac'] = True

                    if _is_arithmetic_series(tac_list):
                        analysis_tests['arithmetic_tac'] = True

                logger.info('Performing IMEIs analysis on data...')
                if _have_consecutive_numbers(imei_list):
                    analysis_tests['consecutive_imei'] = True

                if _is_arithmetic_series(imei_list):
                    analysis_tests['arithmetic_imei'] = True

                if any(analysis_tests.values()):
                    possible_transients.append(val)

            for item in possible_transients:
                opname_csvwriter_map[item.get('operator_id')].writerow(
                    [item.get('msisdn')])

        logger.info(
            'Per-MNO possible transient MSISDN lists generated successfully.')
    return _gen_metadata_for_reports(list(filename_op_map.keys()), report_dir)