def _matching_imeis_sql(self, conn, app_config, virt_imei_range_start, virt_imei_range_end, curr_date=None): """Overrides Dimension._matching_imeis_sql.""" analysis_end_date = compute_analysis_end_date(conn, curr_date) analysis_start_date = analysis_end_date - datetime.timedelta( days=self._lookback_days) self._log_analysis_window(analysis_start_date, analysis_end_date) operators = app_config.region_config.operators mcc_mnc_pairs = [op.mcc_mnc_pairs for op in operators] flat_mcc_mnc_pairs = [ val for sublist in mcc_mnc_pairs for val in sublist ] mcc_list_from_set = list( {'{0}%'.format(x['mcc']) for x in flat_mcc_mnc_pairs}) return sql.SQL("""SELECT imei_norm FROM monthly_network_triplets_country WHERE imei_norm IS NOT NULL AND last_seen >= {analysis_start_date} AND first_seen < {analysis_end_date} AND virt_imei_shard >= {virt_imei_range_start} AND virt_imei_shard < {virt_imei_range_end} AND NOT starts_with_prefix(imsi, {mcc_list}) """).format( analysis_start_date=sql.Literal(analysis_start_date), analysis_end_date=sql.Literal(analysis_end_date), virt_imei_range_start=sql.Literal(virt_imei_range_start), virt_imei_range_end=sql.Literal(virt_imei_range_end), mcc_list=sql.Literal(mcc_list_from_set)).as_string(conn)
def _matching_imeis_sql(self, conn, app_config, virt_imei_range_start, virt_imei_range_end, curr_date=None): """ Overrides Dimension._matching_imeis_sql. :param conn: database connection :param app_config: dirbs config obj :param virt_imei_range_start: virtual imei shard range start :param virt_imei_range_end: virtual imei shard range end :param curr_date: user defined current date for analysis :return: SQL """ analysis_end_date = compute_analysis_end_date(conn, curr_date) rbi_list = [rbi for rbi in self.final_rbi_delays.keys()] delay_list = [self.final_rbi_delays[rbi] for rbi in rbi_list] # HACK: This is used by _write_country_gsma_not_found_report in cli/report.py which instantiates # a dimension without understanding about paralllel queries. Therefore, we passed in 1 and 100 # to cover the entire range of IMEIs and expect it to read from the network_imeis table. if virt_imei_range_start == 1 and virt_imei_range_end == 100: network_imeis_shard = 'network_imeis' else: network_imeis_shard = partition_utils.imei_shard_name(base_name='network_imeis', virt_imei_range_start=virt_imei_range_start, virt_imei_range_end=virt_imei_range_end) # The first CTE 'not_in_gsma' calculates the first date the IMEI was observed on the network for # all IMEIs that have a TAC that is not present in the GSMA database. The min_first_seen date # is computed using the minimum of first_seen date among all operators the IMEI was observed on. # The second CTE 'rbi_delays' is the list of RBIs and corresponding delays that were configured. # The results of the two CTEs are joined using RBI as the key. # Finally, those IMEIs whose min_seen_date + RBI delay is less than curr_date are classified # as gsma_not_found and rest are excluded. # Note 1: The less than check implies that even after adding delay, these IMEIs would still not # have been allocated by the classification date and hence are not valid IMEIs. # Note 2: The delay is added on a per-IMEI basis rather than per-TAC due to potential for someone # squatting on an unallocated TAC in the past. return sql.SQL( """SELECT imei_norm FROM (WITH not_in_gsma AS (SELECT imei_norm, first_seen AS min_first_seen, LEFT(imei_norm, 2) AS rbi FROM {network_imeis_shard} WHERE NOT EXISTS (SELECT 1 FROM gsma_data WHERE tac = LEFT(imei_norm, 8))), rbi_delays AS (SELECT rbi, delay FROM UNNEST({rbi_list}::TEXT[], {delay_list}::INT[]) AS tbl(rbi, delay)) SELECT imei_norm FROM not_in_gsma LEFT JOIN rbi_delays USING (rbi) WHERE min_first_seen + COALESCE(delay, 0) < {analysis_end_date}) invalid_imeis """).format(network_imeis_shard=sql.Identifier(network_imeis_shard), # noqa: Q447, Q449 rbi_list=sql.Literal(rbi_list), delay_list=sql.Literal(delay_list), analysis_end_date=sql.Literal(analysis_end_date)).as_string(conn)
def _calc_analysis_window(self, conn, curr_date=None): """Method used to calculate the analysis window (as a tuple) given a curr date.""" analysis_end_date = compute_analysis_end_date(conn, curr_date) if self._period_months is not None: analysis_start_date = analysis_end_date - relativedelta.relativedelta( months=self._period_months) else: analysis_start_date = analysis_end_date - relativedelta.relativedelta( days=self._period_days) self._log_analysis_window(analysis_start_date, analysis_end_date) return analysis_start_date, analysis_end_date
def _calc_analysis_window(self, conn, curr_date=None): """ Method used to calculate the analysis window (as a tuple) given a curr date. Arguments: conn: DIRBS Postgresql connection curr_date: current date of the analysis """ analysis_end_date = compute_analysis_end_date(conn, curr_date) analysis_start_date = analysis_end_date - relativedelta.relativedelta( days=self._period) self._log_analysis_window(analysis_start_date, analysis_end_date) return analysis_start_date, analysis_end_date
def _calc_analysis_window(self, conn, curr_date=None): """ Method used to calculate the analysis window (as a tuple) given a curr date. :param conn: database connection :param curr_date: user defined current date (default None) :return: dates range for analysis """ analysis_end_date = compute_analysis_end_date(conn, curr_date) if self._period_months is not None: analysis_start_date = analysis_end_date - relativedelta.relativedelta( months=self._period_months) else: analysis_start_date = analysis_end_date - relativedelta.relativedelta( days=self._period_days) self._log_analysis_window(analysis_start_date, analysis_end_date) return analysis_start_date, analysis_end_date
def write_transient_msisdns(logger: callable, period: int, report_dir: str, conn: callable, config: callable, num_of_imeis: int, current_date: str = None): """Helper method to write transient msisdns report. Arguments: logger: DIRBS logger object period: analysis period in days report_dir: output directory to write files into conn: DIRBS postgresql connection object config: DIRBS config object num_of_imeis: Number of IMEIs to be seen with current_date: setting custom current date for analysis Returns: Report metadata """ logger.info('Generating per-operator possible transient MSISDNs list...') with contextlib.ExitStack() as stack: # push files to the exit stack so that they will closed properly at the end operator_ids = [o.id for o in config.region_config.operators] filename_op_map = { 'transient_msisdns_{0}.csv'.format(o): o for o in operator_ids } opname_file_map = { o: stack.enter_context( open(os.path.join(report_dir, fn), 'w', encoding='utf-8')) for fn, o in filename_op_map.items() } # create a map from operator name to csv writer opname_csvwriter_map = { o: csv.writer(opname_file_map[o]) for o in operator_ids } # write header to each file for _, writer in opname_csvwriter_map.items(): writer.writerow(['msisdn']) # the operation begins here # compute time periods for analysis current_date = datetime.date.today( ) if current_date is None else current_date analysis_end_date = utils.compute_analysis_end_date(conn, current_date) analysis_start_date = analysis_end_date - relativedelta.relativedelta( days=period) logger.debug('Analysis start date: {0}, analysis_end_date: {1}'.format( analysis_start_date, analysis_end_date)) with conn.cursor() as cursor: query_bit_counts_in_period = sql.SQL( """SELECT msisdn, imeis_count, operator_id FROM (SELECT msisdn, operator_id, SUM(bit) AS imeis_count FROM (SELECT msisdn, operator_id, get_bitmask_within_window( date_bitmask, first_seen, last_seen, {analysis_start_date}, {analysis_start_dom}, {analysis_end_date}, {analysis_end_dom} ) AS date_bitmask FROM monthly_network_triplets_per_mno WHERE last_seen >= {analysis_start_date} AND first_seen < {analysis_end_date} AND is_valid_msisdn(msisdn)) mn CROSS JOIN generate_series(0, 30) AS i CROSS JOIN LATERAL get_bit(mn.date_bitmask::bit(31), i) AS bit GROUP BY msisdn, operator_id) AS msisdns_to_imeis WHERE imeis_count/{period} >= {num_of_imeis}""" ).format(analysis_start_date=sql.Literal(analysis_start_date), analysis_start_dom=sql.Literal(analysis_start_date.day), analysis_end_date=sql.Literal(analysis_end_date), analysis_end_dom=sql.Literal(analysis_end_date.day), period=sql.Literal(period), num_of_imeis=sql.Literal(num_of_imeis)) cursor.execute(query_bit_counts_in_period.as_string(conn)) msisdn_to_imei_count_map = [{ 'msisdn': res.msisdn, 'imei_count': res.imeis_count, 'operator_id': res.operator_id } for res in cursor] possible_transients = [ ] # dict to identify possible transients based on tests for val in msisdn_to_imei_count_map: imei_extraction_query = sql.SQL( """SELECT DISTINCT imei_norm FROM monthly_network_triplets_country_no_null_imeis WHERE msisdn = {msisdn} AND last_seen >= {analysis_start_date} AND first_seen < {analysis_end_date} ORDER BY imei_norm ASC""" ).format(msisdn=sql.Literal(val.get('msisdn')), analysis_start_date=sql.Literal(analysis_start_date), analysis_end_date=sql.Literal(analysis_end_date)) cursor.execute(imei_extraction_query) imei_list = [ res.imei_norm for res in cursor if res.imei_norm.isnumeric() ] tac_list = [int(imei[:8]) for imei in imei_list] imei_list = list(map(int, imei_list)) analysis_tests = { 'identical_tac': False, 'consecutive_tac': False, 'arithmetic_tac': False, 'consecutive_imei': False, 'arithmetic_imei': False } logger.info('Performing TAC analysis on the data...') if len(set(tac_list)) == 1: analysis_tests['identical_tac'] = True else: if _have_consecutive_numbers(tac_list): analysis_tests['consecutive_tac'] = True if _is_arithmetic_series(tac_list): analysis_tests['arithmetic_tac'] = True logger.info('Performing IMEIs analysis on data...') if _have_consecutive_numbers(imei_list): analysis_tests['consecutive_imei'] = True if _is_arithmetic_series(imei_list): analysis_tests['arithmetic_imei'] = True if any(analysis_tests.values()): possible_transients.append(val) for item in possible_transients: opname_csvwriter_map[item.get('operator_id')].writerow( [item.get('msisdn')]) logger.info( 'Per-MNO possible transient MSISDN lists generated successfully.') return _gen_metadata_for_reports(list(filename_op_map.keys()), report_dir)