def run(devices, filters, round_interactions, last_timestamp_filter=None, days_back=config.days_look_back()): print('Gonna search for', devices) r, i = read_s3(devices, filters, round_interactions, last_timestamp_filter=last_timestamp_filter, days_back=config.days_look_back()) print('read from s3, gonna send interactions', r, i) return r, i
def get_paths_s3(device_id, days_back=config.days_look_back()): """ Return the list of folders for the given :device_id: going back to :days_back: days :param device_id: device id :param days_back: days to go back :return: """ today = datetime.now() paths = [] for i in range(days_back + 1): d = today - timedelta(days=i) paths.append( str(d.year) + '/' + padding_zeroes(d.month, 2) + '/' + padding_zeroes(d.day, 2) + '/' + build_path_device(device_id)) return paths
def periodic_new_infected_interactions_check(debug=False): offset = 0 continuation = True today = datetime.now() condition_infection = today - timedelta(config.days_look_back()) sqs_client = boto3.resource('sqs', region_name=config.get_aws_region()) queue_infection = sqs_client.Queue(config.get_sqs_patients_url()) while continuation: res = mysql_handler.get_infected(condition_infection, offset=offset, limit=LIMIT_BATCH, debug=debug) print('ANALYZING INFECTED LATER', res) if res is None or len(res) == 0: continuation = False break device_ids = [] min_timestamp_analysis = today for entry in res: device_ids.append(str(entry['device_id'])) min_timestamp_analysis = min(min_timestamp_analysis, entry['last_analysis_timestamp']) recurrent_infected = { 'recurrent': { 'device_ids': device_ids, 'timestamp_min_unix': datetime.timestamp(min_timestamp_analysis) } } print('sending new message:', recurrent_infected) if not debug: queue_infection.send_message( MessageBody=json.dumps(recurrent_infected)) print('updating current devices last timestamp') mysql_handler.update_last_analysis_timestamp(device_ids, today, debug=debug) offset += LIMIT_BATCH if debug: break
def read_s3(devices, filters, round_interactions=None, last_timestamp_filter=None, firstRun=True, days_back=config.days_look_back()): filter_id_2_filter = {} for fil in filters: filter_id_2_filter[fil['filter_id']] = fil filter_id_2_device_id_2_interactions, unique_interactions = select_s3_interactions( devices, filters, interactions_further=None, last_timestamp_filter=last_timestamp_filter, days_back=days_back) filter_id_2_connections = aggregate_result_select( filter_id_2_device_id_2_interactions, filter_id_2_filter, round_interactions) # second query for iphones: filter_id_2_device_id_2_interactions_further, unique_interactions_further = select_s3_interactions( list(unique_interactions), filters, interactions_further=filter_id_2_connections, last_timestamp_filter=last_timestamp_filter, days_back=days_back) filter_id_2_connections_further = aggregate_result_select( filter_id_2_device_id_2_interactions_further, filter_id_2_filter, round_interactions) for filter_id, interactions in filter_id_2_connections_further.items(): for inter in interactions: if int(inter['id']) not in devices: # add to filter_id_2_connections if filter_id not in filter_id_2_connections: filter_id_2_connections[filter_id] = [] filter_id_2_connections[filter_id].append(inter) for uif in unique_interactions_further: if int(uif) not in devices: unique_interactions.append(uif) res = filter_id_2_connections, unique_interactions return res
def select_s3_interactions(devices, filters, interactions_further=None, last_timestamp_filter=None, days_back=config.days_look_back()): """ Return a list of interactions for the specified :device_id: for every :filters: :param devices: :param filters: :param last_timestamp_filter: :param days_back: :return: dictionary where the key is the filter_id and the value is the list of interactions, unique_interactions """ filter_id2connections = {} all_interactions = set() if len(devices) > 0: file_names = list_files(devices, last_timestamp_filter=last_timestamp_filter, days_back=days_back) """ filter_id2connections = { filter_id: { device_id: [interactions] } } """ for fil in filters: interactions = None if interactions_further is not None and fil[ 'filter_id'] in interactions_further: interactions = interactions_further[fil['filter_id']] query = build_query(devices, fil, interactions) print('Iterating for filter', fil) print(query) # interactions = set() interaction_id2_interactions = {} print('gonna query {} files'.format(len(file_names))) counter = 0 pool_size = config.get_number_processes() processes = [] ## Define an empty pool whith maximal concurrent processes pool = multiprocessing.Pool(processes=pool_size) # PREPOPULATE PROCESSES for i in range(0, min(pool_size, len(file_names))): processes.append( pool.apply_async(query_file, args=( file_names[counter], query, ))) counter += 1 while True: for ip in range(len(processes) - 1, -1, -1): process = processes[ip] if process.ready(): # todo: sometimes the next process.get arise an error interaction_id2_interactions_partial, unique_interactions = process.get( ) all_interactions = all_interactions.union( unique_interactions) for iid, its in interaction_id2_interactions_partial.items( ): if iid not in interaction_id2_interactions: interaction_id2_interactions[iid] = [] for item in its: interaction_id2_interactions[iid].append(item) processes.pop(ip) if counter >= len(file_names): continue processes.append( pool.apply_async(query_file, args=( file_names[counter], query, ))) counter += 1 if len(processes) == 0: break filter_id2connections[ fil['filter_id']] = interaction_id2_interactions return filter_id2connections, list(all_interactions)
def list_files(devices, last_timestamp_filter=None, days_back=config.days_look_back()): """ Return the list of files for the given :device_id: going back :days_back: days :param devices: :param last_timestamp_filter: :param days_back: :return: """ client = boto3.client('s3') if last_timestamp_filter is not None: print('gonna filtering out files before', last_timestamp_filter) file_names = set() for device_id in devices: paths = get_paths_s3(device_id, days_back=days_back) for p in paths: continuation = None iterate = True while iterate: if continuation is not None: response = client.list_objects_v2( Bucket=BUCKET, MaxKeys=1000, Prefix=p, ContinuationToken=continuation) else: response = client.list_objects_v2(Bucket=BUCKET, MaxKeys=1000, Prefix=p) if response['ResponseMetadata']['HTTPStatusCode'] == 200: iterate = response['IsTruncated'] if 'Contents' in response: for e in response['Contents']: file_name_to_add = e['Key'] if last_timestamp_filter is not None: if '/' in file_name_to_add and '_' in file_name_to_add: try: dir_tree = file_name_to_add.split('/') csv_file_name = dir_tree[len(dir_tree) - 1] date_file = parser.parse( csv_file_name.split('_')[0]) if date_file < last_timestamp_filter: # file to skip since already analyzed print('skipping file', file_name_to_add) continue except: print( 'EXCEPTION on parsing datetime in filename ' + file_name_to_add) file_names.add(file_name_to_add) # iterate = False if 'NextContinuationToken' in response: print('CONTINATION TOKEN') continuation = response['NextContinuationToken'] else: print("Error", response) break return list(file_names)