def get_prefixes(**kwargs): prefix = kwargs['prefix'] fname = generate_fname('prefix.dat', **kwargs) _, prefixes = get_object_list(bucket_name=FETCHES_BUCKET, prefix=prefix) with open(fname, 'w+') as f: f.write("\n".join(prefixes)) kwargs['ti'].xcom_push(key='prefixes_location', value=fname)
def store_prefix_list(prefixes, **kwargs): base_dir = kwargs['base_dir'] execution_date = kwargs['execution_date'].strftime('%Y-%m-%d') fname = generate_fname(suffix='prefix.dat', base_dir=base_dir, execution_date=execution_date) logging.info( f'Storing {len(prefixes)} prefixes in {fname}', len(prefixes), fname) with open(fname, 'w+') as f: f.write("\n".join(prefixes)) return fname
def get_prefixes(**kwargs): prefix = kwargs['prefix'] fname = generate_fname( suffix='prefix.dat', base_dir=kwargs['base_dir'], execution_date=kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M')) _, prefixes = get_object_list(bucket_name=FETCHES_BUCKET, prefix=prefix) with open(fname, 'w+') as f: f.write("\n".join(prefixes)) kwargs['ti'].xcom_push(key='prefixes_location', value=fname)
def generate_objects(**kwargs): fname = kwargs['ti'].xcom_pull(key='prefixes_location', task_ids='get_prefixes') output = generate_fname( suffix='objects.csv', base_dir=kwargs['base_dir'], execution_date=kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M')) logging.info(f'The task will read from {fname} and write to: {output}') with open(fname, 'r') as f: with open(output, 'w+') as out: for prefix in f: objects, _ = get_object_list(bucket_name=FETCHES_BUCKET, prefix=prefix) for obj in objects: out.write(serialize_object(obj)) kwargs['ti'].xcom_push(key='object_location', value=output)
def new_prefix_list_needed(**kwargs): execution_date = kwargs['execution_date'].date() previous_run = kwargs['prev_execution_date'].date() logging.info(f'Start: {previous_run} End: {execution_date}') td = timedelta(days=1) if execution_date - previous_run >= td: logging.info( 'The prefix generation task is too far back. Regenerating') return 'generate_prefix_list' location = generate_fname(suffix='prefix.dat', base_dir=kwargs['base_dir'], execution_date=execution_date.strftime('%Y-%m-%d')) if not os.path.isfile(location): logging.info( 'Old prefix list does not exist for some reason. Regenerating') return 'generate_prefix_list' kwargs['ti'].xcom_push(key='prefixes_location', value=location) return 'dummy'
def generate_object_list_parallel(**kwargs): fname = kwargs['ti'].xcom_pull( key='prefixes_location', task_ids='generate_prefix_list') output = generate_fname(suffix='objects.csv', base_dir=kwargs['base_dir'], execution_date=kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M')) logging.info('The task will read from %s and write to: %s' % (fname, output)) with open(fname, 'r') as f: prefix_list = list(map(lambda a: a.strip(), f.readlines())) if kwargs['filter_prefixes']: logging.info('Filtering prefixes') prefix_list = list(filter_prefixes(prefixes=prefix_list, start_date=kwargs['execution_date'] - timedelta(days=1), end_date=kwargs['execution_date'])) logging.info('Number of prefixes to process: %d ', len(prefix_list)) with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor: future_objects = {executor.submit( get_objects, prefix): prefix for prefix in prefix_list} with open(output, 'w') as f: for future in concurrent.futures.as_completed(future_objects): url = future_objects[future] try: data = future.result() except Exception as exc: logging.error('%r generated an exception: %s' % (url, exc)) else: for o in data: f.write(serialize_object(o)) kwargs['ti'].xcom_push(key='object_location', value=output)