コード例 #1
0
def get_prefixes(**kwargs):
    prefix = kwargs['prefix']
    fname = generate_fname('prefix.dat', **kwargs)

    _, prefixes = get_object_list(bucket_name=FETCHES_BUCKET, prefix=prefix)

    with open(fname, 'w+') as f:
        f.write("\n".join(prefixes))

    kwargs['ti'].xcom_push(key='prefixes_location', value=fname)
コード例 #2
0
def store_prefix_list(prefixes, **kwargs):
    base_dir = kwargs['base_dir']
    execution_date = kwargs['execution_date'].strftime('%Y-%m-%d')

    fname = generate_fname(suffix='prefix.dat',
                           base_dir=base_dir, execution_date=execution_date)
    logging.info(
        f'Storing {len(prefixes)} prefixes in {fname}', len(prefixes), fname)
    with open(fname, 'w+') as f:
        f.write("\n".join(prefixes))

    return fname
コード例 #3
0
def get_prefixes(**kwargs):
    prefix = kwargs['prefix']
    fname = generate_fname(
        suffix='prefix.dat',
        base_dir=kwargs['base_dir'],
        execution_date=kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M'))

    _, prefixes = get_object_list(bucket_name=FETCHES_BUCKET, prefix=prefix)

    with open(fname, 'w+') as f:
        f.write("\n".join(prefixes))

    kwargs['ti'].xcom_push(key='prefixes_location', value=fname)
コード例 #4
0
def generate_objects(**kwargs):
    fname = kwargs['ti'].xcom_pull(key='prefixes_location',
                                   task_ids='get_prefixes')
    output = generate_fname(
        suffix='objects.csv',
        base_dir=kwargs['base_dir'],
        execution_date=kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M'))

    logging.info(f'The task will read from {fname} and write to: {output}')

    with open(fname, 'r') as f:
        with open(output, 'w+') as out:
            for prefix in f:
                objects, _ = get_object_list(bucket_name=FETCHES_BUCKET,
                                             prefix=prefix)
                for obj in objects:
                    out.write(serialize_object(obj))

    kwargs['ti'].xcom_push(key='object_location', value=output)
コード例 #5
0
def new_prefix_list_needed(**kwargs):
    execution_date = kwargs['execution_date'].date()
    previous_run = kwargs['prev_execution_date'].date()
    logging.info(f'Start: {previous_run} End: {execution_date}')
    td = timedelta(days=1)

    if execution_date - previous_run >= td:
        logging.info(
            'The prefix generation task is too far back. Regenerating')
        return 'generate_prefix_list'

    location = generate_fname(suffix='prefix.dat',
                              base_dir=kwargs['base_dir'],
                              execution_date=execution_date.strftime('%Y-%m-%d'))

    if not os.path.isfile(location):
        logging.info(
            'Old prefix list does not exist for some reason. Regenerating')
        return 'generate_prefix_list'

    kwargs['ti'].xcom_push(key='prefixes_location', value=location)
    return 'dummy'
コード例 #6
0
def generate_object_list_parallel(**kwargs):
    fname = kwargs['ti'].xcom_pull(
        key='prefixes_location', task_ids='generate_prefix_list')
    output = generate_fname(suffix='objects.csv',
                            base_dir=kwargs['base_dir'],
                            execution_date=kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M'))
    logging.info('The task will read from %s and write to: %s' %
                 (fname, output))

    with open(fname, 'r') as f:
        prefix_list = list(map(lambda a: a.strip(), f.readlines()))

    if kwargs['filter_prefixes']:
        logging.info('Filtering prefixes')
        prefix_list = list(filter_prefixes(prefixes=prefix_list,
                                           start_date=kwargs['execution_date'] -
                                           timedelta(days=1),
                                           end_date=kwargs['execution_date']))

    logging.info('Number of prefixes to process: %d ', len(prefix_list))

    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        future_objects = {executor.submit(
            get_objects, prefix): prefix for prefix in prefix_list}
        with open(output, 'w') as f:
            for future in concurrent.futures.as_completed(future_objects):
                url = future_objects[future]
                try:
                    data = future.result()
                except Exception as exc:
                    logging.error('%r generated an exception: %s' % (url, exc))
                else:
                    for o in data:
                        f.write(serialize_object(o))

    kwargs['ti'].xcom_push(key='object_location', value=output)