Example #1
0
def _do_map(input_reader, processor_func, finalize_func, params, _shards,
            _output_writer, _output_writer_kwargs, _job_name, _queue_name,
            *processor_args, **processor_kwargs):

    handler_spec = qualname(unpacker)
    handler_params = {
        "func": qualname(processor_func)
        if callable(processor_func) else processor_func,
        "args": processor_args,
        "kwargs": processor_kwargs
    }

    handler_params.update(params)

    pipelines = []
    pipelines.append(
        MapperPipeline(_job_name,
                       handler_spec=handler_spec,
                       input_reader_spec=qualname(input_reader),
                       output_writer_spec=qualname(_output_writer)
                       if _output_writer else None,
                       params=handler_params,
                       shards=_shards))

    if finalize_func:
        pipelines.append(
            CallbackPipeline(
                qualname(finalize_func) if callable(finalize_func) else
                finalize_func, *processor_args, **processor_kwargs))

    new_pipeline = DynamicPipeline(pipelines)
    new_pipeline.start(queue_name=_queue_name or 'default')
    return new_pipeline
Example #2
0
def run_transform():
    JOB_ID_PREFIX = 'ch12_%d' % int(time.time())
    TMP_PATH = 'tmp/mapreduce/%s' % JOB_ID_PREFIX

    # Extract from BigQuery to GCS.
    run_bigquery_job(
        JOB_ID_PREFIX, 'extract', {
            'sourceTable': table_reference('add_zip_input'),
            'destinationUri': 'gs://%s/%s/input-*' % (GCS_BUCKET, TMP_PATH),
            'destinationFormat': 'NEWLINE_DELIMITED_JSON',
        })

    # Run the mapper job to annotate the records.
    mapper = MapperPipeline(
        'Add Zip',
        'add_zip.apply',
        'mapreduce.input_readers.FileInputReader',
        'mapreduce.output_writers._GoogleCloudStorageOutputWriter',
        params={
            'files': ['/gs/%s/%s/input-*' % (GCS_BUCKET, TMP_PATH)],
            'format': 'lines',
            'output_writer': {
                'bucket_name': GCS_BUCKET,
                'naming_format': TMP_PATH + '/output-$num',
            }
        })
    mapper.start()
    wait_for_pipeline(mapper.pipeline_id)

    # Load from GCS into BigQuery.
    run_bigquery_job(
        JOB_ID_PREFIX, 'load', {
            'destinationTable': table_reference('add_zip_output'),
            'sourceUris': ['gs://%s/%s/output-*' % (GCS_BUCKET, TMP_PATH)],
            'sourceFormat': 'NEWLINE_DELIMITED_JSON',
            'schema': OUTPUT_SCHEMA,
            'writeDisposition': 'WRITE_TRUNCATE',
        })