def _do_map(input_reader, processor_func, finalize_func, params, _shards, _output_writer, _output_writer_kwargs, _job_name, _queue_name, *processor_args, **processor_kwargs): handler_spec = qualname(unpacker) handler_params = { "func": qualname(processor_func) if callable(processor_func) else processor_func, "args": processor_args, "kwargs": processor_kwargs } handler_params.update(params) pipelines = [] pipelines.append( MapperPipeline(_job_name, handler_spec=handler_spec, input_reader_spec=qualname(input_reader), output_writer_spec=qualname(_output_writer) if _output_writer else None, params=handler_params, shards=_shards)) if finalize_func: pipelines.append( CallbackPipeline( qualname(finalize_func) if callable(finalize_func) else finalize_func, *processor_args, **processor_kwargs)) new_pipeline = DynamicPipeline(pipelines) new_pipeline.start(queue_name=_queue_name or 'default') return new_pipeline
def run_transform(): JOB_ID_PREFIX = 'ch12_%d' % int(time.time()) TMP_PATH = 'tmp/mapreduce/%s' % JOB_ID_PREFIX # Extract from BigQuery to GCS. run_bigquery_job( JOB_ID_PREFIX, 'extract', { 'sourceTable': table_reference('add_zip_input'), 'destinationUri': 'gs://%s/%s/input-*' % (GCS_BUCKET, TMP_PATH), 'destinationFormat': 'NEWLINE_DELIMITED_JSON', }) # Run the mapper job to annotate the records. mapper = MapperPipeline( 'Add Zip', 'add_zip.apply', 'mapreduce.input_readers.FileInputReader', 'mapreduce.output_writers._GoogleCloudStorageOutputWriter', params={ 'files': ['/gs/%s/%s/input-*' % (GCS_BUCKET, TMP_PATH)], 'format': 'lines', 'output_writer': { 'bucket_name': GCS_BUCKET, 'naming_format': TMP_PATH + '/output-$num', } }) mapper.start() wait_for_pipeline(mapper.pipeline_id) # Load from GCS into BigQuery. run_bigquery_job( JOB_ID_PREFIX, 'load', { 'destinationTable': table_reference('add_zip_output'), 'sourceUris': ['gs://%s/%s/output-*' % (GCS_BUCKET, TMP_PATH)], 'sourceFormat': 'NEWLINE_DELIMITED_JSON', 'schema': OUTPUT_SCHEMA, 'writeDisposition': 'WRITE_TRUNCATE', })
def run_transform(): JOB_ID_PREFIX = 'ch12_%d' % int(time.time()) TMP_PATH = 'tmp/mapreduce/%s' % JOB_ID_PREFIX # Extract from BigQuery to GCS. run_bigquery_job(JOB_ID_PREFIX, 'extract', { 'sourceTable': table_reference('add_zip_input'), 'destinationUri': 'gs://%s/%s/input-*' % (GCS_BUCKET, TMP_PATH), 'destinationFormat': 'NEWLINE_DELIMITED_JSON', }) # Run the mapper job to annotate the records. mapper = MapperPipeline( 'Add Zip', 'add_zip.apply', 'mapreduce.input_readers.FileInputReader', 'mapreduce.output_writers._GoogleCloudStorageOutputWriter', params={ 'files': ['/gs/%s/%s/input-*' % (GCS_BUCKET, TMP_PATH)], 'format': 'lines', 'output_writer': { 'bucket_name': GCS_BUCKET, 'naming_format': TMP_PATH + '/output-$num', } }) mapper.start() wait_for_pipeline(mapper.pipeline_id) # Load from GCS into BigQuery. run_bigquery_job(JOB_ID_PREFIX, 'load', { 'destinationTable': table_reference('add_zip_output'), 'sourceUris': ['gs://%s/%s/output-*' % (GCS_BUCKET, TMP_PATH)], 'sourceFormat': 'NEWLINE_DELIMITED_JSON', 'schema': OUTPUT_SCHEMA, 'writeDisposition': 'WRITE_TRUNCATE', })
def wait_for_pipeline(pipeline_id): '''Wait for a MapReduce pipeline to complete.''' mapreduce_id = None while True: time.sleep(5) pipeline = MapperPipeline.from_id(pipeline_id) if not mapreduce_id and pipeline.outputs.job_id.filled: mapreduce_id = pipeline.outputs.job_id.value with g_state_lock: g_state['mapper_link'] = ( '<a href="/mapreduce/detail?mapreduce_id=%s">%s</a>' % (mapreduce_id, mapreduce_id)) if pipeline.has_finalized: break if pipeline.outputs.result_status.value != 'success': raise RuntimeError('Mapper job failed, see status link.')
def wait_for_pipeline(pipeline_id): '''Wait for a MapReduce pipeline to complete.''' mapreduce_id = None while True: time.sleep(5) pipeline = MapperPipeline.from_id(pipeline_id) if not mapreduce_id and pipeline.outputs.job_id.filled: mapreduce_id = pipeline.outputs.job_id.value with g_state_lock: g_state['mapper_link'] = ( '<a href="/mapreduce/detail?mapreduce_id=%s">%s</a>' % ( mapreduce_id, mapreduce_id)) if pipeline.has_finalized: break if pipeline.outputs.result_status.value != 'success': raise RuntimeError('Mapper job failed, see status link.')