def run(self, map_result_status, reduce_result_status, reduce_outputs): if (map_result_status == model.MapreduceState.RESULT_ABORTED or reduce_result_status == model.MapreduceState.RESULT_ABORTED): result_status = model.MapreduceState.RESULT_ABORTED elif (map_result_status == model.MapreduceState.RESULT_FAILED or reduce_result_status == model.MapreduceState.RESULT_FAILED): result_status = model.MapreduceState.RESULT_FAILED else: result_status = model.MapreduceState.RESULT_SUCCESS self.fill(self.outputs.result_status, result_status) if result_status == model.MapreduceState.RESULT_SUCCESS: yield pipeline_common.Return(reduce_outputs) else: yield pipeline_common.Return([])
def run(self, job_name, mapper_spec, reducer_spec, input_reader_spec, output_writer_spec=None, mapper_params=None, reducer_params=None, shards=None, combiner_spec=None): map_pipeline = yield MapPipeline(job_name, mapper_spec, input_reader_spec, params=mapper_params, shards=shards) shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline) reducer_pipeline = yield ReducePipeline(job_name, reducer_spec, output_writer_spec, reducer_params, shuffler_pipeline, combiner_spec=combiner_spec) with pipeline.After(reducer_pipeline): all_temp_files = yield pipeline_common.Extend( map_pipeline, shuffler_pipeline) yield CleanupPipeline(all_temp_files) yield pipeline_common.Return(reducer_pipeline)
def run(self, job_name, bucket_name, filenames): sort_mappers = [] for i in range(len(filenames)): filenames_only = util.strip_prefix_from_items("/%s/" % bucket_name, filenames[i]) sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchGCSRecordsReader", None, { "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append(*[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def run(self, job_name, filenames): hashed_files = yield _HashPipeline(job_name, filenames) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend( hashed_files, sorted_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, filenames, shards=None): if files.shuffler.available(): yield _ShuffleServicePipeline(job_name, filenames) else: hashed_files = yield _HashPipeline(job_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) temp_files = [hashed_files, sorted_files] merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, mapper_params, filenames, shards=None): bucket_name = mapper_params["bucket_name"] hashed_files = yield _HashPipeline(job_name, bucket_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, bucket_name, hashed_files) temp_files = [hashed_files, sorted_files] merged_files = yield _MergePipeline(job_name, bucket_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield _GCSCleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, filenames): sort_mappers = [] for i in range(len(filenames)): filename = filenames[i] sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchRecordsReader", None, { "files": [filename], "processing_rate": 1000000, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append( *[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)