def run(self, job_name, bucket_name, filenames): sort_mappers = [] for i in range(len(filenames)): filenames_only = util.strip_prefix_from_items("/%s/" % bucket_name, filenames[i]) sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchGCSRecordsReader", None, { "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append(*[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def run(self, shards): result = [] shuffled_shards = [[] for _ in shards[0]] for shard in shards: for i, filename in enumerate(shard): shuffled_shards[i].append(filename) for filenames in shuffled_shards: sorted_files = yield shuffler.SortPipeline(filenames) result.append(sorted_files) yield pipeline_common.Append(*result)
def run(self, job_name, filenames): sort_mappers = [] for i in range(len(filenames)): filename = filenames[i] sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchRecordsReader", None, { "files": [filename], "processing_rate": 1000000, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append( *[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)