def run(self, job_name, reducer_spec, output_writer_spec, params, bucket_name, filenames, combiner_spec=None, shards=None): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) new_params = dict(params or {}) new_params.update({ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, } }) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline(job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params, shards=shards)
def run(self, job_name, reducer_spec, output_writer_spec, params, bucket_name, filenames, combiner_spec=None, shards=None): filenames_only = ( util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) new_params = dict(params or {}) new_params.update({ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }}) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline( job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params, shards=shards)
def run(self, job_name, bucket_name, filenames): sort_mappers = [] for i in range(len(filenames)): filenames_only = util.strip_prefix_from_items("/%s/" % bucket_name, filenames[i]) sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchGCSRecordsReader", None, { "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append(*[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def run(self, job_name, bucket_name, filenames, shards=None): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + "._GoogleCloudStorageRecordInputReader", output_writer_spec=__name__ + "._HashingGCSOutputWriter", params={ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, "output_writer": { "bucket_name": bucket_name, }, }, shards=shards)
def run(self, job_name, bucket_name, filenames, shards=None): filenames_only = ( util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + "._GoogleCloudStorageRecordInputReader", output_writer_spec=__name__ + "._HashingGCSOutputWriter", params={ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, "output_writer": { "bucket_name": bucket_name, }, }, shards=shards)