def run(self, job_name, mapper_spec, reducer_spec, input_reader_spec, output_writer_spec=None, mapper_params=None, reducer_params=None, shards=None): map_pipeline = yield MapPipeline(job_name, mapper_spec, input_reader_spec, params=mapper_params, shards=shards) shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline) reducer_pipeline = yield ReducePipeline(job_name, reducer_spec, output_writer_spec, reducer_params, shuffler_pipeline) with pipeline.After(reducer_pipeline): all_temp_files = yield pipeline_common.Extend( map_pipeline, shuffler_pipeline) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(reducer_pipeline)
def run(self, job_name, filenames): hashed_files = yield _HashPipeline(job_name, filenames) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend( hashed_files, sorted_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, shuffler_params, filenames, shards=None): if files.shuffler.available(): yield _ShuffleServicePipeline(job_name, filenames) else: hashed_files = yield _HashGSPipeline(job_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) temp_files = [hashed_files, sorted_files] merged_files = yield _MergeGSPipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, filenames, shards=None): if files.shuffler.available(): yield _ShuffleServicePipeline(job_name, filenames) else: hashed_files = yield _HashPipeline(job_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) temp_files = [hashed_files, sorted_files] merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, params, parser_params, shards=8): extract_domain_files = yield _ExactDomainMapreducePipeline(job_name, params=params, shard_count=shards) robots_files = yield _RobotsFetchPipeline(job_name, extract_domain_files, shards) fetch_set_buffer_files = yield _FetchSetsBufferPipeline(job_name, robots_files) fetch_files = yield _FetchPagePipeline(job_name, fetch_set_buffer_files, shards) outlinks_files = yield _ExtractOutlinksPipeline(job_name, fetch_files, parser_params, shards) results_files = yield _FetchContentPipeline(job_name, outlinks_files, shards) temp_files = [extract_domain_files, robots_files, fetch_set_buffer_files, fetch_files] with pipeline.After(results_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files)
def run(self, job_name, filenames, shards=None, combine_spec=None): hashed_files = yield _HashPipeline(job_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) temp_files = [hashed_files, sorted_files] if combine_spec: sorted_files = yield _CombinePipeline( job_name, sorted_files, combine_spec) temp_files.append(sorted_files) merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def testCleanup_ListOfLists(self): """Tests cleaning up a list of file lists.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run map p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".KeyValueBlobstoreOutputWriter", params={ "input_reader": { "entity_kind": __name__ + ".TestEntity", }, }, ) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) for name in file_list: files.open(name, "r").read(0) grouped_list = [file_list] # Cleanup cleanup = mapper_pipeline._CleanupPipeline(grouped_list) cleanup.start() test_support.execute_until_empty(self.taskqueue) # Cannot open files for name in file_list: self.assertRaises(files.Error, files.open, name, "r")
def testCleanup_ListOfLists(self): """Tests cleaning up a list of file lists.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run map p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec= output_writers.__name__ + ".KeyValueBlobstoreOutputWriter", params={ "input_reader": { "entity_kind": __name__ + ".TestEntity", }, }, ) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) for name in file_list: files.open(name, "r").read(0) grouped_list = [file_list] # Cleanup cleanup = mapper_pipeline._CleanupPipeline(grouped_list) cleanup.start() test_support.execute_until_empty(self.taskqueue) # Cannot open files for name in file_list: self.assertRaises(files.Error, files.open, name, "r")
def run(self, job_name, params, parser_params, shards=8): extract_domain_files = yield _ExactDomainMapreducePipeline( job_name, params=params, shard_count=shards) robots_files = yield _RobotsFetchPipeline(job_name, extract_domain_files, shards) fetch_set_buffer_files = yield _FetchSetsBufferPipeline( job_name, robots_files) fetch_files = yield _FetchPagePipeline(job_name, fetch_set_buffer_files, shards) outlinks_files = yield _ExtractOutlinksPipeline( job_name, fetch_files, parser_params, shards) results_files = yield _FetchContentPipeline(job_name, outlinks_files, shards) temp_files = [ extract_domain_files, robots_files, fetch_set_buffer_files, fetch_files ] with pipeline.After(results_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files)