def run(self, job_name, reducer_spec, output_writer_spec, params, bucket_name, filenames, combiner_spec=None, shards=None): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) new_params = dict(params or {}) new_params.update({ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, } }) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) # TODO(user): Test this if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline(job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params, shards=shards)
def testShardRetry(self): entity_count = 200 db.delete(TestOutputEntity.all()) db.delete(RetryCount.all()) for i in range(entity_count): TestEntity(data=str(i)).put() p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_shard_retry_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, shards=5) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) outputs = [] for output in TestOutputEntity.all(): outputs.append(int(output.data)) outputs.sort() expected_outputs = [i for i in range(entity_count)] expected_outputs.sort() self.assertEquals(expected_outputs, outputs)
def testProcessEntites(self): """Test empty mapper over non-empty dataset.""" for _ in range(100): TestEntity().put() p = mapper_pipeline.MapperPipeline( "empty_map", handler_spec=__name__ + ".test_empty_handler", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "entity_kind": __name__ + ".TestEntity", }, ) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) self.assertTrue(p.outputs.job_id.value) counters = p.outputs.counters.value self.assertTrue(counters) self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters) self.assertEquals(100, counters[context.COUNTER_MAPPER_CALLS])
def testShardRetryTooMany(self): entity_count = 200 db.delete(TestOutputEntity.all()) db.delete(RetryCount.all()) for i in range(entity_count): TestEntity(data=str(i)).put() p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_shard_retry_too_many_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, shards=5) p.max_attempts = 1 p.start() test_support.execute_until_empty(self.taskqueue) state = model.MapreduceState.all().get() self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline aborted:"))
def run(self): yield mapper_pipeline.MapperPipeline( 'clean_up_old_exports', pipelines.FullName(CleanUpOldExportsMap), 'mapreduce.input_readers.DatastoreInputReader', params={'entity_kind': pipelines.FullName(ExportRatingsResult)}, shards=pipelines.DEFAULT_SHARDS)
def testFailedMap(self): for i in range(1): TestEntity(data=str(i)).put() pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = 1 p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_fail_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, shards=5) p.start() test_support.execute_until_empty(self.taskqueue) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) self.assertTrue(p.was_aborted) self.assertTrue(p.outputs.job_id.filled) state = model.MapreduceState.get_by_job_id(p.outputs.job_id.value) self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status) self.assertFalse(p.outputs.result_status.filled) self.assertFalse(p.outputs.default.filled) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline aborted:"))
def run(self, fbl_json, filters): bucket_name = 'dancedeets-hrd.appspot.com' params = { 'entity_kind': 'events.eventdata.DBEvent', 'filters': filters, 'handle_batch_size': 20, 'output_writer': { 'bucket_name': bucket_name, 'content_type': 'text/plain', } } params.update(fbl_json) # This should use cache, so we can go faster find_events_needing_access_tokens = ( yield mapper_pipeline.MapperPipeline( 'Find valid events needing access_tokens', 'events.find_access_tokens.map_events_needing_access_tokens', 'mapreduce.input_readers.DatastoreInputReader', 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', params=params, shards=10, )) # This will be a single shard single_file_of_fb_events = yield CombinerPipeline( "Combine event lists into single file", bucket_name, find_events_needing_access_tokens) # This will use more shards yield PassFileToAccessTokenFinder(single_file_of_fb_events)
def run(self, job_name, reducer_spec, output_writer_spec, params, filenames, combiner_spec=None, shards=None): new_params = dict(params or {}) new_params.update({"files": filenames}) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) # TODO(user): Test this if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline(job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params, shards=shards)
def run(self, job_name, reducer_spec, output_writer_spec, params, filenames): new_params = dict(params or {}) new_params.update({"files": filenames}) yield mapper_pipeline.MapperPipeline( job_name + "-reduce", reducer_spec, shuffler.__name__ + "._MergingReader", output_writer_spec, new_params)
def run(self, job_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + ".RecordsReader", output_writer_spec= __name__ + "._HashingBlobstoreOutputWriter", params={'files': filenames}, shards=len(filenames))
def run(self, job_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-merge", __name__ + "._merge_map", __name__ + "._MergingReader", output_writer_spec= output_writers.__name__ + ".BlobstoreRecordsOutputWriter", params={'files': filenames}, shards=len(filenames))
def run(self, namespace, job_name, sequence_num, cleanup_params, job_runner_args): self._started(namespace, job_name, sequence_num) yield mapper_pipeline.MapperPipeline( job_name=job_name, handler_spec= 'modules.analytics.filters.PreCleanMapReduceJobPipeline.map', input_reader_spec='mapreduce.input_readers.DatastoreInputReader', params=cleanup_params) yield jobs.MapReduceJobRunner(**job_runner_args)
def run(self, name, entity_type, map_fn): yield mapper_pipeline.MapperPipeline( name, map_fn, 'mapreduce.input_readers.DatastoreInputReader', params={ 'entity_kind': entity_type, 'start_datetime': SerializeDatetime(datetime.now()), }, shards=DEFAULT_SHARDS)
def CreateCleanupPipeline(model_class, start_datetime): return mapper_pipeline.MapperPipeline( 'cleanup', FullName(CleanupMap), 'mapreduce.input_readers.DatastoreInputReader', params={ 'entity_kind': FullName(model_class), 'start_datetime': SerializeDatetime(start_datetime) }, shards=DEFAULT_SHARDS)
def run(self, blob_keys, blob_sizes, shards): yield mapper_pipeline.MapperPipeline( "import_data_mapper", "pipeline.insert_data", "mapreduce.input_readers.BlobstoreLineInputReader", params={ "blob_keys": blob_keys, "blob_sizes": blob_sizes, }, shards=shards)
def run(self, job_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-merge", __name__ + "._merge_map", __name__ + "._MergingReader", output_writer_spec= output_writers.__name__ + ".BlobstoreRecordsOutputWriter", params={ _MergingReader.FILES_PARAM: filenames, _MergingReader.MAX_VALUES_COUNT_PARAM: self._MAX_VALUES_COUNT, _MergingReader.MAX_VALUES_SIZE_PARAM: self._MAX_VALUES_SIZE, }, shards=len(filenames))
def run(self, filenames): mapper = yield mapper_pipeline.MapperPipeline( "sort", __name__ + "._sort_records", __name__ + "._BatchRecordsReader", None, { "files": filenames, "processing_rate": 1000000, }, shards=1) # TODO(user): delete _OutputFile entities after collect with pipeline.After(mapper): yield _CollectOutputFiles(mapper.job_id)
def run(self, job_name, bucket_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-merge", __name__ + "._merge_map", __name__ + "._MergingReader", output_writer_spec=output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter", params={ _MergingReader.FILES_PARAM: filenames, _MergingReader.MAX_VALUES_COUNT_PARAM: self._MAX_VALUES_COUNT, _MergingReader.MAX_VALUES_SIZE_PARAM: self._MAX_VALUES_SIZE, "output_writer": { "bucket_name": bucket_name, }, }, shards=len(filenames))
def DjangoModelMap( model, mapper_func, keys_only=False, output_writer="mapreduce.output_writers.BlobstoreOutputWriter", params=None): """ A simple wrapper function for running a mapper function over Django model instances. Args: model: A Django model class mapper: A top-level function that takes a single argument, and yields zero or many two-tuples strings keys_only: Selects which input reader to use if True, then we use 'mapreduce.input_readers.DatastoreKeyInputReader', if False, then 'djangoappengine.mapreduce.input_readers.DjangoModelInputReader', defaults to False params: An optional dictionary of values to pass to the Mapper """ if keys_only: input_reader_spec = "mapreduce.input_readers.DatastoreKeyInputReader" mapper_params = { "entity_kind": model._meta.db_table, "mime_type": "text/plain" } else: input_reader_spec = "djangoappengine.mapreduce.input_readers.DjangoModelInputReader" mapper_params = { "entity_kind": _convert_model_to_string(model), "mime_type": "text/plain" } if params: mapper_params.update(params) mapper_spec = _convert_func_to_string(mapper_func) return mapper_pipeline.MapperPipeline( "%s-%s-mapper" % (model._meta.object_name, mapper_spec), mapper_spec, input_reader_spec, output_writer_spec=output_writer, params=mapper_params)
def testCleanup_ListOfLists(self): """Tests cleaning up a list of file lists.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run map p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".KeyValueBlobstoreOutputWriter", params={ "input_reader": { "entity_kind": __name__ + ".TestEntity", }, }, ) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) for name in file_list: files.open(name, "r").read(0) grouped_list = [file_list] # Cleanup cleanup = mapper_pipeline._CleanupPipeline(grouped_list) cleanup.start() test_support.execute_until_empty(self.taskqueue) # Cannot open files for name in file_list: self.assertRaises(files.Error, files.open, name, "r")
def run(self, job_name, bucket_name, filenames): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) params = { "output_writer": { "bucket_name": bucket_name, "content_type": "text/plain", }, "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, } } yield mapper_pipeline.MapperPipeline( job_name + "-combine", 'events.find_access_tokens.file_identity', 'mapreduce.input_readers.GoogleCloudStorageInputReader', 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', params, shards=1)
def run(self, job_name, bucket_name, filenames, shards=None): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + "._GoogleCloudStorageRecordInputReader", output_writer_spec=__name__ + "._HashingGCSOutputWriter", params={ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, "output_writer": { "bucket_name": bucket_name, }, }, shards=shards)
def run(self, job_name, filenames): sort_mappers = [] for i in range(len(filenames)): filename = filenames[i] sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchRecordsReader", None, { "files": [filename], "processing_rate": 1000000, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append( *[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def run(self, job_name, reducer_spec, output_writer_spec, params, filenames, combiner_spec=None): new_params = dict(params or {}) new_params.update({ "files": filenames }) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) yield mapper_pipeline.MapperPipeline( job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params)
def run(self, job_name, bucket_name, filenames): sort_mappers = [] for i in range(len(filenames)): filenames_only = util.strip_prefix_from_items( "/%s/" % bucket_name, filenames[i]) sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchGCSRecordsReader", None, { "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append( *[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def testEmptyMapper(self): """Test empty mapper over empty dataset.""" p = mapper_pipeline.MapperPipeline( "empty_map", handler_spec=__name__ + ".test_empty_handler", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + ".TestEntity", # Test datetime can be json serialized. "filters": [("dt", "=", datetime.datetime(2000, 1, 1))], }, }, ) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) # Verify outputs. # Counter output counters = p.outputs.counters.value self.assertTrue(counters) self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters) # Default output. self.assertEqual([], p.outputs.default.value) # Job id output. self.assertTrue(p.outputs.job_id.filled) state = model.MapreduceState.get_by_job_id(p.outputs.job_id.value) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, state.result_status) # Result status output. self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value)