def test_split_input(self): SHARD_COUNT = 10 BATCH_SIZE = 2 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "batch_size": BATCH_SIZE, } }, SHARD_COUNT) def num_expected(): batch_size = min(len(self.dataSet), BATCH_SIZE) free_division = abs(len(self.dataSet) / batch_size) return min(free_division, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) self.assertEqual(3, num_expected()) # 1-3, 3-5, 5-None self.assertEqual(3, len(ds_input_readers)) # batch_size = dataSet bigger half BATCH_SIZE = int(math.ceil(len(self.dataSet) / 2.0)) mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "batch_size": BATCH_SIZE, } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) self.assertEqual(2, num_expected()) # 1-4, 4-None self.assertEqual(2, len(ds_input_readers)) # batch_size > dataSet itself BATCH_SIZE = len(self.dataSet) * 2 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "batch_size": BATCH_SIZE, } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) self.assertEqual(1, num_expected()) # 1-None self.assertEqual(1, len(ds_input_readers))
def testToJson(self): mapper_spec = model.MapperSpec(self.TEST_HANDLER, self.TEST_READER, {"entity_kind": self.ENTITY_KIND}, 8) self.assertEquals(self.default_json, mapper_spec.to_json()) mapper_spec = model.MapperSpec(self.TEST_HANDLER, self.TEST_READER, {"entity_kind": self.ENTITY_KIND}, 8, output_writer_spec=self.TEST_WRITER) d = dict(self.default_json) d["mapper_output_writer"] = self.TEST_WRITER self.assertEquals(d, mapper_spec.to_json())
def handle(self): """Handles start request.""" # Mapper spec as form arguments. mapreduce_name = self._get_required_param("name") mapper_input_reader_spec = self._get_required_param( "mapper_input_reader") mapper_handler_spec = self._get_required_param("mapper_handler") mapper_output_writer_spec = self.request.get("mapper_output_writer") mapper_params = self._get_params("mapper_params_validator", "mapper_params.") params = self._get_params("params_validator", "params.") # Set some mapper param defaults if not present. mapper_params["processing_rate"] = int( mapper_params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) queue_name = mapper_params["queue_name"] = mapper_params.get( "queue_name", "default") # Validate the Mapper spec, handler, and input reader. mapper_spec = model.MapperSpec( mapper_handler_spec, mapper_input_reader_spec, mapper_params, int(mapper_params.get("shard_count", model._DEFAULT_SHARD_COUNT)), output_writer_spec=mapper_output_writer_spec) mapreduce_id = type(self)._start_map(mapreduce_name, mapper_spec, params, base_path=self.base_path(), queue_name=queue_name, _app=mapper_params.get("_app")) self.json_response["mapreduce_id"] = mapreduce_id
def test_with_filter_factory(self): SHARD_COUNT = 10 FF_PATH = \ "test_mapreduce_utils.DatastoreQueryInputReaderTest." \ "simple_parametrized_filter_factory" params = { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "filter_factory_spec": { "name": FF_PATH, "args": ["B"] } } } mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", params, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) got = reduce(operator.add, (list(reader) for reader in ds_input_readers)) self.assertEqual(2, len(got)) data1, data2, = filter(lambda i: i['type'] == "B", self.dataSet) got.sort(key=lambda i: i.name) self.assertDictEqual(data1, db.to_dict(got.pop(0))) self.assertDictEqual(data2, db.to_dict(got.pop(0)))
def testGeneratorWithKeyRange(self): """Test DjangoModelInputReader as generator using KeyRanges.""" expected_entities = [] for i in range(0, 100): entity = TestModel(test_property=i) entity.save() expected_entities.append(entity) params = { "entity_kind": ENTITY_KIND, } mapper_spec = model.MapperSpec( "FooHandler", "djangoappengine.mapreduce.input_readers.DjangoModelInputReader", params, 1) input_ranges = DjangoModelInputReader.split_input(mapper_spec) entities = [] for query_range in input_ranges: for entity in query_range: entities.append(entity) self.assertEquals(100, len(entities)) self.assertEquals(expected_entities, entities)
def start_map(name, handler_spec, reader_spec, mapper_parameters, shard_count=_DEFAULT_SHARD_COUNT, output_writer_spec=None, mapreduce_parameters=None, base_path=base_handler._DEFAULT_BASE_PATH, queue_name="default", eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False): """Start a new, mapper-only mapreduce. Args: name: mapreduce name. Used only for display purposes. handler_spec: fully qualified name of mapper handler function/class to call. reader_spec: fully qualified name of mapper reader to use mapper_parameters: dictionary of parameters to pass to mapper. These are mapper-specific and also used for reader initialization. shard_count: number of shards to create. mapreduce_parameters: dictionary of mapreduce parameters relevant to the whole job. base_path: base path of mapreduce library handler specified in app.yaml. "/mapreduce" by default. queue_name: executor queue name to be used for mapreduce tasks. eta: Absolute time when the MR should execute. May not be specified if 'countdown' is also supplied. This may be timezone-aware or timezone-naive. countdown: Time in seconds into the future that this MR should execute. Defaults to zero. hooks_class_name: fully qualified name of a hooks.Hooks subclass. transactional: Specifies if job should be started as a part of already opened transaction. Returns: mapreduce id as string. """ if not shard_count: shard_count = _DEFAULT_SHARD_COUNT mapper_spec = model.MapperSpec(handler_spec, reader_spec, mapper_parameters, shard_count, output_writer_spec=output_writer_spec) return handlers.StartJobHandler._start_map( name, mapper_spec, mapreduce_parameters or {}, base_path=base_path, queue_name=queue_name, eta=eta, countdown=countdown, hooks_class_name=hooks_class_name, _app=_app, transactional=transactional)
def testValidate_NoEntityFails(self): """Test validate function raises exception with no entity parameter.""" params = {} mapper_spec = model.MapperSpec( "FooHandler", "djangoappengine.mapreduce.input_readers.DjangoModelInputReader", params, 1) self.assertRaises(input_readers.BadReaderParamsError, DjangoModelInputReader.validate, mapper_spec)
def testValidate_Passes(self): """Test validate function accepts valid parameters.""" params = { "entity_kind": ENTITY_KIND, } mapper_spec = model.MapperSpec( "FooHandler", "djangoappengine.mapreduce.input_readers.DjangoModelInputReader", params, 1) DjangoModelInputReader.validate(mapper_spec)
def create_mapper_spec(self, output_writer_spec=BLOBSTORE_WRITER_NAME, params=None): params = params or {} mapper_spec = model.MapperSpec( "FooHandler", "mapreduce.input_readers.DatastoreInputReader", params, 10, output_writer_spec=output_writer_spec) return mapper_spec
def testGetTaskHeaders(self): mr_spec = model.MapreduceSpec(name="foo", mapreduce_id="foo_id", mapper_spec=model.MapperSpec( "foo", "foo", {}, 8).to_json()) task = taskqueue.Task(url="/relative_url", headers=util._get_task_headers( mr_spec.mapreduce_id)) self.assertEqual("foo_id", task.headers[util._MR_ID_TASK_HEADER]) self.assertEqual("v7.foo-module.foo.appspot.com", task.headers["Host"]) self.assertEqual("v7.foo-module", task.target)
def testValidate_BadEntityKind(self): """Test validate function with bad entity kind.""" params = { "entity_kind": "foo", } mapper_spec = model.MapperSpec( "FooHandler", "djangoappengine.mapreduce.input_readers.DjangoModelInputReader", params, 1) self.assertRaises(input_readers.BadReaderParamsError, DjangoModelInputReader.validate, mapper_spec)
def _get_mapper_spec(self): """Converts self to model.MapperSpec.""" # pylint: disable=g-import-not-at-top from mapreduce import model return model.MapperSpec( handler_spec=util._obj_to_path(self.mapper), input_reader_spec=util._obj_to_path(self.input_reader_cls), params=self._get_mapper_params(), shard_count=self.shard_count, output_writer_spec=util._obj_to_path(self.output_writer_cls))
def testValidate_BadNamespace(self): """Test validate function with bad namespace.""" params = { "entity_kind": ENTITY_KIND, "namespace": 'namespace', } mapper_spec = model.MapperSpec( "FooHandler", "djangoappengine.mapreduce.input_readers.DjangoModelInputReader", params, 1) self.assertRaises(input_readers.BadReaderParamsError, DjangoModelInputReader.validate, mapper_spec)
def create_mapper_spec(self, output_params=None): """Create a Mapper specification using the GoogleCloudStorageOutputWriter. The specification generated uses a dummy handler and input reader. The number of shards is 10 (some number greater than 1). Args: output_params: parameters for the output writer. Returns: a model.MapperSpec with default settings and specified output_params. """ return model.MapperSpec("DummyHandler", "DummyInputReader", {"output_writer": output_params or {}}, self.NUM_SHARDS, output_writer_spec=self.WRITER_NAME)
def test_world(self): SHARD_COUNT = 10 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) got = reduce(operator.add, (list(reader) for reader in ds_input_readers)) self.assertEqual(len(self.dataSet), len(got))
def start_map(name, handler_spec, reader_spec, reader_parameters, shard_count, mapreduce_parameters={}, base_path="/mapreduce", queue_name="default", eta=None, countdown=None, _app=None): """Start a new, mapper-only mapreduce. Args: name: mapreduce name. Used only for display purposes. handler_spec: fully qualified name of mapper handler function/class to call. reader_spec: fully qualified name of mapper reader to use reader_parameters: dictionary of parameters to pass to reader. These are reader-specific. shard_count: number of shards to create. mapreduce_parameters: dictionary of mapreduce parameters relevant to the whole job. base_path: base path of mapreduce library handler specified in app.yaml. "/mapreduce" by default. queue_name: executor queue name to be used for mapreduce tasks. eta: Absolute time when the MR should execute. May not be specified if 'countdown' is also supplied. This may be timezone-aware or timezone-naive. countdown: Time in seconds into the future that this MR should execute. Defaults to zero. Returns: mapreduce id as string. """ mapper_spec = model.MapperSpec(handler_spec, reader_spec, reader_parameters, shard_count) return handlers.StartJobHandler._start_map(name, mapper_spec, mapreduce_parameters, base_path=base_path, queue_name=queue_name, eta=eta, countdown=countdown, _app=_app)
def test_with_query_filters(self): SHARD_COUNT = 10 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "filters": [("type", "=", "C")], } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) got = reduce(operator.add, (list(reader) for reader in ds_input_readers)) self.assertEqual(3, len(got)) data1, data2, data3 = filter(lambda i: i['type'] == "C", self.dataSet) got.sort(key=lambda i: i.name) self.assertDictEqual(data1, db.to_dict(got.pop(0))) self.assertDictEqual(data2, db.to_dict(got.pop(0))) self.assertDictEqual(data3, db.to_dict(got.pop(0)))
def testFindAllByMapreduceState(self): mr_state = model.MapreduceState.create_new("mapreduce-id") mr_state.mapreduce_spec = model.MapreduceSpec( "mapreduce", "mapreduce-id", model.MapperSpec("handler", "input-reader", {}, shard_count=304).to_json()) mr_state.put() for i in range(304): model.ShardState.create_new("mapreduce-id", i).put() @db.transactional(xg=False) def non_xg_tx(): # Open a single non-related entity group to ensure # find_all_by_mapreduce_state does not attempt to use outer transaction mr_state2 = model.MapreduceState.create_new( "unrelated-mapreduce-id") mr_state2.put() shard_states = model.ShardState.find_all_by_mapreduce_state( mr_state) for i, ss in enumerate(shard_states): self.assertEqual(i, ss.shard_number) non_xg_tx()
def start_map(name, handler_spec, reader_spec, mapper_parameters, shard_count=None, output_writer_spec=None, mapreduce_parameters=None, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, in_xg_transaction=False): """Start a new, mapper-only mapreduce. Deprecated! Use map_job.start instead. If a value can be specified both from an explicit argument and from a dictionary, the value from the explicit argument wins. Args: name: mapreduce name. Used only for display purposes. handler_spec: fully qualified name of mapper handler function/class to call. reader_spec: fully qualified name of mapper reader to use mapper_parameters: dictionary of parameters to pass to mapper. These are mapper-specific and also used for reader/writer initialization. Should have format {"input_reader": {}, "output_writer":{}}. Old deprecated style does not have sub dictionaries. shard_count: number of shards to create. mapreduce_parameters: dictionary of mapreduce parameters relevant to the whole job. base_path: base path of mapreduce library handler specified in app.yaml. "/mapreduce" by default. queue_name: taskqueue queue name to be used for mapreduce tasks. see util.get_queue_name. eta: absolute time when the MR should execute. May not be specified if 'countdown' is also supplied. This may be timezone-aware or timezone-naive. countdown: time in seconds into the future that this MR should execute. Defaults to zero. hooks_class_name: fully qualified name of a hooks.Hooks subclass. in_xg_transaction: controls what transaction scope to use to start this MR job. If True, there has to be an already opened cross-group transaction scope. MR will use one entity group from it. If False, MR will create an independent transaction to start the job regardless of any existing transaction scopes. Returns: mapreduce id as string. """ if shard_count is None: shard_count = parameters.config.SHARD_COUNT if mapper_parameters: mapper_parameters = dict(mapper_parameters) # Make sure this old API fill all parameters with default values. mr_params = map_job.JobConfig._get_default_mr_params() if mapreduce_parameters: mr_params.update(mapreduce_parameters) # Override default values if user specified them as arguments. if base_path: mr_params["base_path"] = base_path mr_params["queue_name"] = util.get_queue_name(queue_name) mapper_spec = model.MapperSpec(handler_spec, reader_spec, mapper_parameters, shard_count, output_writer_spec=output_writer_spec) if in_xg_transaction and not db.is_in_transaction(): log.warning("Expects an opened xg transaction to start mapreduce " "when transactional is True.") return handlers.StartJobHandler._start_map( name, mapper_spec, mr_params, # TODO(user): Now that "queue_name" is part of mr_params. # Remove all the other ways to get queue_name after one release. queue_name=mr_params["queue_name"], eta=eta, countdown=countdown, hooks_class_name=hooks_class_name, _app=_app, in_xg_transaction=in_xg_transaction)
def start_map(name, handler_spec, reader_spec, mapper_parameters, shard_count=_DEFAULT_SHARD_COUNT, output_writer_spec=None, mapreduce_parameters=None, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, transactional_parent=None): """Start a new, mapper-only mapreduce. Args: name: mapreduce name. Used only for display purposes. handler_spec: fully qualified name of mapper handler function/class to call. reader_spec: fully qualified name of mapper reader to use mapper_parameters: dictionary of parameters to pass to mapper. These are mapper-specific and also used for reader initialization. shard_count: number of shards to create. mapreduce_parameters: dictionary of mapreduce parameters relevant to the whole job. base_path: base path of mapreduce library handler specified in app.yaml. "/mapreduce" by default. queue_name: executor queue name to be used for mapreduce tasks. If unspecified it will be the "default" queue or inherit the queue of the currently running request. eta: absolute time when the MR should execute. May not be specified if 'countdown' is also supplied. This may be timezone-aware or timezone-naive. countdown: time in seconds into the future that this MR should execute. Defaults to zero. hooks_class_name: fully qualified name of a hooks.Hooks subclass. transactional: specifies if job should be started as a part of already opened transaction. transactional_parent: specifies the entity which is already a part of transaction. Child entity will be used to store task payload if mapreduce specification is too big. Returns: mapreduce id as string. """ if not shard_count: shard_count = _DEFAULT_SHARD_COUNT if base_path is None: base_path = base_handler._DEFAULT_BASE_PATH if mapper_parameters: mapper_parameters = dict(mapper_parameters) if mapreduce_parameters: mapreduce_parameters = dict(mapreduce_parameters) mapper_spec = model.MapperSpec(handler_spec, reader_spec, mapper_parameters, shard_count, output_writer_spec=output_writer_spec) if transactional and not transactional_parent: # We should really fail here, but there might be some customers # of this code that wouldn't like this. # This will cause problems only for huge job definitions. logging.error( "transactional_parent should be specified for transactional starts." "Your job will fail to start if mapreduce specification is too big.") return handlers.StartJobHandler._start_map( name, mapper_spec, mapreduce_parameters or {}, base_path=base_path, queue_name=queue_name, eta=eta, countdown=countdown, hooks_class_name=hooks_class_name, _app=_app, transactional=transactional, parent_entity=transactional_parent)
def start_map(name, handler_spec, reader_spec, mapper_parameters, shard_count=None, output_writer_spec=None, mapreduce_parameters=None, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, in_xg_transaction=False): """Start a new, mapper-only mapreduce. Args: name: mapreduce name. Used only for display purposes. handler_spec: fully qualified name of mapper handler function/class to call. reader_spec: fully qualified name of mapper reader to use mapper_parameters: dictionary of parameters to pass to mapper. These are mapper-specific and also used for reader initialization. shard_count: number of shards to create. mapreduce_parameters: dictionary of mapreduce parameters relevant to the whole job. base_path: base path of mapreduce library handler specified in app.yaml. "/mapreduce" by default. queue_name: taskqueue queue name to be used for mapreduce tasks. see util.get_queue_name. eta: absolute time when the MR should execute. May not be specified if 'countdown' is also supplied. This may be timezone-aware or timezone-naive. countdown: time in seconds into the future that this MR should execute. Defaults to zero. hooks_class_name: fully qualified name of a hooks.Hooks subclass. in_xg_transaction: controls what transaction scope to use to start this MR job. If True, there has to be an already opened cross-group transaction scope. MR will use one entity group from it. If False, MR will create an independent transaction to start the job regardless of any existing transaction scopes. Returns: mapreduce id as string. """ if shard_count is None: shard_count = parameters.config.SHARD_COUNT if base_path is None: base_path = parameters.config.BASE_PATH if mapper_parameters: mapper_parameters = dict(mapper_parameters) if mapreduce_parameters: mapreduce_parameters = dict(mapreduce_parameters) if "base_path" not in mapreduce_parameters: mapreduce_parameters["base_path"] = base_path else: mapreduce_parameters = {"base_path": base_path} mapper_spec = model.MapperSpec(handler_spec, reader_spec, mapper_parameters, shard_count, output_writer_spec=output_writer_spec) if in_xg_transaction and not db.is_in_transaction(): logging.warning("Expects an opened xg transaction to start mapreduce " "when transactional is True.") return handlers.StartJobHandler._start_map( name, mapper_spec, mapreduce_parameters, queue_name=util.get_queue_name(queue_name), eta=eta, countdown=countdown, hooks_class_name=hooks_class_name, _app=_app, in_xg_transaction=in_xg_transaction)