def create_mapreduce_state(self, params=None): mapreduce_spec = model.MapreduceSpec( "mapreduce0", "mapreduce0", self.create_mapper_spec(params=params).to_json()) mapreduce_state = model.MapreduceState.create_new("mapreduce0") mapreduce_state.mapreduce_spec = mapreduce_spec return mapreduce_state
def start(job_config=None, in_xg_transaction=False): """Start a new map job. Args: job_config: an instance of map_job.MapJobConfig. in_xg_transaction: controls what transaction scope to use to start this MR job. If True, there has to be an already opened cross-group transaction scope. MR will use one entity group from it. If False, MR will create an independent transaction to start the job regardless of any existing transaction scopes. """ # Validate input reader and output writer. mapper_spec = job_config._get_mapper_spec() job_config.input_reader_cls.validate(mapper_spec) if job_config.output_writer_cls: job_config.output_writer_cls.validate(mapper_spec) # Create mr spec. mapreduce_params = job_config._get_mr_params() mapreduce_spec = model.MapreduceSpec( job_config.job_name, job_config.job_id, mapper_spec.to_json(), mapreduce_params, util._obj_to_path(job_config._hooks_cls)) # Save states and enqueue task. if in_xg_transaction: propagation = db.MANDATORY else: propagation = db.INDEPENDENT @db.transactional(propagation=propagation) def _txn(): _create_and_save_state(job_config, mapreduce_spec) _add_kickoff_task(job_config, mapreduce_spec) _txn()
def testGetTaskHeaders(self): mr_spec = model.MapreduceSpec(name="foo", mapreduce_id="foo_id", mapper_spec=model.MapperSpec( "foo", "foo", {}, 8).to_json()) task = taskqueue.Task(url="/relative_url", headers=util._get_task_headers( mr_spec.mapreduce_id)) self.assertEqual("foo_id", task.headers[util._MR_ID_TASK_HEADER]) self.assertEqual("v7.foo-module.foo.appspot.com", task.headers["Host"]) self.assertEqual("v7.foo-module", task.target)
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path="/mapreduce", queue_name="default", eta=None, countdown=None, _app=None): # Check that handler can be instantiated. mapper_spec.get_handler() mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_readers = mapper_input_reader_class.split_input( mapper_spec) if not mapper_input_readers: raise NoDataError("Found no mapper input readers to process.") mapper_spec.shard_count = len(mapper_input_readers) state = model.MapreduceState.create_new() mapreduce_spec = model.MapreduceSpec(name, state.key().id_or_name(), mapper_spec.to_json(), mapreduce_params) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app # TODO(user): Initialize UI fields correctly. state.char_url = "" state.sparkline_url = "" def schedule_mapreduce(state, mapper_input_readers, eta, countdown): state.save() readers_json = [ reader.to_json_str() for reader in mapper_input_readers ] taskqueue.Task(url=base_path + "/kickoffjob_callback", params={ "mapreduce_spec": state.mapreduce_spec.to_json_str(), "input_readers": simplejson.dumps(readers_json) }, eta=eta, countdown=countdown).add(queue_name, transactional=True) # Point of no return: We're actually going to run this job! db.run_in_transaction(schedule_mapreduce, state, mapper_input_readers, eta, countdown) return state.key().id_or_name()
def create_mapreduce_state(self, output_params=None): """Create a model.MapreduceState including MapreduceSpec and MapperSpec. Args: output_params: parameters for the output writer. Returns: a model.MapreduceSpec with default settings and specified output_params. """ mapreduce_spec = model.MapreduceSpec( "DummyMapReduceJobName", "DummyMapReduceJobId", self.create_mapper_spec(output_params=output_params).to_json()) mapreduce_state = model.MapreduceState.create_new("DummyMapReduceJobId") mapreduce_state.mapreduce_spec = mapreduce_spec mapreduce_state.put() return mapreduce_state
def submit(cls, job_config, in_xg_transaction=False): """Submit the job to run. Args: job_config: an instance of map_job.MapJobConfig. in_xg_transaction: controls what transaction scope to use to start this MR job. If True, there has to be an already opened cross-group transaction scope. MR will use one entity group from it. If False, MR will create an independent transaction to start the job regardless of any existing transaction scopes. Returns: a Job instance representing the submitted job. """ cls.__validate_job_config(job_config) mapper_spec = job_config._get_mapper_spec() # Create mr spec. mapreduce_params = job_config._get_mr_params() mapreduce_spec = model.MapreduceSpec( job_config.job_name, job_config.job_id, mapper_spec.to_json(), mapreduce_params, util._obj_to_path(job_config._hooks_cls)) # Save states and enqueue task. if in_xg_transaction: propagation = db.MANDATORY else: propagation = db.INDEPENDENT state = None @db.transactional(propagation=propagation) def _txn(): state = cls.__create_and_save_state(job_config, mapreduce_spec) cls.__add_kickoff_task(job_config, mapreduce_spec) return state state = _txn() return cls(state)
def testFindAllByMapreduceState(self): mr_state = model.MapreduceState.create_new("mapreduce-id") mr_state.mapreduce_spec = model.MapreduceSpec( "mapreduce", "mapreduce-id", model.MapperSpec("handler", "input-reader", {}, shard_count=304).to_json()) mr_state.put() for i in range(304): model.ShardState.create_new("mapreduce-id", i).put() @db.transactional(xg=False) def non_xg_tx(): # Open a single non-related entity group to ensure # find_all_by_mapreduce_state does not attempt to use outer transaction mr_state2 = model.MapreduceState.create_new( "unrelated-mapreduce-id") mr_state2.put() shard_states = model.ShardState.find_all_by_mapreduce_state( mr_state) for i, ss in enumerate(shard_states): self.assertEqual(i, ss.shard_number) non_xg_tx()
def testToJson(self): """Test to_json method.""" mapper_spec_dict = { "mapper_handler_spec": "TestHandler", "mapper_input_reader": "TestInputReader", "mapper_params": { "entity_kind": "bar" }, "mapper_shard_count": 8 } mapreduce_spec = model.MapreduceSpec( "my job", "mr0", mapper_spec_dict, {"extra": "value"}, __name__ + "." + TestHooks.__name__) self.assertEquals( { "name": "my job", "mapreduce_id": "mr0", "mapper_spec": mapper_spec_dict, "params": { "extra": "value" }, "hooks_class_name": __name__ + "." + TestHooks.__name__, }, mapreduce_spec.to_json())
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": # We are currently in some special queue. E.g. __cron. queue_name = "default" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") # Check that reader can be instantiated and is configured correctly mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) # Check that handler can be instantiated. ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.get_handler() finally: context.Context._set(None) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = parent_entity if not transactional: # Save state in datastore so that UI can see it. # We can't save state in foreign transaction, but conventional UI # doesn't ask for transactional starts anyway. state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: # Use the default task addition implementation. pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path="/mapreduce", queue_name="default", eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False): # Check that handler can be instantiated. mapper_spec.get_handler() # Check that reader can be instantiated and is configured correctly mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = taskqueue.Task(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): if not transactional: # Save state in datastore so that UI can see it. # We can't save state in foreign transaction, but conventional UI # doesn't ask for transactional starts anyway. state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: # Use the default task addition implementation. pass else: return kickoff_worker_task.add(queue_name, transactional=True) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id