def testCreateWritersWithRetries(self): mapreduce_state = self.create_mapreduce_state( output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) # Create the writer for the 1st attempt writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, shard_state.retries + 1) new_filename = writer._get_filename_for_test() writer.begin_slice(None) writer.write("initData") writer.end_slice(None) orig_json = writer.to_json() writer = self.WRITER_CLS.from_json(orig_json) writer.begin_slice(None) writer.write( "badData") # we fail here so this data should be discarded # Recreate the same rewrite (simulates a slice retry). writer = self.WRITER_CLS.from_json(orig_json) writer.begin_slice(None) writer.write("goodData") writer.end_slice(None) writer = self._serialize_and_deserialize(writer) writer.finalize(ctx, shard_state) # Verify the badData is not in the final file self.assertEqual("initDatagoodData", cloudstorage.open(new_filename).read())
def testCreateWritersWithRetries(self): mapreduce_state = self.create_mapreduce_state( output_params= {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) # Create the writer for the 1st attempt writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, shard_state.retries + 1) filename = writer._filename writer.write("badData") # Test re-creating the writer for a retry shard_state.reset_for_retry() writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, shard_state.retries + 1) new_filename = writer._filename good_data = "goodData" writer.write(good_data) writer.finalize(None, shard_state) # Verify the retry has a different filename self.assertNotEqual(filename, new_filename) # Verify the badData is not in the final file self.assertEqual(good_data, cloudstorage.open(new_filename).read())
def testWriterSerialization(self): mapreduce_state = self.create_mapreduce_state( output_params= {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) # data expliclity contains binary data data = "\"fake\"\tdatathatishardtoencode" writer.write(data) # Serialize/deserialize writer after some data written writer = self.WRITER_CLS.from_json(writer.to_json()) writer.write(data) # Serialize/deserialize writer after more data written writer = self.WRITER_CLS.from_json(writer.to_json()) writer.finalize(None, shard_state) # Serialize/deserialize writer after finalization writer = self.WRITER_CLS.from_json(writer.to_json()) self.assertRaises(IOError, writer.write, data) filename = self.WRITER_CLS._get_filename(shard_state) self.assertNotEquals(None, filename) self.assertEqual(data + data, cloudstorage.open(filename).read())
def testSmoke(self): tmp_files = set() final_files = set() for shard_num in range(self.NUM_SHARDS): shard = self.create_shard_state(shard_num) writer = self.WRITER_CLS.create(self.mr_state.mapreduce_spec, shard.shard_number, 0) # Verify files are created under tmp dir. tmp_file = writer._streaming_buffer.name self.assertTrue(self.WRITER_CLS._MR_TMP in tmp_file) tmp_files.add(tmp_file) cxt = context.Context(self.mr_state.mapreduce_spec, shard) writer.finalize(cxt, shard) # Verify the integrity of writer state. self.assertEqual( writer._streaming_buffer.name, (shard.writer_state[self.WRITER_CLS._SEG_PREFIX] + str(shard.writer_state[self.WRITER_CLS._LAST_SEG_INDEX]))) final_file = shard.writer_state["filename"] self.assertFalse(self.WRITER_CLS._MR_TMP in final_file) final_files.add(final_file) # Verify all filenames are different. self.assertEqual(self.NUM_SHARDS, len(tmp_files)) self.assertEqual(self.NUM_SHARDS, len(final_files))
def setUp(self): """Sets up the test harness.""" testutil.setup_for_testing() self.mapper = offline_jobs.CountSubscribers() self.callback = 'http://foo.callback-example.com/my-callback-url' self.topic = 'http://example.com/my-topic-url' self.token = 'token' self.secret = 'my secrat' # Do not make these raw strings on purpose, since they will get # passed through escaped in the mapreduce.yaml. self.topic_pattern = '^http://example\\.com/.*$' self.callback_pattern = ( 'http(?:s)?://(?:[^\\.]+\\.)*([^\\./]+\.[^\\./]+)(?:/.*)?') class FakeMapper(object): params = { 'topic_pattern': self.topic_pattern, 'callback_pattern': self.callback_pattern, } class FakeSpec(object): mapreduce_id = '1234' mapper = FakeMapper() self.context = context.Context(FakeSpec(), None) context.Context._set(self.context)
def testRecoverSomethingWrittenInFailedInstance(self): mr_spec = self.mr_state.mapreduce_spec shard_state = self.create_shard_state(0) ctx = context.Context(mr_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mr_spec, 0, 0) writer.write("123") writer = self.WRITER_CLS.from_json(writer.to_json()) writer.write("4") new_writer = writer._recover(mr_spec, 0, 0) # Old instance is finalized and valid offset saved. old_stat = cloudstorage.stat(writer._streaming_buffer.name) self.assertEqual(len("123"), int(old_stat.metadata[self.WRITER_CLS._VALID_LENGTH])) # New instance is created with an incremented seg index. self.assertEqual(writer._seg_index + 1, new_writer._seg_index) # Verify filenames. self.assertTrue( writer._streaming_buffer.name.endswith(str(writer._seg_index))) self.assertTrue( new_writer._streaming_buffer.name.endswith( str(new_writer._seg_index)))
def testGetSetContext(self): """Test module's get_context and _set functions.""" ctx = context.Context(None, None) self.assertFalse(context.get()) context.Context._set(ctx) self.assertEquals(ctx, context.get()) context.Context._set(None) self.assertEquals(None, context.get())
def testRemoveGarbage(self): """Make sure abandoned files get removed.""" writer_spec = { self.WRITER_CLS.BUCKET_NAME_PARAM: "unused", self.WRITER_CLS.TMP_BUCKET_NAME_PARAM: "test" } mapreduce_state = self.create_mapreduce_state( output_params=writer_spec) shard_state = self.create_shard_state(1) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer.begin_slice(None) # our shard our_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-1-very-random" f = cloudstorage.open(our_file, "w") f.write("foo?") f.close() # not our shard their_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-3-very-random" f = cloudstorage.open(their_file, "w") f.write("bar?") f.close() # unrelated file real_file = "/test/this_things_should_survive" f = cloudstorage.open(real_file, "w") f.write("yes, foobar!") f.close() # Make sure bogus file still exists names = [l.filename for l in cloudstorage.listbucket("/test")] self.assertTrue(our_file in names) self.assertTrue(their_file in names) self.assertTrue(real_file in names) # slice end should clean up the garbage writer = self._serialize_and_deserialize(writer) names = [l.filename for l in cloudstorage.listbucket("/test")] self.assertFalse(our_file in names) self.assertTrue(their_file in names) self.assertTrue(real_file in names) # finalize shouldn't change anything writer.finalize(ctx, shard_state) self.assertFalse(our_file in names) self.assertTrue(their_file in names) self.assertTrue(real_file in names)
def testRemovingIgnoredNonExistent(self): writer_spec = {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"} mapreduce_state = self.create_mapreduce_state( output_params=writer_spec) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer._remove_tmpfile(None, writer_spec) # no exceptions writer._remove_tmpfile("/test/i_dont_exist", writer_spec)
def testRecoverNothingWrittenInFailedInstance(self): mr_spec = self.mr_state.mapreduce_spec shard_state = self.create_shard_state(0) ctx = context.Context(mr_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mr_spec, 0, 0) self.assertEqual(0, writer._seg_index) new_writer = writer._recover(mr_spec, 0, 0) # Old instance is not finalized. self.assertFalse(0, writer._streaming_buffer.closed) # seg index is not incremented. self.assertEqual(0, new_writer._seg_index)
def setupWriter(self): """Create an Google Cloud Storage LevelDB record output writer. Returns: a model.MapreduceSpec. """ self.mapreduce_state = self.create_mapreduce_state() self.shard_state = self.create_shard_state(0) self.writer = self.WRITER_CLS.create(self.mapreduce_state.mapreduce_spec, self.shard_state.shard_number, self.shard_state.retries + 1) self.ctx = context.Context(self.mapreduce_state.mapreduce_spec, self.shard_state) context.Context._set(self.ctx)
def testMutationPoolSize(self): ctx = context.Context(None, None) self.assertEquals(context.MAX_ENTITY_COUNT, ctx.mutation_pool.max_entity_count) self.assertEquals(context.MAX_POOL_SIZE, ctx.mutation_pool.max_pool_size) ctx = context.Context(None, None, task_retry_count=0) self.assertEquals(context.MAX_ENTITY_COUNT, ctx.mutation_pool.max_entity_count) self.assertEquals(context.MAX_POOL_SIZE, ctx.mutation_pool.max_pool_size) ctx = context.Context(None, None, task_retry_count=1) self.assertEquals(context.MAX_ENTITY_COUNT / 2, ctx.mutation_pool.max_entity_count) self.assertEquals(context.MAX_POOL_SIZE / 2, ctx.mutation_pool.max_pool_size) ctx = context.Context(None, None, task_retry_count=4) self.assertEquals(context.MAX_ENTITY_COUNT / 16, ctx.mutation_pool.max_entity_count) self.assertEquals(context.MAX_POOL_SIZE / 16, ctx.mutation_pool.max_pool_size)
def testFinalizeChecksForErrors(self): """Just make sure finalize is never called after processing data.""" mapreduce_state = self.create_mapreduce_state( output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer.begin_slice(None) writer.write("foobar") # We wrote something, finalize must fail (sanity check). self.assertRaises(errors.FailJobError, writer.finalize, ctx, shard_state)
def testSerialization(self): mr_spec = self.mr_state.mapreduce_spec shard_state = self.create_shard_state(0) ctx = context.Context(mr_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mr_spec, 0, 0) writer._seg_index = 1 writer.write("abcde") writer = self.WRITER_CLS.from_json_str(writer.to_json_str()) # _seg_index doesn't change. self.assertEqual(1, writer._seg_index) # _seg_valid_length is updated to what was in the buffer. self.assertEqual(len("abcde"), writer._seg_valid_length)
def testTmpfileName(self): writer_spec = {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"} mapreduce_state = self.create_mapreduce_state( output_params=writer_spec) shard_state = self.create_shard_state(19) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer.begin_slice(None) prefix = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-19-" tmpfile_name = writer.status.tmpfile.name self.assertTrue(tmpfile_name.startswith(prefix), "Test file name is: %s" % tmpfile_name)
def testTmpDefaultsToMain(self): writer_spec = { self.WRITER_CLS.BUCKET_NAME_PARAM: "bucket", self.WRITER_CLS._ACCOUNT_ID_PARAM: "account" } mapreduce_state = self.create_mapreduce_state( output_params=writer_spec) shard_state = self.create_shard_state(1) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) self.assertEquals("bucket", writer._get_tmp_gcs_bucket(writer_spec)) self.assertEquals("account", writer._get_tmp_account_id(writer_spec))
def testWriterCounters(self): mapreduce_state = self.create_mapreduce_state( output_params= {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) # Write large amount of data to ensure measurable time passes during write. data = "d" * 1024 * 1024 * 10 writer.write(data) self.assertEqual(len(data), shard_state.counters_map.get( output_writers.COUNTER_IO_WRITE_BYTES)) self.assertTrue(shard_state.counters_map.get( output_writers.COUNTER_IO_WRITE_MSEC) > 0)
def testWriter(self): mapreduce_state = self.create_mapreduce_state( output_params= {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) data = "fakedata" writer.write(data) writer.finalize(None, shard_state) filename = self.WRITER_CLS._get_filename(shard_state) self.assertNotEquals(None, filename) self.assertEqual(data, cloudstorage.open(filename).read())
def testCreateWriters(self): mapreduce_state = self.create_mapreduce_state( output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) for shard_num in range(self.NUM_SHARDS): shard = self.create_shard_state(shard_num) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard.shard_number, 0) cxt = context.Context(mapreduce_state.mapreduce_spec, shard) shard.result_status = model.ShardState.RESULT_SUCCESS writer.finalize(cxt, shard) shard.put() filenames = self.WRITER_CLS.get_filenames(mapreduce_state) # Verify we have the correct number of filenames self.assertEqual(self.NUM_SHARDS, len(filenames)) # Verify each has a unique filename self.assertEqual(self.NUM_SHARDS, len(set(filenames)))
def testIncrement(self): """Test applying Increment operation.""" m = mox.Mox() ctx = context.Context(None, None) ctx.counters = m.CreateMock(context.Counters) operation = op.counters.Increment("test", 12) # Record calls ctx.counters.increment("test", 12) m.ReplayAll() try: # test, verify operation(ctx) m.VerifyAll() finally: m.UnsetStubs()
def testDelete(self): """Test applying Delete operation.""" m = mox.Mox() ctx = context.Context(None, None) ctx.mutation_pool = m.CreateMock(context.MutationPool) entity = TestEntity() operation = op.db.Delete(entity) # Record calls ctx.mutation_pool.delete(entity) m.ReplayAll() try: # test, verify operation(ctx) m.VerifyAll() finally: m.UnsetStubs()
def testArbitraryPool(self): """Test arbitrary pool registration.""" m = mox.Mox() ctx = context.Context(None, None) self.assertFalse(ctx.get_pool("test")) pool = m.CreateMockAnything() ctx.register_pool("test", pool) self.assertEquals(pool, ctx.get_pool("test")) # Record calls pool.flush() m.ReplayAll() try: # test, verify ctx.flush() m.VerifyAll() finally: m.UnsetStubs()
def testWriterMetadata(self): test_acl = "test-acl" test_content_type = "test-mime" mapreduce_state = self.create_mapreduce_state( output_params={ self.WRITER_CLS.BUCKET_NAME_PARAM: "test", self.WRITER_CLS.ACL_PARAM: test_acl, self.WRITER_CLS.CONTENT_TYPE_PARAM: test_content_type }) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer = self.WRITER_CLS.from_json(writer.to_json()) writer.finalize(ctx, shard_state) filename = self.WRITER_CLS._get_filename(shard_state) file_stat = cloudstorage.stat(filename) self.assertEqual(test_content_type, file_stat.content_type)
def setUp(self): """Sets up the test harness.""" testutil.setup_for_testing() self.mapper = offline_jobs.SubscriptionReconfirmMapper() self.callback = 'http://example.com/my-callback-url' self.topic = 'http://example.com/my-topic-url' self.token = 'token' self.secret = 'my secrat' self.now = datetime.datetime.utcnow() self.threshold_seconds = 1000 self.threshold_timestamp = ( time.mktime(self.now.utctimetuple()) + self.threshold_seconds) self.getnow = lambda: self.now class FakeMapper(object): params = {'threshold_timestamp': str(self.threshold_timestamp)} class FakeSpec(object): mapreduce_id = '1234' mapper = FakeMapper() self.context = context.Context(FakeSpec(), None) context.Context._set(self.context)
def testTemporaryFilesGetCleanedUp(self): mapreduce_state = self.create_mapreduce_state( output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}) shard_state = self.create_shard_state(0) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer.begin_slice(None) writer.write("foo") writer = self.WRITER_CLS.from_json(writer.to_json()) writer.write("bar") writer = self.WRITER_CLS.from_json(writer.to_json()) writer.write("foo again") writer = self.WRITER_CLS.from_json(writer.to_json()) writer.finalize(ctx, shard_state) names = [l.filename for l in cloudstorage.listbucket("/test")] self.assertEquals( ["/test/DummyMapReduceJobName/DummyMapReduceJobId/output-0"], names)
def post(self): """Handle post request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) self._start_time = self._time() shard_id = self.shard_id() # TODO(user): Make this prettier logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.META) shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.save() model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = self.input_reader(spec.mapper) if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(cache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_entity(entity, ctx) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(spec, input_reader)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( "mapper-walltime-msec", int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.put(config=util.create_datastore_write_config(spec)) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def _start_map(cls, name, mapper_spec, mapreduce_params, base_path=None, queue_name=None, eta=None, countdown=None, hooks_class_name=None, _app=None, transactional=False, parent_entity=None): queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default") if queue_name[0] == "_": # We are currently in some special queue. E.g. __cron. queue_name = "default" if not transactional and parent_entity: raise Exception("Parent shouldn't be specfied " "for non-transactional starts.") # Check that reader can be instantiated and is configured correctly mapper_input_reader_class = mapper_spec.input_reader_class() mapper_input_reader_class.validate(mapper_spec) mapper_output_writer_class = mapper_spec.output_writer_class() if mapper_output_writer_class: mapper_output_writer_class.validate(mapper_spec) mapreduce_id = model.MapreduceState.new_mapreduce_id() mapreduce_spec = model.MapreduceSpec(name, mapreduce_id, mapper_spec.to_json(), mapreduce_params, hooks_class_name) # Check that handler can be instantiated. ctx = context.Context(mapreduce_spec, None) context.Context._set(ctx) try: mapper_spec.get_handler() finally: context.Context._set(None) kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()} if _app: kickoff_params["app"] = _app kickoff_worker_task = util.HugeTask(url=base_path + "/kickoffjob_callback", params=kickoff_params, eta=eta, countdown=countdown) hooks = mapreduce_spec.get_hooks() config = util.create_datastore_write_config(mapreduce_spec) def start_mapreduce(): parent = parent_entity if not transactional: # Save state in datastore so that UI can see it. # We can't save state in foreign transaction, but conventional UI # doesn't ask for transactional starts anyway. state = model.MapreduceState.create_new( mapreduce_spec.mapreduce_id) state.mapreduce_spec = mapreduce_spec state.active = True state.active_shards = mapper_spec.shard_count if _app: state.app_id = _app state.put(config=config) parent = state if hooks is not None: try: hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name) except NotImplementedError: # Use the default task addition implementation. pass else: return kickoff_worker_task.add(queue_name, transactional=True, parent=parent) if transactional: start_mapreduce() else: db.run_in_transaction(start_mapreduce) return mapreduce_id
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # NOTE: When aborting, specifically do not finalize the output writer # because it might be in a bad state. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None # Tell NDB to never cache anything in memcache or in-process. This ensures # that entities fetched from Datastore input_readers via NDB will not bloat # up the request memory size and Datastore Puts will avoid doing calls # to memcache. Without this you get soft memory limit exits, which hurts # overall throughput. if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info( "Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() except errors.RetrySliceError, e: logging.error("Slice error: %s", e) retry_count = int( os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0) if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES: raise logging.error("Too many retries: %d, failing the job", retry_count) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) # We recieved a command to abort. We don't care if we override # some data. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error( "Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()