def refill_quotas(self, last_poll_time, processing_rate, active_shard_states): """Refill quotas for all active shards. Args: last_poll_time: Datetime with the last time the job state was updated. processing_rate: How many items to process per second overall. active_shard_states: All active shard states, list of ShardState. """ if not active_shard_states: return quota_manager = quota.QuotaManager(memcache.Client()) current_time = int(self._time()) last_poll_time = time.mktime(last_poll_time.timetuple()) total_quota_refill = processing_rate * max( 0, current_time - last_poll_time) quota_refill = int( math.ceil(1.0 * total_quota_refill / len(active_shard_states))) if not quota_refill: return # TODO(user): use batch memcache API to refill quota in one API call. for shard_state in active_shard_states: quota_manager.put(shard_state.shard_id, quota_refill)
def _schedule_shards(cls, spec, input_readers, output_writers, queue_name, base_path): """Prepares shard states and schedules their execution. Args: spec: mapreduce specification as MapreduceSpec. input_readers: list of InputReaders describing shard splits. queue_name: The queue to run this job on. base_path: The base url path of mapreduce callbacks. """ assert len(input_readers) == len(output_writers) # Note: it's safe to re-attempt this handler because: # - shard state has deterministic and unique key. # - _schedule_slice will fall back gracefully if a task already exists. shard_states = [] for shard_number, input_reader in enumerate(input_readers): shard_state = model.ShardState.create_new(spec.mapreduce_id, shard_number) shard_state.shard_description = str(input_reader) shard_states.append(shard_state) # Retrievs already existing shards. existing_shard_states = db.get(shard.key() for shard in shard_states) existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None) # Puts only non-existing shards. db.put((shard for shard in shard_states if shard.key() not in existing_shard_keys), config=util.create_datastore_write_config(spec)) # Give each shard some quota to start with. processing_rate = int( spec.mapper.params.get("processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC) quota_refill = processing_rate / len(shard_states) quota_manager = quota.QuotaManager(memcache.Client()) for shard_state in shard_states: quota_manager.put(shard_state.shard_id, quota_refill) # Schedule shard tasks. for shard_number, (input_reader, output_writer) in enumerate( zip(input_readers, output_writers)): shard_id = model.ShardState.shard_id_from_number( spec.mapreduce_id, shard_number) MapperWorkerCallbackHandler._schedule_slice( shard_states[shard_number], model.TransientShardState(base_path, spec, shard_id, 0, input_reader, output_writer=output_writer), queue_name=queue_name)
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # NOTE: When aborting, specifically do not finalize the output writer # because it might be in a bad state. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None # Tell NDB to never cache anything in memcache or in-process. This ensures # that entities fetched from Datastore input_readers via NDB will not bloat # up the request memory size and Datastore Puts will avoid doing calls # to memcache. Without this you get soft memory limit exits, which hurts # overall throughput. if ndb is not None: ndb_ctx = ndb.get_context() ndb_ctx.set_cache_policy(lambda key: False) ndb_ctx.set_memcache_policy(lambda key: False) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None try: # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info( "Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() except errors.RetrySliceError, e: logging.error("Slice error: %s", e) retry_count = int( os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0) if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES: raise logging.error("Too many retries: %d, failing the job", retry_count) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED except errors.FailJobError, e: logging.error("Job failed: %s", e) scan_aborted = True shard_state.active = False shard_state.result_status = model.ShardState.RESULT_FAILED
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( "mapper-walltime-msec", int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) shard_state.put(config=util.create_datastore_write_config(spec)) finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def setUp(self): QuotaTestCase.setUp(self) self.quota_manager = quota.QuotaManager(memcache.Client()) self.consumer = quota.QuotaConsumer(self.quota_manager, "foo", 50)
def setUp(self): QuotaTestCase.setUp(self) self.quota_manager = quota.QuotaManager(memcache.Client())
def handle(self): """Handle request.""" tstate = model.TransientShardState.from_request(self.request) spec = tstate.mapreduce_spec self._start_time = self._time() shard_id = tstate.shard_id shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if not shard_state.active: logging.error( "Shard is not active. Looks like spurious task execution.") return ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count()) if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) # We recieved a command to abort. We don't care if we override # some data. shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.put(config=util.create_datastore_write_config(spec)) model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = tstate.input_reader if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_data( entity, input_reader, ctx, tstate) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS operation.counters.Increment( context.COUNTER_MAPPER_WALLTIME_MS, int((time.time() - self._start_time) * 1000))(ctx) # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() if not shard_state.active: # shard is going to stop. Finalize output writer if any. if tstate.output_writer: tstate.output_writer.finalize(ctx, shard_state.shard_number) config = util.create_datastore_write_config(spec) # We don't want shard state to override active state, since that # may stuck job execution (see issue 116). Do a transactional # verification for status. # TODO(user): this might still result in some data inconsistency # which can be avoided. It doesn't seem to be worth it now, because # various crashes might result in all sort of data consistencies # anyway. @db.transactional(retries=5) def tx(): fresh_shard_state = db.get( model.ShardState.get_key_by_shard_id(shard_id)) if (not fresh_shard_state.active or "worker_active_state_collision" in _TEST_INJECTED_FAULTS): shard_state.active = False logging.error( "Spurious task execution. Aborting the shard.") return fresh_shard_state.copy_from(shard_state) fresh_shard_state.put(config=config) tx() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(shard_state, tstate) gc.collect()
def post(self): """Handle post request.""" spec = model.MapreduceSpec.from_json_str( self.request.get("mapreduce_spec")) self._start_time = self._time() shard_id = self.shard_id() # TODO(user): Make this prettier logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.META) shard_state, control = db.get([ model.ShardState.get_key_by_shard_id(shard_id), model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id), ]) if not shard_state: # We're letting this task to die. It's up to controller code to # reinitialize and restart the task. logging.error("State not found for shard ID %r; shutting down", shard_id) return if control and control.command == model.MapreduceControl.ABORT: logging.info("Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_ABORTED shard_state.save() model.MapreduceControl.abort(spec.mapreduce_id) return input_reader = self.input_reader(spec.mapper) if spec.mapper.params.get("enable_quota", True): quota_consumer = quota.QuotaConsumer( quota.QuotaManager(cache.Client()), shard_id, _QUOTA_BATCH_SIZE) else: quota_consumer = None ctx = context.Context(spec, shard_state) context.Context._set(ctx) try: # consume quota ahead, because we do not want to run a datastore # query if there's not enough quota for the shard. if not quota_consumer or quota_consumer.check(): scan_aborted = False entity = None # We shouldn't fetch an entity from the reader if there's not enough # quota to process it. Perform all quota checks proactively. if not quota_consumer or quota_consumer.consume(): for entity in input_reader: if isinstance(entity, db.Model): shard_state.last_work_item = repr(entity.key()) else: shard_state.last_work_item = repr(entity)[:100] scan_aborted = not self.process_entity(entity, ctx) # Check if we've got enough quota for the next entity. if (quota_consumer and not scan_aborted and not quota_consumer.consume()): scan_aborted = True if scan_aborted: break else: scan_aborted = True if not scan_aborted: logging.info("Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id) # We consumed extra quota item at the end of for loop. # Just be nice here and give it back :) if quota_consumer: quota_consumer.put(1) shard_state.active = False shard_state.result_status = model.ShardState.RESULT_SUCCESS # TODO(user): Mike said we don't want this happen in case of # exception while scanning. Figure out when it's appropriate to skip. ctx.flush() finally: context.Context._set(None) if quota_consumer: quota_consumer.dispose() # Rescheduling work should always be the last statement. It shouldn't happen # if there were any exceptions in code before it. if shard_state.active: self.reschedule(spec, input_reader)