def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): """Return KeyRange objects. Helper for _split_input_from_params.""" raw_entity_kind = util.get_short_name(entity_kind_name) if shard_count == 1: # With one shard we don't need to calculate any splitpoints at all. return [key_range.KeyRange(namespace=namespace, _app=app)] # we use datastore.Query instead of ext.db.Query here, because we can't # erase ordering on db.Query once we set it. ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__scatter__") random_keys = ds_query.Get(shard_count * cls._OVERSAMPLING_FACTOR) if not random_keys: # This might mean that there are no entities with scatter property # or there are no entities at all. return [key_range.KeyRange(namespace=namespace, _app=app)] random_keys.sort() # pick shard_count - 1 points to generate shard_count splits split_points_count = shard_count - 1 if len(random_keys) > split_points_count: # downsample random_keys = [ random_keys[len(random_keys) * i / split_points_count] for i in range(split_points_count) ] key_ranges = [] key_ranges.append( key_range.KeyRange(key_start=None, key_end=random_keys[0], direction=key_range.KeyRange.ASC, include_start=False, include_end=False, namespace=namespace)) for i in range(0, len(random_keys) - 1): key_ranges.append( key_range.KeyRange(key_start=random_keys[i], key_end=random_keys[i + 1], direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace)) key_ranges.append( key_range.KeyRange(key_start=random_keys[-1], key_end=None, direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace)) return key_ranges
def testCursors(self): qs = model.QuerySpec(TestModel, model_class_path=ENTITY_KIND) kr = key_range.KeyRange(key_start=key(1), key_end=key(10000), direction="ASC") json = { 'key_range': kr.to_json(), 'query_spec': qs.to_json(), 'cursor': None } entities = [] while True: model_iter = DjangoModelIterator.from_json(json) c = False count = 0 for entity in model_iter: count += 1 entities.append(entity) if count == 10: c = True break if c: json = model_iter.to_json() else: break self.assertEquals(100, len(entities)) self.assertEquals(self.expected_entities, entities)
def _iter_ns_range(self): """Iterates over self._ns_range, delegating to self._iter_key_range().""" while True: if self._current_key_range is None: query = self._ns_range.make_datastore_query() namespace_result = query.Get(1) if not namespace_result: break namespace = namespace_result[0].name() or "" self._current_key_range = key_range.KeyRange( namespace=namespace, _app=self._ns_range.app) for key, o in self._iter_key_range( copy.deepcopy(self._current_key_range)): # The caller must consume yielded values so advancing the KeyRange # before yielding is safe. self._current_key_range.advance(key) yield o if (self._ns_range.is_single_namespace or self._current_key_range.namespace == self._ns_range.namespace_end): break self._ns_range = self._ns_range.with_start_after( self._current_key_range.namespace) self._current_key_range = None
def __iter__(self): """Create a generator for entities or keys in the range. Iterating through entries moves query range past the consumed entries. Yields: next entry. """ while True: entries_query = self._key_range.make_ascending_query( util.for_name(self._entity_kind), self._keys_only) entries_list = entries_query.fetch(limit=self.batch_size) if not entries_list: return for entry in entries_list: if hasattr(entry, 'key'): key = entry.key() else: key = entry self._key_range = key_range.KeyRange( key, self._key_range.key_end, self._key_range.direction, False, self._key_range.include_end) yield entry
def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): entity_kind = util.for_name(entity_kind_name) entity_kind_name = entity_kind.kind() hex_key_start = db.Key.from_path(entity_kind_name, 0) hex_key_end = db.Key.from_path(entity_kind_name, int('f' * 40, base=16)) hex_range = key_range.KeyRange(hex_key_start, hex_key_end, None, True, True, namespace=namespace, _app=app) key_range_list = [hex_range] number_of_half_splits = int(math.floor(math.log(shard_count, 2))) for index in xrange(0, number_of_half_splits): new_ranges = [] for current_range in key_range_list: new_ranges.extend(current_range.split_range(1)) key_range_list = new_ranges adjusted_range_list = [] for current_range in key_range_list: adjusted_range = key_range.KeyRange( key_start=db.Key.from_path(current_range.key_start.kind(), 'hash_%040x' % (current_range.key_start.id() or 0), _app=current_range._app), key_end=db.Key.from_path(current_range.key_end.kind(), 'hash_%040x' % (current_range.key_end.id() or 0), _app=current_range._app), direction=current_range.direction, include_start=current_range.include_start, include_end=current_range.include_end, namespace=current_range.namespace, _app=current_range._app) adjusted_range_list.append(adjusted_range) return adjusted_range_list
def testTwoShards(self): """Tests two shares: one for number prefixes, one for letter prefixes.""" result = ( offline_jobs.HashKeyDatastoreInputReader._split_input_from_namespace( self.app, self.namespace, self.entity_kind, 2)) expected = [ key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_0000000000000000000000000000000000000000', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_7fffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='DESC', include_start=True, include_end=True, namespace='my-namespace', _app='my-app-id'), key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_7fffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_ffffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='ASC', include_start=False, include_end=True, namespace='my-namespace', _app='my-app-id'), ] self.assertEquals(expected, result)
def _split_input_from_params(cls, app, namespaces, entity_kind_name, params, shard_count): readers = super(ConsistentKeyReader, cls)._split_input_from_params(app, namespaces, entity_kind_name, params, shard_count) # We always produce at least one key range because: # a) there might be unapplied entities # b) it simplifies mapper code if not readers: key_ranges = [ key_range.KeyRange(namespace=namespace, _app=app) for namespace in namespaces ] readers = [cls(entity_kind_name, key_ranges)] return readers
def testOneShard(self): """Tests just one shard.""" result = ( offline_jobs.HashKeyDatastoreInputReader._split_input_from_namespace( self.app, self.namespace, self.entity_kind, 1)) expected = [ key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_0000000000000000000000000000000000000000', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_ffffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='ASC', include_start=True, include_end=True, namespace='my-namespace', _app='my-app-id') ] self.assertEquals(expected, result)
def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): """Return KeyRange objects. Helper for _split_input_from_params.""" raw_entity_kind = util.get_short_name(entity_kind_name) if shard_count == 1: # With one shard we don't need to calculate any splitpoints at all. return [key_range.KeyRange(namespace=namespace, _app=app)] # we use datastore.Query instead of ext.db.Query here, because we can't # erase ordering on db.Query once we set it. ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__key__") first_entity_key_list = ds_query.Get(1) if not first_entity_key_list: logging.warning("Could not retrieve an entity of type %s.", raw_entity_kind) return [] first_entity_key = first_entity_key_list[0] ds_query.Order(("__key__", datastore.Query.DESCENDING)) try: last_entity_key, = ds_query.Get(1) except db.NeedIndexError, e: # TODO(user): Show this error in the worker log, not the app logs. logging.warning( "Cannot create accurate approximation of keyspace, " "guessing instead. Please address this problem: %s", e) # TODO(user): Use a key-end hint from the user input parameters # in this case, in the event the user has a good way of figuring out # the range of the keyspace. last_entity_key = key_range.KeyRange.guess_end_key( raw_entity_kind, first_entity_key)
class DatastoreInputReader(InputReader): """Represents a range in query results. DatastoreInputReader yields model instances from the entities in a given key range. Iterating over DatastoreInputReader changes its range past consumed entries. The class shouldn't be instantiated directly. Use the split_input class method instead. """ # Number of entities to fetch at once while doing scanning. _BATCH_SIZE = 50 # Maximum number of shards we'll create. _MAX_SHARD_COUNT = 256 # Mapreduce parameters. ENTITY_KIND_PARAM = "entity_kind" KEYS_ONLY_PARAM = "keys_only" BATCH_SIZE_PARAM = "batch_size" KEY_RANGE_PARAM = "key_range" # TODO(user): Add support for arbitrary queries. It's not possible to # support them without cursors since right now you can't even serialize query # definition. def __init__(self, entity_kind, key_ranges, batch_size=_BATCH_SIZE): """Create new DatastoreInputReader object. This is internal constructor. Use split_query instead. Args: entity_kind: entity kind as string. key_ranges: a sequence of key_range.KeyRange instances to process. batch_size: size of read batch as int. """ self._entity_kind = entity_kind # Reverse the KeyRanges so they can be processed in order as a stack of # work items. self._key_ranges = list(reversed(key_ranges)) self._batch_size = int(batch_size) def __iter__(self): """Create a generator for model instances for entities. Iterating through entities moves query range past the consumed entities. Yields: next model instance. """ while True: if self._current_key_range is None: break while True: query = self._current_key_range.make_ascending_query( util.for_name(self._entity_kind)) results = query.fetch(limit=self._batch_size) if not results: self._advance_key_range() break for model_instance in results: key = model_instance.key() self._current_key_range.advance(key) yield model_instance @property def _current_key_range(self): if self._key_ranges: return self._key_ranges[-1] else: return None def _advance_key_range(self): if self._key_ranges: self._key_ranges.pop() # TODO(user): use query splitting functionality when it becomes available # instead. @classmethod def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): """Return KeyRange objects. Helper for _split_input_from_params.""" raw_entity_kind = util.get_short_name(entity_kind_name) if shard_count == 1: # With one shard we don't need to calculate any splitpoints at all. return [key_range.KeyRange(namespace=namespace, _app=app)] # we use datastore.Query instead of ext.db.Query here, because we can't # erase ordering on db.Query once we set it. ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__key__") first_entity_key_list = ds_query.Get(1) if not first_entity_key_list: logging.warning("Could not retrieve an entity of type %s.", raw_entity_kind) return [] first_entity_key = first_entity_key_list[0] ds_query.Order(("__key__", datastore.Query.DESCENDING)) try: last_entity_key, = ds_query.Get(1) except db.NeedIndexError, e: # TODO(user): Show this error in the worker log, not the app logs. logging.warning( "Cannot create accurate approximation of keyspace, " "guessing instead. Please address this problem: %s", e) # TODO(user): Use a key-end hint from the user input parameters # in this case, in the event the user has a good way of figuring out # the range of the keyspace. last_entity_key = key_range.KeyRange.guess_end_key( raw_entity_kind, first_entity_key) full_keyrange = key_range.KeyRange(first_entity_key, last_entity_key, None, True, True, namespace=namespace, _app=app) key_ranges = [full_keyrange] number_of_half_splits = int(math.floor(math.log(shard_count, 2))) for _ in range(0, number_of_half_splits): new_ranges = [] for r in key_ranges: new_ranges += r.split_range(1) key_ranges = new_ranges return key_ranges
def testManyShards(self): """Tests having many shards with multiple levels of splits.""" result = ( offline_jobs.HashKeyDatastoreInputReader._split_input_from_namespace( self.app, self.namespace, self.entity_kind, 4)) expected = [ key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_0000000000000000000000000000000000000000', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_3fffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='DESC', include_start=True, include_end=True, namespace='my-namespace', _app='my-app-id'), key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_3fffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_7fffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='ASC', include_start=False, include_end=True, namespace='my-namespace', _app='my-app-id'), key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_7fffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_bfffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='DESC', include_start=False, include_end=True, namespace='my-namespace', _app='my-app-id'), key_range.KeyRange( key_start=db.Key.from_path( 'Subscription', u'hash_bfffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), key_end=db.Key.from_path( 'Subscription', u'hash_ffffffffffffffffffffffffffffffffffffffff', _app=u'my-app-id'), direction='ASC', include_start=False, include_end=True, namespace='my-namespace', _app='my-app-id'), ] self.assertEquals(expected, result)
class DatastoreInputReader(InputReader): """Represents a range in query results. DatastoreInputReader is a generator for either entities or keys in the key range, depending on the value of the keys_only parameter. Iterating over DatastoreInputReader changes its range past consumed entries. The class shouldn't be instantiated directly. Use split_input class method instead. """ _BATCH_SIZE = 50 _MAX_SHARD_COUNT = 256 def __init__(self, entity_kind, key_range_param, batch_size, keys_only): """Create new DatastoreInputReader object. This is internal constructor. Use split_query instead. Args: entity_kind: entity kind as string. key_range_param: key range to process as key_range.KeyRange. batch_size: batch size of entity fetching. keys_only: if True, then send only keys to the mapper. """ self._entity_kind = entity_kind self._key_range = key_range_param self.batch_size = batch_size self._keys_only = keys_only def __iter__(self): """Create a generator for entities or keys in the range. Iterating through entries moves query range past the consumed entries. Yields: next entry. """ while True: entries_query = self._key_range.make_ascending_query( util.for_name(self._entity_kind), self._keys_only) entries_list = entries_query.fetch(limit=self.batch_size) if not entries_list: return for entry in entries_list: if hasattr(entry, 'key'): key = entry.key() else: key = entry self._key_range = key_range.KeyRange( key, self._key_range.key_end, self._key_range.direction, False, self._key_range.include_end) yield entry @classmethod def split_input(cls, mapper_spec): """Splits query into shards without fetching query results. Tries as best as it can to split the whole query result set into equal shards. Due to difficulty of making the perfect split, resulting shards' sizes might differ significantly from each other. The actual number of shards might also be less then requested (even 1), though it is never greater. Current implementation does key-lexicographic order splitting. It requires query not to specify any __key__-based ordering. If an index for query.order('-__key__') query is not present, an inaccurate guess at sharding will be made by splitting the full key range. Args: mapper_spec: MapperSpec with params containing 'entity_kind'. May also have 'batch_size' in the params to specify the number of entities to process in each batch. Returns: A list of DatastoreInputReader objects of length <= number_of_shards. Raises: BadReaderParamsError if required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Input reader class mismatch") params = mapper_spec.params if "entity_kind" not in params: raise BadReaderParamsError( "Missing mapper parameter 'entity_kind'") entity_kind_name = params["entity_kind"] entity_kind = util.for_name(entity_kind_name) shard_count = mapper_spec.shard_count batch_size = int(params.get("batch_size", cls._BATCH_SIZE)) keys_only = int(params.get("keys_only", False)) ds_query = entity_kind.all()._get_query() ds_query.Order("__key__") first_entity = ds_query.Get(1) if not first_entity: return [] else: first_entity_key = first_entity[0].key() ds_query.Order(("__key__", datastore.Query.DESCENDING)) try: last_entity = ds_query.Get(1) last_entity_key = last_entity[0].key() except db.NeedIndexError, e: logging.warning( "Cannot create accurate approximation of keyspace, " "guessing instead. Please address this problem: %s", e) last_entity_key = key_range.KeyRange.guess_end_key( entity_kind.kind(), first_entity_key) full_keyrange = key_range.KeyRange(first_entity_key, last_entity_key, None, True, True) key_ranges = [full_keyrange] number_of_half_splits = int(math.floor(math.log(shard_count, 2))) for _ in range(0, number_of_half_splits): new_ranges = [] for r in key_ranges: new_ranges += r.split_range(1) key_ranges = new_ranges return [ DatastoreInputReader(entity_kind_name, r, batch_size, keys_only) for r in key_ranges ]