def testKeyRangesFromList(self): list_of_key_ranges = [key_range.KeyRange(db.Key.from_path("TestEntity", 1)), key_range.KeyRange(db.Key.from_path("TestEntity", 2)), key_range.KeyRange(db.Key.from_path("TestEntity", 3))] kranges = key_ranges.KeyRangesFactory.create_from_list( list(list_of_key_ranges)) self._assertEqualsAndSerialize(list_of_key_ranges, kranges)
def testSplitNSByScatter_LotsOfData(self): """Split lots of data for each shard.""" testutil._create_entities(range(100), { "80": 80, "50": 50, "30": 30, "10": 10 }, ns="google") expected = [ key_range.KeyRange(key_start=None, key_end=testutil.key("30", namespace="google"), direction="ASC", include_start=False, include_end=False, namespace="google", _app=self.appid), key_range.KeyRange(key_start=testutil.key("30", namespace="google"), key_end=testutil.key("80", namespace="google"), direction="ASC", include_start=True, include_end=False, namespace="google", _app=self.appid), key_range.KeyRange(key_start=testutil.key("80", namespace="google"), key_end=None, direction="ASC", include_start=True, include_end=False, namespace="google", _app=self.appid), ] self._assertEquals_splitNSByScatter(3, expected, ns="google")
def testSplitNSByScatter_NotEnoughData2(self): """Splits should not intersect, if there's not enough data for each.""" testutil._create_entities(range(10), {"2": 2, "4": 4}) expected = [ key_range.KeyRange(key_start=None, key_end=testutil.key("2"), direction="ASC", include_start=False, include_end=False, namespace="", _app=self.appid), key_range.KeyRange(key_start=testutil.key("2"), key_end=testutil.key("4"), direction="ASC", include_start=True, include_end=False, namespace="", _app=self.appid), key_range.KeyRange(key_start=testutil.key("4"), key_end=None, direction="ASC", include_start=True, include_end=False, namespace="", _app=self.appid), None ] self._assertEquals_splitNSByScatter(4, expected)
def testKeyRangesFromNSRange(self): namespaces = ["1", "3", "5"] self.create_entities_in_multiple_ns(namespaces) ns_range = namespace_range.NamespaceRange("0", "5", _app=self.app) kranges = key_ranges.KeyRangesFactory.create_from_ns_range(ns_range) expected = [key_range.KeyRange(namespace="1", _app=self.app), key_range.KeyRange(namespace="3", _app=self.app), key_range.KeyRange(namespace="5", _app=self.app)] self._assertEqualsAndSerialize(expected, kranges)
def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): """Return KeyRange objects. Helper for _split_input_from_params.""" raw_entity_kind = util.get_short_name(entity_kind_name) if shard_count == 1: return [key_range.KeyRange(namespace=namespace, _app=app)] ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__scatter__") random_keys = ds_query.Get(shard_count * cls._OVERSAMPLING_FACTOR) if not random_keys: return [key_range.KeyRange(namespace=namespace, _app=app)] random_keys.sort() split_points_count = shard_count - 1 if len(random_keys) > split_points_count: random_keys = [ random_keys[len(random_keys) * i / split_points_count] for i in range(split_points_count) ] key_ranges = [] key_ranges.append( key_range.KeyRange(key_start=None, key_end=random_keys[0], direction=key_range.KeyRange.ASC, include_start=False, include_end=False, namespace=namespace)) for i in range(0, len(random_keys) - 1): key_ranges.append( key_range.KeyRange(key_start=random_keys[i], key_end=random_keys[i + 1], direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace)) key_ranges.append( key_range.KeyRange(key_start=random_keys[-1], key_end=None, direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace)) return key_ranges
def _iter_ns_range(self): """Iterates over self._ns_range, delegating to self._iter_key_range().""" while True: if self._current_key_range is None: query = self._ns_range.make_datastore_query() namespace_result = query.Get(1) if not namespace_result: break namespace = namespace_result[0].name() or "" self._current_key_range = key_range.KeyRange( namespace=namespace, _app=self._ns_range.app) for key, o in self._iter_key_range( copy.deepcopy(self._current_key_range)): self._current_key_range.advance(key) yield o if (self._ns_range.is_single_namespace or self._current_key_range.namespace == self._ns_range.namespace_end): break self._ns_range = self._ns_range.with_start_after( self._current_key_range.namespace) self._current_key_range = None
def next(self): if self._ns_range is None: raise StopIteration() self._last_ns = self._iter.next() if self._last_ns == self._ns_range.namespace_end: self._ns_range = None return key_range.KeyRange(namespace=self._last_ns, _app=self._ns_range.app)
def __next__(self): if self._ns_range is None: raise StopIteration() self._last_ns = next(self._iter) current_ns_range = self._ns_range if self._last_ns == self._ns_range.namespace_end: self._ns_range = None return key_range.KeyRange(namespace=self._last_ns, _app=current_ns_range.app)
def _create_iter(self, iter_cls, entity_kind): kranges = [key_range.KeyRange(namespace=ns) for ns in self.namespaces] kranges = key_ranges.KeyRangesFactory.create_from_list(kranges) query_spec = model.QuerySpec( entity_kind=util.get_short_name(entity_kind), batch_size=10, filters=self.filters, model_class_path=entity_kind) itr = db_iters.RangeIteratorFactory.create_key_ranges_iterator( kranges, query_spec, iter_cls) return itr
def _split_input_from_params(cls, app, namespaces, entity_kind_name, params, shard_count): readers = super(ConsistentKeyReader, cls)._split_input_from_params(app, namespaces, entity_kind_name, params, shard_count) if not readers: key_ranges = [ key_range.KeyRange(namespace=namespace, _app=app) for namespace in namespaces ] readers = [cls(entity_kind_name, key_ranges)] return readers
def _create_iter(self, iter_cls, entity_kind): key_start = db.Key.from_path(util.get_short_name(entity_kind), "0", namespace=self.namespace) key_end = db.Key.from_path(util.get_short_name(entity_kind), "999", namespace=self.namespace) krange = key_range.KeyRange(key_start, key_end, include_start=True, include_end=True, namespace=self.namespace) query_spec = model.QuerySpec( entity_kind=util.get_short_name(entity_kind), batch_size=10, filters=self.filters, model_class_path=entity_kind) return iter_cls(krange, query_spec)
def _split_ns_by_scatter(cls, shard_count, namespace, raw_entity_kind, app): """Split a namespace by scatter index into key_range.KeyRange. TODO: Power this with key_range.KeyRange.compute_split_points. Args: shard_count: number of shards. namespace: namespace name to split. str. raw_entity_kind: low level datastore API entity kind. app: app id in str. Returns: A list of key_range.KeyRange objects. If there are not enough entities to splits into requested shards, the returned list will contain KeyRanges ordered lexicographically with any Nones appearing at the end. """ if shard_count == 1: return [key_range.KeyRange(namespace=namespace, _app=app)] ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__scatter__") oversampling_factor = 32 random_keys = ds_query.Get(shard_count * oversampling_factor) if not random_keys: return ([key_range.KeyRange(namespace=namespace, _app=app)] + [None] * (shard_count - 1)) random_keys.sort() if len(random_keys) >= shard_count: random_keys = cls._choose_split_points(random_keys, shard_count) k_ranges = [] k_ranges.append( key_range.KeyRange(key_start=None, key_end=random_keys[0], direction=key_range.KeyRange.ASC, include_start=False, include_end=False, namespace=namespace, _app=app)) for i in range(0, len(random_keys) - 1): k_ranges.append( key_range.KeyRange(key_start=random_keys[i], key_end=random_keys[i + 1], direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace, _app=app)) k_ranges.append( key_range.KeyRange(key_start=random_keys[-1], key_end=None, direction=key_range.KeyRange.ASC, include_start=True, include_end=False, namespace=namespace, _app=app)) if len(k_ranges) < shard_count: k_ranges += [None] * (shard_count - len(k_ranges)) return k_ranges
class DatastoreInputReader(InputReader): """Represents a range in query results. DatastoreInputReader yields model instances from the entities in a given key range. Iterating over DatastoreInputReader changes its range past consumed entries. The class shouldn't be instantiated directly. Use the split_input class method instead. """ _BATCH_SIZE = 50 _MAX_SHARD_COUNT = 256 ENTITY_KIND_PARAM = "entity_kind" KEYS_ONLY_PARAM = "keys_only" BATCH_SIZE_PARAM = "batch_size" KEY_RANGE_PARAM = "key_range" def __init__(self, entity_kind, key_ranges, batch_size=_BATCH_SIZE): """Create new DatastoreInputReader object. This is internal constructor. Use split_query instead. Args: entity_kind: entity kind as string. key_ranges: a sequence of key_range.KeyRange instances to process. batch_size: size of read batch as int. """ self._entity_kind = entity_kind self._key_ranges = list(reversed(key_ranges)) self._batch_size = int(batch_size) def __iter__(self): """Create a generator for model instances for entities. Iterating through entities moves query range past the consumed entities. Yields: next model instance. """ while True: if self._current_key_range is None: break while True: query = self._current_key_range.make_ascending_query( util.for_name(self._entity_kind)) results = query.fetch(limit=self._batch_size) if not results: self._advance_key_range() break for model_instance in results: key = model_instance.key() self._current_key_range.advance(key) yield model_instance @property def _current_key_range(self): if self._key_ranges: return self._key_ranges[-1] else: return None def _advance_key_range(self): if self._key_ranges: self._key_ranges.pop() @classmethod def _split_input_from_namespace(cls, app, namespace, entity_kind_name, shard_count): """Return KeyRange objects. Helper for _split_input_from_params.""" raw_entity_kind = util.get_short_name(entity_kind_name) ds_query = datastore.Query(kind=raw_entity_kind, namespace=namespace, _app=app, keys_only=True) ds_query.Order("__key__") first_entity_key_list = ds_query.Get(1) if not first_entity_key_list: logging.warning("Could not retrieve an entity of type %s." % raw_entity_kind) return [] first_entity_key = first_entity_key_list[0] ds_query.Order(("__key__", datastore.Query.DESCENDING)) try: last_entity_key, = ds_query.Get(1) except db.NeedIndexError, e: logging.warning( "Cannot create accurate approximation of keyspace, " "guessing instead. Please address this problem: %s", e) last_entity_key = key_range.KeyRange.guess_end_key( raw_entity_kind, first_entity_key) full_keyrange = key_range.KeyRange(first_entity_key, last_entity_key, None, True, True, namespace=namespace, _app=app) key_ranges = [full_keyrange] number_of_half_splits = int(math.floor(math.log(shard_count, 2))) for _ in range(0, number_of_half_splits): new_ranges = [] for r in key_ranges: new_ranges += r.split_range(1) key_ranges = new_ranges return key_ranges
def testToKeyRangesByShard_UnevenNamespaces(self): namespaces = [str(i) for i in range(3)] testutil._create_entities(range(10), {"5": 5}, namespaces[0]) testutil._create_entities(range(10), {"5": 5, "6": 6}, namespaces[1]) testutil._create_entities(range(10), { "5": 5, "6": 6, "7": 7 }, namespaces[2]) shards = 3 expected = [ # shard 1 key_range.KeyRange(key_start=None, key_end=testutil.key("5", namespace="0"), direction="ASC", include_start=False, include_end=False, namespace="0", _app=self.appid), key_range.KeyRange(key_start=None, key_end=testutil.key("5", namespace="1"), direction="ASC", include_start=False, include_end=False, namespace="1", _app=self.appid), key_range.KeyRange(key_start=None, key_end=testutil.key("6", namespace="2"), direction="ASC", include_start=False, include_end=False, namespace="2", _app=self.appid), # shard 2 key_range.KeyRange(key_start=testutil.key("5", namespace="0"), key_end=None, direction="ASC", include_start=True, include_end=False, namespace="0", _app=self.appid), key_range.KeyRange(key_start=testutil.key("5", namespace="1"), key_end=testutil.key("6", namespace="1"), direction="ASC", include_start=True, include_end=False, namespace="1", _app=self.appid), key_range.KeyRange(key_start=testutil.key("6", namespace="2"), key_end=testutil.key("7", namespace="2"), direction="ASC", include_start=True, include_end=False, namespace="2", _app=self.appid), # shard 3 key_range.KeyRange(key_start=testutil.key("6", namespace="1"), key_end=None, direction="ASC", include_start=True, include_end=False, namespace="1", _app=self.appid), key_range.KeyRange(key_start=testutil.key("7", namespace="2"), key_end=None, direction="ASC", include_start=True, include_end=False, namespace="2", _app=self.appid), ] kranges_by_shard = (self.reader_cls._to_key_ranges_by_shard( self.appid, namespaces, shards, model.QuerySpec(entity_kind="TestEntity"))) self.assertEquals(shards, len(kranges_by_shard)) expected.sort() results = [] for kranges in kranges_by_shard: results.extend(list(kranges)) results.sort() self.assertEquals(expected, results)