def testGetData(self): data = [("a", "aa"), ("b", "bb")] records = RecordIORecords() records.insert(data[0]) records.insert(data[1]) records = RecordIORecords(records.get_data()) self.assertEqual(list(records), data)
def commit_to_queue_(self): """Adds all pending changes to the task queues for async commits :return: Yields all shard names that need to be updated. """ pull = taskqueue.Queue('recordio-queue') rpcs = [] key_values_not_added = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None: for entry in key_values: key_values_not_added.insert(entry) else: for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE): payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION) rpc = pull.add_async(taskqueue.Task(payload=payload, method='PULL', tag=shard_name)) rpcs.append((rpc, key_values_chunk, shard_name)) for rpc, key_values, shard_name in rpcs: try: rpc.get_result() yield shard_name except: for entry in key_values: key_values_not_added.insert(entry) self.updates = key_values_not_added if len(self.updates): raise RecordIOWriterNotCompletedError(len(self.updates))
def init(self): """Initializes internal values.""" if not hasattr(self, "records_"): if self.compressed: self.records_ = RecordIORecordsZipped(self.data) else: self.records_ = RecordIORecords(self.data) self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name())
def get_shards_for_key_values(name, records, keys_only=True): """Given a list of entries, returns the shards where they belong to :param name: The name of the RecordIO :param records: A list of entry tuples. :param keys_only: If only the keys should be returned. :return: A list of names or shards. """ gen = RecordIOShard.iterate_records_(records) entry = None while True: if entry == None: try: entry = gen.next() except StopIteration: return key_before_name = RecordIOShard.key_name(name, hi=entry) key_before_name = key_before_name.split(SPLIT_CHAR) key_before_name[6] = SPLIT_CHAR_AFTER key_before_name = SPLIT_CHAR.join(key_before_name) if entry[0] == "": key_before_name = (key_before_name.split(SPLIT_CHAR)[0] + SPLIT_CHAR + "0" + SPLIT_CHAR) key_before = db.Key.from_path( "RecordIOShard", key_before_name) shard_obj = RecordIOShard.get_all_query(name, keys_only=keys_only).filter( "__key__ >", key_before).get() if shard_obj == None: yield None, [entry] + list(gen) return shard_key = None key_result = shard_obj if keys_only: shard_key = shard_obj.name() key_result = shard_key else: shard_key = shard_obj.key().name() lo, hi = RecordIOShard.lo_hi_from_key(shard_key) result = [] try: while entry and not RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield None, result result = [] try: while entry and RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield key_result, result
def testInsertSplitDataSmallToBig(self): records = RecordIORecords() self.insertABC(records) records.insert(("b", 0, 3, 3, "bb")) records.insert(("b", 1, 3, 3, "bb")) records.insert(("b", 2, 3, 3, "bb")) self.assertEqual( [("a", "aa"), ("b", 0, 3, 3, "bb"), ("b", 1, 3, 3, "bb"), ("b", 2, 3, 3, "bb"), ("c", "cc")], list(records) )
def get_shards_for_key_values(name, records, keys_only=True): """Given a list of entries, returns the shards where they belong to :param name: The name of the RecordIO :param records: A list of entry tuples. :param keys_only: If only the keys should be returned. :return: A list of names or shards. """ gen = RecordIOShard.iterate_records_(records) entry = None while True: if entry == None: try: entry = gen.next() except StopIteration: return key_before_name = RecordIOShard.key_name(name, hi=entry) key_before_name = key_before_name.split(SPLIT_CHAR) key_before_name[6] = SPLIT_CHAR_AFTER key_before_name = SPLIT_CHAR.join(key_before_name) if entry[0] == "": key_before_name = (key_before_name.split(SPLIT_CHAR)[0] + SPLIT_CHAR + "0" + SPLIT_CHAR) key_before = db.Key.from_path("RecordIOShard", key_before_name) shard_obj = RecordIOShard.get_all_query( name, keys_only=keys_only).filter("__key__ >", key_before).get() if shard_obj == None: yield None, [entry] + list(gen) return shard_key = None key_result = shard_obj if keys_only: shard_key = shard_obj.name() key_result = shard_key else: shard_key = shard_obj.key().name() lo, hi = RecordIOShard.lo_hi_from_key(shard_key) result = [] try: while entry and not RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield None, result result = [] try: while entry and RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield key_result, result
def testSplit(self): records = RecordIORecords() self.insertABC(records) records.insert(("d", "dd")) records.insert(("e", "ee")) lo, hi, middle = records.split() lo = RecordIORecords(lo) hi = RecordIORecords(hi) self.assertEqual(middle, ("d", "dd")) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(lo)) self.assertEqual([("d", "dd"), ("e", "ee")], list(hi))
def testDelete(self): records = RecordIORecords() self.insertABC(records) self.assertTrue(records.insert(("b", ))) records.insert(("b", "bb")) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records)) self.assertTrue(records.insert(("b", ))) self.assertFalse(records.insert(("d", ))) records = RecordIORecords(records.get_data()) self.assertEqual([("a", "aa"), ("c", "cc")], list(records))
def testInsertNotDeleted(self): records = RecordIORecords() records.insert(("a", "aa")) records.insert(("b", 0, 3, 3, "bb")) records.insert(("b", 0, 2, 2, "bb")) other = RecordIORecords(records.get_data()) self.assertEqual([("a", "aa"), ("b", 0, 2, 2, "bb")], list(other)) self.assertEqual([('b', 1, 3, 3), ('b', 2, 3, 3)], list(records.not_deleted()))
def zip_chunk_comperator(a, b): """Compares two zipped chunks. :param a: zipped chunk tuple (See class definition). :param b: zipped chunk tuple (See class definition). :return: Boolean """ a_lo, a_hi = a[:2] b_lo, b_hi = b[:2] if RecordIORecords.entry_comperator(a_hi, b_lo) == -1: return -1 elif RecordIORecords.entry_comperator(a_lo, b_hi) == 1: return 1 return 0
def __init__(self, name): """Creates a RecordIOWriter :param name: The name of the RecordIO. The urllib quoted name is not allowed to be longer than 64 characters. """ if len(urllib.quote(name)) > MAX_KEY_LENGTH: raise ValueError( "Max urllib.quote(name) length is %d: len('%s') is %d" % (MAX_KEY_LENGTH, name, len(urllib.quote(name)))) self.name = name self.updates = RecordIORecords() self.pending_worker_tasks = [] self.db_search = 0 self.db_get = 0 self.db_put = 0
def insert(self, entry): """Inserts an entry tuple into the RecordIORecords. :param entry: An entry tuple """ pos = bisect_left(self.zipped_chunks_, (entry, entry), comperator=self.zip_chunk_comperator) if ((pos < len(self.zipped_chunks_) and self.zip_chunk_comperator(self.zipped_chunks_[pos], (entry, entry)) == 0) or self.is_entry_deleted(entry)): RecordIORecords.insert(self, entry) else: self.zipped_chunks_.insert(pos, (entry[:-1], entry[:-1], zlib.compress(marshal.dumps([entry], MARSHAL_VERSION), COMPRESSION_LEVEL_MIN)))
def insert(self, entry): """Inserts an entry tuple into the RecordIORecords. :param entry: An entry tuple """ pos = bisect_left(self.zipped_chunks_, (entry, entry), comperator=self.zip_chunk_comperator) if ((pos < len(self.zipped_chunks_) and self.zip_chunk_comperator(self.zipped_chunks_[pos], (entry, entry)) == 0) or self.is_entry_deleted(entry)): RecordIORecords.insert(self, entry) else: self.zipped_chunks_.insert( pos, (entry[:-1], entry[:-1], zlib.compress(marshal.dumps([entry], MARSHAL_VERSION), COMPRESSION_LEVEL_MIN)))
def testDelete(self): records = RecordIORecords() self.insertABC(records) self.assertTrue(records.insert(("b",))) records.insert(("b", "bb")) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records)) self.assertTrue(records.insert(("b",))) self.assertFalse(records.insert(("d",))) records = RecordIORecords(records.get_data()) self.assertEqual([("a", "aa"), ("c", "cc")], list(records))
def testInRange(self): self.assertTrue(RecordIORecords.in_range(("a",))) self.assertTrue(RecordIORecords.in_range(("a",), lo=("a",))) self.assertTrue(RecordIORecords.in_range(("a",), hi=("b",))) self.assertTrue(RecordIORecords.in_range(("b",), lo=("a",), hi=("c",))) self.assertTrue(RecordIORecords.in_range(("a",), lo=("a",), hi=("b",))) self.assertFalse(RecordIORecords.in_range(("a",), lo=("b",))) self.assertFalse(RecordIORecords.in_range(("b",), hi=("b",)))
def testInsertNotDeleted(self): records = RecordIORecords() records.insert(("a", "aa")) records.insert(("b", 0, 3, 3, "bb")) records.insert(("b", 0, 2, 2, "bb")) other = RecordIORecords(records.get_data()) self.assertEqual([("a", "aa"), ("b", 0, 2, 2, "bb")], list(other)) self.assertEqual([("b", 1, 3, 3), ("b", 2, 3, 3)], list(records.not_deleted()))
def testInsertGetAndRead(self): records = RecordIORecords() self.insertABC(records) self.assertEqual(len(records), 3) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records)) records.insert(("b", "new")) self.assertEqual(len(records), 3) self.assertEqual(records["b"], ("b", "new")) self.assertTrue("a" in records) self.assertFalse("z" in records) records.insert(("b", "bb")) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records.read())) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records.read(("", ), ("d", )))) self.assertEqual([("b", "bb")], list(records.read(("b", ), ("c", ))))
def testInRange(self): self.assertTrue(RecordIORecords.in_range(("a", ))) self.assertTrue(RecordIORecords.in_range(("a", ), lo=("a", ))) self.assertTrue(RecordIORecords.in_range(("a", ), hi=("b", ))) self.assertTrue( RecordIORecords.in_range(("b", ), lo=("a", ), hi=("c", ))) self.assertTrue( RecordIORecords.in_range(("a", ), lo=("a", ), hi=("b", ))) self.assertFalse(RecordIORecords.in_range(("a", ), lo=("b", ))) self.assertFalse(RecordIORecords.in_range(("b", ), hi=("b", )))
def testInsertGetAndRead(self): records = RecordIORecords() self.insertABC(records) self.assertEqual(len(records), 3) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records)) records.insert(("b", "new")) self.assertEqual(len(records), 3) self.assertEqual(records["b"], ("b", "new")) self.assertTrue("a" in records) self.assertFalse("z" in records) records.insert(("b", "bb")) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records.read())) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records.read(("",), ("d",)))) self.assertEqual([("b", "bb")], list(records.read(("b",), ("c",))))
def commit_to_queue_(self): """Adds all pending changes to the task queues for async commits :return: Yields all shard names that need to be updated. """ pull = taskqueue.Queue('recordio-queue') rpcs = [] key_values_not_added = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None: for entry in key_values: key_values_not_added.insert(entry) else: for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE): payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION) rpc = pull.add_async( taskqueue.Task(payload=payload, method='PULL', tag=shard_name)) rpcs.append((rpc, key_values_chunk, shard_name)) for rpc, key_values, shard_name in rpcs: try: rpc.get_result() yield shard_name except: for entry in key_values: key_values_not_added.insert(entry) self.updates = key_values_not_added if len(self.updates): raise RecordIOWriterNotCompletedError(len(self.updates))
def testInsertSplitDataSmallToBig(self): records = RecordIORecords() self.insertABC(records) records.insert(("b", 0, 3, 3, "bb")) records.insert(("b", 1, 3, 3, "bb")) records.insert(("b", 2, 3, 3, "bb")) self.assertEqual([("a", "aa"), ("b", 0, 3, 3, "bb"), ("b", 1, 3, 3, "bb"), ("b", 2, 3, 3, "bb"), ("c", "cc")], list(records))
def __init__(self, name): """Creates a RecordIOWriter :param name: The name of the RecordIO. The urllib quoted name is not allowed to be longer than 64 characters. """ if len(urllib.quote(name)) > MAX_KEY_LENGTH: raise ValueError("Max urllib.quote(name) length is %d: len('%s') is %d" % (MAX_KEY_LENGTH, name, len(urllib.quote(name)))) self.name = name self.updates = RecordIORecords() self.pending_worker_tasks = [] self.db_search = 0 self.db_get = 0 self.db_put = 0
def read_entries_(self, start_key=None, end_key=None): """An internal helper function to read split entries. :param start_key: An entry tuple (no value needed) :param end_key: An entry tuple (no value needed) Exclusive. :return: Yields key, split_values """ # TODO (andrin): fetch a couple of shards instead of just one based on # method argument current_key = start_key if current_key == None: current_key = ("", ) limit_shard_name = RecordIOShard.key_name(self.name, lo=start_key, hi=end_key).split(SPLIT_CHAR) while True: shard = RecordIOShard.get_shards_for_key_values( self.name, [current_key], keys_only=False).next()[0] self.db_search_and_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(self.name) hi = shard.lo_hi()[1] shard_name = shard.key().name().split(SPLIT_CHAR) if (shard_name[6:10] >= limit_shard_name[6:10] and (shard_name[2:5] < limit_shard_name[2:5] or limit_shard_name[2] == SPLIT_CHAR_AFTER)): # Read the whole shard for entry in shard: yield entry else: # Read parts of the shard for entry in shard.read(current_key, end_key): yield entry if hi == None: # Was the last shard return current_key = hi if (end_key != None and RecordIORecords.entry_comperator( current_key, end_key) >= 0): # Next shard is after end_key return
def read_entries_(self, start_key=None, end_key=None): """An internal helper function to read split entries. :param start_key: An entry tuple (no value needed) :param end_key: An entry tuple (no value needed) Exclusive. :return: Yields key, split_values """ # TODO (andrin): fetch a couple of shards instead of just one based on # method argument current_key = start_key if current_key == None: current_key = ("", ) limit_shard_name = RecordIOShard.key_name( self.name, lo=start_key, hi=end_key).split(SPLIT_CHAR) while True: shard = RecordIOShard.get_shards_for_key_values( self.name, [current_key], keys_only=False).next()[0] self.db_search_and_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(self.name) hi = shard.lo_hi()[1] shard_name = shard.key().name().split(SPLIT_CHAR) if (shard_name[6:10] >= limit_shard_name[6:10] and (shard_name[2:5] < limit_shard_name[2:5] or limit_shard_name[2] == SPLIT_CHAR_AFTER)): # Read the whole shard for entry in shard: yield entry else: # Read parts of the shard for entry in shard.read(current_key, end_key): yield entry if hi == None: # Was the last shard return current_key = hi if (end_key != None and RecordIORecords.entry_comperator(current_key, end_key) >= 0): # Next shard is after end_key return
class RecordIOShard(db.Model): """Holds the actual data of a RecordIO as sharded datastore entries.""" # The data from a RecordIORecords or RecordIORecordsZipped data = db.BlobProperty() # Determines if it's a RecordIORecords or RecordIORecordsZipped. compressed = db.BooleanProperty(default=True, indexed=False) # The first shard is the index shard. Used for getting a list of all # RecordIO names. index = db.BooleanProperty() @staticmethod def entry_key(key): """Returns a list of escaped strings representing a RecordIO entry tuple. :param key: An entry tuple (no value needed) :return: list of escaped string. """ if len(key) >= 4: str_key = [INTEGER_FMT % i for i in key[1:4]] return (binascii.hexlify(key[0]), ) + tuple(str_key) return (binascii.hexlify(key[0]), INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0) @staticmethod def key_name(name, lo=None, hi=None): """Returns the datastore key name for a shard. :param name: The name of the RecordIO :param lo: The lo entry tuple. :param hi: The hi entry tuple. :return: String """ if lo == None: lo = ("", INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0) else: lo = RecordIOShard.entry_key(lo) if hi == None: hi = (SPLIT_CHAR_AFTER, INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0) else: hi = RecordIOShard.entry_key(hi) return SPLIT_CHAR.join((urllib.quote(name), "0") + hi + lo) @staticmethod def create(name, lo=None, hi=None, compressed=True): """Creates a new RecordIOShard object (in memory). :param name: The name of the RecordIO :param lo: The lo entry tuple. :param hi: The hi entry tuple. :param compressed: Whether this RecordIOs data is zipped or not. :return: RecordIOShard """ shard = RecordIOShard(key_name=RecordIOShard.key_name(name, lo, hi)) shard.compressed = compressed if lo == None: shard.index = True return shard @staticmethod def get_name(key_name): """Returns the name of the RecordIO. :param key_name: A datastore key name :return: String """ return urllib.unquote(key_name.split(SPLIT_CHAR, 1)[0]) def name(self): """Returns the name of the RecordIO this shard belongs to. :return: String """ return self.get_name(self.key().name()) def init(self): """Initializes internal values.""" if not hasattr(self, "records_"): if self.compressed: self.records_ = RecordIORecordsZipped(self.data) else: self.records_ = RecordIORecords(self.data) self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name()) def commit(self): """Writes the data to datastore.""" self.init() self.data = self.records_.get_data(max_size=MAX_BLOB_SIZE) if len(self.data) >= MAX_BLOB_SIZE: raise RecordIOShardTooBigError() self.put() def not_deleted(self): """Entries that need to be deleted in another shard. :return: list of keys """ return self.records_.not_deleted() def __len__(self): """The amount of records in this RecordIO. Expensive if compressed. :return: int """ self.init() return len(self.records_) def __getitem__(self, key): """Returns an the value of an item. :param key_entry: An entry tuple (no value needed) :return: Object """ return self.records_[key][-1] def __iter__(self): """Yields all entry tuples. :return: Entry tuples """ self.init() for entry in self.records_: yield entry def __contains__(self, x): """Checks whether an entry tuple key is part of this RecordIOShard. :param x: An entry tuple (no value needed) :return: Boolean """ try: self[x] return True except: return False def insert(self, entry): """Inserts an entry tuple into the RecordIOShard. :param entry: An entry tuple """ self.init() assert(self.records_.in_range(entry, self.loHi_[0], self.loHi_[1])) self.records_.insert(entry) def read(self, start_key, end_key): """Reads through the records from start_key to end_key (exclusive) :param start_key: An entry tuple (no value needed) :param end_key: An entry tuple (no value needed) :return: Yields all entry tuples within the range. """ self.init() for entry in self.records_.read(start_key, end_key): yield entry @staticmethod def iterate_records_(records): """Iterates over all records. :param records: A generator :return: A generator """ for x in records: yield x @staticmethod def get_all_query(name, keys_only): """Returns a datastore query that returns all shards of a RecordIO. :param name: Name of the RecordIO :param keys_only: If this should be a keys only query :return: A datastore query. """ key_before = db.Key.from_path("RecordIOShard", urllib.quote(name) + SPLIT_CHAR + "0" + SPLIT_CHAR) key_after = db.Key.from_path("RecordIOShard", urllib.quote(name) + SPLIT_CHAR + "0" + SPLIT_CHAR + SPLIT_CHAR_AFTER + SPLIT_CHAR_AFTER) return RecordIOShard.all(keys_only=keys_only ).filter("__key__ >=", key_before).filter("__key__ <", key_after) @staticmethod def get_shards_for_key_values(name, records, keys_only=True): """Given a list of entries, returns the shards where they belong to :param name: The name of the RecordIO :param records: A list of entry tuples. :param keys_only: If only the keys should be returned. :return: A list of names or shards. """ gen = RecordIOShard.iterate_records_(records) entry = None while True: if entry == None: try: entry = gen.next() except StopIteration: return key_before_name = RecordIOShard.key_name(name, hi=entry) key_before_name = key_before_name.split(SPLIT_CHAR) key_before_name[6] = SPLIT_CHAR_AFTER key_before_name = SPLIT_CHAR.join(key_before_name) if entry[0] == "": key_before_name = (key_before_name.split(SPLIT_CHAR)[0] + SPLIT_CHAR + "0" + SPLIT_CHAR) key_before = db.Key.from_path( "RecordIOShard", key_before_name) shard_obj = RecordIOShard.get_all_query(name, keys_only=keys_only).filter( "__key__ >", key_before).get() if shard_obj == None: yield None, [entry] + list(gen) return shard_key = None key_result = shard_obj if keys_only: shard_key = shard_obj.name() key_result = shard_key else: shard_key = shard_obj.key().name() lo, hi = RecordIOShard.lo_hi_from_key(shard_key) result = [] try: while entry and not RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield None, result result = [] try: while entry and RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield key_result, result def split(self): """Splits a RecordIOShard into two smaller shards. :return: lo_shard, hi_shard """ self.init() name = self.name() original_lo, original_hi = self.lo_hi() lo_data, hi_data, middle = self.records_.split() middle_key = middle[0:4] if len(middle_key) == 2: middle_key = middle[0:1] lo_shard = RecordIOShard.create(name, original_lo, middle_key) lo_shard.data = lo_data lo_shard.compressed = self.compressed hi_shard = RecordIOShard.create(name, middle_key, original_hi) hi_shard.data = hi_data hi_shard.compressed = self.compressed return lo_shard, hi_shard @staticmethod def lo_hi_from_key(key_name): """Given a datastore keyname, returns the lo, hi entry tuples. :param key_name: String :return: (lo, hi) entry tuples """ lo = key_name.split(SPLIT_CHAR)[6:10] if lo[0]: lo = [binascii.unhexlify(lo[0])] + [int(x) for x in lo[1:]] lo = tuple(lo) else: lo = None hi = key_name.split(SPLIT_CHAR)[2:6] if hi[0] != SPLIT_CHAR_AFTER: hi = [binascii.unhexlify(hi[0])] + [int(x) for x in hi[1:]] hi = tuple(hi) else: hi = None return lo, hi def lo_hi(self): """Returns the lo, hi entry tuples of a shard. :return:(lo, hi) entry tuples """ self.init() return self.loHi_
def testComperator(self): self.assertEqual(RecordIORecords.entry_comperator(("a",), ("b",)), -1) self.assertEqual(RecordIORecords.entry_comperator(("b",), ("b",)), 0) self.assertEqual(RecordIORecords.entry_comperator(("c",), ("b",)), 1) self.assertEqual(RecordIORecords.entry_comperator(("b", "bb"), ("b",)), 0) self.assertEqual(RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b",)), 0) self.assertEqual(RecordIORecords.entry_comperator(("b",), ("b", 0, 1, 1, "bb")), 0) self.assertEqual(RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", "bb")), 0) self.assertEqual(RecordIORecords.entry_comperator(("b", "bb"), ("b", 0, 1, 1, "bb")), 0) self.assertEqual(RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", 0, 1, 2, "bb")), 0) self.assertEqual(RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"), ("b", 1, 2, 2, "bb")), -1) self.assertEqual(RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"), ("b", 1, 2, 1, "bb")), 0) self.assertEqual(RecordIORecords.entry_comperator(("b", 1, 3, 1, "bb"), ("b", 1, 2, 1, "bb")), 1)
def testInsertSplitDataBigToSmall(self): records = RecordIORecords() records.insert(("a", "aa")) records.insert(("b", 0, 3, 3, "bb")) records.insert(("b", 1, 3, 3, "bb")) records.insert(("b", 2, 3, 3, "bb")) records.insert(("c", "cc")) records.insert(("b", "bb")) records = RecordIORecords(records.get_data()) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records))
class RecordIOShard(db.Model): """Holds the actual data of a RecordIO as sharded datastore entries.""" # The data from a RecordIORecords or RecordIORecordsZipped data = db.BlobProperty() # Determines if it's a RecordIORecords or RecordIORecordsZipped. compressed = db.BooleanProperty(default=True, indexed=False) # The first shard is the index shard. Used for getting a list of all # RecordIO names. index = db.BooleanProperty() @staticmethod def entry_key(key): """Returns a list of escaped strings representing a RecordIO entry tuple. :param key: An entry tuple (no value needed) :return: list of escaped string. """ if len(key) >= 4: str_key = [INTEGER_FMT % i for i in key[1:4]] return (binascii.hexlify(key[0]), ) + tuple(str_key) return (binascii.hexlify(key[0]), INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0) @staticmethod def key_name(name, lo=None, hi=None): """Returns the datastore key name for a shard. :param name: The name of the RecordIO :param lo: The lo entry tuple. :param hi: The hi entry tuple. :return: String """ if lo == None: lo = ("", INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0) else: lo = RecordIOShard.entry_key(lo) if hi == None: hi = (SPLIT_CHAR_AFTER, INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0) else: hi = RecordIOShard.entry_key(hi) return SPLIT_CHAR.join((urllib.quote(name), "0") + hi + lo) @staticmethod def create(name, lo=None, hi=None, compressed=True): """Creates a new RecordIOShard object (in memory). :param name: The name of the RecordIO :param lo: The lo entry tuple. :param hi: The hi entry tuple. :param compressed: Whether this RecordIOs data is zipped or not. :return: RecordIOShard """ shard = RecordIOShard(key_name=RecordIOShard.key_name(name, lo, hi)) shard.compressed = compressed if lo == None: shard.index = True return shard @staticmethod def get_name(key_name): """Returns the name of the RecordIO. :param key_name: A datastore key name :return: String """ return urllib.unquote(key_name.split(SPLIT_CHAR, 1)[0]) def name(self): """Returns the name of the RecordIO this shard belongs to. :return: String """ return self.get_name(self.key().name()) def init(self): """Initializes internal values.""" if not hasattr(self, "records_"): if self.compressed: self.records_ = RecordIORecordsZipped(self.data) else: self.records_ = RecordIORecords(self.data) self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name()) def commit(self): """Writes the data to datastore.""" self.init() self.data = self.records_.get_data(max_size=MAX_BLOB_SIZE) if len(self.data) >= MAX_BLOB_SIZE: raise RecordIOShardTooBigError() self.put() def not_deleted(self): """Entries that need to be deleted in another shard. :return: list of keys """ return self.records_.not_deleted() def __len__(self): """The amount of records in this RecordIO. Expensive if compressed. :return: int """ self.init() return len(self.records_) def __getitem__(self, key): """Returns an the value of an item. :param key_entry: An entry tuple (no value needed) :return: Object """ return self.records_[key][-1] def __iter__(self): """Yields all entry tuples. :return: Entry tuples """ self.init() for entry in self.records_: yield entry def __contains__(self, x): """Checks whether an entry tuple key is part of this RecordIOShard. :param x: An entry tuple (no value needed) :return: Boolean """ try: self[x] return True except: return False def insert(self, entry): """Inserts an entry tuple into the RecordIOShard. :param entry: An entry tuple """ self.init() assert (self.records_.in_range(entry, self.loHi_[0], self.loHi_[1])) self.records_.insert(entry) def read(self, start_key, end_key): """Reads through the records from start_key to end_key (exclusive) :param start_key: An entry tuple (no value needed) :param end_key: An entry tuple (no value needed) :return: Yields all entry tuples within the range. """ self.init() for entry in self.records_.read(start_key, end_key): yield entry @staticmethod def iterate_records_(records): """Iterates over all records. :param records: A generator :return: A generator """ for x in records: yield x @staticmethod def get_all_query(name, keys_only): """Returns a datastore query that returns all shards of a RecordIO. :param name: Name of the RecordIO :param keys_only: If this should be a keys only query :return: A datastore query. """ key_before = db.Key.from_path( "RecordIOShard", urllib.quote(name) + SPLIT_CHAR + "0" + SPLIT_CHAR) key_after = db.Key.from_path( "RecordIOShard", urllib.quote(name) + SPLIT_CHAR + "0" + SPLIT_CHAR + SPLIT_CHAR_AFTER + SPLIT_CHAR_AFTER) return RecordIOShard.all(keys_only=keys_only).filter( "__key__ >=", key_before).filter("__key__ <", key_after) @staticmethod def get_shards_for_key_values(name, records, keys_only=True): """Given a list of entries, returns the shards where they belong to :param name: The name of the RecordIO :param records: A list of entry tuples. :param keys_only: If only the keys should be returned. :return: A list of names or shards. """ gen = RecordIOShard.iterate_records_(records) entry = None while True: if entry == None: try: entry = gen.next() except StopIteration: return key_before_name = RecordIOShard.key_name(name, hi=entry) key_before_name = key_before_name.split(SPLIT_CHAR) key_before_name[6] = SPLIT_CHAR_AFTER key_before_name = SPLIT_CHAR.join(key_before_name) if entry[0] == "": key_before_name = (key_before_name.split(SPLIT_CHAR)[0] + SPLIT_CHAR + "0" + SPLIT_CHAR) key_before = db.Key.from_path("RecordIOShard", key_before_name) shard_obj = RecordIOShard.get_all_query( name, keys_only=keys_only).filter("__key__ >", key_before).get() if shard_obj == None: yield None, [entry] + list(gen) return shard_key = None key_result = shard_obj if keys_only: shard_key = shard_obj.name() key_result = shard_key else: shard_key = shard_obj.key().name() lo, hi = RecordIOShard.lo_hi_from_key(shard_key) result = [] try: while entry and not RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield None, result result = [] try: while entry and RecordIORecords.in_range(entry, lo, hi): result.append(entry) entry = gen.next() except StopIteration: entry = None if result: yield key_result, result def split(self): """Splits a RecordIOShard into two smaller shards. :return: lo_shard, hi_shard """ self.init() name = self.name() original_lo, original_hi = self.lo_hi() lo_data, hi_data, middle = self.records_.split() middle_key = middle[0:4] if len(middle_key) == 2: middle_key = middle[0:1] lo_shard = RecordIOShard.create(name, original_lo, middle_key) lo_shard.data = lo_data lo_shard.compressed = self.compressed hi_shard = RecordIOShard.create(name, middle_key, original_hi) hi_shard.data = hi_data hi_shard.compressed = self.compressed return lo_shard, hi_shard @staticmethod def lo_hi_from_key(key_name): """Given a datastore keyname, returns the lo, hi entry tuples. :param key_name: String :return: (lo, hi) entry tuples """ lo = key_name.split(SPLIT_CHAR)[6:10] if lo[0]: lo = [binascii.unhexlify(lo[0])] + [int(x) for x in lo[1:]] lo = tuple(lo) else: lo = None hi = key_name.split(SPLIT_CHAR)[2:6] if hi[0] != SPLIT_CHAR_AFTER: hi = [binascii.unhexlify(hi[0])] + [int(x) for x in hi[1:]] hi = tuple(hi) else: hi = None return lo, hi def lo_hi(self): """Returns the lo, hi entry tuples of a shard. :return:(lo, hi) entry tuples """ self.init() return self.loHi_
def commit_sync(self, retries=32, retry_timeout=1): """Applies all changes synchronously to the RecordIO. :param retries: How many times a commit_sync should be retried in case of datastore collisions. :param retry_timeout: The amount of second to wait before the next retry. """ if not len(self.updates): return for attempt in range(retries + 1): shard_does_not_exist = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None and key_values: logging.debug("RecordIO %s: No shard found for:\n%s -> %s" % (self.name, SPLIT_CHAR.join(RecordIOShard.entry_key(key_values[0])), key_values[0][:-1])) for entry in key_values: shard_does_not_exist.insert(entry) else: lo_just_split = None hi_just_split = None for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE): if lo_just_split and hi_just_split and key_values_chunk: if RecordIORecords.in_range(key_values_chunk[0], lo=lo_just_split[0], hi=lo_just_split[1]): shard_name = RecordIOShard.key_name(self.name, lo=lo_just_split[0], hi=lo_just_split[1]) elif RecordIORecords.in_range(key_values_chunk[0], lo=hi_just_split[0], hi=hi_just_split[1]): shard_name = RecordIOShard.key_name(self.name, lo=hi_just_split[0], hi=hi_just_split[1]) not_deleted = None try: not_deleted, lo_just_split, hi_just_split = self.commit_shard_( shard_name, key_values_chunk) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) lo_just_split = None hi_just_split = None for entry in key_values_chunk: shard_does_not_exist.insert(entry) if not_deleted: for to_delete_shard_name, to_delete_key_values in ( RecordIOShard.get_shards_for_key_values( self.name, not_deleted)): self.db_search += 1 try: self.commit_shard_(to_delete_shard_name, to_delete_key_values) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) for entry in to_delete_key_values: shard_does_not_exist.insert(entry) self.updates = shard_does_not_exist if len(self.updates): if attempt == retries: raise RecordIOWriterNotCompletedError(len(self.updates)) else: logging.debug("Commit attempt %d failed" % attempt) time.sleep(retry_timeout) else: return
class RecordIOWriter(): """This class allows you to write data to a RecordIO.""" def __init__(self, name): """Creates a RecordIOWriter :param name: The name of the RecordIO. The urllib quoted name is not allowed to be longer than 64 characters. """ if len(urllib.quote(name)) > MAX_KEY_LENGTH: raise ValueError("Max urllib.quote(name) length is %d: len('%s') is %d" % (MAX_KEY_LENGTH, name, len(urllib.quote(name)))) self.name = name self.updates = RecordIORecords() self.pending_worker_tasks = [] self.db_search = 0 self.db_get = 0 self.db_put = 0 def create(self, compressed=True, pre_split=[]): """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens :param compressed: Boolean if the data in the RecordIO should be gzipped. :param pre_split: An optional list of keys to that should be used to pre-split the internal data shards. This is only makes sense if you are going to write a lot of data and you already know the key range of the data and roughly how many entries fit into one shard. :return: True, if the RecordIO didn't exist before. """ self.db_search += 1 if RecordIOShard.get_all_query(self.name, keys_only=True).get() == None: pre_split.sort() self.db_put += 1 split = [None] + [(x,) for x in pre_split] + [None] split = [(split[i], split[i+1]) for i in xrange(len(split) - 1)] for lo, hi in split: index = None if lo == None: index = True RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name, lo=lo, hi=hi), compressed=compressed, index=index) return True return False def delete(self): """Deletes a RecordIO. Modifying RecordIOs or applying queued writes may result in errors during deletions. """ db.delete(RecordIOShard.get_all_query(self.name, keys_only=True)) def insert(self, key, value): """Assigns a value to a given key. Overwrites existing values with the same key. :param key: Must be a string and must not be longer than 64 characters. :param value: Values can be of any type that is pickeable (anything you can put in memcache). Values can have arbitrary size (There is no size limit like normal Datastore entries have). """ if isinstance(key, unicode): try: key = str(key) except: pass if not isinstance(key, str): raise ValueError("Key must be <type 'str'> got: %s" % type(key)) typed_value = None if isinstance(value, str): typed_value = recordio_entry_types.STRING + value elif type(value) in MARSHALABLE_TYPES: try: typed_value = recordio_entry_types.MARSHAL + marshal.dumps( value, MARSHAL_VERSION) except: pass if typed_value == None: typed_value = recordio_entry_types.CPICKLE + cPickle.dumps(value) if len(key) > MAX_KEY_LENGTH: raise ValueError("Max key length is %d: %d" % (MAX_KEY_LENGTH, len(key))) if len(typed_value) > MAX_ENTRY_SIZE: entries = int(math.ceil(1.0 * len(typed_value) / MAX_ENTRY_SIZE)) version = (hash(typed_value) + hash(str(time.time()))) % INTEGER_MAX for i in xrange(entries): self.insert_entry_((key, i, entries, version, typed_value[i * MAX_ENTRY_SIZE: (i+1) * MAX_ENTRY_SIZE])) else: self.insert_entry_((key, typed_value)) def remove(self, key): """Removes a value from the RecordIO :param key: A key of a previously inserted value. If this key does not exist, no exception is thrown. """ self.updates.insert((key, )) def commit_sync(self, retries=32, retry_timeout=1): """Applies all changes synchronously to the RecordIO. :param retries: How many times a commit_sync should be retried in case of datastore collisions. :param retry_timeout: The amount of second to wait before the next retry. """ if not len(self.updates): return for attempt in range(retries + 1): shard_does_not_exist = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None and key_values: logging.debug("RecordIO %s: No shard found for:\n%s -> %s" % (self.name, SPLIT_CHAR.join(RecordIOShard.entry_key(key_values[0])), key_values[0][:-1])) for entry in key_values: shard_does_not_exist.insert(entry) else: lo_just_split = None hi_just_split = None for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE): if lo_just_split and hi_just_split and key_values_chunk: if RecordIORecords.in_range(key_values_chunk[0], lo=lo_just_split[0], hi=lo_just_split[1]): shard_name = RecordIOShard.key_name(self.name, lo=lo_just_split[0], hi=lo_just_split[1]) elif RecordIORecords.in_range(key_values_chunk[0], lo=hi_just_split[0], hi=hi_just_split[1]): shard_name = RecordIOShard.key_name(self.name, lo=hi_just_split[0], hi=hi_just_split[1]) not_deleted = None try: not_deleted, lo_just_split, hi_just_split = self.commit_shard_( shard_name, key_values_chunk) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) lo_just_split = None hi_just_split = None for entry in key_values_chunk: shard_does_not_exist.insert(entry) if not_deleted: for to_delete_shard_name, to_delete_key_values in ( RecordIOShard.get_shards_for_key_values( self.name, not_deleted)): self.db_search += 1 try: self.commit_shard_(to_delete_shard_name, to_delete_key_values) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) for entry in to_delete_key_values: shard_does_not_exist.insert(entry) self.updates = shard_does_not_exist if len(self.updates): if attempt == retries: raise RecordIOWriterNotCompletedError(len(self.updates)) else: logging.debug("Commit attempt %d failed" % attempt) time.sleep(retry_timeout) else: return def commit_async(self, write_every_n_seconds=300): """Applies the changes asynchronously to the RecordIO. Automatically batches other pending writes to the same RecordIO (Cheaper and more efficient than synchronous commits). :param write_every_n_seconds: Applies the changes after this amount of seconds to the RecordIO. """ seen = set([]) raise_exception = False try: for tag in self.commit_to_queue_(): if tag in seen: continue seen.add(tag) self.pending_worker_tasks.append( self.create_task_(tag, write_every_n_seconds)) except RecordIOWriterNotCompletedError: raise_exception = True failed_add = [] while self.pending_worker_tasks: batch = self.pending_worker_tasks[:100] self.pending_worker_tasks = self.pending_worker_tasks[100:] try: taskqueue.Queue('recordio-writer').add(batch) except (taskqueue.DuplicateTaskNameError, taskqueue.TombstonedTaskError, taskqueue.TaskAlreadyExistsError): pass except ValueError: failed_add += batch self.pending_worker_tasks = failed_add if raise_exception or self.pending_worker_tasks: raise RecordIOWriterNotCompletedError(len(self.updates)) def db_stats(self): """Returns some datastore access statistics. :return: Dict """ return { "search": self.db_search, "get": self.db_get, "put": self.db_put } def insert_entry_(self, entry): """Inserts a entry tuples to the internal queue. :param entry: An entry tuple. """ self.updates.insert(entry) @staticmethod def create_task_(tag, write_every_n_seconds=300, in_past=False): """Creates the future taskqueue tasks to apply queued writes. :param tag: The shard to write. :param write_every_n_seconds: At what interval the shard should be updated. :param in_past: If the we should schedule the task in the past :return: taskqueue.Task """ now = int(time.time()) schedule = now - (now % write_every_n_seconds) schedule += hash(tag) % write_every_n_seconds if schedule < now and not in_past: schedule += write_every_n_seconds if schedule > now and in_past: schedule -= write_every_n_seconds task_name = "%d_%d_%d" % (hash(tag[:len(tag)/2]), hash(tag[len(tag)/2:]), schedule) params = {"taskqueue": tag} return taskqueue.Task(name=task_name, url="/recordio/write", params=params, eta=datetime.datetime.fromtimestamp( schedule + MAX_CLOCK_SKEW)) def commit_to_queue_(self): """Adds all pending changes to the task queues for async commits :return: Yields all shard names that need to be updated. """ pull = taskqueue.Queue('recordio-queue') rpcs = [] key_values_not_added = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None: for entry in key_values: key_values_not_added.insert(entry) else: for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE): payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION) rpc = pull.add_async(taskqueue.Task(payload=payload, method='PULL', tag=shard_name)) rpcs.append((rpc, key_values_chunk, shard_name)) for rpc, key_values, shard_name in rpcs: try: rpc.get_result() yield shard_name except: for entry in key_values: key_values_not_added.insert(entry) self.updates = key_values_not_added if len(self.updates): raise RecordIOWriterNotCompletedError(len(self.updates)) @db.transactional(xg=True) def commit_shard_(self, shard_name, key_values): """Adds key, values to a shard and splits it if necessary. :param shard_name: The key name of the RecordIOShard. :param key_values: A list of key values to be added :return: list of keys that need to be deleted in other shards. """ shard = RecordIOShard.get_by_key_name(shard_name) self.db_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(shard_name) for entry in key_values: shard.insert(entry) try: shard.commit() self.db_put += 1 return (shard.not_deleted(), None, None) except (RecordIOShardTooBigError, RequestTooLargeError, ValueError, ArgumentError, BadRequestError): shard.delete() lo_shard, hi_shard = shard.split() lo_shard.commit() hi_shard.commit() self.db_put += 2 logging.debug("Split\n%s\n%s\n%s" % (shard.key().name(), lo_shard.key().name(), hi_shard.key().name())) shard_name = hi_shard.key().name() return shard.not_deleted(), (lo_shard.lo_hi()), (hi_shard.lo_hi())
def testComperator(self): self.assertEqual(RecordIORecords.entry_comperator(("a", ), ("b", )), -1) self.assertEqual(RecordIORecords.entry_comperator(("b", ), ("b", )), 0) self.assertEqual(RecordIORecords.entry_comperator(("c", ), ("b", )), 1) self.assertEqual( RecordIORecords.entry_comperator(("b", "bb"), ("b", )), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", )), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", ), ("b", 0, 1, 1, "bb")), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", "bb")), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", "bb"), ("b", 0, 1, 1, "bb")), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", 0, 1, 2, "bb")), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"), ("b", 1, 2, 2, "bb")), -1) self.assertEqual( RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"), ("b", 1, 2, 1, "bb")), 0) self.assertEqual( RecordIORecords.entry_comperator(("b", 1, 3, 1, "bb"), ("b", 1, 2, 1, "bb")), 1)
def commit_sync(self, retries=32, retry_timeout=1): """Applies all changes synchronously to the RecordIO. :param retries: How many times a commit_sync should be retried in case of datastore collisions. :param retry_timeout: The amount of second to wait before the next retry. """ if not len(self.updates): return for attempt in range(retries + 1): shard_does_not_exist = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None and key_values: logging.debug( "RecordIO %s: No shard found for:\n%s -> %s" % (self.name, SPLIT_CHAR.join(RecordIOShard.entry_key( key_values[0])), key_values[0][:-1])) for entry in key_values: shard_does_not_exist.insert(entry) else: lo_just_split = None hi_just_split = None for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE): if lo_just_split and hi_just_split and key_values_chunk: if RecordIORecords.in_range(key_values_chunk[0], lo=lo_just_split[0], hi=lo_just_split[1]): shard_name = RecordIOShard.key_name( self.name, lo=lo_just_split[0], hi=lo_just_split[1]) elif RecordIORecords.in_range(key_values_chunk[0], lo=hi_just_split[0], hi=hi_just_split[1]): shard_name = RecordIOShard.key_name( self.name, lo=hi_just_split[0], hi=hi_just_split[1]) not_deleted = None try: not_deleted, lo_just_split, hi_just_split = self.commit_shard_( shard_name, key_values_chunk) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) lo_just_split = None hi_just_split = None for entry in key_values_chunk: shard_does_not_exist.insert(entry) if not_deleted: for to_delete_shard_name, to_delete_key_values in ( RecordIOShard.get_shards_for_key_values( self.name, not_deleted)): self.db_search += 1 try: self.commit_shard_(to_delete_shard_name, to_delete_key_values) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) for entry in to_delete_key_values: shard_does_not_exist.insert(entry) self.updates = shard_does_not_exist if len(self.updates): if attempt == retries: raise RecordIOWriterNotCompletedError(len(self.updates)) else: logging.debug("Commit attempt %d failed" % attempt) time.sleep(retry_timeout) else: return
class RecordIOWriter(): """This class allows you to write data to a RecordIO.""" def __init__(self, name): """Creates a RecordIOWriter :param name: The name of the RecordIO. The urllib quoted name is not allowed to be longer than 64 characters. """ if len(urllib.quote(name)) > MAX_KEY_LENGTH: raise ValueError( "Max urllib.quote(name) length is %d: len('%s') is %d" % (MAX_KEY_LENGTH, name, len(urllib.quote(name)))) self.name = name self.updates = RecordIORecords() self.pending_worker_tasks = [] self.db_search = 0 self.db_get = 0 self.db_put = 0 def create(self, compressed=True, pre_split=[]): """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens :param compressed: Boolean if the data in the RecordIO should be gzipped. :param pre_split: An optional list of keys to that should be used to pre-split the internal data shards. This is only makes sense if you are going to write a lot of data and you already know the key range of the data and roughly how many entries fit into one shard. :return: True, if the RecordIO didn't exist before. """ self.db_search += 1 if RecordIOShard.get_all_query(self.name, keys_only=True).get() == None: pre_split.sort() self.db_put += 1 split = [None] + [(x, ) for x in pre_split] + [None] split = [(split[i], split[i + 1]) for i in xrange(len(split) - 1)] for lo, hi in split: index = None if lo == None: index = True RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name, lo=lo, hi=hi), compressed=compressed, index=index) return True return False def delete(self): """Deletes a RecordIO. Modifying RecordIOs or applying queued writes may result in errors during deletions. """ db.delete(RecordIOShard.get_all_query(self.name, keys_only=True)) def insert(self, key, value): """Assigns a value to a given key. Overwrites existing values with the same key. :param key: Must be a string and must not be longer than 64 characters. :param value: Values can be of any type that is pickeable (anything you can put in memcache). Values can have arbitrary size (There is no size limit like normal Datastore entries have). """ if isinstance(key, unicode): try: key = str(key) except: pass if not isinstance(key, str): raise ValueError("Key must be <type 'str'> got: %s" % type(key)) typed_value = None if isinstance(value, str): typed_value = recordio_entry_types.STRING + value elif type(value) in MARSHALABLE_TYPES: try: typed_value = recordio_entry_types.MARSHAL + marshal.dumps( value, MARSHAL_VERSION) except: pass if typed_value == None: typed_value = recordio_entry_types.CPICKLE + cPickle.dumps(value) if len(key) > MAX_KEY_LENGTH: raise ValueError("Max key length is %d: %d" % (MAX_KEY_LENGTH, len(key))) if len(typed_value) > MAX_ENTRY_SIZE: entries = int(math.ceil(1.0 * len(typed_value) / MAX_ENTRY_SIZE)) version = (hash(typed_value) + hash(str(time.time()))) % INTEGER_MAX for i in xrange(entries): self.insert_entry_( (key, i, entries, version, typed_value[i * MAX_ENTRY_SIZE:(i + 1) * MAX_ENTRY_SIZE])) else: self.insert_entry_((key, typed_value)) def remove(self, key): """Removes a value from the RecordIO :param key: A key of a previously inserted value. If this key does not exist, no exception is thrown. """ self.updates.insert((key, )) def commit_sync(self, retries=32, retry_timeout=1): """Applies all changes synchronously to the RecordIO. :param retries: How many times a commit_sync should be retried in case of datastore collisions. :param retry_timeout: The amount of second to wait before the next retry. """ if not len(self.updates): return for attempt in range(retries + 1): shard_does_not_exist = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None and key_values: logging.debug( "RecordIO %s: No shard found for:\n%s -> %s" % (self.name, SPLIT_CHAR.join(RecordIOShard.entry_key( key_values[0])), key_values[0][:-1])) for entry in key_values: shard_does_not_exist.insert(entry) else: lo_just_split = None hi_just_split = None for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE): if lo_just_split and hi_just_split and key_values_chunk: if RecordIORecords.in_range(key_values_chunk[0], lo=lo_just_split[0], hi=lo_just_split[1]): shard_name = RecordIOShard.key_name( self.name, lo=lo_just_split[0], hi=lo_just_split[1]) elif RecordIORecords.in_range(key_values_chunk[0], lo=hi_just_split[0], hi=hi_just_split[1]): shard_name = RecordIOShard.key_name( self.name, lo=hi_just_split[0], hi=hi_just_split[1]) not_deleted = None try: not_deleted, lo_just_split, hi_just_split = self.commit_shard_( shard_name, key_values_chunk) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) lo_just_split = None hi_just_split = None for entry in key_values_chunk: shard_does_not_exist.insert(entry) if not_deleted: for to_delete_shard_name, to_delete_key_values in ( RecordIOShard.get_shards_for_key_values( self.name, not_deleted)): self.db_search += 1 try: self.commit_shard_(to_delete_shard_name, to_delete_key_values) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) for entry in to_delete_key_values: shard_does_not_exist.insert(entry) self.updates = shard_does_not_exist if len(self.updates): if attempt == retries: raise RecordIOWriterNotCompletedError(len(self.updates)) else: logging.debug("Commit attempt %d failed" % attempt) time.sleep(retry_timeout) else: return def commit_async(self, write_every_n_seconds=300): """Applies the changes asynchronously to the RecordIO. Automatically batches other pending writes to the same RecordIO (Cheaper and more efficient than synchronous commits). :param write_every_n_seconds: Applies the changes after this amount of seconds to the RecordIO. """ seen = set([]) raise_exception = False try: for tag in self.commit_to_queue_(): if tag in seen: continue seen.add(tag) self.pending_worker_tasks.append( self.create_task_(tag, write_every_n_seconds)) except RecordIOWriterNotCompletedError: raise_exception = True failed_add = [] while self.pending_worker_tasks: batch = self.pending_worker_tasks[:100] self.pending_worker_tasks = self.pending_worker_tasks[100:] try: taskqueue.Queue('recordio-writer').add(batch) except (taskqueue.DuplicateTaskNameError, taskqueue.TombstonedTaskError, taskqueue.TaskAlreadyExistsError): pass except ValueError: failed_add += batch self.pending_worker_tasks = failed_add if raise_exception or self.pending_worker_tasks: raise RecordIOWriterNotCompletedError(len(self.updates)) def db_stats(self): """Returns some datastore access statistics. :return: Dict """ return { "search": self.db_search, "get": self.db_get, "put": self.db_put } def insert_entry_(self, entry): """Inserts a entry tuples to the internal queue. :param entry: An entry tuple. """ self.updates.insert(entry) @staticmethod def create_task_(tag, write_every_n_seconds=300, in_past=False): """Creates the future taskqueue tasks to apply queued writes. :param tag: The shard to write. :param write_every_n_seconds: At what interval the shard should be updated. :param in_past: If the we should schedule the task in the past :return: taskqueue.Task """ now = int(time.time()) schedule = now - (now % write_every_n_seconds) schedule += hash(tag) % write_every_n_seconds if schedule < now and not in_past: schedule += write_every_n_seconds if schedule > now and in_past: schedule -= write_every_n_seconds task_name = "%d_%d_%d" % (hash( tag[:len(tag) / 2]), hash(tag[len(tag) / 2:]), schedule) params = {"taskqueue": tag} return taskqueue.Task( name=task_name, url="/recordio/write", params=params, eta=datetime.datetime.fromtimestamp(schedule + MAX_CLOCK_SKEW)) def commit_to_queue_(self): """Adds all pending changes to the task queues for async commits :return: Yields all shard names that need to be updated. """ pull = taskqueue.Queue('recordio-queue') rpcs = [] key_values_not_added = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None: for entry in key_values: key_values_not_added.insert(entry) else: for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE): payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION) rpc = pull.add_async( taskqueue.Task(payload=payload, method='PULL', tag=shard_name)) rpcs.append((rpc, key_values_chunk, shard_name)) for rpc, key_values, shard_name in rpcs: try: rpc.get_result() yield shard_name except: for entry in key_values: key_values_not_added.insert(entry) self.updates = key_values_not_added if len(self.updates): raise RecordIOWriterNotCompletedError(len(self.updates)) @db.transactional(xg=True) def commit_shard_(self, shard_name, key_values): """Adds key, values to a shard and splits it if necessary. :param shard_name: The key name of the RecordIOShard. :param key_values: A list of key values to be added :return: list of keys that need to be deleted in other shards. """ shard = RecordIOShard.get_by_key_name(shard_name) self.db_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(shard_name) for entry in key_values: shard.insert(entry) try: shard.commit() self.db_put += 1 return (shard.not_deleted(), None, None) except (RecordIOShardTooBigError, RequestTooLargeError, ValueError, ArgumentError, BadRequestError): shard.delete() lo_shard, hi_shard = shard.split() lo_shard.commit() hi_shard.commit() self.db_put += 2 logging.debug("Split\n%s\n%s\n%s" % (shard.key().name(), lo_shard.key().name(), hi_shard.key().name())) shard_name = hi_shard.key().name() return shard.not_deleted(), (lo_shard.lo_hi()), (hi_shard.lo_hi())
def testInsertSplitDataBiggerToBigToSmall(self): records = RecordIORecords() records.insert(("a", "aa")) records.insert(("b", 0, 3, 3, "bb")) records.insert(("b", 1, 3, 3, "bb")) records.insert(("b", 2, 3, 3, "bb")) records.insert(("c", "cc")) records.insert(("b", 0, 2, 2, "bb")) records.insert(("b", 1, 2, 2, "bb")) records.insert(("b", "bb")) records = RecordIORecords(records.get_data()) self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records))