def test_put_row_text(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' Simple test to store text and retrieve it Analyzes: - HCache - Put_row (write text) - Iteritems (read text) ''' '''''' table = "bulk" self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE %s.%s(partid int PRIMARY KEY, data text);" % (self.keyspace, table)) num_items = int(pow(10, 3)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort nblocks = 10 t_f = pow(-2, 63) # Token begin range t_t = pow(2, 63) - 1 # Token blocks tkn_size = (t_t - t_f) / (num_items / nblocks) tokens = [(a, a + tkn_size) for a in xrange(t_f, t_t - tkn_size, tkn_size)] keys = ["partid"] values = ["data"] hcache_config = {'cache_size': '10', 'writer_buffer': 20} cache = Hcache(self.keyspace, table, "", tokens, keys, values, hcache_config) for i in xrange(0, num_items): cache.put_row([i], ['someRandomText']) # it doesnt make sense to count the read elements # because the data is still being written async hiter = cache.iteritems(10) while True: try: data = hiter.get_next() self.assertEqual(len(data), len(keys) + len(values)) self.assertEqual(data[1], 'someRandomText') except StopIteration: break
def _setup_hcache(self): key_names = [key["name"] for key in self._primary_keys] persistent_values = [{"name": col["name"]} for col in self._columns] if self._tokens is None: raise RuntimeError("Tokens for object {} are null".format(self._get_name())) self._hcache_params = (self._ksp, self._table, self.storage_id, self._tokens, key_names, persistent_values, {'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'writer_buffer': config.write_buffer_size, 'timestamped_writes': config.timestamped_writes}) log.debug("HCACHE params %s", self._hcache_params) self._hcache = Hcache(*self._hcache_params)
def test_write_nulls_simple(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' Simple test to store text and retrieve it Analyzes: - HCache - Put_row (write data mixed with nulls) ''' '''''' table = "nulls" self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE %s.%s(partid int PRIMARY KEY, time float, data text);" % (self.keyspace, table)) num_items = int(pow(10, 3)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort nblocks = 10 t_f = pow(-2, 63) # Token begin range t_t = pow(2, 63) - 1 # Token blocks tkn_size = (t_t - t_f) / (num_items / nblocks) tokens = [(a, a + tkn_size) for a in xrange(t_f, t_t - tkn_size, tkn_size)] keys = ["partid"] values = ["time", "data"] hcache_config = {'cache_size': '10', 'writer_buffer': 20} cache = Hcache(self.keyspace, table, "", tokens, keys, values, hcache_config) for i in xrange(0, num_items): cache.put_row( [i], [12, None] ) # random.sample({i,None},1)+random.sample({'SomeRandomText',None},1)) time.sleep(10)
def make_persistent(self, name): if self._is_persistent: raise AlreadyPersistentError( "This StorageNumpy is already persistent [Before:{}.{}][After:{}]", self._ksp, self._table, name) self._is_persistent = True (self._ksp, self._table) = self._extract_ks_tab(name) if self._storage_id is None: self._storage_id = uuid.uuid3( uuid.NAMESPACE_DNS, self._ksp + '.' + self._table + '_numpies') self._build_args = self.args(self._storage_id, self._class_name, name) log.info("PERSISTING DATA INTO %s %s", self._ksp, self._table) query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % ( self._ksp, config.replication) config.session.execute(query_keyspace) config.session.execute( 'CREATE TABLE IF NOT EXISTS ' + self._ksp + '.' + self._table + '_numpies' '(storage_id uuid , ' 'cluster_id int, ' 'block_id int, ' 'payload blob, ' 'PRIMARY KEY((storage_id,cluster_id),block_id))') self._hcache_params = (self._ksp, self._table + '_numpies', self._storage_id, [], ['storage_id', 'cluster_id', 'block_id'], [{ 'name': "payload", 'type': 'numpy' }], { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'write_buffer': config.write_buffer_size }) self._hcache = Hcache(*self._hcache_params) if len(self.shape) != 0: self._hcache.put_row([self._storage_id, -1, -1], [self]) self._store_meta(self._build_args)
def test_small_brute(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' This test iterates over a small amount of data using an iterkeys and validates that no column name can be a key and value at the same time Analyzes: - HCache (enforce column can't be key and value at the same time) - Iterkeys ''' '''''' table = "particle" nelems = 10001 self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) for i in xrange(0, nelems): vals = ','.join( str(e) for e in [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"]) self.session.execute( "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)" % (self.keyspace, table, vals)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort nblocks = 100 t_f = pow(-2, 63) # Token begin range t_t = pow(2, 63) - 1 # Token blocks tkn_size = (t_t - t_f) / (nelems / nblocks) tokens = [(a, a + tkn_size) for a in xrange(t_f, t_t - tkn_size, tkn_size)] hcache_config = {'cache_size': '10', 'writer_buffer': 20} keys = ["partid", "time"] values = ["time", "x"] cache = None # this should fail since a key can not be a column name at the same time (key=time, column=time) try: cache = Hcache(self.keyspace, table, "WHERE token(partid)>=? AND token(partid)<?;", tokens, keys, values, hcache_config) except RuntimeError, e: self.assertTrue(True, e)
def load_array(storage_id, name): (ksp, table) = IStorage._extract_ks_tab(name) _hcache_params = (ksp, table + '_numpies', storage_id, [], ['storage_id', 'cluster_id', 'block_id'], [{ 'name': "payload", 'type': 'numpy' }], { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'write_buffer': config.write_buffer_size }) _hcache = Hcache(*_hcache_params) result = _hcache.get_row([storage_id, -1, -1]) if len(result) == 1: return result[0] else: raise KeyError
def test_simpletest(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' Analyzes: ''' '''''' table = 'particle' nelems = 500 self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) for i in xrange(0, nelems): vals = ','.join( str(e) for e in [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"]) self.session.execute( "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)" % (self.keyspace, table, vals)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort keys = ["partid", "time"] values = ["x", "y", "z"] token_ranges = [] # empty configuration parameter (the last dictionary) means to use the default config table = Hcache(self.keyspace, table, "WHERE token(partid)>=? AND token(partid)<?;", token_ranges, keys, values, {}) def get_data(cache, keys): data = None try: data = cache.get_row(keys) self.assertEqual(len(data), len(values)) except KeyError: print 'not found' return data q1 = get_data(table, [433, 4330]) # float(0.003) lost = get_data(table, [133, 1330]) lost = get_data(table, [433, 4330]) q2 = get_data(table, [433, 4330]) self.assertEqual(q1, q2)
def test_coherency(self): from hfetch import connectCassandra from hfetch import Hcache from hfetch import HWriter '''''' ''' Analyzes: - HCache ''' '''''' table = "particle" nparts = 10000 # Num particles in range self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort tkns = [] keys = ["partid", "time"] values = ["x", "y", "z"] cache = Hcache(self.keyspace, table, "WHERE token(partid)>=? AND token(partid)<?;", tkns, keys, values, { 'cache_size': '1', 'writer_buffer': 20 }) for i in xrange(0, nparts): cache.put_row([i, i / .1], [i / .2, i / .3, i / .4]) for i in reversed(xrange(0, nparts)): #xrange(nparts, -1, -1): try: cache.get_row([i, i / .1]) except KeyError: str_k = str([i, i / .1]) self.fail(str_k + " not found")
def test_iterators(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' This test iterates over some text and check coherency between hcache and hiter Analyzes: - HCache - Get_row (read text) - Iteritems (read text) ''' '''''' table = "words" num_keys = 20 self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE %s.%s(position int PRIMARY KEY, wordinfo text);" % (self.keyspace, table)) for i in xrange(0, num_keys): vals = ','.join( str(e) for e in [ i, "'someRandomTextForTesting purposes - " + str(i * 60) + "'" ]) self.session.execute( "INSERT INTO %s.%s(position , wordinfo ) VALUES (%s)" % (self.keyspace, table, vals)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort tkns = [(pow(-2, 63) + 1, pow(2, 63) - 1)] keys = ["position"] values = ["wordinfo"] hcache_config = {'cache_size': 100, 'writer_buffer': 20} cache = Hcache(self.keyspace, table, "WHERE token(position)>=? AND token(position)<?;", tkns, keys, values, hcache_config) iter_config = {"prefetch_size": 100, "update_cache": "yes"} myIter = cache.iteritems(iter_config) data = [] for i in xrange(0, 10): data.append(myIter.get_next()) assert (len(data) > 0) first_data = data[0] assert (len(first_data) == 2) first_key = [first_data[0]] assert (type(first_key[0]) == int) somedata = cache.get_row(first_key) # self.assertEqual((first_key + somedata), first_data) assert ((first_key + somedata) == first_data) count = len(data) while True: try: i = myIter.get_next() except StopIteration: print 'End of data, items read: ', count, ' with value ', i break count = count + 1 print 'data was: \n', data
def write_test(self): from hfetch import connectCassandra from hfetch import Hcache from hfetch import HWriter '''''' ''' While the iterator retrieves the data from a table, the writer stores it into another table Analyzes: - HCache - HWriter - Iteritems (updating the cache) ''' '''''' table = "particle" table_write = "particle_write" nparts = 6000 # Num particles in range self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table_write)) for i in xrange(0, nparts): vals = ','.join( str(e) for e in [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"]) self.session.execute( "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)" % (self.keyspace, table, vals)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort p = 1000 # Num partitions t_f = -7764607523034234880 # Token begin range # t_t = 5764607523034234880 # Token end range t_t = 7764607523034234880 # Token blocks tkn_size = (t_t - t_f) / (nparts / p) tkns = [(a, a + tkn_size) for a in xrange(t_f, t_t - tkn_size, tkn_size)] keys = ["partid", "time"] values = ["x", "y", "z"] a = Hcache(self.keyspace, table, "WHERE token(partid)>=? AND token(partid)<?;", tkns, keys, values, { self.keyspace: '100', 'writer_buffer': 20 }) writer = HWriter(self.keyspace, table_write, keys, values, {'writer_buffer': 20}) def readAll(iter, wr): count = 1 while True: try: i = iter.get_next() except StopIteration: print 'End of data, items read: ', count, ' with value ', i break wr.write(i[0:2], i[2:5]) count += 1 if count % 100000 == 0: print count print "iter has %d elements" % count start = time.time() readAll(a.iteritems({ "prefetch_size": 100, "update_cache": "yes" }), writer) print "finshed into %d" % (time.time() - start)
print e print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort table = "arrays_numpies" self.session.execute("DROP TABLE if exists %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE %s.%s(storage_id uuid, cluster_id int, block_id int, payload blob,PRIMARY KEY((storage_id,cluster_id),block_id));" % (self.keyspace, table)) storage_id = uuid.uuid3(uuid.NAMESPACE_DNS, self.keyspace + '.' + table) time.sleep(5) a = Hcache(self.keyspace, table, storage_id, [], ['storage_id', 'cluster_id', 'block_id'], [{ 'name': "payload", 'type': 'numpy' }], {}) #prepare data bigarr = np.arange(pow(elem_dim, dims)).reshape(elem_dim, elem_dim, elem_dim) keys = [storage_id, -1, -1] values = [bigarr.astype('i')] #insert a.put_row(keys, values) # othw we ask for the row before it has been processed time.sleep(2)
class QbeastIterator(IStorage): """ Object used to access data from workers. """ args_names = ['primary_keys', 'columns', 'indexed_on', 'name', 'qbeast_meta', 'qbeast_random', 'storage_id', 'tokens', 'class_name', 'built_remotely'] _building_args = namedtuple('QbeastArgs', args_names) _prepared_store_meta = config.session.prepare('INSERT INTO hecuba.istorage' '(primary_keys, columns, indexed_on, name, qbeast_meta,' ' qbeast_random, storage_id, tokens, class_name)' 'VALUES (?,?,?,?,?,?,?,?,?)') _prepared_set_qbeast_meta = config.session.prepare('INSERT INTO hecuba.istorage (storage_id, qbeast_meta) ' 'VALUES (?,?)') @staticmethod def _store_meta(storage_args): log.debug("QbeastIterator: storing metas %s", '') try: config.session.execute(QbeastIterator._prepared_store_meta, [storage_args.primary_keys, storage_args.columns, storage_args.indexed_on, storage_args.name, storage_args.qbeast_meta, storage_args.qbeast_random, storage_args.storage_id, storage_args.tokens, storage_args.class_name]) except Exception as ex: log.error("Error creating the StorageDictIx metadata: %s %s", storage_args, ex) raise ex def __init__(self, primary_keys, columns, indexed_on, name, qbeast_meta=None, qbeast_random=None, storage_id=None, tokens=None, **kwargs): """ Creates a new block. Args: primary_keys (list(tuple)): a list of (key,type) primary keys (primary + clustering). columns (list(tuple)): a list of (key,type) columns indexed_on (list(str)): a list of the names of the indexed columns name (string): keyspace.table of the Cassandra collection qbeast_random (str): qbeast random string, when selecting in different nodes this must have the same value storage_id (uuid): the storage id identifier tokens (list): list of tokens """ super().__init__((), name=name, storage_id=storage_id, **kwargs) log.debug("CREATED QbeastIterator(%s,%s,%s,%s)", storage_id, tokens, ) self._qbeast_meta = qbeast_meta self._primary_keys = primary_keys self._columns = columns self._indexed_on = indexed_on if qbeast_random is None: self._qbeast_random = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(5)) else: self._qbeast_random = qbeast_random class_name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) self._primary_keys = [{"type": key[1], "name": key[0]} if isinstance(key, tuple) else key for key in self._primary_keys] self._columns = [{"type": col[1], "name": col[0]} if isinstance(col, tuple) else col for col in self._columns] key_names = [col["name"] for col in self._primary_keys] column_names = [col["name"] for col in self._columns] if len(key_names) > 1: self._key_builder = namedtuple('row', key_names) else: self._key_builder = None if len(column_names) > 1: self._column_builder = namedtuple('row', column_names) else: self._column_builder = None self._k_size = len(primary_keys) build_keys = [(key["name"], key["type"]) for key in self._primary_keys] build_columns = [(col["name"], col["type"]) for col in self._columns] self._build_args = self._building_args( build_keys, build_columns, self._indexed_on, self._ksp + "." + self._table, self._qbeast_meta, self._qbeast_random, self.storage_id, self._tokens, class_name, self._built_remotely) if name or storage_id: self.make_persistent(name) def make_persistent(self, name): # Update local QbeastIterator metadata super().make_persistent(name) self._build_args = self._build_args._replace(storage_id=self.storage_id, name=self._ksp + "." + self._table, tokens=self._tokens) self._setup_hcache() QbeastIterator._store_meta(self._build_args) def _setup_hcache(self): key_names = [key["name"] for key in self._primary_keys] persistent_values = [{"name": col["name"]} for col in self._columns] if self._tokens is None: raise RuntimeError("Tokens for object {} are null".format(self._get_name())) self._hcache_params = (self._ksp, self._table, self.storage_id, self._tokens, key_names, persistent_values, {'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'writer_buffer': config.write_buffer_size, 'timestamped_writes': config.timestamped_writes}) log.debug("HCACHE params %s", self._hcache_params) self._hcache = Hcache(*self._hcache_params) def _set_qbeast_meta(self, qbeast_meta): self._qbeast_meta = qbeast_meta self._build_args = self._build_args._replace(qbeast_meta=qbeast_meta) config.session.execute(QbeastIterator._prepared_set_qbeast_meta, [self.storage_id, qbeast_meta]) def __len__(self): return len([row for row in self.__iter__()]) def __iter__(self): if hasattr(self, "_qbeast_meta") and self._qbeast_meta is not None: conditions = "" for index, (from_p, to_p) in enumerate(zip(self._qbeast_meta.from_point, self._qbeast_meta.to_point)): conditions += "{0} > {1} AND {0} < {2} AND ".format(self._indexed_on[index], from_p, to_p) conditions = conditions[:-5] + self._qbeast_meta.mem_filter conditions += " AND expr(%s_idx, 'precision=%s:%s') ALLOW FILTERING" \ % (self._table, self._qbeast_meta.precision, self._qbeast_random) hiter = self._hcache.iteritems({'custom_select': conditions, 'prefetch_size': config.prefetch_size}) else: hiter = self._hcache.iteritems(config.prefetch_size) return NamedItemsIterator(self._key_builder, self._column_builder, self._k_size, hiter, self)
class StorageDict(IStorage, dict): # """ # Object used to access data from workers. # """ args_names = [ "name", "primary_keys", "columns", "tokens", "storage_id", "indexed_on", "class_name", "built_remotely" ] args = namedtuple('StorageDictArgs', args_names) _prepared_store_meta = config.session.prepare( 'INSERT INTO hecuba.istorage' '(storage_id, class_name, name, tokens, ' 'primary_keys, columns, indexed_on)' 'VALUES (?,?,?,?,?,?,?)') @staticmethod def _store_meta(storage_args): """ Method to update the info about the StorageDict in the DB metadata table Args: storage_args: structure with all data needed to update the metadata """ log.debug("StorageDict: storing metas %s", storage_args) try: config.session.execute(StorageDict._prepared_store_meta, [ storage_args.storage_id, storage_args.class_name, storage_args.name, storage_args.tokens, storage_args.primary_keys, storage_args.columns, storage_args.indexed_on ]) except Exception as ex: log.error("Error creating the StorageDict metadata: %s %s", storage_args, ex) raise ex def __init__(self, name=None, primary_keys=None, columns=None, indexed_on=None, storage_id=None, **kwargs): """ Creates a new StorageDict. Args: name (string): the name of the collection/table (keyspace is optional) primary_keys (list(tuple)): a list of (key,type) primary keys (primary + clustering). columns (list(tuple)): a list of (key,type) columns tokens (list): list of tokens storage_id (string): the storage id identifier indexed_on (list): values that will be used as index kwargs: other parameters """ super().__init__((), name=name, storage_id=storage_id, **kwargs) log.debug("CREATE StorageDict(%s,%s)", primary_keys, columns) ''' yolandab kwargs of the init should contain metas: all the row in the istorage if exists after super().__init__ if kwargs is empty --> this is a new object generate build args parsing the _doc_ string or using the parameters we need to generate the column info of sets with the format to persist it (name--> _set_) if name or storage id --> call to store_metas else --> this is an already existing objects metas and tokens should form the attributes of self we need to convert the column info of sets to the format in memory ( _set_name --> name) TODO: implement a cleaner version of embedded sets ''' build_column = None build_keys = None if self.__doc__ is not None: self._persistent_props = self._parse_comments(self.__doc__) self._primary_keys = self._persistent_props['primary_keys'] self._columns = self._persistent_props['columns'] self._indexed_on = self._persistent_props.get( 'indexed_on', indexed_on) # Field '_istorage_metas' will be set if it exists in HECUBA.istorage initialized = (getattr(self, '_istorage_metas', None) is not None) if not initialized and self.__doc__ is None: #info is not in the doc string, should be passed in the parameters if primary_keys == None or columns == None: raise RuntimeError( "StorageDict: missed specification. Type of Primary Key or Column undefined" ) self._primary_keys = primary_keys self._columns = columns self._indexed_on = indexed_on if initialized: #object already in istorage # if (primary_keys is not None or columns is not None): # raise RuntimeError("StorageDict: Trying to define a new schema, but it is already persistent") # --> this check would be necessary if passing columns/key spec # as parameter was part of the user interface. As it is intended # just for internal use we skip this check. If the spec does not # match the actual schema access to the object will fail. if getattr(self, "_persistent_props", None) is not None: # __doc__ and disk: do they match? self._check_schema_and_raise("__init__") else: # _persistent_props == None (only in disk) # Parse _istorage_metas to fulfill the _primary_keys, _columns self._primary_keys = self._istorage_metas.primary_keys self._columns = self._istorage_metas.columns build_column = self._columns # Keep a copy from the disk to avoid recalculate it later build_keys = self._primary_keys # Keep a copy from the disk to avoid recalculate it later self._indexed_on = self._istorage_metas.indexed_on #we manipulate the info about sets retrieved from istorage # (_set_s1_0,int), (_set_s1_1,int) --> {name: s1, type: set , column:((s1_0, int), (s1_1, int))} has_embedded_set = False set_pks = [] normal_columns = [] for column_name, column_type in self._columns: if column_name.find("_set_") == 0: attr_name = column_name[ 5:] # Remove '_set_' The attribute name also contains the "column_name" needed later... set_pks.append((attr_name, column_type)) has_embedded_set = True else: normal_columns.append((column_name, column_type)) if has_embedded_set: # Embedded set has a different layout {name,type:set, columns:[(name,type),(name,type)]} column_name = attr_name.split( "_", 1 )[0] # Get the 1st name (attr_1, attr_2... -> attr or attr -> attr) self._columns = [{ "name": column_name, "type": "set", "columns": set_pks }] else: self._columns = [{ "type": col[1], "name": col[0] } for col in normal_columns] # COMMON CODE: new and instantiation # Special case:Do we have an embedded set? self._has_embedded_set = False if isinstance(self._columns[0], dict): if self._columns[0]['type'] == 'set': self._has_embedded_set = True self._primary_keys = [{ "type": key[1], "name": key[0] } if isinstance(key, tuple) else key for key in self._primary_keys] self._columns = [{ "type": col[1], "name": col[0] } if isinstance(col, tuple) else col for col in self._columns] # POST: _primary_keys and _columns are list of DICTS> [ {name:..., type:...}, {name:..., type:set, columns:[(name,type),...]},...] log.debug("CREATED StorageDict(%s,%s)", self._primary_keys, self._columns) key_names = [key["name"] for key in self._primary_keys] column_names = [col["name"] for col in self._columns] if len(key_names) > 1: self._key_builder = namedtuple('row', key_names) else: # 1 self._key_builder = None if self._has_embedded_set: set_names = [colname for (colname, dt) in self._get_set_types()] self._column_builder = namedtuple('row', set_names) elif len(column_names) > 1: self._column_builder = namedtuple('row', column_names) else: self._column_builder = None self._k_size = len(key_names) class_name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) if build_keys == None: build_keys = [(key["name"], key["type"]) for key in self._primary_keys] # Define 'build_column': it will contain the column info stored in istorage. For the sets we manipulate the parsed data if build_column == None: build_column = [] for col in self._columns: if col["type"] == "set": types = col["columns"] for t in types: build_column.append(("_set_" + t[0], t[1])) else: build_column.append((col["name"], col["type"])) self._build_args = self.args(self._get_name(), build_keys, build_column, self._tokens, self.storage_id, self._indexed_on, class_name, self._built_remotely) if name and storage_id and (storage_id != storage_id_from_name(name) ): # instantiating an splitted object self._persist_metadata() elif name or storage_id: # instantiating a persistent object if initialized: # already existint self._setup_hcache() else: # new object self._persist_metadata() @classmethod def _parse_comments(self, comments): parser = Parser("TypeSpec") return parser._parse_comments(comments) def __contains__(self, key): """ Method that checks if a given key exists in a StorageDict. Args: key: the position that we want to check if exists. Returns: boolean (true - exists, false - doesn't exist). """ if not self.storage_id: return dict.__contains__(self, key) else: try: # TODO we should save this value in a cache self._hcache.get_row(self._make_key(key)) return True except Exception as ex: log.warn("persistentDict.__contains__ ex %s", ex) return False def _create_tables(self): # Prepare data persistent_keys = [ (key["name"], "tuple<" + ",".join(key["columns"]) + ">") if key["type"] == "tuple" else (key["name"], key["type"]) for key in self._primary_keys ] + self._get_set_types() persistent_values = [] if not self._has_embedded_set: for col in self._columns: if col["type"] == "tuple": persistent_values.append({ "name": col["name"], "type": "tuple<" + ",".join(col["columns"]) + ">" }) elif col["type"] not in basic_types: persistent_values.append({ "name": col["name"], "type": "uuid" }) else: persistent_values.append({ "name": col["name"], "type": col["type"] }) key_names = [ col[0] if isinstance(col, tuple) else col["name"] for col in persistent_keys ] query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % ( self._ksp, config.replication) try: log.debug('MAKE PERSISTENCE: %s', query_keyspace) config.executelocked(query_keyspace) except Exception as ex: log.warn("Error creating the StorageDict keyspace %s, %s", (query_keyspace), ex) raise ex persistent_columns = [(col["name"], col["type"]) for col in persistent_values] query_table = "CREATE TABLE IF NOT EXISTS %s.%s (%s, PRIMARY KEY (%s));" \ % (self._ksp, self._table, ",".join("%s %s" % tup for tup in persistent_keys + persistent_columns), str.join(',', key_names)) try: log.debug('MAKE PERSISTENCE: %s', query_table) config.executelocked(query_table) except Exception as ex: log.warn("Error creating the StorageDict table: %s %s", query_table, ex) raise ex if hasattr(self, '_indexed_on') and self._indexed_on is not None: index_query = 'CREATE CUSTOM INDEX IF NOT EXISTS ' + self._table + '_idx ON ' index_query += self._ksp + '.' + self._table + ' (' + str.join( ',', self._indexed_on) + ') ' index_query += "using 'es.bsc.qbeast.index.QbeastIndex';" try: config.executelocked(index_query) except Exception as ex: log.error("Error creating the Qbeast custom index: %s %s", index_query, ex) raise ex trigger_query = "CREATE TRIGGER IF NOT EXISTS %s%s_qtr ON %s.%s USING 'es.bsc.qbeast.index.QbeastTrigger';" % \ (self._ksp, self._table, self._ksp, self._table) try: config.executelocked(trigger_query) except Exception as ex: log.error("Error creating the Qbeast trigger: %s %s", trigger_query, ex) raise ex def _persist_data_from_memory(self): for k, v in super().items(): self[k] = v if config.max_cache_size != 0: #if C++ cache is enabled, clear Python memory, otherwise keep it super().clear() def sync(self): super().sync() self._hcache.flush() def _setup_hcache(self): key_names = [key["name"] for key in self._primary_keys] key_names = key_names + [name for name, dt in self._get_set_types()] persistent_values = [] if not self._has_embedded_set: persistent_values = [{ "name": col["name"] } for col in self._columns] if self._tokens is None: raise RuntimeError("Tokens for object {} are null".format( self._get_name())) self._hcache_params = (self._ksp, self._table, self.storage_id, self._tokens, key_names, persistent_values, { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'writer_buffer': config.write_buffer_size, 'timestamped_writes': config.timestamped_writes }) log.debug("HCACHE params %s", self._hcache_params) self._hcache = Hcache(*self._hcache_params) def _make_key(self, key): """ Method used to pass the key data to the StorageDict cache in a proper way. Args: key: the data that needs to get the correct format """ if isinstance(key, str) or not isinstance(key, Iterable): if len(self._primary_keys) == 1: return [key] else: raise Exception('missing a primary key') if isinstance(key, Iterable) and len(key) == len(self._primary_keys): return list(key) elif self._has_embedded_set and isinstance( key, Iterable) and len(key) == (len(self._primary_keys) + len(self._get_set_types())): return list(key) else: raise Exception('wrong primary key') @staticmethod def _make_value(value): """ Method used to pass the value data to the StorageDict cache in a proper way. Args: value: the data that needs to get the correct format """ if issubclass(value.__class__, IStorage): return [value.storage_id] elif isinstance(value, str) or not isinstance(value, Iterable) or isinstance( value, np.ndarray): return [value] elif isinstance(value, tuple): return [value] elif isinstance(value, Iterable): val = [] for v in value: if isinstance(v, IStorage): val.append(v.storage_id) else: val.append(v) return val else: return list(value) def _count_elements(self, query): try: result = config.session.execute(query) return result[0][0] except OperationTimedOut as ex: import warnings warnings.warn( "len() operation on {} from class {} failed by timeout." "Use len() on split() results if you must".format( self._get_name(), self.__class__.__name__)) raise ex except Exception as ir: log.error("Unable to execute %s", query) raise ir def __iter__(self): """ Method that overloads the python dict basic iteration, which returns an iterator over the dictionary keys. """ return self.keys() def _persist_metadata(self): """ Private Method to create tables, setup the cache and store the metadata of a StorageDict. Used for NEW storage dicts, that do no need to persist any data. """ if not self._built_remotely: self._create_tables() self._setup_hcache() StorageDict._store_meta(self._build_args) def _persist_data(self, name): """ Private Method to store a StorageDict into cassandra This will make it use a persistent DB as the main location of its data. Args: name: """ # Update local StorageDict metadata self._build_args = self._build_args._replace( storage_id=self.storage_id, name=self._ksp + "." + self._table, tokens=self._tokens) self._persist_metadata() self._persist_data_from_memory() def make_persistent(self, name): """ Method to transform a StorageDict into a persistent object. This will make it use a persistent DB as the main location of its data. Args: name: """ super().make_persistent(name) if getattr(self, "_istorage_metas", None) is not None: self._check_schema_and_raise("make_persistent") self._persist_data(name) def stop_persistent(self): """ Method to turn a StorageDict into non-persistent. """ super().stop_persistent() log.debug('STOP PERSISTENCE: %s', self._table) self._hcache = None self.storage_id = None def delete_persistent(self): """ Method to empty all data assigned to a StorageDict. """ self.sync() super().delete_persistent() log.debug('DELETE PERSISTENT: %s', self._table) query = "DROP TABLE %s.%s;" % (self._ksp, self._table) config.session.execute(query) query = "DELETE FROM hecuba.istorage where storage_id={}".format( self.storage_id) config.session.execute(query) self.storage_id = None def __delitem__(self, key): """ Method to delete a specific entry in the dict in the key position. Args: key: position of the entry that we want to delete """ if not self.storage_id: dict.__delitem__(self, key) elif self._has_embedded_set: self._hcache.delete_row(key) elif isinstance(key, Iterable) and not isinstance(key, str): self._hcache.delete_row(list(key)) else: self._hcache.delete_row([key]) def __create_embeddedset(self, key, val=None): if not isinstance(key, Iterable) or isinstance(key, str): return EmbeddedSet(self, [key], val) else: return EmbeddedSet(self, list(key), val) def _check_schema_and_raise(self, txt): """ Raises an exception if the schema stored in the database does not match with the description of the object in memory. This may happen if the user specifies an already used name for its data. PRE: self._istorage_metas contains a list of tuples (name, type) self._primary_keys contains a list of tuples (name, type) or list of dicts {'name':value, 'type':value} self._columns may contain: a list of tuples (name, type) or a list of dicts {'name':value, 'type':value} or a list of dicts with a set {'name':value, 'type':'set','columns':[(name1,type1),....]} """ # TODO: Change parser to have a consistent behaviour # try to send a useful message if it is a problem with a mismatched schema if getattr(self, "_istorage_metas", None) is None: self._istorage_metas = get_istorage_attrs(self.storage_id) if len(self._primary_keys) != len(self._istorage_metas.primary_keys): raise RuntimeError( "StorageDict: {}: key Metadata does not match specification. Trying {} but stored specification {}" .format(txt, self._primary_keys, self._istorage_metas.primary_keys)) pk = [{ "type": key[1], "name": key[0] } if isinstance(key, tuple) else key for key in self._primary_keys] for pos, key in enumerate(pk): if self._istorage_metas.primary_keys[pos][0] != key[ 'name'] or self._istorage_metas.primary_keys[pos][ 1] != key['type']: raise RuntimeError( "StorageDict: {}: key Metadata does not match specification. Trying {} but stored specification {}" .format(txt, self._primary_keys, self._istorage_metas.primary_keys)) columns = self._columns # Treat the embedded set case... if type(self._columns[0]) == dict: if self._columns[0]['type'] == 'set': columns = self._columns[0]['columns'] if len(columns) != len(self._istorage_metas.columns): raise RuntimeError( "StorageDict: {}: column Metadata does not match specification. Trying {} but stored specification {}" .format(txt, self._columns, self._istorage_metas.columns)) columns = [{ "type": col[1], "name": col[0] } if isinstance(col, tuple) else col for col in columns] for pos, val in enumerate(columns): #istorage_metas.columns[pos] -->[(_set_s1_0,int),(_set_s1_1,int)] mykey = self._istorage_metas.columns[pos][0] mytype = self._istorage_metas.columns[pos][1] if mykey.find("_set_") == 0: mykey = mykey[ 5:] # Skip the '_set_' '_set_s1_0' ==> 's1_0' TODO Change the set identification method if (mykey != val['name']) or (mytype != val['type']): raise RuntimeError( "StorageDict: {}: column Metadata does not match specification. Trying {} but stored specification {}" .format(txt, self._columns, self._istorage_metas.columns)) def __getitem__(self, key): """ If the object is persistent, each request goes to the hfetch. Args: key: the dictionary key Returns item: value found in position key """ log.debug('GET ITEM %s', key) if not self.storage_id: return dict.__getitem__(self, key) elif self._has_embedded_set: return self.__create_embeddedset(key=key) else: # Returns always a list with a single entry for the key if config.max_cache_size == 0: # if C++ cache is disabled, use Python memory try: result = dict.__getitem__(self, key) return result except: pass persistent_result = self._hcache.get_row(self._make_key(key)) log.debug("GET ITEM %s[%s]", persistent_result, persistent_result.__class__) # we need to transform UUIDs belonging to IStorage objects and rebuild them # TODO hcache should return objects of the class uuid, not str final_results = [] for index, col in enumerate(self._columns): col_type = col["type"] element = persistent_result[index] if col_type not in basic_types: # element is not a built-in type info = { "storage_id": element, "tokens": self._build_args.tokens, "class_name": col_type } element = build_remotely(info) final_results.append(element) if self._column_builder is not None: return self._column_builder(*final_results) else: return final_results[0] def __make_val_persistent(self, val, col=0): if isinstance(val, list): for index, element in enumerate(val): val[index] = self.__make_val_persistent(element, index) elif isinstance(val, IStorage) and not val._is_persistent: valstorage_id = uuid.uuid4() attribute = self._columns[col]["name"] name = self._ksp + "." + ( "D" + str(valstorage_id).replace('-', '_') + self._table + attribute )[: 40] # 48 is the max length of table names, this may have collisions but this would only affect to object instantiation that are not really expected (user should give the name of the object instead of relying on the system to generate it) # new name as ksp.Dra_n_dom_table_attrname[:40] val.make_persistent(name) return val def __setitem__(self, key, val): """ Method to insert values in the StorageDict Args: key: the position of the value that we want to save val: the value that we want to save in that position """ if isinstance(val, list): vals_istorage = [] for element in val: if isinstance(element, np.ndarray) and not isinstance( element, StorageNumpy): val_istorage = StorageNumpy(element) else: val_istorage = element vals_istorage.append(val_istorage) val = vals_istorage elif isinstance(val, np.ndarray) and not isinstance(val, StorageNumpy): val = StorageNumpy(val) elif isinstance(val, set): val = self.__create_embeddedset(key=key, val=val) log.debug('SET ITEM %s->%s', key, val) if self.storage_id is None: dict.__setitem__(self, key, val) elif not isinstance(val, EmbeddedSet): # Not needed because it is made persistent and inserted to hcache when calling to self.__create_embeddedset val = self.__make_val_persistent(val) self._hcache.put_row(self._make_key(key), self._make_value(val)) if config.max_cache_size == 0: # If C++ cache is disabled, use python memory dict.__setitem__(self, key, val) def __len__(self): if not self.storage_id: return super().__len__() self.sync() if self._tokens[0][0] == _min_token and self._tokens[-1][ 1] == _max_token: query = f"SELECT COUNT(*) FROM {self._ksp}.{self._table}" return self._count_elements(query) else: keys = [] for pkey in self._primary_keys: template = "'{}'" if pkey["type"] == "text" else "{}" keys.append(template.format(pkey["name"])) all_keys = ",".join(keys) total = 0 for (token_start, token_end) in self._tokens: query = f"SELECT COUNT(*) FROM {self._ksp}.{self._table} " \ f"WHERE token({all_keys})>={token_start} AND token({all_keys})<{token_end}" total = total + self._count_elements(query) return total def __repr__(self): """ Overloads the method used by print to show a StorageDict Returns: The representation of the data stored in the StorageDict """ to_return = {} for item in self.items(): to_return[item[0]] = item[1] if len(to_return) == config.hecuba_print_limit: return str(to_return) if len(to_return) > 0: return str(to_return) return "" def update(self, other=None, **kwargs): """ Updates the current dict with a new dictionary or set of attr,value pairs (those must follow the current dict data model). Args: other: python dictionary or StorageDict. All key,val values in it will be inserted in the current dict. **kwargs: set of attr:val pairs, to be treated as key,val and inserted in the current dict. """ if other is not None: if isinstance(other, StorageDict): for k, v in other.items(): self[k] = v else: for k, v in other.items() if isinstance(other, Mapping) else other: self[k] = v for k, v in kwargs.items(): self[k] = v def keys(self): """ Obtains the iterator for the keys of the StorageDict Returns: if persistent: iterkeys(self): list of keys if not persistent: dict.keys(self) """ if self.storage_id: self.sync() ik = self._hcache.iterkeys(config.prefetch_size) iterator = NamedIterator(ik, self._key_builder, self) if self._has_embedded_set: iterator = iter(set(iterator)) return iterator else: return dict.keys(self) def items(self): """ Obtains the iterator for the key,val pairs of the StorageDict Returns: if persistent: NamedItemsIterator(self): list of key,val pairs if not persistent: dict.items(self) """ if self.storage_id: self.sync() ik = self._hcache.iteritems(config.prefetch_size) iterator = NamedItemsIterator(self._key_builder, self._column_builder, self._k_size, ik, self) if self._has_embedded_set: d = defaultdict(set) # iteritems has the set values in different rows, this puts all the set values in the same row if len(self._get_set_types()) == 1: for row in iterator: d[row.key].add(row.value[0]) else: for row in iterator: d[row.key].add(tuple(row.value)) iterator = d.items() return iterator else: return dict.items(self) def values(self): """ Obtains the iterator for the values of the StorageDict Returns: if persistent: NamedIterator(self): list of valuesStorageDict if not persistent: dict.values(self) """ if self.storage_id: self.sync() if self._has_embedded_set: items = self.items() return dict(items).values() else: ik = self._hcache.itervalues(config.prefetch_size) return NamedIterator(ik, self._column_builder, self) else: return dict.values(self) def get(self, key, default=None): try: value = self.__getitem__(key) except KeyError: value = default return value def _get_set_types(self): """ Returns a list of tuples (name,type) for the types of the set """ if self._has_embedded_set: set_types = [ col.get("columns", []) for col in self._columns if isinstance(col, dict) ] return sum(set_types, []) else: return []
class StorageDict(dict, IStorage): # """ # Object used to access data from workers. # """ args_names = [ "name", "primary_keys", "columns", "tokens", "storage_id", "indexed_on", "class_name" ] args = namedtuple('StorageDictArgs', args_names) _prepared_store_meta = config.session.prepare( 'INSERT INTO hecuba.istorage' '(storage_id, class_name, name, tokens, ' 'primary_keys, columns, indexed_on)' 'VALUES (?,?,?,?,?,?,?)') @staticmethod def build_remotely(result): """ Launches the StorageDict.__init__ from the api.getByID Args: result: a namedtuple with all the information needed to create again the StorageDict """ log.debug("Building Storage dict with %s", result) return StorageDict(result.name, result.primary_keys, result.columns, result.tokens, result.storage_id, result.indexed_on) @staticmethod def _store_meta(storage_args): """ Method to update the info about the StorageDict in the DB metadata table Args: storage_args: structure with all data needed to update the metadata """ log.debug("StorageDict: storing metas %s", storage_args) try: config.session.execute(StorageDict._prepared_store_meta, [ storage_args.storage_id, storage_args.class_name, storage_args.name, storage_args.tokens, storage_args.primary_keys, storage_args.columns, storage_args.indexed_on ]) except Exception as ex: log.error("Error creating the StorageDict metadata: %s %s", storage_args, ex) raise ex def __init__(self, name=None, primary_keys=None, columns=None, tokens=None, storage_id=None, indexed_args=None, **kwargs): """ Creates a new StorageDict. Args: name (string): the name of the collection/table (keyspace is optional) primary_keys (list(tuple)): a list of (key,type) primary keys (primary + clustering). columns (list(tuple)): a list of (key,type) columns tokens (list): list of tokens storage_id (string): the storage id identifier indexed_args (list): values that will be used as index kwargs: other parameters """ super(StorageDict, self).__init__(**kwargs) self._is_persistent = False log.debug("CREATED StorageDict(%s,%s,%s,%s,%s,%s)", primary_keys, columns, name, tokens, storage_id, kwargs) if tokens is None: log.info('using all tokens') tokens = map(lambda a: a.value, config.cluster.metadata.token_map.ring) self._tokens = IStorage._discrete_token_ranges(tokens) else: self._tokens = tokens self._storage_id = storage_id if self.__doc__ is not None: self._persistent_props = self._parse_comments(self.__doc__) self._primary_keys = self._persistent_props[ self.__class__.__name__]['primary_keys'] self._columns = self._persistent_props[ self.__class__.__name__]['columns'] try: self._indexed_args = self._persistent_props[ self.__class__.__name__]['indexed_values'] except KeyError: self._indexed_args = indexed_args else: self._primary_keys = primary_keys self._columns = columns self._indexed_args = indexed_args key_names = [pkname for (pkname, dt) in self._primary_keys] column_names = [colname for (colname, dt) in self._columns] self._item_builder = namedtuple('row', key_names + column_names) if len(key_names) > 1: self._key_builder = namedtuple('row', key_names) else: self._key_builder = None if len(column_names) > 1: self._column_builder = namedtuple('row', column_names) else: self._column_builder = None self._k_size = len(key_names) class_name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) self._build_args = self.args(name, self._primary_keys, self._columns, self._tokens, self._storage_id, self._indexed_args, class_name) if name is not None: self.make_persistent(name) else: self._is_persistent = False def __eq__(self, other): """ Method to compare a StorageDict with another one. Args: other: StorageDict to be compared with. Returns: boolean (true - equals, false - not equals). """ return self._storage_id == other._storage_id and self._tokens == other.token_ranges and \ self._table == other.table_name and self._ksp == other.keyspace _dict_case = re.compile( '.*@TypeSpec + *< *< *([\w:, ]+)+ *> *, *([\w+:., <>]+) *>') _tuple_case = re.compile('.*@TypeSpec +(\w+) +tuple+ *< *([\w, +]+) *>') _index_vars = re.compile('.*@Index_on *([A-z0-9, ]+)') _other_case = re.compile(' *(\w+) *< *([\w, +]+) *>') @classmethod def _parse_comments(self, comments): """ Parses de comments in a class file to save them in the class information Args: comments: the comment in the class file Returns: this: a structure with all the information of the comment """ this = {} for line in comments.split('\n'): m = StorageDict._dict_case.match(line) if m is not None: # Matching @TypeSpec of a dict dict_keys, dict_values = m.groups() primary_keys = [] for ind, key in enumerate(dict_keys.split(",")): key = key.replace(' ', '') match = IStorage._data_type.match(key) if match is not None: # an IStorage with a name name, value = match.groups() elif ':' in key: raise SyntaxError else: name = "key" + str(ind) value = key name = name.replace(' ', '') value = value.replace(' ', '') primary_keys.append( (name, StorageDict._conversions[value])) dict_values = dict_values.replace(' ', '') if dict_values.startswith('dict'): n = IStorage._sub_dict_case.match(dict_values[4:]) # Matching @TypeSpec of a sub dict dict_keys2, dict_values2 = n.groups() primary_keys2 = [] for ind, key in enumerate(dict_keys2.split(",")): try: name, value = IStorage._data_type.match( key).groups() except ValueError: if ':' in key: raise SyntaxError else: name = "key" + str(ind) value = key name = name.replace(' ', '') primary_keys2.append( (name, StorageDict._conversions[value])) columns2 = [] dict_values2 = dict_values2.replace(' ', '') if dict_values2.startswith('tuple'): dict_values2 = dict_values2[6:] for ind, val in enumerate(dict_values2.split(",")): try: name, value = IStorage._data_type.match( val).groups() except ValueError: if ':' in key: raise SyntaxError else: name = "val" + str(ind) value = val columns2.append( (name, StorageDict._conversions[value])) columns = { 'type': 'dict', 'primary_keys': primary_keys2, 'columns': columns2 } elif dict_values.startswith('tuple'): n = IStorage._sub_tuple_case.match(dict_values[5:]) tuple_values = list(n.groups())[0] columns = [] for ind, val in enumerate(tuple_values.split(",")): try: name, value = val.split(':') except ValueError: if ':' in key: raise SyntaxError else: name = "val" + str(ind) value = val name = name.replace(' ', '') columns.append((name, StorageDict._conversions[value])) else: columns = [] for ind, val in enumerate(dict_values.split(",")): match = IStorage._data_type.match(val) if match is not None: # an IStorage with a name name, value = match.groups() elif ':' in val: name, value = IStorage._so_data_type.match( val).groups() else: name = "val" + str(ind) value = val name = name.replace(' ', '') try: columns.append( (name, StorageDict._conversions[value])) except KeyError: columns.append((name, value)) name = str(self).replace('\'>', '').split('.')[-1] if self.__class__.__name__ in this: this[name].update({ 'type': 'dict', 'primary_keys': primary_keys, 'columns': columns }) else: this[name] = { 'type': 'dict', 'primary_keys': primary_keys, 'columns': columns } m = StorageDict._index_vars.match(line) if m is not None: name = str(self).replace('\'>', '').split('.')[-1] indexed_values = m.groups() indexed_values = indexed_values.replace(' ', '').split(',') if name in this: this[name].update({'indexed_values': indexed_values}) else: this[name] = {'indexed_values': indexed_values} return this def __contains__(self, key): """ Method that checks if a given key exists in a StorageDict. Args: key: the position that we want to check if exists. Returns: boolean (true - exists, false - doesn't exist). """ if not self._is_persistent: return dict.__contains__(self, key) else: try: # TODO we should save this value in a cache self._hcache.get_row(self._make_key(key)) return True except Exception as ex: log.warn("persistentDict.__contains__ ex %s", ex) raise ex def _make_key(self, key): """ Method used to pass the key data to the StorageDict cache in a proper way. Args: key: the data that needs to get the correct format """ if isinstance(key, str) or isinstance( key, unicode) or not isinstance(key, Iterable): if len(self._primary_keys) == 1: if isinstance(key, unicode): return [key.encode('ascii', 'ignore')] return [key] else: raise Exception('missing a primary key') if isinstance(key, Iterable) and len(key) == len(self._primary_keys): return list(key) else: raise Exception('wrong primary key') @staticmethod def _make_value(value): """ Method used to pass the value data to the StorageDict cache in a proper way. Args: value: the data that needs to get the correct format """ if issubclass(value.__class__, IStorage): return [uuid.UUID(value.getID())] elif isinstance(value, str) or not isinstance(value, Iterable) or isinstance( value, np.ndarray): return [value] elif isinstance(value, unicode): return [value.encode('ascii', 'ignore')] else: return list(value) def keys(self): """ This method return a list of all the keys of the StorageDict. Returns: list: a list of keys """ return [i for i in self.iterkeys()] def values(self): """ This method return a list of all the values of the StorageDict. Returns: list: a list of values """ return [i for i in self.itervalues()] def __iter__(self): """ Method that overloads the python dict basic iteration, which returns an iterator over the dictionary keys. """ return self.iterkeys() def make_persistent(self, name): """ Method to transform a StorageDict into a persistent object. This will make it use a persistent DB as the main location of its data. Args: name: """ if self._is_persistent: raise AlreadyPersistentError( "This StorageDict is already persistent [Before:{}.{}][After:{}]", self._ksp, self._table, name) self._is_persistent = True (self._ksp, self._table) = self._extract_ks_tab(name) if self._storage_id is None: self._storage_id = uuid.uuid3(uuid.NAMESPACE_DNS, self._ksp + '.' + self._table) self._build_args = self._build_args._replace( storage_id=self._storage_id, name=self._ksp + "." + self._table) self._store_meta(self._build_args) if config.id_create_schema == -1: query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % ( self._ksp, config.replication) try: log.debug('MAKE PERSISTENCE: %s', query_keyspace) config.session.execute(query_keyspace) except Exception as ex: log.warn("Error creating the StorageDict keyspace %s, %s", (query_keyspace), ex) raise ex for key, value in dict.iteritems(self): if issubclass(value.__class__, IStorage): # new name as ksp+table+obj_class_name val_name = self._ksp + '.' + self._table + type( value).__name__.lower() value.make_persistent(val_name) columns = self._primary_keys + self._columns for ind, entry in enumerate(columns): n = StorageDict._other_case.match(entry[1]) if n is not None: iter_type, intra_type = n.groups() else: iter_type = entry[1] if iter_type not in IStorage._basic_types: columns[ind] = entry[0], 'uuid' pks = map(lambda a: a[0], self._primary_keys) query_table = "CREATE TABLE IF NOT EXISTS %s.%s (%s, PRIMARY KEY (%s));" \ % (self._ksp, self._table, ",".join("%s %s" % tup for tup in columns), str.join(',', pks)) try: log.debug('MAKE PERSISTENCE: %s', query_table) config.session.execute(query_table) except Exception as ex: log.warn("Error creating the StorageDict table: %s %s", query_table, ex) raise ex key_names = map(lambda a: a[0].encode('UTF8'), self._primary_keys) column_names = self._columns self._hcache_params = (self._ksp, self._table, self._storage_id, self._tokens, key_names, map(lambda x: { "name": x[0], "type": x[1] }, column_names), { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'write_buffer': config.write_buffer_size }) log.debug("HCACHE params %s", self._hcache_params) self._hcache = Hcache(*self._hcache_params) # Storing all in-memory values to cassandra for key, value in dict.iteritems(self): self._hcache.put_row(self._make_key(key), self._make_value(value)) if hasattr(self, '_indexed_args') and self._indexed_args is not None: index_query = 'CREATE CUSTOM INDEX IF NOT EXISTS ' + self._table + '_idx ON ' index_query += self._ksp + '.' + self._table + ' (' + str.join( ',', self._indexed_args) + ') ' index_query += "using 'es.bsc.qbeast.index.QbeastIndex';" try: config.session.execute(index_query) except Exception as ex: log.error("Error creating the Qbeast custom index: %s %s", index_query, ex) raise ex def stop_persistent(self): """ Method to turn a StorageDict into non-persistent. """ log.debug('STOP PERSISTENCE: %s', self._table) self._is_persistent = False self._hcache = None def delete_persistent(self): """ Method to empty all data assigned to a StorageDict. """ query = "TRUNCATE TABLE %s.%s;" % (self._ksp, self._table) log.debug('DELETE PERSISTENT: %s', query) config.session.execute(query) def _build_istorage_obj(self, obj_type, so_name, storage_id): cname, module = IStorage.process_path(obj_type) mod = __import__(module, globals(), locals(), [cname], 0) # new name as ksp+table+obj_class_name so = getattr(mod, cname)(name=so_name + cname.lower(), storage_id=storage_id) # sso._storage_id = storage_id return so def __delitem__(self, key): """ Method to delete a specific entry in the dict in the key position. Args: key: position of the entry that we want to delete """ if not self._is_persistent: dict.__delitem__(self, key) else: self._hcache.delete_row([key]) def __getitem__(self, key): """ If the object is persistent, each request goes to the hfetch. Args: key: the dictionary key Returns item: value found in position key """ log.debug('GET ITEM %s', key) if not self._is_persistent: to_return = dict.__getitem__(self, key) return to_return else: cres = self._hcache.get_row(self._make_key(key)) log.debug("GET ITEM %s[%s]", cres, cres.__class__) final_results = [] for index, (name, col_type) in enumerate(self._columns): if col_type not in IStorage._basic_types: table_name = self._ksp + '.' + self._table element = (self._build_istorage_obj( col_type, table_name, uuid.UUID(cres[index]))) else: element = cres[index] final_results.append(element) cres = final_results if issubclass(cres.__class__, NoneType): return None elif self._column_builder is not None: if len(cres) > 0 and isinstance(cres[0], list): return [self._column_builder(*row) for row in cres] else: return self._column_builder(*cres) else: return cres[0] def __setitem__(self, key, val): """ Method to insert values in the StorageDict Args: key: the position of the value that we want to save val: the value that we want to save in that position """ if isinstance(val, np.ndarray): val = StorageNumpy(val) log.debug('SET ITEM %s->%s', key, val) if not config.hecuba_type_checking: if not self._is_persistent: dict.__setitem__(self, key, val) else: if isinstance(val, IStorage) and not val._is_persistent: attribute = val.__class__.__name__.lower() count = self._count_name_collision(attribute) # new name as ksp+table+obj_class_name val.make_persistent(self._ksp + '.' + self._table + "_" + attribute + "_" + str(count)) self._hcache.put_row(self._make_key(key), self._make_value(val)) else: if isinstance(val, Iterable) and not isinstance(val, str): col_types = map( lambda x: IStorage._conversions[x.__class__.__name__], val) spec_col_types = map(lambda x: x[1], self._columns) for idx, value in enumerate(spec_col_types): if value == 'double': spec_col_types[idx] = 'float' else: col_types = IStorage._conversions[val.__class__.__name__] spec_col_types = map(lambda x: x[1], self._columns)[0] if spec_col_types == 'double': spec_col_types = 'float' if isinstance(key, Iterable) and not isinstance(key, str): key_types = map( lambda x: IStorage._conversions[x.__class__.__name__], key) spec_key_types = map(lambda x: x[1], self._primary_keys) for idx, value in enumerate(spec_key_types): if value == 'double': spec_key_types[idx] = 'float' else: key_types = IStorage._conversions[key.__class__.__name__] spec_key_types = map(lambda x: x[1], self._primary_keys)[0] if spec_key_types == 'double': spec_key_types = 'float' if (col_types == spec_col_types): if (key_types == spec_key_types): if not self._is_persistent: dict.__setitem__(self, key, val) else: self._hcache.put_row(self._make_key(key), self._make_value(val)) else: raise KeyError else: raise ValueError def __repr__(self): """ Overloads the method used by print to show a StorageDict Returns: The representation of the data stored in the StorageDict """ to_return = {} for item in self.iteritems(): to_return[item[0]] = item[1] if len(to_return) == config.hecuba_print_limit: return str(to_return) if len(to_return) > 0: return str(to_return) return "" def update(self, other=None, **kwargs): """ Updates the current dict with a new dictionary or set of attr,value pairs (those must follow the current dict data model). Args: other: python dictionary or StorageDict. All key,val values in it will be inserted in the current dict. **kwargs: set of attr:val pairs, to be treated as key,val and inserted in the current dict. """ if other is not None: if isinstance(other, StorageDict): for k, v in other.iteritems(): self[k] = v else: for k, v in other.items() if isinstance(other, Mapping) else other: self[k] = v for k, v in kwargs.items(): self[k] = v def iterkeys(self): """ Obtains the iterator for the keys of the StorageDict Returns: if persistent: iterkeys(self): list of keys if not persistent: dict.iterkeys(self) """ if self._is_persistent: ik = self._hcache.iterkeys(config.prefetch_size) return NamedIterator(ik, self._key_builder, self) else: return dict.iterkeys(self) def iteritems(self): """ Obtains the iterator for the key,val pairs of the StorageDict Returns: if persistent: NamedItemsIterator(self): list of key,val pairs if not persistent: dict.iteritems(self) """ if self._is_persistent: ik = self._hcache.iteritems(config.prefetch_size) return NamedItemsIterator(self._key_builder, self._column_builder, self._k_size, ik, self) else: return dict.iteritems(self) def itervalues(self): """ Obtains the iterator for the values of the StorageDict Returns: if persistent: NamedIterator(self): list of valuesStorageDict if not persistent: dict.itervalues(self) """ if self._is_persistent: ik = self._hcache.itervalues(config.prefetch_size) return NamedIterator(ik, self._column_builder, self) else: return dict.itervalues(self) def keys(self): return [i for i in self.iterkeys()] def values(self): return [i for i in self.itervalues()] def items(self): return [i for i in self.iteritems()] def get(self, key, default): try: value = self.__getitem__(key) except KeyError: value = default return value
def make_persistent(self, name): """ Method to transform a StorageDict into a persistent object. This will make it use a persistent DB as the main location of its data. Args: name: """ if self._is_persistent: raise AlreadyPersistentError( "This StorageDict is already persistent [Before:{}.{}][After:{}]", self._ksp, self._table, name) self._is_persistent = True (self._ksp, self._table) = self._extract_ks_tab(name) if self._storage_id is None: self._storage_id = uuid.uuid3(uuid.NAMESPACE_DNS, self._ksp + '.' + self._table) self._build_args = self._build_args._replace( storage_id=self._storage_id, name=self._ksp + "." + self._table) self._store_meta(self._build_args) if config.id_create_schema == -1: query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % ( self._ksp, config.replication) try: log.debug('MAKE PERSISTENCE: %s', query_keyspace) config.session.execute(query_keyspace) except Exception as ex: log.warn("Error creating the StorageDict keyspace %s, %s", (query_keyspace), ex) raise ex for key, value in dict.iteritems(self): if issubclass(value.__class__, IStorage): # new name as ksp+table+obj_class_name val_name = self._ksp + '.' + self._table + type( value).__name__.lower() value.make_persistent(val_name) columns = self._primary_keys + self._columns for ind, entry in enumerate(columns): n = StorageDict._other_case.match(entry[1]) if n is not None: iter_type, intra_type = n.groups() else: iter_type = entry[1] if iter_type not in IStorage._basic_types: columns[ind] = entry[0], 'uuid' pks = map(lambda a: a[0], self._primary_keys) query_table = "CREATE TABLE IF NOT EXISTS %s.%s (%s, PRIMARY KEY (%s));" \ % (self._ksp, self._table, ",".join("%s %s" % tup for tup in columns), str.join(',', pks)) try: log.debug('MAKE PERSISTENCE: %s', query_table) config.session.execute(query_table) except Exception as ex: log.warn("Error creating the StorageDict table: %s %s", query_table, ex) raise ex key_names = map(lambda a: a[0].encode('UTF8'), self._primary_keys) column_names = self._columns self._hcache_params = (self._ksp, self._table, self._storage_id, self._tokens, key_names, map(lambda x: { "name": x[0], "type": x[1] }, column_names), { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'write_buffer': config.write_buffer_size }) log.debug("HCACHE params %s", self._hcache_params) self._hcache = Hcache(*self._hcache_params) # Storing all in-memory values to cassandra for key, value in dict.iteritems(self): self._hcache.put_row(self._make_key(key), self._make_value(value)) if hasattr(self, '_indexed_args') and self._indexed_args is not None: index_query = 'CREATE CUSTOM INDEX IF NOT EXISTS ' + self._table + '_idx ON ' index_query += self._ksp + '.' + self._table + ' (' + str.join( ',', self._indexed_args) + ') ' index_query += "using 'es.bsc.qbeast.index.QbeastIndex';" try: config.session.execute(index_query) except Exception as ex: log.error("Error creating the Qbeast custom index: %s %s", index_query, ex) raise ex
def test_get_row_key_error(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' This test check the hcache sets a key error when the key we asked doesnt exist Analyzes: - Hcache - Get_row (returning KeyError) ''' '''''' table = 'particle' num_keys = 10001 self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) for i in xrange(0, num_keys): vals = ','.join( str(e) for e in [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"]) self.session.execute( "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)" % (self.keyspace, table, vals)) token_ranges = [(8070430489100699999, 8070450532247928832)] non_existent_keys = 10 cache_size = num_keys + non_existent_keys try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort keys = ["partid", "time"] values = ["ciao", "x", "y", "z"] cache = Hcache(self.keyspace, table, "", token_ranges, keys, values, {'cache_size': cache_size}) # Access the cache, which is empty and queries cassandra to retrieve the data t1 = time.time() error_counter = 0 for pk in xrange(0, num_keys + non_existent_keys): ck = pk * 10 try: result = cache.get_row([pk, ck]) self.assertEqual(len(result), len(values)) except KeyError as e: error_counter = error_counter + 1 print 'Retrieved {0} keys in {1} seconds. {2} keys weren\'t found, {3} keys weren\'t supposed to be found'.format( unicode(str(num_keys), 'utf-8'), unicode(str(time.time() - t1), 'utf-8'), unicode(str(error_counter), 'utf-8'), unicode(str(non_existent_keys), 'utf-8')) self.assertEqual(error_counter, non_existent_keys) # Access the cache, which has already all the data and will ask cassandra only if # the keys asked are not present t1 = time.time() error_counter = 0 for pk in xrange(0, num_keys + non_existent_keys): ck = pk * 10 try: result = cache.get_row([pk, ck]) self.assertEqual(len(result), len(values)) except KeyError as e: error_counter = error_counter + 1 print 'Retrieved {0} keys in {1} seconds. {2} keys weren\'t found, {3} keys weren\'t supposed to be found'.format( unicode(str(num_keys), 'utf-8'), unicode(str(time.time() - t1), 'utf-8'), unicode(str(error_counter), 'utf-8'), unicode(str(non_existent_keys), 'utf-8')) self.assertEqual(error_counter, non_existent_keys)
def uuid_test(self): from hfetch import connectCassandra from hfetch import Hcache import uuid '''''' ''' This test check the correct handling of UUIDs Analyzes: - Hcache - Put_row - Iteritems ''' '''''' table = "uuid" self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid uuid, data int, PRIMARY KEY(partid));" % (self.keyspace, table)) nelem = 1000 nblocks = 10 t_f = pow(-2, 63) # Token begin range t_t = pow(2, 63) - 1 # Token blocks tkn_size = (t_t - t_f) / (nelem / nblocks) tokens = [(a, a + tkn_size) for a in xrange(t_f, t_t - tkn_size, tkn_size)] try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort keys = ["partid"] values = ["data"] # CREATE TABLE test.bulk(partid int PRIMARY KEY, data text); cache = Hcache(self.keyspace, table, "WHERE token(partid)>=? AND token(partid)<?;", tokens, keys, values, { 'cache_size': '10', 'writer_buffer': 20 }) # Write data someid = None i = 0 while i < nelem: u = uuid.uuid4() # ('81da81e8-1914-11e7-908d-ecf4bb4c66c4') cache.put_row([u], [i]) if i == nelem / 2: someid = u i += 1 # by recreating the cache we wait until all the data is written cache = Hcache(self.keyspace, table, "WHERE token(partid)>=? AND token(partid)<?;", tokens, keys, values, { 'cache_size': '10', 'writer_buffer': 20 }) # Read data itera = cache.iteritems(10) found = False counter = 0 while True: try: L = uuid.UUID(itera.get_next()[0]) if L == someid: found = True except StopIteration: break counter = counter + 1 self.assertEqual(counter, nelem) self.assertTrue(found)
class Hfetch_Tests(unittest.TestCase): keyspace = "hnumpy_test" contact_names = ['127.0.0.1'] nodePort = 9042 cluster = Cluster(contact_names, port=nodePort) session = cluster.connect() @classmethod def setUpClass(cls): cls.session.execute( "CREATE KEYSPACE IF NOT EXISTS %s WITH replication " "= {'class': 'SimpleStrategy', 'replication_factor': 1};" % cls.keyspace) cls.session.execute( "CREATE TYPE IF NOT EXISTS %s.numpy_meta(dims frozen<list<int>>,type int,type_size int);" % cls.keyspace) @classmethod def tearDownClass(cls): #self.session.execute("DROP KEYSPACE IF EXISTS %s;" % cls.keyspace) pass def test_simple_memory(self): from hfetch import connectCassandra from hfetch import Hcache import numpy as np '''''' ''' Analyzes: ''' '''''' dims = 2 elem_dim = 4096 try: connectCassandra(self.contact_names, self.nodePort) except RuntimeError, e: print e print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort table = "arrays_numpies" self.session.execute("DROP TABLE if exists %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE %s.%s(storage_id uuid, cluster_id int, block_id int, payload blob,PRIMARY KEY((storage_id,cluster_id),block_id));" % (self.keyspace, table)) storage_id = uuid.uuid3(uuid.NAMESPACE_DNS, self.keyspace + '.' + table) time.sleep(5) a = Hcache(self.keyspace, table, storage_id, [], ['storage_id', 'cluster_id', 'block_id'], [{ 'name': "payload", 'type': 'numpy' }], {}) #prepare data bigarr = np.arange(pow(elem_dim, dims)).reshape(elem_dim, elem_dim) print 'To be written ' keys = [storage_id, -1, -1] values = [bigarr.astype('i')] print values #insert a.put_row(keys, values) #delete is a blocking op which waits the data to be flushed del a a = Hcache(self.keyspace, table, storage_id, [], ["storage_id", 'cluster_id', 'block_id'], [{ "name": "payload", "type": "numpy" }], {}) #retrieve result = a.get_row(keys) print 'Retrieved from cassandra' print result if np.array_equal(bigarr, result[0]): print 'Created and retrieved are equal' else: self.fail('Created and retrieved ndarrays differ') self.session.execute("DROP TABLE %s.%s;" % (self.keyspace, table))
t_f = pow(-2, 63) # Token begin range t_t = pow(2, 63) - 1 # Token blocks tkn_size = (t_t - t_f) / (nparts / p) tkns = [(a, a + tkn_size) for a in xrange(t_f, t_t - tkn_size, tkn_size)] keys = ["partid", "time"] values = ["x"] hcache_config = {'cache_size': '100', 'writer_buffer': 20} token_query = "WHERE token(partid)>=? AND token(partid)<?;" cache = Hcache(self.keyspace, table, token_query, tkns, keys, values, hcache_config) hiter_config = {"prefetch_size": 100, "update_cache": "yes"} hiter = cache.iteritems(hiter_config) count = 0 start = time.time() while True: try: i = hiter.get_next() self.assertEqual(len(i), len(keys) + len(values)) except StopIteration: break count += 1
def test_get_row(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' This test iterates over a set of particles, performing get_row operations Analyzes: - HCache (multiple reads of the same key) - Get_row ''' '''''' table = 'particle' num_keys = 10001 self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) for i in xrange(0, num_keys): vals = ','.join( str(e) for e in [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"]) self.session.execute( "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)" % (self.keyspace, table, vals)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort token_ranges = [] cache_size = 10001 keys = ["partid", "time"] values = ["ciao", "x", "y", "z"] cache_config = {'cache_size': cache_size} cache = Hcache(self.keyspace, table, "", token_ranges, keys, values, cache_config) # clustering key t1 = time.time() for pk in xrange(0, num_keys): ck = pk * 10 try: result = cache.get_row([pk, ck]) self.assertEqual(len(result), len(values)) except KeyError as e: print "Error when retrieving value from cache:", e, [pk, ck] print 'time - load C++ cache with cassandra data: ', time.time() - t1 t1 = time.time() for pk in xrange(0, num_keys): ck = pk * 10 try: result = cache.get_row([pk, ck]) self.assertEqual(len(result), len(values)) except KeyError as e: print "Error when retrieving value from cache:", e, [pk, ck] # print 'items in res: ',len(result) print 'time - read data from C++ cache: ', time.time() - t1 py_dict = {} cache = Hcache(self.keyspace, table, "", [(8070430489100699999, 8070450532247928832)], ["partid", "time"], ["ciao", "x", "y", "z"], {'cache_size': num_keys}) t1 = time.time() for pk in xrange(0, num_keys): ck = pk * 10 try: result = cache.get_row([pk, ck]) py_dict[(pk, ck)] = result self.assertEqual(len(result), len(values)) except KeyError as e: print "Error when retrieving value from cache:", e, [pk, ck] print 'time - load data into python dict: ', time.time() - t1 # print 'size ', len(py_dict) # print 'items in res: ',len(py_dict[1]) t1 = time.time() for pk in xrange(0, num_keys): ck = pk * 10 try: result = py_dict[(pk, ck)] self.assertEqual(len(result), len(values)) except KeyError as e: print "Error when retrieving value from cache:", e, [pk, ck] print 'time - read data from the python dict: ', time.time() - t1
def test_delete_row(self): from hfetch import connectCassandra from hfetch import Hcache '''''' ''' This test iterates over a set of particles, performing get_row operations Analyzes: - HCache - Get_row (setting TypeError properly) ''' '''''' table = 'particle' num_keys = 100 # num keys must be multiple of expected_errors expected_errors = 10 self.session.execute("DROP TABLE IF EXISTS %s.%s;" % (self.keyspace, table)) self.session.execute( "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text," "x float, y float, z float, PRIMARY KEY(partid,time));" % (self.keyspace, table)) for i in xrange(0, num_keys): vals = ','.join( str(e) for e in [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"]) self.session.execute( "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)" % (self.keyspace, table, vals)) try: connectCassandra(self.contact_names, self.nodePort) except Exception: print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort token_ranges = [] cache_size = 1 keys = ["partid", "time"] values = ["ciao", "x", "y", "z"] cache_config = {'cache_size': cache_size} cache = Hcache(self.keyspace, table, "", token_ranges, keys, values, cache_config) pk = 0 ck = pk * 10 try: result = cache.get_row([pk, ck]) self.assertEqual(len(result), len(values)) except KeyError as e: self.fail("Error when retrieving value from cache: " + str(e) + " -- " + str([pk, ck])) try: result = cache.delete_row([pk, ck]) except KeyError as e: self.fail("Error when deleteing entry from cache: " + str(e) + " -- " + str([pk, ck])) try: result = cache.get_row([pk, ck]) self.fail( "Error when retrieving value from cache, the entry shouldnt exist" ) except KeyError as e: pass
class StorageNumpy(np.ndarray, IStorage): _storage_id = None _build_args = None _class_name = None _hcache_params = None _hcache = None _prepared_store_meta = config.session.prepare( 'INSERT INTO hecuba.istorage' '(storage_id, class_name, name)' 'VALUES (?,?,?)') args_names = ["storage_id", "class_name", "name"] args = namedtuple('StorageNumpyArgs', args_names) def __new__(cls, input_array=None, storage_id=None, name=None, **kwargs): if input_array is None and name is not None and storage_id is not None: input_array = cls.load_array(storage_id, name) obj = np.asarray(input_array).view(cls) obj._is_persistent = True elif name is None and storage_id is not None: raise RuntimeError("hnumpy received storage id but not a name") elif (input_array is not None and name is not None and storage_id is not None) \ or (storage_id is None and name is not None): obj = np.asarray(input_array).view(cls) obj.make_persistent(name) else: obj = np.asarray(input_array).view(cls) obj._is_persistent = False # Input array is an already formed ndarray instance # We first cast to be our class type # add the new attribute to the created instance obj._storage_id = storage_id # Finally, we must return the newly created object: obj._class_name = '%s.%s' % (cls.__module__, cls.__name__) return obj # used as copy constructor def __array_finalize__(self, obj): if obj is None: return self._storage_id = getattr(obj, '_storage_id', None) @staticmethod def build_remotely(new_args): """ Launches the StorageNumpy.__init__ from the uuid api.getByID Args: new_args: a list of all information needed to create again the StorageNumpy Returns: so: the created StorageNumpy """ log.debug("Building StorageNumpy object with %s", new_args) return StorageNumpy(new_args.storage_id) @staticmethod def _store_meta(storage_args): """ Saves the information of the object in the istorage table. Args:. storage_args (object): contains all data needed to restore the object from the workers """ log.debug("StorageObj: storing media %s", storage_args) try: config.session.execute(StorageNumpy._prepared_store_meta, [ storage_args.storage_id, storage_args.class_name, storage_args.name ]) except Exception as ex: log.warn("Error creating the StorageNumpy metadata with args: %s" % str(storage_args)) raise ex @staticmethod def load_array(storage_id, name): (ksp, table) = IStorage._extract_ks_tab(name) _hcache_params = (ksp, table + '_numpies', storage_id, [], ['storage_id', 'cluster_id', 'block_id'], [{ 'name': "payload", 'type': 'numpy' }], { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'write_buffer': config.write_buffer_size }) _hcache = Hcache(*_hcache_params) result = _hcache.get_row([storage_id, -1, -1]) if len(result) == 1: return result[0] else: raise KeyError def make_persistent(self, name): if self._is_persistent: raise AlreadyPersistentError( "This StorageNumpy is already persistent [Before:{}.{}][After:{}]", self._ksp, self._table, name) self._is_persistent = True (self._ksp, self._table) = self._extract_ks_tab(name) if self._storage_id is None: self._storage_id = uuid.uuid3( uuid.NAMESPACE_DNS, self._ksp + '.' + self._table + '_numpies') self._build_args = self.args(self._storage_id, self._class_name, name) log.info("PERSISTING DATA INTO %s %s", self._ksp, self._table) query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % ( self._ksp, config.replication) config.session.execute(query_keyspace) config.session.execute( 'CREATE TABLE IF NOT EXISTS ' + self._ksp + '.' + self._table + '_numpies' '(storage_id uuid , ' 'cluster_id int, ' 'block_id int, ' 'payload blob, ' 'PRIMARY KEY((storage_id,cluster_id),block_id))') self._hcache_params = (self._ksp, self._table + '_numpies', self._storage_id, [], ['storage_id', 'cluster_id', 'block_id'], [{ 'name': "payload", 'type': 'numpy' }], { 'cache_size': config.max_cache_size, 'writer_par': config.write_callbacks_number, 'write_buffer': config.write_buffer_size }) self._hcache = Hcache(*self._hcache_params) if len(self.shape) != 0: self._hcache.put_row([self._storage_id, -1, -1], [self]) self._store_meta(self._build_args) def delete_persistent(self): """ Deletes the Cassandra table where the persistent StorageObj stores data """ self._is_persistent = False query = "DELETE FROM %s.%s WHERE storage_id = %s;" % ( self._ksp, self._table + '_numpies', self._storage_id) log.debug("DELETE PERSISTENT: %s", query) config.session.execute(query)