Exemple #1
0
    def test_put_row_text(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        Simple test to store text and retrieve it
        
        Analyzes:
        - HCache
        - Put_row (write text)
        - Iteritems (read text)
        ''' ''''''

        table = "bulk"

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE %s.%s(partid int PRIMARY KEY, data text);" %
            (self.keyspace, table))

        num_items = int(pow(10, 3))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        nblocks = 10
        t_f = pow(-2, 63)  # Token begin range
        t_t = pow(2, 63) - 1
        # Token blocks
        tkn_size = (t_t - t_f) / (num_items / nblocks)
        tokens = [(a, a + tkn_size)
                  for a in xrange(t_f, t_t - tkn_size, tkn_size)]

        keys = ["partid"]
        values = ["data"]

        hcache_config = {'cache_size': '10', 'writer_buffer': 20}

        cache = Hcache(self.keyspace, table, "", tokens, keys, values,
                       hcache_config)
        for i in xrange(0, num_items):
            cache.put_row([i], ['someRandomText'])

        # it doesnt make sense to count the read elements
        # because the data is still being written async
        hiter = cache.iteritems(10)
        while True:
            try:
                data = hiter.get_next()
                self.assertEqual(len(data), len(keys) + len(values))
                self.assertEqual(data[1], 'someRandomText')
            except StopIteration:
                break
Exemple #2
0
    def _setup_hcache(self):
        key_names = [key["name"] for key in self._primary_keys]
        persistent_values = [{"name": col["name"]} for col in self._columns]

        if self._tokens is None:
            raise RuntimeError("Tokens for object {} are null".format(self._get_name()))

        self._hcache_params = (self._ksp, self._table,
                               self.storage_id,
                               self._tokens, key_names, persistent_values,
                               {'cache_size': config.max_cache_size,
                                'writer_par': config.write_callbacks_number,
                                'writer_buffer': config.write_buffer_size,
                                'timestamped_writes': config.timestamped_writes})
        log.debug("HCACHE params %s", self._hcache_params)
        self._hcache = Hcache(*self._hcache_params)
Exemple #3
0
    def test_write_nulls_simple(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        Simple test to store text and retrieve it

        Analyzes:
        - HCache
        - Put_row (write data mixed with nulls)
        ''' ''''''

        table = "nulls"

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE %s.%s(partid int PRIMARY KEY, time float, data text);"
            % (self.keyspace, table))

        num_items = int(pow(10, 3))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        nblocks = 10
        t_f = pow(-2, 63)  # Token begin range
        t_t = pow(2, 63) - 1
        # Token blocks
        tkn_size = (t_t - t_f) / (num_items / nblocks)
        tokens = [(a, a + tkn_size)
                  for a in xrange(t_f, t_t - tkn_size, tkn_size)]

        keys = ["partid"]
        values = ["time", "data"]

        hcache_config = {'cache_size': '10', 'writer_buffer': 20}

        cache = Hcache(self.keyspace, table, "", tokens, keys, values,
                       hcache_config)
        for i in xrange(0, num_items):
            cache.put_row(
                [i], [12, None]
            )  # random.sample({i,None},1)+random.sample({'SomeRandomText',None},1))
        time.sleep(10)
Exemple #4
0
    def make_persistent(self, name):
        if self._is_persistent:
            raise AlreadyPersistentError(
                "This StorageNumpy is already persistent [Before:{}.{}][After:{}]",
                self._ksp, self._table, name)
        self._is_persistent = True

        (self._ksp, self._table) = self._extract_ks_tab(name)
        if self._storage_id is None:
            self._storage_id = uuid.uuid3(
                uuid.NAMESPACE_DNS, self._ksp + '.' + self._table + '_numpies')
        self._build_args = self.args(self._storage_id, self._class_name, name)
        log.info("PERSISTING DATA INTO %s %s", self._ksp, self._table)

        query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % (
            self._ksp, config.replication)
        config.session.execute(query_keyspace)

        config.session.execute(
            'CREATE TABLE IF NOT EXISTS ' + self._ksp + '.' + self._table +
            '_numpies'
            '(storage_id uuid , '
            'cluster_id int, '
            'block_id int, '
            'payload blob, '
            'PRIMARY KEY((storage_id,cluster_id),block_id))')

        self._hcache_params = (self._ksp, self._table + '_numpies',
                               self._storage_id, [],
                               ['storage_id', 'cluster_id', 'block_id'], [{
                                   'name':
                                   "payload",
                                   'type':
                                   'numpy'
                               }], {
                                   'cache_size': config.max_cache_size,
                                   'writer_par': config.write_callbacks_number,
                                   'write_buffer': config.write_buffer_size
                               })

        self._hcache = Hcache(*self._hcache_params)
        if len(self.shape) != 0:
            self._hcache.put_row([self._storage_id, -1, -1], [self])
        self._store_meta(self._build_args)
Exemple #5
0
    def test_small_brute(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        This test iterates over a small amount of data using an iterkeys and validates that
        no column name can be a key and value at the same time
        
        Analyzes:
        - HCache (enforce column can't be key and value at the same time)
        - Iterkeys
        ''' ''''''

        table = "particle"
        nelems = 10001

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        for i in xrange(0, nelems):
            vals = ','.join(
                str(e) for e in
                [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"])
            self.session.execute(
                "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)"
                % (self.keyspace, table, vals))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        nblocks = 100

        t_f = pow(-2, 63)  # Token begin range
        t_t = pow(2, 63) - 1
        # Token blocks
        tkn_size = (t_t - t_f) / (nelems / nblocks)
        tokens = [(a, a + tkn_size)
                  for a in xrange(t_f, t_t - tkn_size, tkn_size)]

        hcache_config = {'cache_size': '10', 'writer_buffer': 20}
        keys = ["partid", "time"]
        values = ["time", "x"]

        cache = None
        # this should fail since a key can not be a column name at the same time (key=time, column=time)
        try:
            cache = Hcache(self.keyspace, table,
                           "WHERE token(partid)>=? AND token(partid)<?;",
                           tokens, keys, values, hcache_config)
        except RuntimeError, e:
            self.assertTrue(True, e)
Exemple #6
0
 def load_array(storage_id, name):
     (ksp, table) = IStorage._extract_ks_tab(name)
     _hcache_params = (ksp, table + '_numpies', storage_id, [],
                       ['storage_id', 'cluster_id', 'block_id'], [{
                           'name':
                           "payload",
                           'type':
                           'numpy'
                       }], {
                           'cache_size': config.max_cache_size,
                           'writer_par': config.write_callbacks_number,
                           'write_buffer': config.write_buffer_size
                       })
     _hcache = Hcache(*_hcache_params)
     result = _hcache.get_row([storage_id, -1, -1])
     if len(result) == 1:
         return result[0]
     else:
         raise KeyError
Exemple #7
0
    def test_simpletest(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        
        Analyzes:
        ''' ''''''

        table = 'particle'
        nelems = 500

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        for i in xrange(0, nelems):
            vals = ','.join(
                str(e) for e in
                [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"])
            self.session.execute(
                "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)"
                % (self.keyspace, table, vals))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        keys = ["partid", "time"]
        values = ["x", "y", "z"]
        token_ranges = []
        # empty configuration parameter (the last dictionary) means to use the default config
        table = Hcache(self.keyspace, table,
                       "WHERE token(partid)>=? AND token(partid)<?;",
                       token_ranges, keys, values, {})

        def get_data(cache, keys):
            data = None
            try:
                data = cache.get_row(keys)
                self.assertEqual(len(data), len(values))
            except KeyError:
                print 'not found'
            return data

        q1 = get_data(table, [433, 4330])  # float(0.003)
        lost = get_data(table, [133, 1330])
        lost = get_data(table, [433, 4330])
        q2 = get_data(table, [433, 4330])
        self.assertEqual(q1, q2)
Exemple #8
0
    def test_coherency(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        from hfetch import HWriter
        '''''' '''
         Analyzes:
         - HCache
         ''' ''''''

        table = "particle"
        nparts = 10000  # Num particles in range

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        tkns = []
        keys = ["partid", "time"]
        values = ["x", "y", "z"]
        cache = Hcache(self.keyspace, table,
                       "WHERE token(partid)>=? AND token(partid)<?;", tkns,
                       keys, values, {
                           'cache_size': '1',
                           'writer_buffer': 20
                       })
        for i in xrange(0, nparts):
            cache.put_row([i, i / .1], [i / .2, i / .3, i / .4])

        for i in reversed(xrange(0, nparts)):  #xrange(nparts, -1, -1):
            try:
                cache.get_row([i, i / .1])
            except KeyError:
                str_k = str([i, i / .1])
                self.fail(str_k + " not found")
Exemple #9
0
    def test_iterators(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        This test iterates over some text and check coherency between hcache and hiter
        
        Analyzes:
        - HCache
        - Get_row (read text)
        - Iteritems (read text)
        ''' ''''''

        table = "words"
        num_keys = 20

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE %s.%s(position int PRIMARY KEY, wordinfo text);" %
            (self.keyspace, table))

        for i in xrange(0, num_keys):
            vals = ','.join(
                str(e) for e in [
                    i, "'someRandomTextForTesting purposes - " + str(i * 60) +
                    "'"
                ])
            self.session.execute(
                "INSERT INTO %s.%s(position , wordinfo ) VALUES (%s)" %
                (self.keyspace, table, vals))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        tkns = [(pow(-2, 63) + 1, pow(2, 63) - 1)]
        keys = ["position"]
        values = ["wordinfo"]
        hcache_config = {'cache_size': 100, 'writer_buffer': 20}

        cache = Hcache(self.keyspace, table,
                       "WHERE token(position)>=? AND token(position)<?;", tkns,
                       keys, values, hcache_config)

        iter_config = {"prefetch_size": 100, "update_cache": "yes"}
        myIter = cache.iteritems(iter_config)

        data = []
        for i in xrange(0, 10):
            data.append(myIter.get_next())

        assert (len(data) > 0)
        first_data = data[0]

        assert (len(first_data) == 2)
        first_key = [first_data[0]]

        assert (type(first_key[0]) == int)
        somedata = cache.get_row(first_key)
        # self.assertEqual((first_key + somedata), first_data)
        assert ((first_key + somedata) == first_data)

        count = len(data)

        while True:
            try:
                i = myIter.get_next()
            except StopIteration:
                print 'End of data, items read: ', count, ' with value ', i
                break
            count = count + 1

        print 'data was: \n', data
Exemple #10
0
    def write_test(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        from hfetch import HWriter
        '''''' '''
        While the iterator retrieves the data from a table, the writer stores it into another table
        
        Analyzes:
        - HCache
        - HWriter
        - Iteritems (updating the cache)
        ''' ''''''

        table = "particle"
        table_write = "particle_write"
        nparts = 6000  # Num particles in range

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table_write))

        for i in xrange(0, nparts):
            vals = ','.join(
                str(e) for e in
                [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"])
            self.session.execute(
                "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)"
                % (self.keyspace, table, vals))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        p = 1000  # Num partitions

        t_f = -7764607523034234880  # Token begin range
        # t_t = 5764607523034234880  # Token end range
        t_t = 7764607523034234880
        # Token blocks
        tkn_size = (t_t - t_f) / (nparts / p)
        tkns = [(a, a + tkn_size)
                for a in xrange(t_f, t_t - tkn_size, tkn_size)]
        keys = ["partid", "time"]
        values = ["x", "y", "z"]
        a = Hcache(self.keyspace, table,
                   "WHERE token(partid)>=? AND token(partid)<?;", tkns, keys,
                   values, {
                       self.keyspace: '100',
                       'writer_buffer': 20
                   })

        writer = HWriter(self.keyspace, table_write, keys, values,
                         {'writer_buffer': 20})

        def readAll(iter, wr):
            count = 1
            while True:
                try:
                    i = iter.get_next()
                except StopIteration:
                    print 'End of data, items read: ', count, ' with value ', i
                    break
                wr.write(i[0:2], i[2:5])
                count += 1
                if count % 100000 == 0:
                    print count
            print "iter has %d elements" % count

        start = time.time()
        readAll(a.iteritems({
            "prefetch_size": 100,
            "update_cache": "yes"
        }), writer)
        print "finshed into %d" % (time.time() - start)
Exemple #11
0
            print e
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        table = "arrays_numpies"

        self.session.execute("DROP TABLE if exists %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE %s.%s(storage_id uuid, cluster_id int, block_id int, payload blob,PRIMARY KEY((storage_id,cluster_id),block_id));"
            % (self.keyspace, table))
        storage_id = uuid.uuid3(uuid.NAMESPACE_DNS,
                                self.keyspace + '.' + table)
        time.sleep(5)
        a = Hcache(self.keyspace, table, storage_id, [],
                   ['storage_id', 'cluster_id', 'block_id'], [{
                       'name': "payload",
                       'type': 'numpy'
                   }], {})

        #prepare data
        bigarr = np.arange(pow(elem_dim,
                               dims)).reshape(elem_dim, elem_dim, elem_dim)

        keys = [storage_id, -1, -1]
        values = [bigarr.astype('i')]

        #insert
        a.put_row(keys, values)

        # othw we ask for the row before it has been processed
        time.sleep(2)
Exemple #12
0
class QbeastIterator(IStorage):
    """
    Object used to access data from workers.
    """

    args_names = ['primary_keys', 'columns', 'indexed_on', 'name', 'qbeast_meta', 'qbeast_random',
                  'storage_id', 'tokens', 'class_name', 'built_remotely']
    _building_args = namedtuple('QbeastArgs', args_names)
    _prepared_store_meta = config.session.prepare('INSERT INTO hecuba.istorage'
                                                  '(primary_keys, columns, indexed_on, name, qbeast_meta,'
                                                  ' qbeast_random, storage_id, tokens, class_name)'
                                                  'VALUES (?,?,?,?,?,?,?,?,?)')
    _prepared_set_qbeast_meta = config.session.prepare('INSERT INTO hecuba.istorage (storage_id, qbeast_meta) '
                                                       'VALUES (?,?)')

    @staticmethod
    def _store_meta(storage_args):
        log.debug("QbeastIterator: storing metas %s", '')

        try:
            config.session.execute(QbeastIterator._prepared_store_meta,
                                   [storage_args.primary_keys,
                                    storage_args.columns,
                                    storage_args.indexed_on,
                                    storage_args.name,
                                    storage_args.qbeast_meta,
                                    storage_args.qbeast_random,
                                    storage_args.storage_id,
                                    storage_args.tokens,
                                    storage_args.class_name])
        except Exception as ex:
            log.error("Error creating the StorageDictIx metadata: %s %s", storage_args, ex)
            raise ex

    def __init__(self, primary_keys, columns, indexed_on, name, qbeast_meta=None, qbeast_random=None,
                 storage_id=None, tokens=None, **kwargs):
        """
        Creates a new block.
        Args:
            primary_keys (list(tuple)): a list of (key,type) primary keys (primary + clustering).
            columns (list(tuple)): a list of (key,type) columns
            indexed_on (list(str)): a list of the names of the indexed columns
            name (string): keyspace.table of the Cassandra collection
            qbeast_random (str): qbeast random string, when selecting in different nodes this must have the same value
            storage_id (uuid): the storage id identifier
            tokens (list): list of tokens
        """
        super().__init__((), name=name, storage_id=storage_id, **kwargs)

        log.debug("CREATED QbeastIterator(%s,%s,%s,%s)", storage_id, tokens, )

        self._qbeast_meta = qbeast_meta
        self._primary_keys = primary_keys
        self._columns = columns
        self._indexed_on = indexed_on

        if qbeast_random is None:
            self._qbeast_random = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(5))
        else:
            self._qbeast_random = qbeast_random

        class_name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__)

        self._primary_keys = [{"type": key[1], "name": key[0]} if isinstance(key, tuple) else key
                              for key in self._primary_keys]
        self._columns = [{"type": col[1], "name": col[0]} if isinstance(col, tuple) else col
                         for col in self._columns]

        key_names = [col["name"] for col in self._primary_keys]
        column_names = [col["name"] for col in self._columns]
        if len(key_names) > 1:
            self._key_builder = namedtuple('row', key_names)
        else:
            self._key_builder = None
        if len(column_names) > 1:
            self._column_builder = namedtuple('row', column_names)
        else:
            self._column_builder = None

        self._k_size = len(primary_keys)

        build_keys = [(key["name"], key["type"]) for key in self._primary_keys]
        build_columns = [(col["name"], col["type"]) for col in self._columns]

        self._build_args = self._building_args(
            build_keys,
            build_columns,
            self._indexed_on,
            self._ksp + "." + self._table,
            self._qbeast_meta,
            self._qbeast_random,
            self.storage_id,
            self._tokens,
            class_name,
            self._built_remotely)

        if name or storage_id:
            self.make_persistent(name)

    def make_persistent(self, name):
        # Update local QbeastIterator metadata
        super().make_persistent(name)
        self._build_args = self._build_args._replace(storage_id=self.storage_id, name=self._ksp + "." + self._table,
                                                     tokens=self._tokens)

        self._setup_hcache()

        QbeastIterator._store_meta(self._build_args)

    def _setup_hcache(self):
        key_names = [key["name"] for key in self._primary_keys]
        persistent_values = [{"name": col["name"]} for col in self._columns]

        if self._tokens is None:
            raise RuntimeError("Tokens for object {} are null".format(self._get_name()))

        self._hcache_params = (self._ksp, self._table,
                               self.storage_id,
                               self._tokens, key_names, persistent_values,
                               {'cache_size': config.max_cache_size,
                                'writer_par': config.write_callbacks_number,
                                'writer_buffer': config.write_buffer_size,
                                'timestamped_writes': config.timestamped_writes})
        log.debug("HCACHE params %s", self._hcache_params)
        self._hcache = Hcache(*self._hcache_params)

    def _set_qbeast_meta(self, qbeast_meta):
        self._qbeast_meta = qbeast_meta
        self._build_args = self._build_args._replace(qbeast_meta=qbeast_meta)
        config.session.execute(QbeastIterator._prepared_set_qbeast_meta, [self.storage_id, qbeast_meta])

    def __len__(self):
        return len([row for row in self.__iter__()])

    def __iter__(self):
        if hasattr(self, "_qbeast_meta") and self._qbeast_meta is not None:
            conditions = ""
            for index, (from_p, to_p) in enumerate(zip(self._qbeast_meta.from_point, self._qbeast_meta.to_point)):
                conditions += "{0} > {1} AND {0} < {2} AND ".format(self._indexed_on[index], from_p, to_p)

            conditions = conditions[:-5] + self._qbeast_meta.mem_filter

            conditions += " AND expr(%s_idx, 'precision=%s:%s') ALLOW FILTERING" \
                          % (self._table, self._qbeast_meta.precision, self._qbeast_random)

            hiter = self._hcache.iteritems({'custom_select': conditions, 'prefetch_size': config.prefetch_size})
        else:
            hiter = self._hcache.iteritems(config.prefetch_size)

        return NamedItemsIterator(self._key_builder, self._column_builder, self._k_size, hiter, self)
Exemple #13
0
class StorageDict(IStorage, dict):
    # """
    # Object used to access data from workers.
    # """

    args_names = [
        "name", "primary_keys", "columns", "tokens", "storage_id",
        "indexed_on", "class_name", "built_remotely"
    ]
    args = namedtuple('StorageDictArgs', args_names)
    _prepared_store_meta = config.session.prepare(
        'INSERT INTO hecuba.istorage'
        '(storage_id, class_name, name, tokens, '
        'primary_keys, columns, indexed_on)'
        'VALUES (?,?,?,?,?,?,?)')

    @staticmethod
    def _store_meta(storage_args):
        """
        Method to update the info about the StorageDict in the DB metadata table
        Args:
            storage_args: structure with all data needed to update the metadata
        """
        log.debug("StorageDict: storing metas %s", storage_args)

        try:
            config.session.execute(StorageDict._prepared_store_meta, [
                storage_args.storage_id, storage_args.class_name,
                storage_args.name, storage_args.tokens,
                storage_args.primary_keys, storage_args.columns,
                storage_args.indexed_on
            ])
        except Exception as ex:
            log.error("Error creating the StorageDict metadata: %s %s",
                      storage_args, ex)
            raise ex

    def __init__(self,
                 name=None,
                 primary_keys=None,
                 columns=None,
                 indexed_on=None,
                 storage_id=None,
                 **kwargs):
        """
        Creates a new StorageDict.

        Args:
            name (string): the name of the collection/table (keyspace is optional)
            primary_keys (list(tuple)): a list of (key,type) primary keys (primary + clustering).
            columns (list(tuple)): a list of (key,type) columns
            tokens (list): list of tokens
            storage_id (string): the storage id identifier
            indexed_on (list): values that will be used as index
            kwargs: other parameters
        """

        super().__init__((), name=name, storage_id=storage_id, **kwargs)
        log.debug("CREATE StorageDict(%s,%s)", primary_keys, columns)
        '''
        yolandab
        kwargs of the init should contain metas: all the row in the istorage if exists
        after super().__init__
                    if kwargs is empty --> this is a new object
                        generate build args parsing the _doc_ string or using the parameters
                        we need to generate the column info of sets with the format to persist it (name--> _set_)
                        if name or storage id --> call to store_metas
                    else --> this is an already existing objects
                        metas and tokens should form the attributes of self
                        we need to convert the column info of sets to the format in memory ( _set_name --> name)
            TODO: implement a cleaner version of embedded sets
        '''
        build_column = None
        build_keys = None
        if self.__doc__ is not None:
            self._persistent_props = self._parse_comments(self.__doc__)
            self._primary_keys = self._persistent_props['primary_keys']
            self._columns = self._persistent_props['columns']
            self._indexed_on = self._persistent_props.get(
                'indexed_on', indexed_on)

        # Field '_istorage_metas' will be set if it exists in HECUBA.istorage
        initialized = (getattr(self, '_istorage_metas', None) is not None)
        if not initialized and self.__doc__ is None:
            #info is not in the doc string, should be passed in the parameters
            if primary_keys == None or columns == None:
                raise RuntimeError(
                    "StorageDict: missed specification. Type of Primary Key or Column undefined"
                )
            self._primary_keys = primary_keys
            self._columns = columns
            self._indexed_on = indexed_on

        if initialized:  #object already in istorage

            # if (primary_keys is not None or columns is not None):
            #    raise RuntimeError("StorageDict: Trying to define a new schema, but it is already persistent")
            #    --> this check would be necessary if passing columns/key spec
            #    as parameter was part of the user interface. As it is intended
            #    just for internal use we skip this check. If the spec does not
            #    match the actual schema access to the object will fail.

            if getattr(self, "_persistent_props",
                       None) is not None:  # __doc__ and disk: do they match?
                self._check_schema_and_raise("__init__")

            else:  # _persistent_props == None (only in disk)
                # Parse _istorage_metas to fulfill the _primary_keys, _columns
                self._primary_keys = self._istorage_metas.primary_keys
                self._columns = self._istorage_metas.columns
                build_column = self._columns  # Keep a copy from the disk to avoid recalculate it later
                build_keys = self._primary_keys  # Keep a copy from the disk to avoid recalculate it later
                self._indexed_on = self._istorage_metas.indexed_on
                #we manipulate the info about sets retrieved from istorage
                # (_set_s1_0,int), (_set_s1_1,int) --> {name: s1, type: set , column:((s1_0, int), (s1_1, int))}
                has_embedded_set = False
                set_pks = []
                normal_columns = []
                for column_name, column_type in self._columns:
                    if column_name.find("_set_") == 0:
                        attr_name = column_name[
                            5:]  # Remove '_set_' The attribute name also contains the "column_name" needed later...
                        set_pks.append((attr_name, column_type))
                        has_embedded_set = True
                    else:
                        normal_columns.append((column_name, column_type))
                if has_embedded_set:  # Embedded set has a different layout {name,type:set, columns:[(name,type),(name,type)]}
                    column_name = attr_name.split(
                        "_", 1
                    )[0]  # Get the 1st name (attr_1, attr_2... -> attr or attr -> attr)
                    self._columns = [{
                        "name": column_name,
                        "type": "set",
                        "columns": set_pks
                    }]
                else:
                    self._columns = [{
                        "type": col[1],
                        "name": col[0]
                    } for col in normal_columns]

        # COMMON CODE: new and instantiation
        # Special case:Do we have an embedded set?
        self._has_embedded_set = False
        if isinstance(self._columns[0], dict):
            if self._columns[0]['type'] == 'set':
                self._has_embedded_set = True

        self._primary_keys = [{
            "type": key[1],
            "name": key[0]
        } if isinstance(key, tuple) else key for key in self._primary_keys]
        self._columns = [{
            "type": col[1],
            "name": col[0]
        } if isinstance(col, tuple) else col for col in self._columns]
        # POST: _primary_keys and _columns are list of DICTS> [ {name:..., type:...}, {name:..., type:set, columns:[(name,type),...]},...]
        log.debug("CREATED StorageDict(%s,%s)", self._primary_keys,
                  self._columns)
        key_names = [key["name"] for key in self._primary_keys]
        column_names = [col["name"] for col in self._columns]

        if len(key_names) > 1:
            self._key_builder = namedtuple('row', key_names)
        else:  # 1
            self._key_builder = None

        if self._has_embedded_set:
            set_names = [colname for (colname, dt) in self._get_set_types()]
            self._column_builder = namedtuple('row', set_names)
        elif len(column_names) > 1:
            self._column_builder = namedtuple('row', column_names)
        else:
            self._column_builder = None

        self._k_size = len(key_names)

        class_name = '%s.%s' % (self.__class__.__module__,
                                self.__class__.__name__)

        if build_keys == None:
            build_keys = [(key["name"], key["type"])
                          for key in self._primary_keys]

        # Define 'build_column': it will contain the column info stored in istorage. For the sets we manipulate the parsed data
        if build_column == None:
            build_column = []
            for col in self._columns:
                if col["type"] == "set":
                    types = col["columns"]
                    for t in types:
                        build_column.append(("_set_" + t[0], t[1]))
                else:
                    build_column.append((col["name"], col["type"]))

        self._build_args = self.args(self._get_name(), build_keys,
                                     build_column, self._tokens,
                                     self.storage_id, self._indexed_on,
                                     class_name, self._built_remotely)

        if name and storage_id and (storage_id != storage_id_from_name(name)
                                    ):  # instantiating an splitted object
            self._persist_metadata()
        elif name or storage_id:  # instantiating a persistent object
            if initialized:  # already existint
                self._setup_hcache()
            else:  # new object
                self._persist_metadata()

    @classmethod
    def _parse_comments(self, comments):
        parser = Parser("TypeSpec")
        return parser._parse_comments(comments)

    def __contains__(self, key):
        """
        Method that checks if a given key exists in a StorageDict.
        Args:
            key: the position that we want to check if exists.
        Returns:
            boolean (true - exists, false - doesn't exist).
        """
        if not self.storage_id:
            return dict.__contains__(self, key)
        else:
            try:
                # TODO we should save this value in a cache
                self._hcache.get_row(self._make_key(key))
                return True
            except Exception as ex:
                log.warn("persistentDict.__contains__ ex %s", ex)
                return False

    def _create_tables(self):
        # Prepare data
        persistent_keys = [
            (key["name"], "tuple<" + ",".join(key["columns"]) +
             ">") if key["type"] == "tuple" else (key["name"], key["type"])
            for key in self._primary_keys
        ] + self._get_set_types()
        persistent_values = []
        if not self._has_embedded_set:
            for col in self._columns:
                if col["type"] == "tuple":
                    persistent_values.append({
                        "name":
                        col["name"],
                        "type":
                        "tuple<" + ",".join(col["columns"]) + ">"
                    })
                elif col["type"] not in basic_types:
                    persistent_values.append({
                        "name": col["name"],
                        "type": "uuid"
                    })
                else:
                    persistent_values.append({
                        "name": col["name"],
                        "type": col["type"]
                    })

        key_names = [
            col[0] if isinstance(col, tuple) else col["name"]
            for col in persistent_keys
        ]

        query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % (
            self._ksp, config.replication)
        try:
            log.debug('MAKE PERSISTENCE: %s', query_keyspace)
            config.executelocked(query_keyspace)
        except Exception as ex:
            log.warn("Error creating the StorageDict keyspace %s, %s",
                     (query_keyspace), ex)
            raise ex

        persistent_columns = [(col["name"], col["type"])
                              for col in persistent_values]

        query_table = "CREATE TABLE IF NOT EXISTS %s.%s (%s, PRIMARY KEY (%s));" \
                      % (self._ksp,
                         self._table,
                         ",".join("%s %s" % tup for tup in persistent_keys + persistent_columns),
                         str.join(',', key_names))
        try:
            log.debug('MAKE PERSISTENCE: %s', query_table)
            config.executelocked(query_table)
        except Exception as ex:
            log.warn("Error creating the StorageDict table: %s %s",
                     query_table, ex)
            raise ex

        if hasattr(self, '_indexed_on') and self._indexed_on is not None:
            index_query = 'CREATE CUSTOM INDEX IF NOT EXISTS ' + self._table + '_idx ON '
            index_query += self._ksp + '.' + self._table + ' (' + str.join(
                ',', self._indexed_on) + ') '
            index_query += "using 'es.bsc.qbeast.index.QbeastIndex';"
            try:
                config.executelocked(index_query)
            except Exception as ex:
                log.error("Error creating the Qbeast custom index: %s %s",
                          index_query, ex)
                raise ex
            trigger_query = "CREATE TRIGGER IF NOT EXISTS %s%s_qtr ON %s.%s USING 'es.bsc.qbeast.index.QbeastTrigger';" % \
                            (self._ksp, self._table, self._ksp, self._table)
            try:
                config.executelocked(trigger_query)
            except Exception as ex:
                log.error("Error creating the Qbeast trigger: %s %s",
                          trigger_query, ex)
                raise ex

    def _persist_data_from_memory(self):
        for k, v in super().items():
            self[k] = v
        if config.max_cache_size != 0:  #if C++ cache is enabled, clear Python memory, otherwise keep it
            super().clear()

    def sync(self):
        super().sync()
        self._hcache.flush()

    def _setup_hcache(self):
        key_names = [key["name"] for key in self._primary_keys]
        key_names = key_names + [name for name, dt in self._get_set_types()]

        persistent_values = []
        if not self._has_embedded_set:
            persistent_values = [{
                "name": col["name"]
            } for col in self._columns]
        if self._tokens is None:
            raise RuntimeError("Tokens for object {} are null".format(
                self._get_name()))
        self._hcache_params = (self._ksp, self._table, self.storage_id,
                               self._tokens, key_names, persistent_values, {
                                   'cache_size': config.max_cache_size,
                                   'writer_par': config.write_callbacks_number,
                                   'writer_buffer': config.write_buffer_size,
                                   'timestamped_writes':
                                   config.timestamped_writes
                               })
        log.debug("HCACHE params %s", self._hcache_params)
        self._hcache = Hcache(*self._hcache_params)

    def _make_key(self, key):
        """
        Method used to pass the key data to the StorageDict cache in a proper way.
        Args:
            key: the data that needs to get the correct format
        """
        if isinstance(key, str) or not isinstance(key, Iterable):
            if len(self._primary_keys) == 1:
                return [key]
            else:
                raise Exception('missing a primary key')

        if isinstance(key, Iterable) and len(key) == len(self._primary_keys):
            return list(key)
        elif self._has_embedded_set and isinstance(
                key, Iterable) and len(key) == (len(self._primary_keys) +
                                                len(self._get_set_types())):
            return list(key)
        else:
            raise Exception('wrong primary key')

    @staticmethod
    def _make_value(value):
        """
        Method used to pass the value data to the StorageDict cache in a proper way.
        Args:
            value: the data that needs to get the correct format
        """
        if issubclass(value.__class__, IStorage):
            return [value.storage_id]
        elif isinstance(value,
                        str) or not isinstance(value, Iterable) or isinstance(
                            value, np.ndarray):
            return [value]
        elif isinstance(value, tuple):
            return [value]
        elif isinstance(value, Iterable):
            val = []
            for v in value:
                if isinstance(v, IStorage):
                    val.append(v.storage_id)
                else:
                    val.append(v)
            return val
        else:
            return list(value)

    def _count_elements(self, query):
        try:
            result = config.session.execute(query)
            return result[0][0]
        except OperationTimedOut as ex:
            import warnings
            warnings.warn(
                "len() operation on {} from class {} failed by timeout."
                "Use len() on split() results if you must".format(
                    self._get_name(), self.__class__.__name__))
            raise ex
        except Exception as ir:
            log.error("Unable to execute %s", query)
            raise ir

    def __iter__(self):
        """
        Method that overloads the python dict basic iteration, which returns
        an iterator over the dictionary keys.
        """
        return self.keys()

    def _persist_metadata(self):
        """
        Private Method to create tables, setup the cache and store the metadata
        of a StorageDict.
        Used for NEW storage dicts, that do no need to persist any data.
        """
        if not self._built_remotely:
            self._create_tables()
        self._setup_hcache()
        StorageDict._store_meta(self._build_args)

    def _persist_data(self, name):
        """
        Private Method to store a StorageDict into cassandra
        This will make it use a persistent DB as the main location
        of its data.
        Args:
            name:
        """
        # Update local StorageDict metadata
        self._build_args = self._build_args._replace(
            storage_id=self.storage_id,
            name=self._ksp + "." + self._table,
            tokens=self._tokens)
        self._persist_metadata()
        self._persist_data_from_memory()

    def make_persistent(self, name):
        """
        Method to transform a StorageDict into a persistent object.
        This will make it use a persistent DB as the main location
        of its data.
        Args:
            name:
        """
        super().make_persistent(name)
        if getattr(self, "_istorage_metas", None) is not None:
            self._check_schema_and_raise("make_persistent")
        self._persist_data(name)

    def stop_persistent(self):
        """
        Method to turn a StorageDict into non-persistent.
        """
        super().stop_persistent()
        log.debug('STOP PERSISTENCE: %s', self._table)
        self._hcache = None
        self.storage_id = None

    def delete_persistent(self):
        """
        Method to empty all data assigned to a StorageDict.
        """
        self.sync()
        super().delete_persistent()
        log.debug('DELETE PERSISTENT: %s', self._table)
        query = "DROP TABLE %s.%s;" % (self._ksp, self._table)
        config.session.execute(query)

        query = "DELETE FROM hecuba.istorage where storage_id={}".format(
            self.storage_id)
        config.session.execute(query)
        self.storage_id = None

    def __delitem__(self, key):
        """
        Method to delete a specific entry in the dict in the key position.
        Args:
            key: position of the entry that we want to delete
        """
        if not self.storage_id:
            dict.__delitem__(self, key)
        elif self._has_embedded_set:
            self._hcache.delete_row(key)
        elif isinstance(key, Iterable) and not isinstance(key, str):
            self._hcache.delete_row(list(key))
        else:
            self._hcache.delete_row([key])

    def __create_embeddedset(self, key, val=None):
        if not isinstance(key, Iterable) or isinstance(key, str):
            return EmbeddedSet(self, [key], val)
        else:
            return EmbeddedSet(self, list(key), val)

    def _check_schema_and_raise(self, txt):
        """
        Raises an exception if the schema stored in the database does not match
        with the description of the object in memory. This may happen if the
        user specifies an already used name for its data.
        PRE:
            self._istorage_metas contains a list of tuples (name, type)
            self._primary_keys contains a list of tuples (name, type) or list of dicts {'name':value, 'type':value}
            self._columns may contain:
                        a list of tuples (name, type) or
                        a list of dicts {'name':value, 'type':value}  or
                        a list of dicts with a set {'name':value, 'type':'set','columns':[(name1,type1),....]}
        """
        # TODO: Change parser to have a consistent behaviour
        # try to send a useful message if it is a problem with a mismatched schema
        if getattr(self, "_istorage_metas", None) is None:
            self._istorage_metas = get_istorage_attrs(self.storage_id)

        if len(self._primary_keys) != len(self._istorage_metas.primary_keys):
            raise RuntimeError(
                "StorageDict: {}: key Metadata does not match specification. Trying {} but stored specification {}"
                .format(txt, self._primary_keys,
                        self._istorage_metas.primary_keys))
        pk = [{
            "type": key[1],
            "name": key[0]
        } if isinstance(key, tuple) else key for key in self._primary_keys]

        for pos, key in enumerate(pk):
            if self._istorage_metas.primary_keys[pos][0] != key[
                    'name'] or self._istorage_metas.primary_keys[pos][
                        1] != key['type']:
                raise RuntimeError(
                    "StorageDict: {}: key Metadata does not match specification. Trying {} but stored specification {}"
                    .format(txt, self._primary_keys,
                            self._istorage_metas.primary_keys))

        columns = self._columns
        # Treat the embedded set case...
        if type(self._columns[0]) == dict:
            if self._columns[0]['type'] == 'set':
                columns = self._columns[0]['columns']
        if len(columns) != len(self._istorage_metas.columns):
            raise RuntimeError(
                "StorageDict: {}: column Metadata does not match specification. Trying {} but stored specification {}"
                .format(txt, self._columns, self._istorage_metas.columns))
        columns = [{
            "type": col[1],
            "name": col[0]
        } if isinstance(col, tuple) else col for col in columns]
        for pos, val in enumerate(columns):
            #istorage_metas.columns[pos] -->[(_set_s1_0,int),(_set_s1_1,int)]
            mykey = self._istorage_metas.columns[pos][0]
            mytype = self._istorage_metas.columns[pos][1]

            if mykey.find("_set_") == 0:
                mykey = mykey[
                    5:]  # Skip the '_set_' '_set_s1_0' ==> 's1_0' TODO Change the set identification method
            if (mykey != val['name']) or (mytype != val['type']):
                raise RuntimeError(
                    "StorageDict: {}: column Metadata does not match specification. Trying {} but stored specification {}"
                    .format(txt, self._columns, self._istorage_metas.columns))

    def __getitem__(self, key):
        """
        If the object is persistent, each request goes to the hfetch.
        Args:
             key: the dictionary key
        Returns
             item: value found in position key
        """
        log.debug('GET ITEM %s', key)

        if not self.storage_id:
            return dict.__getitem__(self, key)
        elif self._has_embedded_set:
            return self.__create_embeddedset(key=key)
        else:
            # Returns always a list with a single entry for the key
            if config.max_cache_size == 0:  # if C++ cache is disabled, use Python memory
                try:
                    result = dict.__getitem__(self, key)
                    return result
                except:
                    pass

            persistent_result = self._hcache.get_row(self._make_key(key))

            log.debug("GET ITEM %s[%s]", persistent_result,
                      persistent_result.__class__)

            # we need to transform UUIDs belonging to IStorage objects and rebuild them
            # TODO hcache should return objects of the class uuid, not str
            final_results = []
            for index, col in enumerate(self._columns):
                col_type = col["type"]
                element = persistent_result[index]
                if col_type not in basic_types:
                    # element is not a built-in type
                    info = {
                        "storage_id": element,
                        "tokens": self._build_args.tokens,
                        "class_name": col_type
                    }
                    element = build_remotely(info)

                final_results.append(element)

            if self._column_builder is not None:
                return self._column_builder(*final_results)
            else:
                return final_results[0]

    def __make_val_persistent(self, val, col=0):
        if isinstance(val, list):
            for index, element in enumerate(val):
                val[index] = self.__make_val_persistent(element, index)
        elif isinstance(val, IStorage) and not val._is_persistent:
            valstorage_id = uuid.uuid4()
            attribute = self._columns[col]["name"]

            name = self._ksp + "." + (
                "D" + str(valstorage_id).replace('-', '_') + self._table +
                attribute
            )[:
              40]  # 48 is the max length of table names, this may have collisions but this would only affect to object instantiation that are not really expected (user should give the name of the object instead of relying on the system to generate it)
            # new name as ksp.Dra_n_dom_table_attrname[:40]
            val.make_persistent(name)
        return val

    def __setitem__(self, key, val):
        """
           Method to insert values in the StorageDict
           Args:
               key: the position of the value that we want to save
               val: the value that we want to save in that position
        """
        if isinstance(val, list):
            vals_istorage = []
            for element in val:
                if isinstance(element, np.ndarray) and not isinstance(
                        element, StorageNumpy):
                    val_istorage = StorageNumpy(element)
                else:
                    val_istorage = element
                vals_istorage.append(val_istorage)

            val = vals_istorage
        elif isinstance(val, np.ndarray) and not isinstance(val, StorageNumpy):
            val = StorageNumpy(val)
        elif isinstance(val, set):
            val = self.__create_embeddedset(key=key, val=val)

        log.debug('SET ITEM %s->%s', key, val)
        if self.storage_id is None:
            dict.__setitem__(self, key, val)
        elif not isinstance(val, EmbeddedSet):
            # Not needed because it is made persistent and inserted to hcache when calling to self.__create_embeddedset
            val = self.__make_val_persistent(val)
            self._hcache.put_row(self._make_key(key), self._make_value(val))

            if config.max_cache_size == 0:  # If C++ cache is disabled, use python memory
                dict.__setitem__(self, key, val)

    def __len__(self):
        if not self.storage_id:
            return super().__len__()

        self.sync()
        if self._tokens[0][0] == _min_token and self._tokens[-1][
                1] == _max_token:
            query = f"SELECT COUNT(*) FROM {self._ksp}.{self._table}"
            return self._count_elements(query)

        else:
            keys = []
            for pkey in self._primary_keys:
                template = "'{}'" if pkey["type"] == "text" else "{}"
                keys.append(template.format(pkey["name"]))
            all_keys = ",".join(keys)

            total = 0
            for (token_start, token_end) in self._tokens:
                query = f"SELECT COUNT(*) FROM {self._ksp}.{self._table} " \
                    f"WHERE token({all_keys})>={token_start} AND token({all_keys})<{token_end}"

                total = total + self._count_elements(query)
            return total

    def __repr__(self):
        """
        Overloads the method used by print to show a StorageDict
        Returns: The representation of the data stored in the StorageDict

        """
        to_return = {}
        for item in self.items():
            to_return[item[0]] = item[1]
            if len(to_return) == config.hecuba_print_limit:
                return str(to_return)
        if len(to_return) > 0:
            return str(to_return)
        return ""

    def update(self, other=None, **kwargs):
        """
        Updates the current dict with a new dictionary or set of attr,value pairs
        (those must follow the current dict data model).
        Args:
            other: python dictionary or StorageDict. All key,val values in it will
            be inserted in the current dict.
            **kwargs: set of attr:val pairs, to be treated as key,val and inserted
            in the current dict.
        """
        if other is not None:
            if isinstance(other, StorageDict):
                for k, v in other.items():
                    self[k] = v
            else:
                for k, v in other.items() if isinstance(other,
                                                        Mapping) else other:
                    self[k] = v
        for k, v in kwargs.items():
            self[k] = v

    def keys(self):
        """
        Obtains the iterator for the keys of the StorageDict
        Returns:
            if persistent:
                iterkeys(self): list of keys
            if not persistent:
                dict.keys(self)
        """
        if self.storage_id:
            self.sync()
            ik = self._hcache.iterkeys(config.prefetch_size)
            iterator = NamedIterator(ik, self._key_builder, self)
            if self._has_embedded_set:
                iterator = iter(set(iterator))

            return iterator
        else:
            return dict.keys(self)

    def items(self):
        """
        Obtains the iterator for the key,val pairs of the StorageDict
        Returns:
            if persistent:
                NamedItemsIterator(self): list of key,val pairs
            if not persistent:
                dict.items(self)
        """
        if self.storage_id:
            self.sync()
            ik = self._hcache.iteritems(config.prefetch_size)
            iterator = NamedItemsIterator(self._key_builder,
                                          self._column_builder, self._k_size,
                                          ik, self)
            if self._has_embedded_set:
                d = defaultdict(set)
                # iteritems has the set values in different rows, this puts all the set values in the same row
                if len(self._get_set_types()) == 1:
                    for row in iterator:
                        d[row.key].add(row.value[0])
                else:
                    for row in iterator:
                        d[row.key].add(tuple(row.value))

                iterator = d.items()

            return iterator
        else:
            return dict.items(self)

    def values(self):
        """
        Obtains the iterator for the values of the StorageDict
        Returns:
            if persistent:
                NamedIterator(self): list of valuesStorageDict
            if not persistent:
                dict.values(self)
        """
        if self.storage_id:
            self.sync()
            if self._has_embedded_set:
                items = self.items()
                return dict(items).values()
            else:
                ik = self._hcache.itervalues(config.prefetch_size)
                return NamedIterator(ik, self._column_builder, self)
        else:
            return dict.values(self)

    def get(self, key, default=None):
        try:
            value = self.__getitem__(key)
        except KeyError:
            value = default
        return value

    def _get_set_types(self):
        """
        Returns a list of tuples (name,type) for the types of the set
        """
        if self._has_embedded_set:
            set_types = [
                col.get("columns", []) for col in self._columns
                if isinstance(col, dict)
            ]
            return sum(set_types, [])
        else:
            return []
Exemple #14
0
class StorageDict(dict, IStorage):
    # """
    # Object used to access data from workers.
    # """

    args_names = [
        "name", "primary_keys", "columns", "tokens", "storage_id",
        "indexed_on", "class_name"
    ]
    args = namedtuple('StorageDictArgs', args_names)
    _prepared_store_meta = config.session.prepare(
        'INSERT INTO hecuba.istorage'
        '(storage_id, class_name, name, tokens, '
        'primary_keys, columns, indexed_on)'
        'VALUES (?,?,?,?,?,?,?)')

    @staticmethod
    def build_remotely(result):
        """
        Launches the StorageDict.__init__ from the api.getByID
        Args:
            result: a namedtuple with all  the information needed to create again the StorageDict
        """
        log.debug("Building Storage dict with %s", result)

        return StorageDict(result.name, result.primary_keys, result.columns,
                           result.tokens, result.storage_id, result.indexed_on)

    @staticmethod
    def _store_meta(storage_args):
        """
        Method to update the info about the StorageDict in the DB metadata table
        Args:
            storage_args: structure with all data needed to update the metadata
        """
        log.debug("StorageDict: storing metas %s", storage_args)

        try:
            config.session.execute(StorageDict._prepared_store_meta, [
                storage_args.storage_id, storage_args.class_name,
                storage_args.name, storage_args.tokens,
                storage_args.primary_keys, storage_args.columns,
                storage_args.indexed_on
            ])
        except Exception as ex:
            log.error("Error creating the StorageDict metadata: %s %s",
                      storage_args, ex)
            raise ex

    def __init__(self,
                 name=None,
                 primary_keys=None,
                 columns=None,
                 tokens=None,
                 storage_id=None,
                 indexed_args=None,
                 **kwargs):
        """
        Creates a new StorageDict.

        Args:
            name (string): the name of the collection/table (keyspace is optional)
            primary_keys (list(tuple)): a list of (key,type) primary keys (primary + clustering).
            columns (list(tuple)): a list of (key,type) columns
            tokens (list): list of tokens
            storage_id (string): the storage id identifier
            indexed_args (list): values that will be used as index
            kwargs: other parameters
        """

        super(StorageDict, self).__init__(**kwargs)
        self._is_persistent = False
        log.debug("CREATED StorageDict(%s,%s,%s,%s,%s,%s)", primary_keys,
                  columns, name, tokens, storage_id, kwargs)

        if tokens is None:
            log.info('using all tokens')
            tokens = map(lambda a: a.value,
                         config.cluster.metadata.token_map.ring)
            self._tokens = IStorage._discrete_token_ranges(tokens)
        else:
            self._tokens = tokens

        self._storage_id = storage_id

        if self.__doc__ is not None:
            self._persistent_props = self._parse_comments(self.__doc__)
            self._primary_keys = self._persistent_props[
                self.__class__.__name__]['primary_keys']
            self._columns = self._persistent_props[
                self.__class__.__name__]['columns']
            try:
                self._indexed_args = self._persistent_props[
                    self.__class__.__name__]['indexed_values']
            except KeyError:
                self._indexed_args = indexed_args
        else:
            self._primary_keys = primary_keys
            self._columns = columns
            self._indexed_args = indexed_args

        key_names = [pkname for (pkname, dt) in self._primary_keys]
        column_names = [colname for (colname, dt) in self._columns]
        self._item_builder = namedtuple('row', key_names + column_names)

        if len(key_names) > 1:
            self._key_builder = namedtuple('row', key_names)
        else:
            self._key_builder = None
        if len(column_names) > 1:
            self._column_builder = namedtuple('row', column_names)
        else:
            self._column_builder = None

        self._k_size = len(key_names)

        class_name = '%s.%s' % (self.__class__.__module__,
                                self.__class__.__name__)
        self._build_args = self.args(name, self._primary_keys, self._columns,
                                     self._tokens, self._storage_id,
                                     self._indexed_args, class_name)

        if name is not None:
            self.make_persistent(name)
        else:
            self._is_persistent = False

    def __eq__(self, other):
        """
        Method to compare a StorageDict with another one.
        Args:
            other: StorageDict to be compared with.
        Returns:
            boolean (true - equals, false - not equals).
        """
        return self._storage_id == other._storage_id and self._tokens == other.token_ranges and \
               self._table == other.table_name and self._ksp == other.keyspace

    _dict_case = re.compile(
        '.*@TypeSpec + *< *< *([\w:, ]+)+ *> *, *([\w+:., <>]+) *>')
    _tuple_case = re.compile('.*@TypeSpec +(\w+) +tuple+ *< *([\w, +]+) *>')
    _index_vars = re.compile('.*@Index_on *([A-z0-9, ]+)')
    _other_case = re.compile(' *(\w+) *< *([\w, +]+) *>')

    @classmethod
    def _parse_comments(self, comments):
        """
            Parses de comments in a class file to save them in the class information
            Args:
                comments: the comment in the class file
            Returns:
                this: a structure with all the information of the comment
        """
        this = {}
        for line in comments.split('\n'):
            m = StorageDict._dict_case.match(line)
            if m is not None:
                # Matching @TypeSpec of a dict
                dict_keys, dict_values = m.groups()
                primary_keys = []
                for ind, key in enumerate(dict_keys.split(",")):
                    key = key.replace(' ', '')
                    match = IStorage._data_type.match(key)
                    if match is not None:
                        # an IStorage with a name
                        name, value = match.groups()
                    elif ':' in key:
                        raise SyntaxError
                    else:
                        name = "key" + str(ind)
                        value = key

                    name = name.replace(' ', '')
                    value = value.replace(' ', '')
                    primary_keys.append(
                        (name, StorageDict._conversions[value]))
                dict_values = dict_values.replace(' ', '')
                if dict_values.startswith('dict'):
                    n = IStorage._sub_dict_case.match(dict_values[4:])
                    # Matching @TypeSpec of a sub dict
                    dict_keys2, dict_values2 = n.groups()
                    primary_keys2 = []
                    for ind, key in enumerate(dict_keys2.split(",")):
                        try:
                            name, value = IStorage._data_type.match(
                                key).groups()
                        except ValueError:
                            if ':' in key:
                                raise SyntaxError
                            else:
                                name = "key" + str(ind)
                                value = key
                        name = name.replace(' ', '')
                        primary_keys2.append(
                            (name, StorageDict._conversions[value]))
                    columns2 = []
                    dict_values2 = dict_values2.replace(' ', '')
                    if dict_values2.startswith('tuple'):
                        dict_values2 = dict_values2[6:]
                    for ind, val in enumerate(dict_values2.split(",")):
                        try:
                            name, value = IStorage._data_type.match(
                                val).groups()
                        except ValueError:
                            if ':' in key:
                                raise SyntaxError
                            else:
                                name = "val" + str(ind)
                                value = val
                        columns2.append(
                            (name, StorageDict._conversions[value]))
                    columns = {
                        'type': 'dict',
                        'primary_keys': primary_keys2,
                        'columns': columns2
                    }
                elif dict_values.startswith('tuple'):
                    n = IStorage._sub_tuple_case.match(dict_values[5:])
                    tuple_values = list(n.groups())[0]
                    columns = []
                    for ind, val in enumerate(tuple_values.split(",")):
                        try:
                            name, value = val.split(':')
                        except ValueError:
                            if ':' in key:
                                raise SyntaxError
                            else:
                                name = "val" + str(ind)
                                value = val
                        name = name.replace(' ', '')
                        columns.append((name, StorageDict._conversions[value]))
                else:
                    columns = []
                    for ind, val in enumerate(dict_values.split(",")):
                        match = IStorage._data_type.match(val)
                        if match is not None:
                            # an IStorage with a name
                            name, value = match.groups()
                        elif ':' in val:
                            name, value = IStorage._so_data_type.match(
                                val).groups()
                        else:
                            name = "val" + str(ind)
                            value = val
                        name = name.replace(' ', '')
                        try:
                            columns.append(
                                (name, StorageDict._conversions[value]))
                        except KeyError:
                            columns.append((name, value))
                name = str(self).replace('\'>', '').split('.')[-1]
                if self.__class__.__name__ in this:
                    this[name].update({
                        'type': 'dict',
                        'primary_keys': primary_keys,
                        'columns': columns
                    })
                else:
                    this[name] = {
                        'type': 'dict',
                        'primary_keys': primary_keys,
                        'columns': columns
                    }
            m = StorageDict._index_vars.match(line)
            if m is not None:
                name = str(self).replace('\'>', '').split('.')[-1]
                indexed_values = m.groups()
                indexed_values = indexed_values.replace(' ', '').split(',')
                if name in this:
                    this[name].update({'indexed_values': indexed_values})
                else:
                    this[name] = {'indexed_values': indexed_values}
        return this

    def __contains__(self, key):
        """
        Method that checks if a given key exists in a StorageDict.
        Args:
            key: the position that we want to check if exists.
        Returns:
            boolean (true - exists, false - doesn't exist).
        """
        if not self._is_persistent:
            return dict.__contains__(self, key)
        else:
            try:
                # TODO we should save this value in a cache
                self._hcache.get_row(self._make_key(key))
                return True
            except Exception as ex:
                log.warn("persistentDict.__contains__ ex %s", ex)
                raise ex

    def _make_key(self, key):
        """
        Method used to pass the key data to the StorageDict cache in a proper way.
        Args:
            key: the data that needs to get the correct format
        """
        if isinstance(key, str) or isinstance(
                key, unicode) or not isinstance(key, Iterable):
            if len(self._primary_keys) == 1:
                if isinstance(key, unicode):
                    return [key.encode('ascii', 'ignore')]
                return [key]
            else:
                raise Exception('missing a primary key')

        if isinstance(key, Iterable) and len(key) == len(self._primary_keys):
            return list(key)
        else:
            raise Exception('wrong primary key')

    @staticmethod
    def _make_value(value):
        """
        Method used to pass the value data to the StorageDict cache in a proper way.
        Args:
            value: the data that needs to get the correct format
        """
        if issubclass(value.__class__, IStorage):
            return [uuid.UUID(value.getID())]
        elif isinstance(value,
                        str) or not isinstance(value, Iterable) or isinstance(
                            value, np.ndarray):
            return [value]
        elif isinstance(value, unicode):
            return [value.encode('ascii', 'ignore')]
        else:
            return list(value)

    def keys(self):
        """
        This method return a list of all the keys of the StorageDict.
        Returns:
          list: a list of keys
        """
        return [i for i in self.iterkeys()]

    def values(self):
        """
        This method return a list of all the values of the StorageDict.
        Returns:
          list: a list of values
        """
        return [i for i in self.itervalues()]

    def __iter__(self):
        """
        Method that overloads the python dict basic iteration, which returns
        an iterator over the dictionary keys.
        """
        return self.iterkeys()

    def make_persistent(self, name):
        """
        Method to transform a StorageDict into a persistent object.
        This will make it use a persistent DB as the main location
        of its data.
        Args:
            name:
        """
        if self._is_persistent:
            raise AlreadyPersistentError(
                "This StorageDict is already persistent [Before:{}.{}][After:{}]",
                self._ksp, self._table, name)
        self._is_persistent = True
        (self._ksp, self._table) = self._extract_ks_tab(name)

        if self._storage_id is None:
            self._storage_id = uuid.uuid3(uuid.NAMESPACE_DNS,
                                          self._ksp + '.' + self._table)
        self._build_args = self._build_args._replace(
            storage_id=self._storage_id, name=self._ksp + "." + self._table)
        self._store_meta(self._build_args)
        if config.id_create_schema == -1:
            query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % (
                self._ksp, config.replication)
            try:
                log.debug('MAKE PERSISTENCE: %s', query_keyspace)
                config.session.execute(query_keyspace)
            except Exception as ex:
                log.warn("Error creating the StorageDict keyspace %s, %s",
                         (query_keyspace), ex)
                raise ex

        for key, value in dict.iteritems(self):
            if issubclass(value.__class__, IStorage):
                # new name as ksp+table+obj_class_name
                val_name = self._ksp + '.' + self._table + type(
                    value).__name__.lower()
                value.make_persistent(val_name)

        columns = self._primary_keys + self._columns
        for ind, entry in enumerate(columns):
            n = StorageDict._other_case.match(entry[1])
            if n is not None:
                iter_type, intra_type = n.groups()
            else:
                iter_type = entry[1]
            if iter_type not in IStorage._basic_types:
                columns[ind] = entry[0], 'uuid'

        pks = map(lambda a: a[0], self._primary_keys)
        query_table = "CREATE TABLE IF NOT EXISTS %s.%s (%s, PRIMARY KEY (%s));" \
                      % (self._ksp,
                         self._table,
                         ",".join("%s %s" % tup for tup in columns),
                         str.join(',', pks))
        try:
            log.debug('MAKE PERSISTENCE: %s', query_table)
            config.session.execute(query_table)
        except Exception as ex:
            log.warn("Error creating the StorageDict table: %s %s",
                     query_table, ex)
            raise ex
        key_names = map(lambda a: a[0].encode('UTF8'), self._primary_keys)
        column_names = self._columns

        self._hcache_params = (self._ksp, self._table, self._storage_id,
                               self._tokens, key_names,
                               map(lambda x: {
                                   "name": x[0],
                                   "type": x[1]
                               }, column_names), {
                                   'cache_size': config.max_cache_size,
                                   'writer_par': config.write_callbacks_number,
                                   'write_buffer': config.write_buffer_size
                               })
        log.debug("HCACHE params %s", self._hcache_params)
        self._hcache = Hcache(*self._hcache_params)
        # Storing all in-memory values to cassandra
        for key, value in dict.iteritems(self):
            self._hcache.put_row(self._make_key(key), self._make_value(value))
        if hasattr(self, '_indexed_args') and self._indexed_args is not None:
            index_query = 'CREATE CUSTOM INDEX IF NOT EXISTS ' + self._table + '_idx ON '
            index_query += self._ksp + '.' + self._table + ' (' + str.join(
                ',', self._indexed_args) + ') '
            index_query += "using 'es.bsc.qbeast.index.QbeastIndex';"
            try:
                config.session.execute(index_query)
            except Exception as ex:
                log.error("Error creating the Qbeast custom index: %s %s",
                          index_query, ex)
                raise ex

    def stop_persistent(self):
        """
        Method to turn a StorageDict into non-persistent.
        """
        log.debug('STOP PERSISTENCE: %s', self._table)
        self._is_persistent = False
        self._hcache = None

    def delete_persistent(self):
        """
        Method to empty all data assigned to a StorageDict.
        """
        query = "TRUNCATE TABLE %s.%s;" % (self._ksp, self._table)
        log.debug('DELETE PERSISTENT: %s', query)
        config.session.execute(query)

    def _build_istorage_obj(self, obj_type, so_name, storage_id):
        cname, module = IStorage.process_path(obj_type)
        mod = __import__(module, globals(), locals(), [cname], 0)
        # new name as ksp+table+obj_class_name
        so = getattr(mod, cname)(name=so_name + cname.lower(),
                                 storage_id=storage_id)
        # sso._storage_id = storage_id
        return so

    def __delitem__(self, key):
        """
        Method to delete a specific entry in the dict in the key position.
        Args:
            key: position of the entry that we want to delete
        """
        if not self._is_persistent:
            dict.__delitem__(self, key)
        else:
            self._hcache.delete_row([key])

    def __getitem__(self, key):
        """
        If the object is persistent, each request goes to the hfetch.
        Args:
             key: the dictionary key
        Returns
             item: value found in position key
        """
        log.debug('GET ITEM %s', key)

        if not self._is_persistent:
            to_return = dict.__getitem__(self, key)
            return to_return
        else:
            cres = self._hcache.get_row(self._make_key(key))
            log.debug("GET ITEM %s[%s]", cres, cres.__class__)

            final_results = []
            for index, (name, col_type) in enumerate(self._columns):
                if col_type not in IStorage._basic_types:
                    table_name = self._ksp + '.' + self._table
                    element = (self._build_istorage_obj(
                        col_type, table_name, uuid.UUID(cres[index])))
                else:
                    element = cres[index]
                final_results.append(element)

            cres = final_results
            if issubclass(cres.__class__, NoneType):
                return None
            elif self._column_builder is not None:
                if len(cres) > 0 and isinstance(cres[0], list):
                    return [self._column_builder(*row) for row in cres]
                else:
                    return self._column_builder(*cres)
            else:
                return cres[0]

    def __setitem__(self, key, val):
        """
           Method to insert values in the StorageDict
           Args:
               key: the position of the value that we want to save
               val: the value that we want to save in that position
        """
        if isinstance(val, np.ndarray):
            val = StorageNumpy(val)
        log.debug('SET ITEM %s->%s', key, val)
        if not config.hecuba_type_checking:
            if not self._is_persistent:
                dict.__setitem__(self, key, val)
            else:
                if isinstance(val, IStorage) and not val._is_persistent:
                    attribute = val.__class__.__name__.lower()
                    count = self._count_name_collision(attribute)
                    # new name as ksp+table+obj_class_name
                    val.make_persistent(self._ksp + '.' + self._table + "_" +
                                        attribute + "_" + str(count))
                self._hcache.put_row(self._make_key(key),
                                     self._make_value(val))
        else:
            if isinstance(val, Iterable) and not isinstance(val, str):
                col_types = map(
                    lambda x: IStorage._conversions[x.__class__.__name__], val)
                spec_col_types = map(lambda x: x[1], self._columns)
                for idx, value in enumerate(spec_col_types):
                    if value == 'double':
                        spec_col_types[idx] = 'float'
            else:
                col_types = IStorage._conversions[val.__class__.__name__]
                spec_col_types = map(lambda x: x[1], self._columns)[0]
                if spec_col_types == 'double':
                    spec_col_types = 'float'
            if isinstance(key, Iterable) and not isinstance(key, str):
                key_types = map(
                    lambda x: IStorage._conversions[x.__class__.__name__], key)
                spec_key_types = map(lambda x: x[1], self._primary_keys)
                for idx, value in enumerate(spec_key_types):
                    if value == 'double':
                        spec_key_types[idx] = 'float'
            else:
                key_types = IStorage._conversions[key.__class__.__name__]
                spec_key_types = map(lambda x: x[1], self._primary_keys)[0]
                if spec_key_types == 'double':
                    spec_key_types = 'float'
            if (col_types == spec_col_types):
                if (key_types == spec_key_types):
                    if not self._is_persistent:
                        dict.__setitem__(self, key, val)
                    else:
                        self._hcache.put_row(self._make_key(key),
                                             self._make_value(val))
                else:
                    raise KeyError
            else:
                raise ValueError

    def __repr__(self):
        """
        Overloads the method used by print to show a StorageDict
        Returns: The representation of the data stored in the StorageDict

        """
        to_return = {}
        for item in self.iteritems():
            to_return[item[0]] = item[1]
            if len(to_return) == config.hecuba_print_limit:
                return str(to_return)
        if len(to_return) > 0:
            return str(to_return)
        return ""

    def update(self, other=None, **kwargs):
        """
        Updates the current dict with a new dictionary or set of attr,value pairs
        (those must follow the current dict data model).
        Args:
            other: python dictionary or StorageDict. All key,val values in it will
            be inserted in the current dict.
            **kwargs: set of attr:val pairs, to be treated as key,val and inserted
            in the current dict.
        """
        if other is not None:
            if isinstance(other, StorageDict):
                for k, v in other.iteritems():
                    self[k] = v
            else:
                for k, v in other.items() if isinstance(other,
                                                        Mapping) else other:
                    self[k] = v
        for k, v in kwargs.items():
            self[k] = v

    def iterkeys(self):
        """
        Obtains the iterator for the keys of the StorageDict
        Returns:
            if persistent:
                iterkeys(self): list of keys
            if not persistent:
                dict.iterkeys(self)
        """
        if self._is_persistent:
            ik = self._hcache.iterkeys(config.prefetch_size)
            return NamedIterator(ik, self._key_builder, self)
        else:
            return dict.iterkeys(self)

    def iteritems(self):
        """
        Obtains the iterator for the key,val pairs of the StorageDict
        Returns:
            if persistent:
                NamedItemsIterator(self): list of key,val pairs
            if not persistent:
                dict.iteritems(self)
        """
        if self._is_persistent:
            ik = self._hcache.iteritems(config.prefetch_size)
            return NamedItemsIterator(self._key_builder, self._column_builder,
                                      self._k_size, ik, self)
        else:
            return dict.iteritems(self)

    def itervalues(self):
        """
        Obtains the iterator for the values of the StorageDict
        Returns:
            if persistent:
                NamedIterator(self): list of valuesStorageDict
            if not persistent:
                dict.itervalues(self)
        """
        if self._is_persistent:
            ik = self._hcache.itervalues(config.prefetch_size)
            return NamedIterator(ik, self._column_builder, self)
        else:
            return dict.itervalues(self)

    def keys(self):
        return [i for i in self.iterkeys()]

    def values(self):
        return [i for i in self.itervalues()]

    def items(self):
        return [i for i in self.iteritems()]

    def get(self, key, default):
        try:
            value = self.__getitem__(key)
        except KeyError:
            value = default
        return value
Exemple #15
0
    def make_persistent(self, name):
        """
        Method to transform a StorageDict into a persistent object.
        This will make it use a persistent DB as the main location
        of its data.
        Args:
            name:
        """
        if self._is_persistent:
            raise AlreadyPersistentError(
                "This StorageDict is already persistent [Before:{}.{}][After:{}]",
                self._ksp, self._table, name)
        self._is_persistent = True
        (self._ksp, self._table) = self._extract_ks_tab(name)

        if self._storage_id is None:
            self._storage_id = uuid.uuid3(uuid.NAMESPACE_DNS,
                                          self._ksp + '.' + self._table)
        self._build_args = self._build_args._replace(
            storage_id=self._storage_id, name=self._ksp + "." + self._table)
        self._store_meta(self._build_args)
        if config.id_create_schema == -1:
            query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % (
                self._ksp, config.replication)
            try:
                log.debug('MAKE PERSISTENCE: %s', query_keyspace)
                config.session.execute(query_keyspace)
            except Exception as ex:
                log.warn("Error creating the StorageDict keyspace %s, %s",
                         (query_keyspace), ex)
                raise ex

        for key, value in dict.iteritems(self):
            if issubclass(value.__class__, IStorage):
                # new name as ksp+table+obj_class_name
                val_name = self._ksp + '.' + self._table + type(
                    value).__name__.lower()
                value.make_persistent(val_name)

        columns = self._primary_keys + self._columns
        for ind, entry in enumerate(columns):
            n = StorageDict._other_case.match(entry[1])
            if n is not None:
                iter_type, intra_type = n.groups()
            else:
                iter_type = entry[1]
            if iter_type not in IStorage._basic_types:
                columns[ind] = entry[0], 'uuid'

        pks = map(lambda a: a[0], self._primary_keys)
        query_table = "CREATE TABLE IF NOT EXISTS %s.%s (%s, PRIMARY KEY (%s));" \
                      % (self._ksp,
                         self._table,
                         ",".join("%s %s" % tup for tup in columns),
                         str.join(',', pks))
        try:
            log.debug('MAKE PERSISTENCE: %s', query_table)
            config.session.execute(query_table)
        except Exception as ex:
            log.warn("Error creating the StorageDict table: %s %s",
                     query_table, ex)
            raise ex
        key_names = map(lambda a: a[0].encode('UTF8'), self._primary_keys)
        column_names = self._columns

        self._hcache_params = (self._ksp, self._table, self._storage_id,
                               self._tokens, key_names,
                               map(lambda x: {
                                   "name": x[0],
                                   "type": x[1]
                               }, column_names), {
                                   'cache_size': config.max_cache_size,
                                   'writer_par': config.write_callbacks_number,
                                   'write_buffer': config.write_buffer_size
                               })
        log.debug("HCACHE params %s", self._hcache_params)
        self._hcache = Hcache(*self._hcache_params)
        # Storing all in-memory values to cassandra
        for key, value in dict.iteritems(self):
            self._hcache.put_row(self._make_key(key), self._make_value(value))
        if hasattr(self, '_indexed_args') and self._indexed_args is not None:
            index_query = 'CREATE CUSTOM INDEX IF NOT EXISTS ' + self._table + '_idx ON '
            index_query += self._ksp + '.' + self._table + ' (' + str.join(
                ',', self._indexed_args) + ') '
            index_query += "using 'es.bsc.qbeast.index.QbeastIndex';"
            try:
                config.session.execute(index_query)
            except Exception as ex:
                log.error("Error creating the Qbeast custom index: %s %s",
                          index_query, ex)
                raise ex
Exemple #16
0
    def test_get_row_key_error(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        This test check the hcache sets a key error when the key we asked doesnt exist
        Analyzes:
        - Hcache
        - Get_row (returning KeyError)
        ''' ''''''

        table = 'particle'
        num_keys = 10001

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        for i in xrange(0, num_keys):
            vals = ','.join(
                str(e) for e in
                [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"])
            self.session.execute(
                "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)"
                % (self.keyspace, table, vals))

        token_ranges = [(8070430489100699999, 8070450532247928832)]

        non_existent_keys = 10

        cache_size = num_keys + non_existent_keys

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort
        keys = ["partid", "time"]
        values = ["ciao", "x", "y", "z"]
        cache = Hcache(self.keyspace, table, "", token_ranges, keys, values,
                       {'cache_size': cache_size})

        # Access the cache, which is empty and queries cassandra to retrieve the data
        t1 = time.time()
        error_counter = 0
        for pk in xrange(0, num_keys + non_existent_keys):
            ck = pk * 10
            try:
                result = cache.get_row([pk, ck])
                self.assertEqual(len(result), len(values))
            except KeyError as e:
                error_counter = error_counter + 1

        print 'Retrieved {0} keys in {1} seconds. {2} keys weren\'t found, {3} keys weren\'t supposed to be found'.format(
            unicode(str(num_keys), 'utf-8'),
            unicode(str(time.time() - t1), 'utf-8'),
            unicode(str(error_counter), 'utf-8'),
            unicode(str(non_existent_keys), 'utf-8'))

        self.assertEqual(error_counter, non_existent_keys)

        # Access the cache, which has already all the data and will ask cassandra only if
        # the keys asked are not present
        t1 = time.time()
        error_counter = 0
        for pk in xrange(0, num_keys + non_existent_keys):
            ck = pk * 10
            try:
                result = cache.get_row([pk, ck])
                self.assertEqual(len(result), len(values))
            except KeyError as e:
                error_counter = error_counter + 1

        print 'Retrieved {0} keys in {1} seconds. {2} keys weren\'t found, {3} keys weren\'t supposed to be found'.format(
            unicode(str(num_keys), 'utf-8'),
            unicode(str(time.time() - t1), 'utf-8'),
            unicode(str(error_counter), 'utf-8'),
            unicode(str(non_existent_keys), 'utf-8'))

        self.assertEqual(error_counter, non_existent_keys)
Exemple #17
0
    def uuid_test(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        import uuid
        '''''' '''
        This test check the correct handling of UUIDs
        
        Analyzes:
        - Hcache
        - Put_row
        - Iteritems
        ''' ''''''

        table = "uuid"

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid uuid, data int, PRIMARY KEY(partid));"
            % (self.keyspace, table))

        nelem = 1000
        nblocks = 10

        t_f = pow(-2, 63)  # Token begin range
        t_t = pow(2, 63) - 1
        # Token blocks
        tkn_size = (t_t - t_f) / (nelem / nblocks)
        tokens = [(a, a + tkn_size)
                  for a in xrange(t_f, t_t - tkn_size, tkn_size)]

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        keys = ["partid"]
        values = ["data"]

        # CREATE TABLE test.bulk(partid int PRIMARY KEY, data text);
        cache = Hcache(self.keyspace, table,
                       "WHERE token(partid)>=? AND token(partid)<?;", tokens,
                       keys, values, {
                           'cache_size': '10',
                           'writer_buffer': 20
                       })

        # Write data
        someid = None
        i = 0
        while i < nelem:
            u = uuid.uuid4()  # ('81da81e8-1914-11e7-908d-ecf4bb4c66c4')
            cache.put_row([u], [i])
            if i == nelem / 2:
                someid = u
            i += 1

        # by recreating the cache we wait until all the data is written

        cache = Hcache(self.keyspace, table,
                       "WHERE token(partid)>=? AND token(partid)<?;", tokens,
                       keys, values, {
                           'cache_size': '10',
                           'writer_buffer': 20
                       })
        # Read data
        itera = cache.iteritems(10)
        found = False
        counter = 0
        while True:
            try:
                L = uuid.UUID(itera.get_next()[0])
                if L == someid:
                    found = True
            except StopIteration:
                break
            counter = counter + 1

        self.assertEqual(counter, nelem)
        self.assertTrue(found)
Exemple #18
0
class Hfetch_Tests(unittest.TestCase):
    keyspace = "hnumpy_test"
    contact_names = ['127.0.0.1']
    nodePort = 9042
    cluster = Cluster(contact_names, port=nodePort)
    session = cluster.connect()

    @classmethod
    def setUpClass(cls):
        cls.session.execute(
            "CREATE KEYSPACE IF NOT EXISTS %s WITH replication "
            "= {'class': 'SimpleStrategy', 'replication_factor': 1};" %
            cls.keyspace)
        cls.session.execute(
            "CREATE TYPE IF NOT EXISTS %s.numpy_meta(dims frozen<list<int>>,type int,type_size int);"
            % cls.keyspace)

    @classmethod
    def tearDownClass(cls):
        #self.session.execute("DROP KEYSPACE IF EXISTS %s;" % cls.keyspace)
        pass

    def test_simple_memory(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        import numpy as np
        '''''' '''
        
        Analyzes:
        
        ''' ''''''
        dims = 2
        elem_dim = 4096

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except RuntimeError, e:
            print e
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        table = "arrays_numpies"

        self.session.execute("DROP TABLE if exists %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE %s.%s(storage_id uuid, cluster_id int, block_id int, payload blob,PRIMARY KEY((storage_id,cluster_id),block_id));"
            % (self.keyspace, table))

        storage_id = uuid.uuid3(uuid.NAMESPACE_DNS,
                                self.keyspace + '.' + table)
        time.sleep(5)
        a = Hcache(self.keyspace, table, storage_id, [],
                   ['storage_id', 'cluster_id', 'block_id'], [{
                       'name': "payload",
                       'type': 'numpy'
                   }], {})

        #prepare data

        bigarr = np.arange(pow(elem_dim, dims)).reshape(elem_dim, elem_dim)

        print 'To be written '
        keys = [storage_id, -1, -1]
        values = [bigarr.astype('i')]
        print values
        #insert
        a.put_row(keys, values)

        #delete is a blocking op which waits the data to be flushed
        del a

        a = Hcache(self.keyspace, table, storage_id, [],
                   ["storage_id", 'cluster_id', 'block_id'], [{
                       "name": "payload",
                       "type": "numpy"
                   }], {})
        #retrieve
        result = a.get_row(keys)
        print 'Retrieved from cassandra'
        print result
        if np.array_equal(bigarr, result[0]):
            print 'Created and retrieved are equal'
        else:
            self.fail('Created and retrieved ndarrays differ')
        self.session.execute("DROP TABLE %s.%s;" % (self.keyspace, table))
Exemple #19
0
        t_f = pow(-2, 63)  # Token begin range
        t_t = pow(2, 63) - 1
        # Token blocks
        tkn_size = (t_t - t_f) / (nparts / p)
        tkns = [(a, a + tkn_size)
                for a in xrange(t_f, t_t - tkn_size, tkn_size)]

        keys = ["partid", "time"]
        values = ["x"]

        hcache_config = {'cache_size': '100', 'writer_buffer': 20}

        token_query = "WHERE token(partid)>=? AND token(partid)<?;"

        cache = Hcache(self.keyspace, table, token_query, tkns, keys, values,
                       hcache_config)

        hiter_config = {"prefetch_size": 100, "update_cache": "yes"}

        hiter = cache.iteritems(hiter_config)

        count = 0
        start = time.time()
        while True:
            try:
                i = hiter.get_next()
                self.assertEqual(len(i), len(keys) + len(values))
            except StopIteration:
                break
            count += 1
Exemple #20
0
    def test_get_row(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        This test iterates over a set of particles, performing get_row operations
        
        Analyzes:
        - HCache (multiple reads of the same key)
        - Get_row
        ''' ''''''

        table = 'particle'
        num_keys = 10001

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        for i in xrange(0, num_keys):
            vals = ','.join(
                str(e) for e in
                [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"])
            self.session.execute(
                "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)"
                % (self.keyspace, table, vals))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        token_ranges = []

        cache_size = 10001

        keys = ["partid", "time"]
        values = ["ciao", "x", "y", "z"]

        cache_config = {'cache_size': cache_size}

        cache = Hcache(self.keyspace, table, "", token_ranges, keys, values,
                       cache_config)

        # clustering key
        t1 = time.time()
        for pk in xrange(0, num_keys):
            ck = pk * 10
            try:
                result = cache.get_row([pk, ck])
                self.assertEqual(len(result), len(values))
            except KeyError as e:
                print "Error when retrieving value from cache:", e, [pk, ck]

        print 'time - load C++ cache with cassandra data: ', time.time() - t1

        t1 = time.time()
        for pk in xrange(0, num_keys):
            ck = pk * 10
            try:
                result = cache.get_row([pk, ck])
                self.assertEqual(len(result), len(values))
            except KeyError as e:
                print "Error when retrieving value from cache:", e, [pk, ck]
        # print 'items in res: ',len(result)
        print 'time - read data from C++ cache: ', time.time() - t1

        py_dict = {}
        cache = Hcache(self.keyspace, table, "",
                       [(8070430489100699999, 8070450532247928832)],
                       ["partid", "time"], ["ciao", "x", "y", "z"],
                       {'cache_size': num_keys})

        t1 = time.time()
        for pk in xrange(0, num_keys):
            ck = pk * 10
            try:
                result = cache.get_row([pk, ck])
                py_dict[(pk, ck)] = result
                self.assertEqual(len(result), len(values))
            except KeyError as e:
                print "Error when retrieving value from cache:", e, [pk, ck]
        print 'time - load data into python dict: ', time.time() - t1
        # print 'size ', len(py_dict)
        # print 'items in res: ',len(py_dict[1])

        t1 = time.time()
        for pk in xrange(0, num_keys):
            ck = pk * 10
            try:
                result = py_dict[(pk, ck)]
                self.assertEqual(len(result), len(values))
            except KeyError as e:
                print "Error when retrieving value from cache:", e, [pk, ck]
        print 'time - read data from the python dict: ', time.time() - t1
Exemple #21
0
    def test_delete_row(self):
        from hfetch import connectCassandra
        from hfetch import Hcache
        '''''' '''
        This test iterates over a set of particles, performing get_row operations

        Analyzes:
        - HCache
        - Get_row (setting TypeError properly)
        ''' ''''''

        table = 'particle'
        num_keys = 100  # num keys must be multiple of expected_errors
        expected_errors = 10

        self.session.execute("DROP TABLE IF EXISTS %s.%s;" %
                             (self.keyspace, table))
        self.session.execute(
            "CREATE TABLE IF NOT EXISTS %s.%s(partid int, time float, ciao text,"
            "x float, y float, z float, PRIMARY KEY(partid,time));" %
            (self.keyspace, table))

        for i in xrange(0, num_keys):
            vals = ','.join(
                str(e) for e in
                [i, i / .1, i / .2, i / .3, i / .4, "'" + str(i * 60) + "'"])
            self.session.execute(
                "INSERT INTO %s.%s(partid , time , x, y , z,ciao ) VALUES (%s)"
                % (self.keyspace, table, vals))

        try:
            connectCassandra(self.contact_names, self.nodePort)
        except Exception:
            print 'can\'t connect, verify the contact points and port', self.contact_names, self.nodePort

        token_ranges = []

        cache_size = 1

        keys = ["partid", "time"]
        values = ["ciao", "x", "y", "z"]

        cache_config = {'cache_size': cache_size}

        cache = Hcache(self.keyspace, table, "", token_ranges, keys, values,
                       cache_config)
        pk = 0
        ck = pk * 10

        try:
            result = cache.get_row([pk, ck])
            self.assertEqual(len(result), len(values))
        except KeyError as e:
            self.fail("Error when retrieving value from cache: " + str(e) +
                      " -- " + str([pk, ck]))

        try:
            result = cache.delete_row([pk, ck])
        except KeyError as e:
            self.fail("Error when deleteing entry from cache: " + str(e) +
                      " -- " + str([pk, ck]))

        try:
            result = cache.get_row([pk, ck])
            self.fail(
                "Error when retrieving value from cache, the entry shouldnt exist"
            )
        except KeyError as e:
            pass
Exemple #22
0
class StorageNumpy(np.ndarray, IStorage):
    _storage_id = None
    _build_args = None
    _class_name = None
    _hcache_params = None
    _hcache = None
    _prepared_store_meta = config.session.prepare(
        'INSERT INTO hecuba.istorage'
        '(storage_id, class_name, name)'
        'VALUES (?,?,?)')

    args_names = ["storage_id", "class_name", "name"]
    args = namedtuple('StorageNumpyArgs', args_names)

    def __new__(cls, input_array=None, storage_id=None, name=None, **kwargs):

        if input_array is None and name is not None and storage_id is not None:
            input_array = cls.load_array(storage_id, name)
            obj = np.asarray(input_array).view(cls)
            obj._is_persistent = True
        elif name is None and storage_id is not None:
            raise RuntimeError("hnumpy received storage id but not a name")
        elif (input_array is not None and name is not None and storage_id is not None) \
                or (storage_id is None and name is not None):
            obj = np.asarray(input_array).view(cls)
            obj.make_persistent(name)
        else:
            obj = np.asarray(input_array).view(cls)
            obj._is_persistent = False
        # Input array is an already formed ndarray instance
        # We first cast to be our class type
        # add the new attribute to the created instance
        obj._storage_id = storage_id
        # Finally, we must return the newly created object:
        obj._class_name = '%s.%s' % (cls.__module__, cls.__name__)
        return obj

    # used as copy constructor
    def __array_finalize__(self, obj):
        if obj is None:
            return
        self._storage_id = getattr(obj, '_storage_id', None)

    @staticmethod
    def build_remotely(new_args):
        """
            Launches the StorageNumpy.__init__ from the uuid api.getByID
            Args:
                new_args: a list of all information needed to create again the StorageNumpy
            Returns:
                so: the created StorageNumpy
        """
        log.debug("Building StorageNumpy object with %s", new_args)
        return StorageNumpy(new_args.storage_id)

    @staticmethod
    def _store_meta(storage_args):
        """
            Saves the information of the object in the istorage table.
            Args:.
                storage_args (object): contains all data needed to restore the object from the workers
        """
        log.debug("StorageObj: storing media %s", storage_args)
        try:
            config.session.execute(StorageNumpy._prepared_store_meta, [
                storage_args.storage_id, storage_args.class_name,
                storage_args.name
            ])
        except Exception as ex:
            log.warn("Error creating the StorageNumpy metadata with args: %s" %
                     str(storage_args))
            raise ex

    @staticmethod
    def load_array(storage_id, name):
        (ksp, table) = IStorage._extract_ks_tab(name)
        _hcache_params = (ksp, table + '_numpies', storage_id, [],
                          ['storage_id', 'cluster_id', 'block_id'], [{
                              'name':
                              "payload",
                              'type':
                              'numpy'
                          }], {
                              'cache_size': config.max_cache_size,
                              'writer_par': config.write_callbacks_number,
                              'write_buffer': config.write_buffer_size
                          })
        _hcache = Hcache(*_hcache_params)
        result = _hcache.get_row([storage_id, -1, -1])
        if len(result) == 1:
            return result[0]
        else:
            raise KeyError

    def make_persistent(self, name):
        if self._is_persistent:
            raise AlreadyPersistentError(
                "This StorageNumpy is already persistent [Before:{}.{}][After:{}]",
                self._ksp, self._table, name)
        self._is_persistent = True

        (self._ksp, self._table) = self._extract_ks_tab(name)
        if self._storage_id is None:
            self._storage_id = uuid.uuid3(
                uuid.NAMESPACE_DNS, self._ksp + '.' + self._table + '_numpies')
        self._build_args = self.args(self._storage_id, self._class_name, name)
        log.info("PERSISTING DATA INTO %s %s", self._ksp, self._table)

        query_keyspace = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = %s" % (
            self._ksp, config.replication)
        config.session.execute(query_keyspace)

        config.session.execute(
            'CREATE TABLE IF NOT EXISTS ' + self._ksp + '.' + self._table +
            '_numpies'
            '(storage_id uuid , '
            'cluster_id int, '
            'block_id int, '
            'payload blob, '
            'PRIMARY KEY((storage_id,cluster_id),block_id))')

        self._hcache_params = (self._ksp, self._table + '_numpies',
                               self._storage_id, [],
                               ['storage_id', 'cluster_id', 'block_id'], [{
                                   'name':
                                   "payload",
                                   'type':
                                   'numpy'
                               }], {
                                   'cache_size': config.max_cache_size,
                                   'writer_par': config.write_callbacks_number,
                                   'write_buffer': config.write_buffer_size
                               })

        self._hcache = Hcache(*self._hcache_params)
        if len(self.shape) != 0:
            self._hcache.put_row([self._storage_id, -1, -1], [self])
        self._store_meta(self._build_args)

    def delete_persistent(self):
        """
            Deletes the Cassandra table where the persistent StorageObj stores data
        """
        self._is_persistent = False

        query = "DELETE FROM %s.%s WHERE storage_id = %s;" % (
            self._ksp, self._table + '_numpies', self._storage_id)
        log.debug("DELETE PERSISTENT: %s", query)
        config.session.execute(query)