Beispiel #1
0
    def test_rtrie_in_mdb(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        try:
            env = mdb.Env('/tmp/test_rtrie',
                          flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC
                          | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE)
            db.put_raw(txn, 'nodes', nodeaddr, nodelen)
            db.put_raw(txn, 'kids', kidaddr, kidlen)

            n, ns = db.get_raw(txn, 'nodes')
            k, ks = db.get_raw(txn, 'kids')
            txn.commit()
            env.close()

            env = mdb.Env('/tmp/test_rtrie',
                          flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name='_meta_')

            n, ns = db.get_raw(txn, 'nodes')
            k, ks = db.get_raw(txn, 'kids')
            self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2)
            self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3)
            self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4)
            self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere'))

            txn.commit()
            env.close()
        finally:
            import os
            os.unlink('/tmp/test_rtrie')
            os.unlink('/tmp/test_rtrie-lock')
Beispiel #2
0
    def test_rtrie_in_mdb(self):
        t = Trie()
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hell"), 2)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellothere"), 3)
        self.assertEqual(t.add("good"), 4)
        self.assertEqual(t.add("goodbye"), 5)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellsink"), 6)

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        try:
            env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name="_meta_", flags=mdb.MDB_CREATE)
            db.put_raw(txn, "nodes", nodeaddr, nodelen)
            db.put_raw(txn, "kids", kidaddr, kidlen)

            n, ns = db.get_raw(txn, "nodes")
            k, ks = db.get_raw(txn, "kids")
            txn.commit()
            env.close()

            env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name="_meta_")

            n, ns = db.get_raw(txn, "nodes")
            k, ks = db.get_raw(txn, "kids")
            self.assertEqual(rtrie.vid_for_value(n, k, "hello"), 1)
            self.assertEqual(rtrie.vid_for_value(n, k, "hell"), 2)
            self.assertEqual(rtrie.vid_for_value(n, k, "goodbye"), 5)
            self.assertEqual(rtrie.vid_for_value(n, k, "hellsink"), 6)
            self.assertEqual(rtrie.vid_for_value(n, k, "hellothere"), 3)
            self.assertEqual(rtrie.vid_for_value(n, k, "good"), 4)
            self.assertIsNone(rtrie.vid_for_value(n, k, "notthere"))

            txn.commit()
            env.close()
        finally:
            import os

            os.unlink("/tmp/test_rtrie")
            os.unlink("/tmp/test_rtrie-lock")
Beispiel #3
0
    def test_stress_wtrie(self):
        ktrie = Trie()
        strie = Trie()
        etrie = Trie()

        keywords = {}
        search_terms = {}
        exchange_ids = {}

        with open(fixture) as f:
            for data in f:
                for word in data.split(' '):
                    vid = ktrie.add(word)
                    actual_vid = keywords.get(word)
                    if actual_vid is not None:
                        self.assertEqual(vid, actual_vid)
                    else:
                        keywords[word] = vid

                vid = strie.add(data)
                actual_vid = search_terms.get(data)
                if actual_vid is not None:
                    self.assertEqual(vid, actual_vid)
                else:
                    search_terms[data] = vid

        nodes, kids, nodelen = etrie.serialize()
        naddr, nlen = nodes.buffer_info()
        kaddr, klen = kids.buffer_info()
        #summarize(naddr, kaddr, nodelen)
        #print_it(naddr, kaddr)

        for dc, vid in exchange_ids.iteritems():
            rvid = etrie.add(dc)
            self.assertEqual(vid, rvid)

            print dc, vid
            value = value_for_vid(naddr, kaddr, vid)
            self.assertEqual(dc, value)
            if dc != value:
                print "      dc=%s adc=%s" % (dc, value)

            avid = vid_for_value(naddr, kaddr, dc)
            #print "vid=%s avid=%s" % (vid, avid)
            self.assertEqual(vid, avid)
Beispiel #4
0
    def test_stress_wtrie(self):
        ktrie = Trie()
        strie = Trie()
        etrie = Trie()

        keywords = {}
        search_terms = {}
        exchange_ids = {}

        with open(fixture) as f:
            for data in f:
                for word in data.split(" "):
                    vid = ktrie.add(word)
                    actual_vid = keywords.get(word)
                    if actual_vid is not None:
                        self.assertEqual(vid, actual_vid)
                    else:
                        keywords[word] = vid

                vid = strie.add(data)
                actual_vid = search_terms.get(data)
                if actual_vid is not None:
                    self.assertEqual(vid, actual_vid)
                else:
                    search_terms[data] = vid

        nodes, kids, nodelen = etrie.serialize()
        naddr, nlen = nodes.buffer_info()
        kaddr, klen = kids.buffer_info()
        # summarize(naddr, kaddr, nodelen)
        # print_it(naddr, kaddr)

        for dc, vid in exchange_ids.iteritems():
            rvid = etrie.add(dc)
            self.assertEqual(vid, rvid)

            print dc, vid
            value = value_for_vid(naddr, kaddr, vid)
            self.assertEqual(dc, value)
            if dc != value:
                print "      dc=%s adc=%s" % (dc, value)

            avid = vid_for_value(naddr, kaddr, dc)
            # print "vid=%s avid=%s" % (vid, avid)
            self.assertEqual(vid, avid)
Beispiel #5
0
    def test_rtrie_in_memory(self):

        s = unicode(u'séllsink').encode('utf-8')
        #print "HELLSINK: %s" % s

        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(s), 7)
        t.print_it()

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        print "LENS %s %s" % (nodelen, kidlen)

        for i in range(8):
            val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
            print "Value", i, val

        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'),
                         3)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells'))
Beispiel #6
0
    def test_rtrie_in_memory(self):

        s = unicode(u"séllsink").encode("utf-8")
        # print "HELLSINK: %s" % s

        t = Trie()
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hell"), 2)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellothere"), 3)
        self.assertEqual(t.add("good"), 4)
        self.assertEqual(t.add("goodbye"), 5)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellsink"), 6)
        self.assertEqual(t.add(s), 7)
        t.print_it()

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        print "LENS %s %s" % (nodelen, kidlen)

        for i in range(8):
            val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
            print "Value", i, val

        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hello"), 1)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hell"), 2)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "goodbye"), 5)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellsink"), 6)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellothere"), 3)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "good"), 4)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "notthere"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "h"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "he"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hel"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hells"))
Beispiel #7
0
    class HustleOutputStream(object):
        def __init__(self, stream, url, params, **kwargs):
            import tempfile
            from wtrie import Trie

            self.result_table = result_table
            self.result_columns = result_table._field_names
            tmpdir = getattr(params, 'tmpdir', '/tmp')
            self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir)
            maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024)
            self.env, self.txn, self.dbs, self.meta = \
                self.result_table._open(self.filename, maxsize, write=True,
                                        lru_size=10000)
            self.autoinc = 1
            self.url = url
            self.vid_trie = Trie()
            self.vid16_trie = Trie()

        def add(self, k, v):
            from hustle.core.marble import _insert_row
            data = dict(zip(self.result_columns, list(k) + list(v)))
            #print "BOZAK! adding %s %s %s" % (self.result_columns, k, v)
            updated_dbs = _insert_row(data, self.txn, self.dbs, self.autoinc,
                                      self.vid_trie, self.vid16_trie)
            if updated_dbs:
                self.dbs = updated_dbs
            self.autoinc += 1

        def close(self):
            import os
            import ujson
            from disco import util

            self.meta.put(self.txn, '_total_rows', str(self.autoinc))
            vid_nodes, vid_kids, _ = self.vid_trie.serialize()
            vid16_nodes, vid16_kids, _ = self.vid16_trie.serialize()
            vn_ptr, vn_len = vid_nodes.buffer_info()
            vk_ptr, vk_len = vid_kids.buffer_info()
            vn16_ptr, vn16_len = vid16_nodes.buffer_info()
            vk16_ptr, vk16_len = vid16_kids.buffer_info()
            self.meta.put_raw(self.txn, '_vid_nodes', vn_ptr, vn_len)
            self.meta.put_raw(self.txn, '_vid_kids', vk_ptr, vk_len)
            self.meta.put_raw(self.txn, '_vid16_nodes', vn16_ptr, vn16_len)
            self.meta.put_raw(self.txn, '_vid16_kids', vk16_ptr, vk16_len)
            self.meta.put(self.txn, 'name',
                          ujson.dumps(self.result_table._name))
            self.meta.put(self.txn, 'fields',
                          ujson.dumps(self.result_table._fields))
            for index, (subdb, subindexdb, bitmap_dict, column,
                        last) in self.dbs.iteritems():
                if subindexdb:
                    # process all values for this bitmap index
                    if column.index_indicator == 2:
                        bitmap_dict.evictAll()
                    else:
                        for val, bitmap in bitmap_dict.iteritems():
                            subindexdb.put(self.txn, val, bitmap.dumps())
                # insert a sentinel row to value table
                subdb.put(self.txn, self.autoinc + 1, last)
            self.txn.commit()

            try:
                self.env.copy(self.url)
                # print "Dumped result to %s" % self.url
            except Exception as e:
                msg = "Copy error: %s" % e
                print msg
                self.txn.abort()
                raise util.DataError(msg, "")
            finally:
                self.env.close()
                os.unlink(self.filename)
Beispiel #8
0
    class HustleOutputStream(object):
        def __init__(self, stream, url, params, **kwargs):
            import tempfile
            from wtrie import Trie

            self.result_table = result_table
            self.result_columns = result_table._field_names
            tmpdir = getattr(params, 'tmpdir', '/tmp')
            self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir)
            maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024)
            self.env, self.txn, self.dbs, self.meta = \
                self.result_table._open(self.filename, maxsize, write=True,
                                        lru_size=10000)
            self.autoinc = 1
            self.url = url
            self.vid_trie = Trie()
            self.vid16_trie = Trie()

        def add(self, k, v):
            from hustle.core.marble import _insert_row
            data = dict(zip(self.result_columns, list(k) + list(v)))
            #print "BOZAK! adding %s %s %s" % (self.result_columns, k, v)
            updated_dbs = _insert_row(data, self.txn, self.dbs, self.autoinc,
                                      self.vid_trie, self.vid16_trie)
            if updated_dbs:
                self.dbs = updated_dbs
            self.autoinc += 1

        def close(self):
            import os
            import ujson

            self.meta.put(self.txn, '_total_rows', str(self.autoinc))
            vid_nodes, vid_kids, _ = self.vid_trie.serialize()
            vid16_nodes, vid16_kids, _ = self.vid16_trie.serialize()
            vn_ptr, vn_len = vid_nodes.buffer_info()
            vk_ptr, vk_len = vid_kids.buffer_info()
            vn16_ptr, vn16_len = vid16_nodes.buffer_info()
            vk16_ptr, vk16_len = vid16_kids.buffer_info()
            self.meta.put_raw(self.txn, '_vid_nodes', vn_ptr, vn_len)
            self.meta.put_raw(self.txn, '_vid_kids', vk_ptr, vk_len)
            self.meta.put_raw(self.txn, '_vid16_nodes', vn16_ptr, vn16_len)
            self.meta.put_raw(self.txn, '_vid16_kids', vk16_ptr, vk16_len)
            self.meta.put(self.txn, 'name', ujson.dumps(self.result_table._name))
            self.meta.put(self.txn, 'fields', ujson.dumps(self.result_table._fields))
            for index, (subdb, subindexdb, bitmap_dict, column, last) in self.dbs.iteritems():
                if subindexdb:
                    # process all values for this bitmap index
                    if column.index_indicator == 2:
                        bitmap_dict.evictAll()
                    else:
                        for val, bitmap in bitmap_dict.iteritems():
                            subindexdb.put(self.txn, val, bitmap.dumps())
                # insert a sentinel row to value table
                subdb.put(self.txn, self.autoinc + 1, last)
            self.txn.commit()

            try:
                self.env.copy(self.url)
                # print "Dumped result to %s" % self.url
            except Exception as e:
                print "Copy error: %s" % e
                self.txn.abort()
                raise e
            self.env.close()
            os.unlink(self.filename)
Beispiel #9
0
    def test_wtrie(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(''), 0)

        # nodes = t.nodes
        # t.print_it()

        key, sz, pt = t.node_at_path()
        self.assertEqual(sz, 2)

        key, sz, pt = t.node_at_path(104)
        self.assertEqual(key, 'hell')
        self.assertEqual(pt, 0)
        self.assertEqual(sz, 2, 'actual %s' % sz)

        key2, sz, pt = t.node_at_path(104, 111)
        self.assertEqual(key2, 'o', 'actual %s' % key)
        self.assertEqual(pt, 2)
        self.assertEqual(sz, 1)

        key, sz, pt = t.node_at_path(104, 111, 116)
        self.assertEqual(key, 'there')
        self.assertEqual(pt, 1)
        self.assertEqual(sz, 0)

        n, k, _ = t.serialize()
        self.assertEqual(len(n), 7 * 4, "actual %d" % len(n))
        self.assertEqual(len(k), 100, "actual %d" % len(k))
        # print "sqork: %s" % t.kid_space

        print 'nodes', n
        print 'kids', k

        unpacked = struct.unpack_from("7I", n, 0)
        expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004,
                    0x00000008, 0x00000016)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH2I", k, 0)
        expected = (0, 0, 0x67000004, 0x68000002)
        self.assertEqual(unpacked, expected, unpacked)

        unpacked = struct.unpack_from("IH4cI", k, 16)
        expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH3c", k, 32)
        expected = (0x0004, 0x0003, 'b', 'y', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c2I", k, 44)
        expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IHcI", k, 64)
        expected = (0x0002, 1, 'o', 0x74000003)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH5c", k, 76)
        expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c", k, 88)
        expected = (0x0002, 0x0004, 's', 'i', 'n', 'k')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
Beispiel #10
0
    def test_wtrie(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(''), 0)

        # nodes = t.nodes
        # t.print_it()

        key, sz, pt = t.node_at_path()
        self.assertEqual(sz, 2)

        key, sz, pt = t.node_at_path(104)
        self.assertEqual(key, 'hell')
        self.assertEqual(pt, 0)
        self.assertEqual(sz, 2, 'actual %s' % sz)

        key2, sz, pt = t.node_at_path(104, 111)
        self.assertEqual(key2, 'o', 'actual %s' % key)
        self.assertEqual(pt, 2)
        self.assertEqual(sz, 1)

        key, sz, pt = t.node_at_path(104, 111, 116)
        self.assertEqual(key, 'there')
        self.assertEqual(pt, 1)
        self.assertEqual(sz, 0)

        n, k, _ = t.serialize()
        self.assertEqual(len(n), 7 * 4, "actual %d" % len(n))
        self.assertEqual(len(k), 100, "actual %d" % len(k))
        # print "sqork: %s" % t.kid_space

        print 'nodes', n
        print 'kids', k

        unpacked = struct.unpack_from("7I", n, 0)
        expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH2I", k, 0)
        expected = (0, 0, 0x67000004, 0x68000002)
        self.assertEqual(unpacked, expected, unpacked)

        unpacked = struct.unpack_from("IH4cI", k, 16)
        expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH3c", k, 32)
        expected = (0x0004, 0x0003, 'b', 'y', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c2I", k, 44)
        expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IHcI", k, 64)
        expected = (0x0002, 1, 'o', 0x74000003)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH5c", k, 76)
        expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c", k, 88)
        expected = (0x0002, 0x0004, 's', 'i', 'n', 'k')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))