Example #1
0
        def __init__(self, stream, url, params, **kwargs):
            import tempfile
            from wtrie import Trie

            self.result_table = result_table
            self.result_columns = result_table._field_names
            tmpdir = getattr(params, 'tmpdir', '/tmp')
            self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir)
            maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024)
            self.env, self.txn, self.dbs, self.meta = self.result_table._open(self.filename, maxsize, write=True, lru_size=10000)
            self.autoinc = 1
            self.url = url
            self.vid_trie = Trie()
            self.vid16_trie = Trie()
Example #2
0
        def __init__(self, stream, url, params, **kwargs):
            import tempfile
            from wtrie import Trie

            self.result_table = result_table
            self.result_columns = result_table._field_names
            tmpdir = getattr(params, 'tmpdir', '/tmp')
            self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir)
            maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024)
            self.env, self.txn, self.dbs, self.meta = self.result_table._open(
                self.filename, maxsize, write=True, lru_size=10000)
            self.autoinc = 1
            self.url = url
            self.vid_trie = Trie()
            self.vid16_trie = Trie()
Example #3
0
    def test_rtrie_in_mdb(self):
        t = Trie()
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hell"), 2)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellothere"), 3)
        self.assertEqual(t.add("good"), 4)
        self.assertEqual(t.add("goodbye"), 5)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellsink"), 6)

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        try:
            env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name="_meta_", flags=mdb.MDB_CREATE)
            db.put_raw(txn, "nodes", nodeaddr, nodelen)
            db.put_raw(txn, "kids", kidaddr, kidlen)

            n, ns = db.get_raw(txn, "nodes")
            k, ks = db.get_raw(txn, "kids")
            txn.commit()
            env.close()

            env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name="_meta_")

            n, ns = db.get_raw(txn, "nodes")
            k, ks = db.get_raw(txn, "kids")
            self.assertEqual(rtrie.vid_for_value(n, k, "hello"), 1)
            self.assertEqual(rtrie.vid_for_value(n, k, "hell"), 2)
            self.assertEqual(rtrie.vid_for_value(n, k, "goodbye"), 5)
            self.assertEqual(rtrie.vid_for_value(n, k, "hellsink"), 6)
            self.assertEqual(rtrie.vid_for_value(n, k, "hellothere"), 3)
            self.assertEqual(rtrie.vid_for_value(n, k, "good"), 4)
            self.assertIsNone(rtrie.vid_for_value(n, k, "notthere"))

            txn.commit()
            env.close()
        finally:
            import os

            os.unlink("/tmp/test_rtrie")
            os.unlink("/tmp/test_rtrie-lock")
Example #4
0
    def test_stress_wtrie(self):
        ktrie = Trie()
        strie = Trie()
        etrie = Trie()

        keywords = {}
        search_terms = {}
        exchange_ids = {}

        with open(fixture) as f:
            for data in f:
                for word in data.split(' '):
                    vid = ktrie.add(word)
                    actual_vid = keywords.get(word)
                    if actual_vid is not None:
                        self.assertEqual(vid, actual_vid)
                    else:
                        keywords[word] = vid

                vid = strie.add(data)
                actual_vid = search_terms.get(data)
                if actual_vid is not None:
                    self.assertEqual(vid, actual_vid)
                else:
                    search_terms[data] = vid

        nodes, kids, nodelen = etrie.serialize()
        naddr, nlen = nodes.buffer_info()
        kaddr, klen = kids.buffer_info()
        #summarize(naddr, kaddr, nodelen)
        #print_it(naddr, kaddr)

        for dc, vid in exchange_ids.iteritems():
            rvid = etrie.add(dc)
            self.assertEqual(vid, rvid)

            print dc, vid
            value = value_for_vid(naddr, kaddr, vid)
            self.assertEqual(dc, value)
            if dc != value:
                print "      dc=%s adc=%s" % (dc, value)

            avid = vid_for_value(naddr, kaddr, dc)
            #print "vid=%s avid=%s" % (vid, avid)
            self.assertEqual(vid, avid)
Example #5
0
    def test_stress_wtrie(self):
        ktrie = Trie()
        strie = Trie()
        etrie = Trie()

        keywords = {}
        search_terms = {}
        exchange_ids = {}

        with open(fixture) as f:
            for data in f:
                for word in data.split(" "):
                    vid = ktrie.add(word)
                    actual_vid = keywords.get(word)
                    if actual_vid is not None:
                        self.assertEqual(vid, actual_vid)
                    else:
                        keywords[word] = vid

                vid = strie.add(data)
                actual_vid = search_terms.get(data)
                if actual_vid is not None:
                    self.assertEqual(vid, actual_vid)
                else:
                    search_terms[data] = vid

        nodes, kids, nodelen = etrie.serialize()
        naddr, nlen = nodes.buffer_info()
        kaddr, klen = kids.buffer_info()
        # summarize(naddr, kaddr, nodelen)
        # print_it(naddr, kaddr)

        for dc, vid in exchange_ids.iteritems():
            rvid = etrie.add(dc)
            self.assertEqual(vid, rvid)

            print dc, vid
            value = value_for_vid(naddr, kaddr, vid)
            self.assertEqual(dc, value)
            if dc != value:
                print "      dc=%s adc=%s" % (dc, value)

            avid = vid_for_value(naddr, kaddr, dc)
            # print "vid=%s avid=%s" % (vid, avid)
            self.assertEqual(vid, avid)
Example #6
0
    class HustleOutputStream(object):
        def __init__(self, stream, url, params, **kwargs):
            import tempfile
            from wtrie import Trie

            self.result_table = result_table
            self.result_columns = result_table._field_names
            tmpdir = getattr(params, 'tmpdir', '/tmp')
            self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir)
            maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024)
            self.env, self.txn, self.dbs, self.meta = \
                self.result_table._open(self.filename, maxsize, write=True,
                                        lru_size=10000)
            self.autoinc = 1
            self.url = url
            self.vid_trie = Trie()
            self.vid16_trie = Trie()

        def add(self, k, v):
            from hustle.core.marble import _insert_row
            data = dict(zip(self.result_columns, list(k) + list(v)))
            #print "BOZAK! adding %s %s %s" % (self.result_columns, k, v)
            updated_dbs = _insert_row(data, self.txn, self.dbs, self.autoinc,
                                      self.vid_trie, self.vid16_trie)
            if updated_dbs:
                self.dbs = updated_dbs
            self.autoinc += 1

        def close(self):
            import os
            import ujson
            from disco import util

            self.meta.put(self.txn, '_total_rows', str(self.autoinc))
            vid_nodes, vid_kids, _ = self.vid_trie.serialize()
            vid16_nodes, vid16_kids, _ = self.vid16_trie.serialize()
            vn_ptr, vn_len = vid_nodes.buffer_info()
            vk_ptr, vk_len = vid_kids.buffer_info()
            vn16_ptr, vn16_len = vid16_nodes.buffer_info()
            vk16_ptr, vk16_len = vid16_kids.buffer_info()
            self.meta.put_raw(self.txn, '_vid_nodes', vn_ptr, vn_len)
            self.meta.put_raw(self.txn, '_vid_kids', vk_ptr, vk_len)
            self.meta.put_raw(self.txn, '_vid16_nodes', vn16_ptr, vn16_len)
            self.meta.put_raw(self.txn, '_vid16_kids', vk16_ptr, vk16_len)
            self.meta.put(self.txn, 'name',
                          ujson.dumps(self.result_table._name))
            self.meta.put(self.txn, 'fields',
                          ujson.dumps(self.result_table._fields))
            for index, (subdb, subindexdb, bitmap_dict, column,
                        last) in self.dbs.iteritems():
                if subindexdb:
                    # process all values for this bitmap index
                    if column.index_indicator == 2:
                        bitmap_dict.evictAll()
                    else:
                        for val, bitmap in bitmap_dict.iteritems():
                            subindexdb.put(self.txn, val, bitmap.dumps())
                # insert a sentinel row to value table
                subdb.put(self.txn, self.autoinc + 1, last)
            self.txn.commit()

            try:
                self.env.copy(self.url)
                # print "Dumped result to %s" % self.url
            except Exception as e:
                msg = "Copy error: %s" % e
                print msg
                self.txn.abort()
                raise util.DataError(msg, "")
            finally:
                self.env.close()
                os.unlink(self.filename)
Example #7
0
    def test_rtrie_in_memory(self):

        s = unicode(u"séllsink").encode("utf-8")
        # print "HELLSINK: %s" % s

        t = Trie()
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hell"), 2)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellothere"), 3)
        self.assertEqual(t.add("good"), 4)
        self.assertEqual(t.add("goodbye"), 5)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellsink"), 6)
        self.assertEqual(t.add(s), 7)
        t.print_it()

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        print "LENS %s %s" % (nodelen, kidlen)

        for i in range(8):
            val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
            print "Value", i, val

        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hello"), 1)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hell"), 2)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "goodbye"), 5)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellsink"), 6)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellothere"), 3)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "good"), 4)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "notthere"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "h"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "he"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hel"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hells"))
Example #8
0
    class HustleOutputStream(object):
        def __init__(self, stream, url, params, **kwargs):
            import tempfile
            from wtrie import Trie

            self.result_table = result_table
            self.result_columns = result_table._field_names
            tmpdir = getattr(params, 'tmpdir', '/tmp')
            self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir)
            maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024)
            self.env, self.txn, self.dbs, self.meta = \
                self.result_table._open(self.filename, maxsize, write=True,
                                        lru_size=10000)
            self.autoinc = 1
            self.url = url
            self.vid_trie = Trie()
            self.vid16_trie = Trie()

        def add(self, k, v):
            from hustle.core.marble import _insert_row
            data = dict(zip(self.result_columns, list(k) + list(v)))
            #print "BOZAK! adding %s %s %s" % (self.result_columns, k, v)
            updated_dbs = _insert_row(data, self.txn, self.dbs, self.autoinc,
                                      self.vid_trie, self.vid16_trie)
            if updated_dbs:
                self.dbs = updated_dbs
            self.autoinc += 1

        def close(self):
            import os
            import ujson

            self.meta.put(self.txn, '_total_rows', str(self.autoinc))
            vid_nodes, vid_kids, _ = self.vid_trie.serialize()
            vid16_nodes, vid16_kids, _ = self.vid16_trie.serialize()
            vn_ptr, vn_len = vid_nodes.buffer_info()
            vk_ptr, vk_len = vid_kids.buffer_info()
            vn16_ptr, vn16_len = vid16_nodes.buffer_info()
            vk16_ptr, vk16_len = vid16_kids.buffer_info()
            self.meta.put_raw(self.txn, '_vid_nodes', vn_ptr, vn_len)
            self.meta.put_raw(self.txn, '_vid_kids', vk_ptr, vk_len)
            self.meta.put_raw(self.txn, '_vid16_nodes', vn16_ptr, vn16_len)
            self.meta.put_raw(self.txn, '_vid16_kids', vk16_ptr, vk16_len)
            self.meta.put(self.txn, 'name', ujson.dumps(self.result_table._name))
            self.meta.put(self.txn, 'fields', ujson.dumps(self.result_table._fields))
            for index, (subdb, subindexdb, bitmap_dict, column, last) in self.dbs.iteritems():
                if subindexdb:
                    # process all values for this bitmap index
                    if column.index_indicator == 2:
                        bitmap_dict.evictAll()
                    else:
                        for val, bitmap in bitmap_dict.iteritems():
                            subindexdb.put(self.txn, val, bitmap.dumps())
                # insert a sentinel row to value table
                subdb.put(self.txn, self.autoinc + 1, last)
            self.txn.commit()

            try:
                self.env.copy(self.url)
                # print "Dumped result to %s" % self.url
            except Exception as e:
                print "Copy error: %s" % e
                self.txn.abort()
                raise e
            self.env.close()
            os.unlink(self.filename)
Example #9
0
    def test_wtrie(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(''), 0)

        # nodes = t.nodes
        # t.print_it()

        key, sz, pt = t.node_at_path()
        self.assertEqual(sz, 2)

        key, sz, pt = t.node_at_path(104)
        self.assertEqual(key, 'hell')
        self.assertEqual(pt, 0)
        self.assertEqual(sz, 2, 'actual %s' % sz)

        key2, sz, pt = t.node_at_path(104, 111)
        self.assertEqual(key2, 'o', 'actual %s' % key)
        self.assertEqual(pt, 2)
        self.assertEqual(sz, 1)

        key, sz, pt = t.node_at_path(104, 111, 116)
        self.assertEqual(key, 'there')
        self.assertEqual(pt, 1)
        self.assertEqual(sz, 0)

        n, k, _ = t.serialize()
        self.assertEqual(len(n), 7 * 4, "actual %d" % len(n))
        self.assertEqual(len(k), 100, "actual %d" % len(k))
        # print "sqork: %s" % t.kid_space

        print 'nodes', n
        print 'kids', k

        unpacked = struct.unpack_from("7I", n, 0)
        expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004,
                    0x00000008, 0x00000016)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH2I", k, 0)
        expected = (0, 0, 0x67000004, 0x68000002)
        self.assertEqual(unpacked, expected, unpacked)

        unpacked = struct.unpack_from("IH4cI", k, 16)
        expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH3c", k, 32)
        expected = (0x0004, 0x0003, 'b', 'y', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c2I", k, 44)
        expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IHcI", k, 64)
        expected = (0x0002, 1, 'o', 0x74000003)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH5c", k, 76)
        expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c", k, 88)
        expected = (0x0002, 0x0004, 's', 'i', 'n', 'k')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
Example #10
0
    def test_rtrie_in_memory(self):

        s = unicode(u'séllsink').encode('utf-8')
        #print "HELLSINK: %s" % s

        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(s), 7)
        t.print_it()

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        print "LENS %s %s" % (nodelen, kidlen)

        for i in range(8):
            val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
            print "Value", i, val

        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'),
                         3)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells'))
Example #11
0
    def test_rtrie_in_mdb(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        try:
            env = mdb.Env('/tmp/test_rtrie',
                          flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC
                          | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE)
            db.put_raw(txn, 'nodes', nodeaddr, nodelen)
            db.put_raw(txn, 'kids', kidaddr, kidlen)

            n, ns = db.get_raw(txn, 'nodes')
            k, ks = db.get_raw(txn, 'kids')
            txn.commit()
            env.close()

            env = mdb.Env('/tmp/test_rtrie',
                          flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name='_meta_')

            n, ns = db.get_raw(txn, 'nodes')
            k, ks = db.get_raw(txn, 'kids')
            self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2)
            self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3)
            self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4)
            self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere'))

            txn.commit()
            env.close()
        finally:
            import os
            os.unlink('/tmp/test_rtrie')
            os.unlink('/tmp/test_rtrie-lock')
Example #12
0
    def _insert(self, streams, preprocess=None, maxsize=1024 * 1024 * 1024,
                tmpdir='/tmp', decoder=None, lru_size=10000):
        """insert a file into the hustle table."""
        from wtrie import Trie

        if not decoder:
            decoder = json_decoder

        partitions = {}
        counters = {}
        autoincs = {}
        vid_tries = {}
        vid16_tries = {}
        page_size = 4096
        pdata = None

        try:
            for stream in streams:
                for line in stream:
                    # print "Line: %s" % line
                    try:
                        data = decoder(line)
                    except Exception as e:
                        print "Exception decoding record (skipping): %s %s" % (e, line)
                    else:
                        if preprocess:
                            preprocess(data)

                        newpdata = str(data.get(self._partition, ''))
                        if pdata != newpdata:
                            pdata = newpdata
                            if pdata in partitions:
                                bigfile, env, txn, dbs, meta, pmaxsize = partitions[pdata]
                            else:
                                bigfile = tempfile.mktemp(prefix="hustle", dir=tmpdir) + '.big'
                                env, txn, dbs, meta = self._open(bigfile, maxsize=maxsize, write=True, lru_size=lru_size)
                                page_size = env.stat()['ms_psize']
                                partitions[pdata] = bigfile, env, txn, dbs, meta, maxsize
                                counters[pdata] = 0
                                autoincs[pdata] = 1
                                vid_tries[pdata] = Trie()
                                vid16_tries[pdata] = Trie()
                                pmaxsize = maxsize

                        if counters[pdata] >= COMMIT_THRESHOLD:
                            txn.commit()
                            total_pages = pmaxsize / page_size
                            last_page = env.info()['me_last_pgno']
                            pages_left = total_pages - last_page
                            highwatermark = int(0.75 * total_pages)
                            if pages_left < highwatermark:
                                pmaxsize = int(pmaxsize * 1.5)
                                try:
                                    print "======= attempting to resize mmap ======"
                                    env.set_mapsize(pmaxsize)
                                    env, txn, dbs, meta = self._open_dbs(env, write=True, lru_size=lru_size)
                                except Exception as e:
                                    import traceback
                                    print "Error resizing MDB: %s" % e
                                    print traceback.format_exc(15)
                                    return 0, None
                            else:
                                txn = env.begin_txn()
                            #TODO: a bit a hack - need to reset txns and dbs for all of our indexes
                            #  (iff they are LRUDicts)
                            for index, (_, subindexdb, bitmap_dict, _) in dbs.iteritems():
                                if bitmap_dict is not _dummy and type(bitmap_dict) is not defaultdict:
                                    lru_evict = bitmap_dict._Evict
                                    lru_fetch = bitmap_dict._Fetch
                                    lru_evict.txn = lru_fetch.txn = txn
                                    lru_evict.db = lru_fetch.db = subindexdb
                            partitions[pdata] = bigfile, env, txn, dbs, meta, pmaxsize
                            counters[pdata] = 0

                        _insert_row(data, txn, dbs, autoincs[pdata], vid_tries[pdata], vid16_tries[pdata])
                        # vid_trie = vid_tries[pdata]
                        # vid16_trie = vid16_tries[pdata]
                        # row_id = autoincs[pdata]
                        # for subdb, subindexdb, bitmap_dict, column in dbs.itervalues():
                        #
                        #     val = column.converter(data.get(column.name, column.default_value) or column.default_value,
                        #                            vid_trie, vid16_trie)
                        #     subdb.put(txn, row_id, val)
                        #     bitmap_dict[val].set(row_id)
                        autoincs[pdata] += 1
                        counters[pdata] += 1

            files = {}
            total_records = 0
            for pdata, (bigfile, env, txn, dbs, meta, pmaxsize) in partitions.iteritems():
                try:
                    meta.put(txn, '_total_rows', str(autoincs[pdata]))
                    total_records += autoincs[pdata] - 1
                    vid_nodes, vid_kids, _ = vid_tries[pdata].serialize()
                    vid16_nodes, vid16_kids, _ = vid16_tries[pdata].serialize()
                    vn_ptr, vn_len = vid_nodes.buffer_info()
                    vk_ptr, vk_len = vid_kids.buffer_info()
                    vn16_ptr, vn16_len = vid16_nodes.buffer_info()
                    vk16_ptr, vk16_len = vid16_kids.buffer_info()
                    meta.put_raw(txn, '_vid_nodes', vn_ptr, vn_len)
                    meta.put_raw(txn, '_vid_kids', vk_ptr, vk_len)
                    meta.put_raw(txn, '_vid16_nodes', vn16_ptr, vn16_len)
                    meta.put_raw(txn, '_vid16_kids', vk16_ptr, vk16_len)
                    meta.put(txn, 'name', ujson.dumps(self._name))
                    meta.put(txn, 'fields', ujson.dumps(self._fields))
                    meta.put(txn, 'partition', ujson.dumps(self._partition))
                    for index, (subdb, subindexdb, bitmap_dict, column) in dbs.iteritems():
                        if subindexdb:
                            # process all values for this bitmap index
                            if column.index_indicator == 2:
                                bitmap_dict.evictAll()
                            else:
                                for val, bitmap in bitmap_dict.iteritems():
                                    subindexdb.put(txn, val, bitmap.dumps())

                    txn.commit()
                except Exception as e:
                    print "Error writing to MDB: %s" % e
                    txn.abort()
                    import traceback
                    trace = traceback.format_exc(15)
                    print trace
                    return 0, None
                else:
                    # close dbs
                    meta.close()
                    for index, (subdb, subindexdb, _, _) in dbs.iteritems():
                        subdb.close()
                        if subindexdb:
                            subindexdb.close()
                    try:
                        outfile = bigfile[:-4]  # drop the '.big'
                        env.copy(outfile)
                        files[pdata] = outfile
                    except Exception as e:
                        print "Copy error: %s" % e
                        raise e
                env.close()
            return total_records, files
        finally:
            for _, (bigfile, _, _, _, _, _) in partitions.iteritems():
                os.unlink(bigfile)
                os.unlink(bigfile + '-lock')
Example #13
0
    def test_wtrie(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(''), 0)

        # nodes = t.nodes
        # t.print_it()

        key, sz, pt = t.node_at_path()
        self.assertEqual(sz, 2)

        key, sz, pt = t.node_at_path(104)
        self.assertEqual(key, 'hell')
        self.assertEqual(pt, 0)
        self.assertEqual(sz, 2, 'actual %s' % sz)

        key2, sz, pt = t.node_at_path(104, 111)
        self.assertEqual(key2, 'o', 'actual %s' % key)
        self.assertEqual(pt, 2)
        self.assertEqual(sz, 1)

        key, sz, pt = t.node_at_path(104, 111, 116)
        self.assertEqual(key, 'there')
        self.assertEqual(pt, 1)
        self.assertEqual(sz, 0)

        n, k, _ = t.serialize()
        self.assertEqual(len(n), 7 * 4, "actual %d" % len(n))
        self.assertEqual(len(k), 100, "actual %d" % len(k))
        # print "sqork: %s" % t.kid_space

        print 'nodes', n
        print 'kids', k

        unpacked = struct.unpack_from("7I", n, 0)
        expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH2I", k, 0)
        expected = (0, 0, 0x67000004, 0x68000002)
        self.assertEqual(unpacked, expected, unpacked)

        unpacked = struct.unpack_from("IH4cI", k, 16)
        expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH3c", k, 32)
        expected = (0x0004, 0x0003, 'b', 'y', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c2I", k, 44)
        expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IHcI", k, 64)
        expected = (0x0002, 1, 'o', 0x74000003)
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH5c", k, 76)
        expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))

        unpacked = struct.unpack_from("IH4c", k, 88)
        expected = (0x0002, 0x0004, 's', 'i', 'n', 'k')
        self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))