def __init__(self, stream, url, params, **kwargs): import tempfile from wtrie import Trie self.result_table = result_table self.result_columns = result_table._field_names tmpdir = getattr(params, 'tmpdir', '/tmp') self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir) maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024) self.env, self.txn, self.dbs, self.meta = self.result_table._open(self.filename, maxsize, write=True, lru_size=10000) self.autoinc = 1 self.url = url self.vid_trie = Trie() self.vid16_trie = Trie()
def __init__(self, stream, url, params, **kwargs): import tempfile from wtrie import Trie self.result_table = result_table self.result_columns = result_table._field_names tmpdir = getattr(params, 'tmpdir', '/tmp') self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir) maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024) self.env, self.txn, self.dbs, self.meta = self.result_table._open( self.filename, maxsize, write=True, lru_size=10000) self.autoinc = 1 self.url = url self.vid_trie = Trie() self.vid16_trie = Trie()
def test_rtrie_in_mdb(self): t = Trie() self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hell"), 2) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellothere"), 3) self.assertEqual(t.add("good"), 4) self.assertEqual(t.add("goodbye"), 5) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellsink"), 6) nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() try: env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name="_meta_", flags=mdb.MDB_CREATE) db.put_raw(txn, "nodes", nodeaddr, nodelen) db.put_raw(txn, "kids", kidaddr, kidlen) n, ns = db.get_raw(txn, "nodes") k, ks = db.get_raw(txn, "kids") txn.commit() env.close() env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name="_meta_") n, ns = db.get_raw(txn, "nodes") k, ks = db.get_raw(txn, "kids") self.assertEqual(rtrie.vid_for_value(n, k, "hello"), 1) self.assertEqual(rtrie.vid_for_value(n, k, "hell"), 2) self.assertEqual(rtrie.vid_for_value(n, k, "goodbye"), 5) self.assertEqual(rtrie.vid_for_value(n, k, "hellsink"), 6) self.assertEqual(rtrie.vid_for_value(n, k, "hellothere"), 3) self.assertEqual(rtrie.vid_for_value(n, k, "good"), 4) self.assertIsNone(rtrie.vid_for_value(n, k, "notthere")) txn.commit() env.close() finally: import os os.unlink("/tmp/test_rtrie") os.unlink("/tmp/test_rtrie-lock")
def test_stress_wtrie(self): ktrie = Trie() strie = Trie() etrie = Trie() keywords = {} search_terms = {} exchange_ids = {} with open(fixture) as f: for data in f: for word in data.split(' '): vid = ktrie.add(word) actual_vid = keywords.get(word) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: keywords[word] = vid vid = strie.add(data) actual_vid = search_terms.get(data) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: search_terms[data] = vid nodes, kids, nodelen = etrie.serialize() naddr, nlen = nodes.buffer_info() kaddr, klen = kids.buffer_info() #summarize(naddr, kaddr, nodelen) #print_it(naddr, kaddr) for dc, vid in exchange_ids.iteritems(): rvid = etrie.add(dc) self.assertEqual(vid, rvid) print dc, vid value = value_for_vid(naddr, kaddr, vid) self.assertEqual(dc, value) if dc != value: print " dc=%s adc=%s" % (dc, value) avid = vid_for_value(naddr, kaddr, dc) #print "vid=%s avid=%s" % (vid, avid) self.assertEqual(vid, avid)
def test_stress_wtrie(self): ktrie = Trie() strie = Trie() etrie = Trie() keywords = {} search_terms = {} exchange_ids = {} with open(fixture) as f: for data in f: for word in data.split(" "): vid = ktrie.add(word) actual_vid = keywords.get(word) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: keywords[word] = vid vid = strie.add(data) actual_vid = search_terms.get(data) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: search_terms[data] = vid nodes, kids, nodelen = etrie.serialize() naddr, nlen = nodes.buffer_info() kaddr, klen = kids.buffer_info() # summarize(naddr, kaddr, nodelen) # print_it(naddr, kaddr) for dc, vid in exchange_ids.iteritems(): rvid = etrie.add(dc) self.assertEqual(vid, rvid) print dc, vid value = value_for_vid(naddr, kaddr, vid) self.assertEqual(dc, value) if dc != value: print " dc=%s adc=%s" % (dc, value) avid = vid_for_value(naddr, kaddr, dc) # print "vid=%s avid=%s" % (vid, avid) self.assertEqual(vid, avid)
class HustleOutputStream(object): def __init__(self, stream, url, params, **kwargs): import tempfile from wtrie import Trie self.result_table = result_table self.result_columns = result_table._field_names tmpdir = getattr(params, 'tmpdir', '/tmp') self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir) maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024) self.env, self.txn, self.dbs, self.meta = \ self.result_table._open(self.filename, maxsize, write=True, lru_size=10000) self.autoinc = 1 self.url = url self.vid_trie = Trie() self.vid16_trie = Trie() def add(self, k, v): from hustle.core.marble import _insert_row data = dict(zip(self.result_columns, list(k) + list(v))) #print "BOZAK! adding %s %s %s" % (self.result_columns, k, v) updated_dbs = _insert_row(data, self.txn, self.dbs, self.autoinc, self.vid_trie, self.vid16_trie) if updated_dbs: self.dbs = updated_dbs self.autoinc += 1 def close(self): import os import ujson from disco import util self.meta.put(self.txn, '_total_rows', str(self.autoinc)) vid_nodes, vid_kids, _ = self.vid_trie.serialize() vid16_nodes, vid16_kids, _ = self.vid16_trie.serialize() vn_ptr, vn_len = vid_nodes.buffer_info() vk_ptr, vk_len = vid_kids.buffer_info() vn16_ptr, vn16_len = vid16_nodes.buffer_info() vk16_ptr, vk16_len = vid16_kids.buffer_info() self.meta.put_raw(self.txn, '_vid_nodes', vn_ptr, vn_len) self.meta.put_raw(self.txn, '_vid_kids', vk_ptr, vk_len) self.meta.put_raw(self.txn, '_vid16_nodes', vn16_ptr, vn16_len) self.meta.put_raw(self.txn, '_vid16_kids', vk16_ptr, vk16_len) self.meta.put(self.txn, 'name', ujson.dumps(self.result_table._name)) self.meta.put(self.txn, 'fields', ujson.dumps(self.result_table._fields)) for index, (subdb, subindexdb, bitmap_dict, column, last) in self.dbs.iteritems(): if subindexdb: # process all values for this bitmap index if column.index_indicator == 2: bitmap_dict.evictAll() else: for val, bitmap in bitmap_dict.iteritems(): subindexdb.put(self.txn, val, bitmap.dumps()) # insert a sentinel row to value table subdb.put(self.txn, self.autoinc + 1, last) self.txn.commit() try: self.env.copy(self.url) # print "Dumped result to %s" % self.url except Exception as e: msg = "Copy error: %s" % e print msg self.txn.abort() raise util.DataError(msg, "") finally: self.env.close() os.unlink(self.filename)
def test_rtrie_in_memory(self): s = unicode(u"séllsink").encode("utf-8") # print "HELLSINK: %s" % s t = Trie() self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hell"), 2) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellothere"), 3) self.assertEqual(t.add("good"), 4) self.assertEqual(t.add("goodbye"), 5) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellsink"), 6) self.assertEqual(t.add(s), 7) t.print_it() nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() print "LENS %s %s" % (nodelen, kidlen) for i in range(8): val = rtrie.value_for_vid(nodeaddr, kidaddr, i) print "Value", i, val self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hello"), 1) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hell"), 2) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "goodbye"), 5) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellsink"), 6) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellothere"), 3) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "good"), 4) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "notthere")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "h")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "he")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hel")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hells"))
class HustleOutputStream(object): def __init__(self, stream, url, params, **kwargs): import tempfile from wtrie import Trie self.result_table = result_table self.result_columns = result_table._field_names tmpdir = getattr(params, 'tmpdir', '/tmp') self.filename = tempfile.mktemp(prefix="hustle", dir=tmpdir) maxsize = getattr(params, 'maxsize', 100 * 1024 * 1024) self.env, self.txn, self.dbs, self.meta = \ self.result_table._open(self.filename, maxsize, write=True, lru_size=10000) self.autoinc = 1 self.url = url self.vid_trie = Trie() self.vid16_trie = Trie() def add(self, k, v): from hustle.core.marble import _insert_row data = dict(zip(self.result_columns, list(k) + list(v))) #print "BOZAK! adding %s %s %s" % (self.result_columns, k, v) updated_dbs = _insert_row(data, self.txn, self.dbs, self.autoinc, self.vid_trie, self.vid16_trie) if updated_dbs: self.dbs = updated_dbs self.autoinc += 1 def close(self): import os import ujson self.meta.put(self.txn, '_total_rows', str(self.autoinc)) vid_nodes, vid_kids, _ = self.vid_trie.serialize() vid16_nodes, vid16_kids, _ = self.vid16_trie.serialize() vn_ptr, vn_len = vid_nodes.buffer_info() vk_ptr, vk_len = vid_kids.buffer_info() vn16_ptr, vn16_len = vid16_nodes.buffer_info() vk16_ptr, vk16_len = vid16_kids.buffer_info() self.meta.put_raw(self.txn, '_vid_nodes', vn_ptr, vn_len) self.meta.put_raw(self.txn, '_vid_kids', vk_ptr, vk_len) self.meta.put_raw(self.txn, '_vid16_nodes', vn16_ptr, vn16_len) self.meta.put_raw(self.txn, '_vid16_kids', vk16_ptr, vk16_len) self.meta.put(self.txn, 'name', ujson.dumps(self.result_table._name)) self.meta.put(self.txn, 'fields', ujson.dumps(self.result_table._fields)) for index, (subdb, subindexdb, bitmap_dict, column, last) in self.dbs.iteritems(): if subindexdb: # process all values for this bitmap index if column.index_indicator == 2: bitmap_dict.evictAll() else: for val, bitmap in bitmap_dict.iteritems(): subindexdb.put(self.txn, val, bitmap.dumps()) # insert a sentinel row to value table subdb.put(self.txn, self.autoinc + 1, last) self.txn.commit() try: self.env.copy(self.url) # print "Dumped result to %s" % self.url except Exception as e: print "Copy error: %s" % e self.txn.abort() raise e self.env.close() os.unlink(self.filename)
def test_wtrie(self): t = Trie() self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hell'), 2) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellothere'), 3) self.assertEqual(t.add('good'), 4) self.assertEqual(t.add('goodbye'), 5) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellsink'), 6) self.assertEqual(t.add(''), 0) # nodes = t.nodes # t.print_it() key, sz, pt = t.node_at_path() self.assertEqual(sz, 2) key, sz, pt = t.node_at_path(104) self.assertEqual(key, 'hell') self.assertEqual(pt, 0) self.assertEqual(sz, 2, 'actual %s' % sz) key2, sz, pt = t.node_at_path(104, 111) self.assertEqual(key2, 'o', 'actual %s' % key) self.assertEqual(pt, 2) self.assertEqual(sz, 1) key, sz, pt = t.node_at_path(104, 111, 116) self.assertEqual(key, 'there') self.assertEqual(pt, 1) self.assertEqual(sz, 0) n, k, _ = t.serialize() self.assertEqual(len(n), 7 * 4, "actual %d" % len(n)) self.assertEqual(len(k), 100, "actual %d" % len(k)) # print "sqork: %s" % t.kid_space print 'nodes', n print 'kids', k unpacked = struct.unpack_from("7I", n, 0) expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016) self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) unpacked = struct.unpack_from("IH2I", k, 0) expected = (0, 0, 0x67000004, 0x68000002) self.assertEqual(unpacked, expected, unpacked) unpacked = struct.unpack_from("IH4cI", k, 16) expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005) self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) unpacked = struct.unpack_from("IH3c", k, 32) expected = (0x0004, 0x0003, 'b', 'y', 'e') self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) unpacked = struct.unpack_from("IH4c2I", k, 44) expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006) self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) unpacked = struct.unpack_from("IHcI", k, 64) expected = (0x0002, 1, 'o', 0x74000003) self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) unpacked = struct.unpack_from("IH5c", k, 76) expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e') self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) unpacked = struct.unpack_from("IH4c", k, 88) expected = (0x0002, 0x0004, 's', 'i', 'n', 'k') self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
def test_rtrie_in_memory(self): s = unicode(u'séllsink').encode('utf-8') #print "HELLSINK: %s" % s t = Trie() self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hell'), 2) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellothere'), 3) self.assertEqual(t.add('good'), 4) self.assertEqual(t.add('goodbye'), 5) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellsink'), 6) self.assertEqual(t.add(s), 7) t.print_it() nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() print "LENS %s %s" % (nodelen, kidlen) for i in range(8): val = rtrie.value_for_vid(nodeaddr, kidaddr, i) print "Value", i, val self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'), 3) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells'))
def test_rtrie_in_mdb(self): t = Trie() self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hell'), 2) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellothere'), 3) self.assertEqual(t.add('good'), 4) self.assertEqual(t.add('goodbye'), 5) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellsink'), 6) nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() try: env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE) db.put_raw(txn, 'nodes', nodeaddr, nodelen) db.put_raw(txn, 'kids', kidaddr, kidlen) n, ns = db.get_raw(txn, 'nodes') k, ks = db.get_raw(txn, 'kids') txn.commit() env.close() env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name='_meta_') n, ns = db.get_raw(txn, 'nodes') k, ks = db.get_raw(txn, 'kids') self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1) self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2) self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5) self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6) self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3) self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4) self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere')) txn.commit() env.close() finally: import os os.unlink('/tmp/test_rtrie') os.unlink('/tmp/test_rtrie-lock')
def _insert(self, streams, preprocess=None, maxsize=1024 * 1024 * 1024, tmpdir='/tmp', decoder=None, lru_size=10000): """insert a file into the hustle table.""" from wtrie import Trie if not decoder: decoder = json_decoder partitions = {} counters = {} autoincs = {} vid_tries = {} vid16_tries = {} page_size = 4096 pdata = None try: for stream in streams: for line in stream: # print "Line: %s" % line try: data = decoder(line) except Exception as e: print "Exception decoding record (skipping): %s %s" % (e, line) else: if preprocess: preprocess(data) newpdata = str(data.get(self._partition, '')) if pdata != newpdata: pdata = newpdata if pdata in partitions: bigfile, env, txn, dbs, meta, pmaxsize = partitions[pdata] else: bigfile = tempfile.mktemp(prefix="hustle", dir=tmpdir) + '.big' env, txn, dbs, meta = self._open(bigfile, maxsize=maxsize, write=True, lru_size=lru_size) page_size = env.stat()['ms_psize'] partitions[pdata] = bigfile, env, txn, dbs, meta, maxsize counters[pdata] = 0 autoincs[pdata] = 1 vid_tries[pdata] = Trie() vid16_tries[pdata] = Trie() pmaxsize = maxsize if counters[pdata] >= COMMIT_THRESHOLD: txn.commit() total_pages = pmaxsize / page_size last_page = env.info()['me_last_pgno'] pages_left = total_pages - last_page highwatermark = int(0.75 * total_pages) if pages_left < highwatermark: pmaxsize = int(pmaxsize * 1.5) try: print "======= attempting to resize mmap ======" env.set_mapsize(pmaxsize) env, txn, dbs, meta = self._open_dbs(env, write=True, lru_size=lru_size) except Exception as e: import traceback print "Error resizing MDB: %s" % e print traceback.format_exc(15) return 0, None else: txn = env.begin_txn() #TODO: a bit a hack - need to reset txns and dbs for all of our indexes # (iff they are LRUDicts) for index, (_, subindexdb, bitmap_dict, _) in dbs.iteritems(): if bitmap_dict is not _dummy and type(bitmap_dict) is not defaultdict: lru_evict = bitmap_dict._Evict lru_fetch = bitmap_dict._Fetch lru_evict.txn = lru_fetch.txn = txn lru_evict.db = lru_fetch.db = subindexdb partitions[pdata] = bigfile, env, txn, dbs, meta, pmaxsize counters[pdata] = 0 _insert_row(data, txn, dbs, autoincs[pdata], vid_tries[pdata], vid16_tries[pdata]) # vid_trie = vid_tries[pdata] # vid16_trie = vid16_tries[pdata] # row_id = autoincs[pdata] # for subdb, subindexdb, bitmap_dict, column in dbs.itervalues(): # # val = column.converter(data.get(column.name, column.default_value) or column.default_value, # vid_trie, vid16_trie) # subdb.put(txn, row_id, val) # bitmap_dict[val].set(row_id) autoincs[pdata] += 1 counters[pdata] += 1 files = {} total_records = 0 for pdata, (bigfile, env, txn, dbs, meta, pmaxsize) in partitions.iteritems(): try: meta.put(txn, '_total_rows', str(autoincs[pdata])) total_records += autoincs[pdata] - 1 vid_nodes, vid_kids, _ = vid_tries[pdata].serialize() vid16_nodes, vid16_kids, _ = vid16_tries[pdata].serialize() vn_ptr, vn_len = vid_nodes.buffer_info() vk_ptr, vk_len = vid_kids.buffer_info() vn16_ptr, vn16_len = vid16_nodes.buffer_info() vk16_ptr, vk16_len = vid16_kids.buffer_info() meta.put_raw(txn, '_vid_nodes', vn_ptr, vn_len) meta.put_raw(txn, '_vid_kids', vk_ptr, vk_len) meta.put_raw(txn, '_vid16_nodes', vn16_ptr, vn16_len) meta.put_raw(txn, '_vid16_kids', vk16_ptr, vk16_len) meta.put(txn, 'name', ujson.dumps(self._name)) meta.put(txn, 'fields', ujson.dumps(self._fields)) meta.put(txn, 'partition', ujson.dumps(self._partition)) for index, (subdb, subindexdb, bitmap_dict, column) in dbs.iteritems(): if subindexdb: # process all values for this bitmap index if column.index_indicator == 2: bitmap_dict.evictAll() else: for val, bitmap in bitmap_dict.iteritems(): subindexdb.put(txn, val, bitmap.dumps()) txn.commit() except Exception as e: print "Error writing to MDB: %s" % e txn.abort() import traceback trace = traceback.format_exc(15) print trace return 0, None else: # close dbs meta.close() for index, (subdb, subindexdb, _, _) in dbs.iteritems(): subdb.close() if subindexdb: subindexdb.close() try: outfile = bigfile[:-4] # drop the '.big' env.copy(outfile) files[pdata] = outfile except Exception as e: print "Copy error: %s" % e raise e env.close() return total_records, files finally: for _, (bigfile, _, _, _, _, _) in partitions.iteritems(): os.unlink(bigfile) os.unlink(bigfile + '-lock')