def reduce(iter, params): partitions = params['partitions'] name = params['name'] discodb = DiscoDB(kvgroup(iter)) try: # figure out what partition we are in key = discodb.keys().__iter__().next() partition = util.default_partition(key, partitions, params) discodb.dump(open(filename(name, partition), 'w')) yield partition, None except StopIteration: # no keys, nothing to write pass
class TestSerializationProtocol(unittest.TestCase): numkeys = 10000 def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys)) def test_dumps_loads(self): dbuffer = self.discodb.dumps() self.assertEquals(dbuffer, DiscoDB.loads(dbuffer).dumps()) def test_dump_load(self): from tempfile import NamedTemporaryFile handle = NamedTemporaryFile() self.discodb.dump(handle) handle.seek(0) discodb = DiscoDB.load(handle) self.assertEquals(discodb.dumps(), self.discodb.dumps())
def test_leak(): while True: d = DiscoDB(zip(letters, ['abc'] * 1000)) t = len(d.query('a')) t = len(d['b']) t = 'd' in d t = d.dumps() t = DiscoDB.loads(t) t = d.dump(open('/tmp/discodb', 'w')) t = DiscoDB.load(open('/tmp/discodb')) for k in d.keys(): for v in d.values(): t = k == v
def test_leak(): while True: d = DiscoDB(zip(letters, ["abc"] * 1000)) t = len(d.query("a")) t = len(d["b"]) t = "d" in d t = d.dumps() t = DiscoDB.loads(t) t = d.dump(open("/tmp/discodb", "w")) t = DiscoDB.load(open("/tmp/discodb")) for k in d.keys(): for v in d.values(): t = k == v
def create_db(self, name, data): db_path = os.path.join(os.environ["DATA_DB_PATH"], name + ".db") data = DiscoDB(data) data.dump(open(db_path, "w")) return db_path
#!/usr/bin/python import sys from discodb import DiscoDB def read_data(instream): for line in instream: try: (key, value) = line.rstrip().split("\t") yield (key, value) except: pass db = DiscoDB( read_data( open(sys.argv[1], 'r') if ( len(sys.argv) > 1 and sys.argv[1] != '-') else sys.stdin)) db.dump(file(sys.argv[2] if len(sys.argv) > 2 else 'out.discodb', 'w'))
def create_db(self, name, data): db_path = os.path.join(os.environ['DATA_DB_PATH'], name + '.db') data = DiscoDB(data) data.dump(open(db_path, 'w')) return db_path
for g2 in tags_srt_sub: x = db2.query(Q.parse(g1 + " & " + g2)) g_lens.append(len(x)) lens_ttls.append(g_lens) print(g1) t2 = time.time() # 500: 48 sec: 5.2k/sec # 1k: 182: 5.5k/sec # 2k: 722: 5.5k/sec # ** writing/loading fo = open('/home/johannes/Dropbox/gsss/thesis/anls/try1/add_data/db.disco', 'a') db.dump(fo) fo.close() with open('/home/johannes/Dropbox/gsss/thesis/anls/try1/add_data/db.disco', 'r') as fi: dbsx = DiscoDB.load(fi) # ** multiprocessing theory from multiprocessing import Process def f(name): print('hello', name) for i in range(5): print(i) time.sleep(1)