def migrate(fin, fout, avsc): "Migrate data from MongoDB (muri) to HDFS (huri)" auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB data = json.load(open(fin)) # store data to Avro wmaid = astg.file_write(fout, data) print("Wrote %s, wmaid=%s" % (fout, wmaid))
class FileStorageTest(unittest.TestCase): def setUp(self): self.tdir = tempfile.mkdtemp() data = { "int": 1, "float": 1.2, "list": [1, 2, 3], "dict": { "dname": "foo", "dval": 1 }, "listdict": [{ "lname": "foo" }], "str": "string" } self.bare_data = dict(data) data['wmaid'] = wmaHash(data) data['stype'] = 'avroio' self.data = data schema = gen_schema(self.data) sname = os.path.join(self.tdir, 'schema.avsc') with open(sname, 'w') as ostream: ostream.write(json.dumps(schema)) self.mgr = AvroStorage('avroio:%s' % sname) def tearDown(self): "Tear down content of temp dir" for fname in os.listdir(self.tdir): os.remove(os.path.join(self.tdir, fname)) os.rmdir(self.tdir) def test_write(self): "Test write functionality" wmaids = self.mgr.write(self.data) self.assertEqual(len(wmaids), 1) data = self.mgr.read(wmaids[0]) self.assertEqual(data[0], self.bare_data) def test_file_write(self): "Test file_write functionality" fname = os.path.join(self.tdir, 'file.avro') wmaids = self.mgr.file_write(fname, self.data) self.assertEqual(len(wmaids), 1) data = self.mgr.file_read(fname) self.assertEqual(data[0], self.data) def test_file_write_exception(self): "Test file_write functionality with exception" fname = os.path.join( '/etc/file.avro') # we should not have access to /etc self.assertRaises(Exception, self.mgr.file_write, (fname, self.data))
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk, close2midnight, dtype): "Write data from MongoDB (muri) to avro file(s) on local file system" mstg = MongoStorage(muri, dbname) auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB for given storage and document types query = {'stype': mstg.stype, 'dtype': dtype} mdocs = mstg.find(query, None) # with no fields we'll get entire docs # loop over provided docs and write them into avro file on local file system wmaids = [] total = 0 fsize = 0 fname = file_name(odir, mdir, thr, compress, close2midnight) while True: data = [r for r in itertools.islice(mdocs, chunk)] total += len(data) if not len(data): break ids = astg.file_write(fname, data) if os.path.isfile(fname): fsize = os.path.getsize(fname) wmaids += ids if ids: # update status attributes of docs in MongoDB spec = {'$set': {'stype': astg.stype}} mstg.update(ids, spec) try: if PSUTIL: pid = os.getpid() proc = psutil.Process(pid) mem = proc.memory_info_ex() rss = 'RSS:%s' % size_format(mem.rss) else: rss = '' except: rss = '' print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \ % (len(ids), fname, size_format(fsize), fsize, rss)) fname = file_name(odir, mdir, thr, compress, close2midnight) print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight): "Write data from MongoDB (muri) to avro file(s) on local file system" mstg = MongoStorage(muri) auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB, returned mdocs is generator type query = {'stype': mstg.stype} mdocs = mstg.find(query, None) # with no fields we'll get entire docs # loop over provided docs and write them into avro file on local file system wmaids = [] total = 0 fsize = 0 fname = file_name(odir, mdir, thr, compress, close2midnight) while True: data = [r for r in itertools.islice(mdocs, chunk)] total += len(data) if not len(data): break ids = astg.file_write(fname, data) if os.path.isfile(fname): fsize = os.path.getsize(fname) wmaids += ids if ids: # update status attributes of docs in MongoDB spec = {'$set' : {'stype': astg.stype}} mstg.update(ids, spec) try: if PSUTIL: pid = os.getpid() proc = psutil.Process(pid) mem = proc.memory_info_ex() rss = 'RSS:%s' % size_format(mem.rss) else: rss = '' except: rss = '' print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \ % (len(ids), fname, size_format(fsize), fsize, rss)) fname = file_name(odir, mdir, thr, compress, close2midnight) print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
class FileStorageTest(unittest.TestCase): def setUp(self): self.tdir = tempfile.mkdtemp() data = {"int":1, "float":1.2, "list":[1,2,3], "dict":{"dname": "foo", "dval":1}, "listdict":[{"lname":"foo"}], "str":"string"} self.bare_data = dict(data) data['wmaid'] = wmaHash(data) data['stype'] = 'avroio' self.data = data schema = gen_schema(self.data) sname = os.path.join(self.tdir, 'schema.avsc') with open(sname, 'w') as ostream: ostream.write(json.dumps(schema)) self.mgr = AvroStorage('avroio:%s' % sname) def tearDown(self): "Tear down content of temp dir" for fname in os.listdir(self.tdir): os.remove(os.path.join(self.tdir, fname)) os.rmdir(self.tdir) def test_write(self): "Test write functionality" wmaids = self.mgr.write(self.data) self.assertEqual(len(wmaids), 1) data = self.mgr.read(wmaids[0]) self.assertEqual(data[0], self.bare_data) def test_file_write(self): "Test file_write functionality" fname = os.path.join(self.tdir, 'file.avro') wmaids = self.mgr.file_write(fname, self.data) self.assertEqual(len(wmaids), 1) data = self.mgr.file_read(fname) self.assertEqual(data[0], self.data) def test_file_write_exception(self): "Test file_write functionality with exception" fname = os.path.join('/etc/file.avro') # we should not have access to /etc self.assertRaises(Exception, self.mgr.file_write, (fname, self.data))