def migrate(muri, dbname, huri): "Migrate data from MongoDB (muri) to HDFS (huri)" mstg = MongoStorage(muri, dbname) hstg = HdfsStorage(huri) # read data from MongoDB query = {'stype': mstg.stype} mdocs = mstg.read(query) mids = [d['wmaid'] for d in mdocs] # do nothing if no documents is found if not len(mdocs): return # store data to HDFS wmaid = hstg.write(mdocs) # read data from HDFS hdocs = hstg.read(wmaid) # now we can compare MongoDB docs with HDFS docs, a la cross-check for mdoc, hdoc in zip(mdocs, hdocs): # drop WMArchive keys for key in ['stype', 'wmaid']: if key in mdoc: del mdoc[key] if key in hdoc: del hdoc[key] if mdoc != hdoc: print("ERROR", mdoc, hdoc) sys.exit(1) # update status attributes of docs in MongoDB query = {'$set': {'stype': hstg.stype}} mstg.update(mids, query)
def migrate(muri, huri): "Migrate data from MongoDB (muri) to HDFS (huri)" mstg = MongoStorage(muri) hstg = HdfsStorage(huri) # read data from MongoDB query = {'stype': mstg.stype} mdocs = mstg.read(query) mids = [d['wmaid'] for d in mdocs] # do nothing if no documents is found if not len(mdocs): return # store data to HDFS wmaid = hstg.write(mdocs) # read data from HDFS hdocs = hstg.read(wmaid) # now we can compare MongoDB docs with HDFS docs, a la cross-check for mdoc, hdoc in zip(mdocs, hdocs): # drop WMArchive keys for key in ['stype', 'wmaid']: if key in mdoc: del mdoc[key] if key in hdoc: del hdoc[key] if mdoc != hdoc: print("ERROR", mdoc, hdoc) sys.exit(1) # update status attributes of docs in MongoDB query = {'$set' : {'stype': hstg.stype}} mstg.update(mids, query)
def cleanup(muri, tst, stype): "Cleanup data in MongoDB (muri) for given timestamp (tst)" time0 = time.time() mstg = MongoStorage(muri) # remove records whose type is hdfsio, i.e. already migrated to HDFS, # and whose time stamp is less than provided one query = {'stype': stype, 'wmats':{'$lt': dateformat(tst)}} rdocs = mstg.ndocs(query) tdocs = time.time()-time0 print(tstamp('mongo2avro'), 'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0))) time0 = time.time() response = mstg.remove(query) print(tstamp('mongo2avro'), 'remove query %s in %s' % (query, elapsed_time(time0)))
def cleanup(muri, tst, stype, verbose): "Cleanup data in MongoDB (muri) for given timestamp (tst)" time0 = time.time() mstg = MongoStorage(muri) # remove records whose type is hdfsio, i.e. already migrated to HDFS, # and whose time stamp is less than provided one query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}} if verbose: print("Clean-up records in MongoDB: %s" % muri) print("MongoDB cleanup spec:", query) response = mstg.remove(query) if verbose: print("response: %s" % response) print("Elapsed time: %s" % elapsed_time(time0))
def cleanup(muri, tst, stype, verbose): "Cleanup data in MongoDB (muri) for given timestamp (tst)" time0 = time.time() mstg = MongoStorage(muri) # remove records whose type is hdfsio, i.e. already migrated to HDFS, # and whose time stamp is less than provided one query = {'stype': stype, 'wmats':{'$lt': dateformat(tst)}} if verbose: print("Clean-up records in MongoDB: %s" % muri) print("MongoDB cleanup spec:", query) response = mstg.remove(query) if verbose: print("response: %s" % response) print("Elapsed time: %s" % elapsed_time(time0))
def cleanup(muri, tst, stype): "Cleanup data in MongoDB (muri) for given timestamp (tst)" time0 = time.time() mstg = MongoStorage(muri) # remove records whose type is hdfsio, i.e. already migrated to HDFS, # and whose time stamp is less than provided one query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}} rdocs = mstg.ndocs(query) tdocs = time.time() - time0 print(tstamp('mongo2avro'), 'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0))) time0 = time.time() response = mstg.remove(query) print(tstamp('mongo2avro'), 'remove query %s in %s' % (query, elapsed_time(time0)))
def migrate(muri): "Write and read data to MongoDB" mstg = MongoStorage(muri, dbname='test_fwjr') doc = {"test": 1, 'wmaid': 1} mstg.write([doc]) # read data from MongoDB query = {} mdocs = mstg.read(query) for doc in mdocs: print(doc) if '_id' in doc: print("found _id in doc") print(doc) break
def migrate(muri): "Write and read data to MongoDB" mstg = MongoStorage(muri, dbname='test_fwjr') doc = {"test":1, 'wmaid':1} mstg.write([doc]) # read data from MongoDB query = {} mdocs = mstg.read(query) for doc in mdocs: print(doc) if '_id' in doc: print("found _id in doc") print(doc) break
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk, close2midnight, dtype): "Write data from MongoDB (muri) to avro file(s) on local file system" mstg = MongoStorage(muri, dbname) auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB for given storage and document types query = {'stype': mstg.stype, 'dtype': dtype} mdocs = mstg.find(query, None) # with no fields we'll get entire docs # loop over provided docs and write them into avro file on local file system wmaids = [] total = 0 fsize = 0 fname = file_name(odir, mdir, thr, compress, close2midnight) while True: data = [r for r in itertools.islice(mdocs, chunk)] total += len(data) if not len(data): break ids = astg.file_write(fname, data) if os.path.isfile(fname): fsize = os.path.getsize(fname) wmaids += ids if ids: # update status attributes of docs in MongoDB spec = {'$set': {'stype': astg.stype}} mstg.update(ids, spec) try: if PSUTIL: pid = os.getpid() proc = psutil.Process(pid) mem = proc.memory_info_ex() rss = 'RSS:%s' % size_format(mem.rss) else: rss = '' except: rss = '' print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \ % (len(ids), fname, size_format(fsize), fsize, rss)) fname = file_name(odir, mdir, thr, compress, close2midnight) print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight): "Write data from MongoDB (muri) to avro file(s) on local file system" mstg = MongoStorage(muri) auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB, returned mdocs is generator type query = {'stype': mstg.stype} mdocs = mstg.find(query, None) # with no fields we'll get entire docs # loop over provided docs and write them into avro file on local file system wmaids = [] total = 0 fsize = 0 fname = file_name(odir, mdir, thr, compress, close2midnight) while True: data = [r for r in itertools.islice(mdocs, chunk)] total += len(data) if not len(data): break ids = astg.file_write(fname, data) if os.path.isfile(fname): fsize = os.path.getsize(fname) wmaids += ids if ids: # update status attributes of docs in MongoDB spec = {'$set' : {'stype': astg.stype}} mstg.update(ids, spec) try: if PSUTIL: pid = os.getpid() proc = psutil.Process(pid) mem = proc.memory_info_ex() rss = 'RSS:%s' % size_format(mem.rss) else: rss = '' except: rss = '' print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \ % (len(ids), fname, size_format(fsize), fsize, rss)) fname = file_name(odir, mdir, thr, compress, close2midnight) print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
def __init__(self, uri): "ctor with STS uri" if uri.startswith('mongo'): self.mgr = MongoStorage(uri) elif uri.startswith('file'): self.mgr = FileStorage(uri) elif uri.startswith('avro'): self.mgr = AvroStorage(uri) else: self.mgr = FileStorage(os.getenv('WMA_STORAGE_ROOT', '/tmp/wma_storage')) self.stype = self.mgr.stype # determine storage type
class MongoStorageTest(unittest.TestCase): def setUp(self): uri = os.environ.get("WMA_MONGODB", "mongodb://localhost:8230") self.dbname = "test_fwjr" try: self.mgr = MongoStorage(uri, dbname=self.dbname) self.mgr.remove() except: self.mgr = None print("WARNING: cannot connect to %s" % uri) data = { "int": 1, "float": 1.2, "list": [1, 2, 3], "dict": {"dname": "foo", "dval": 1}, "listdict": [{"lname": "foo"}], "str": "string", } self.bare_data = dict(data) data["wmaid"] = wmaHash(data) data["stype"] = "mongodb" self.data = data def tearDown(self): "Tear down content of temp dir" self.mgr.remove() self.mgr.dropdb(self.dbname) def test_write(self): "Test write functionality" if self.mgr: wmaids = self.mgr.write(self.data) self.assertEqual(len(wmaids), 1) data = self.mgr.read(wmaids[0]) record = data[0] for key in ["wmaid", "stype"]: if key in record: del record[key] self.assertEqual(record, self.bare_data) data = self.mgr.read(wmaids[0], ["dict"]) self.assertEqual(1, len(data)) self.assertEqual(data[0]["dict"], self.bare_data["dict"])
def setUp(self): uri = os.environ.get("WMA_MONGODB", "mongodb://localhost:8230") self.dbname = "test_fwjr" try: self.mgr = MongoStorage(uri, dbname=self.dbname) self.mgr.remove() except: self.mgr = None print("WARNING: cannot connect to %s" % uri) data = { "int": 1, "float": 1.2, "list": [1, 2, 3], "dict": {"dname": "foo", "dval": 1}, "listdict": [{"lname": "foo"}], "str": "string", } self.bare_data = dict(data) data["wmaid"] = wmaHash(data) data["stype"] = "mongodb" self.data = data