Exemple #1
0
def migrate(muri, dbname, huri):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    mstg = MongoStorage(muri, dbname)
    hstg = HdfsStorage(huri)

    # read data from MongoDB
    query = {'stype': mstg.stype}
    mdocs = mstg.read(query)
    mids = [d['wmaid'] for d in mdocs]

    # do nothing if no documents is found
    if not len(mdocs):
        return

    # store data to HDFS
    wmaid = hstg.write(mdocs)

    # read data from HDFS
    hdocs = hstg.read(wmaid)

    # now we can compare MongoDB docs with HDFS docs, a la cross-check
    for mdoc, hdoc in zip(mdocs, hdocs):
        # drop WMArchive keys
        for key in ['stype', 'wmaid']:
            if key in mdoc:
                del mdoc[key]
            if key in hdoc:
                del hdoc[key]
        if mdoc != hdoc:
            print("ERROR", mdoc, hdoc)
            sys.exit(1)

    # update status attributes of docs in MongoDB
    query = {'$set': {'stype': hstg.stype}}
    mstg.update(mids, query)
Exemple #2
0
def migrate(muri, huri):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    mstg = MongoStorage(muri)
    hstg = HdfsStorage(huri)

    # read data from MongoDB
    query = {'stype': mstg.stype}
    mdocs = mstg.read(query)
    mids = [d['wmaid'] for d in mdocs]

    # do nothing if no documents is found
    if  not len(mdocs):
        return

    # store data to HDFS
    wmaid = hstg.write(mdocs)

    # read data from HDFS
    hdocs = hstg.read(wmaid)

    # now we can compare MongoDB docs with HDFS docs, a la cross-check
    for mdoc, hdoc in zip(mdocs, hdocs):
        # drop WMArchive keys
        for key in ['stype', 'wmaid']:
            if  key in mdoc:
                del mdoc[key]
            if  key in hdoc:
                del hdoc[key]
        if mdoc != hdoc:
            print("ERROR", mdoc, hdoc)
            sys.exit(1)

    # update status attributes of docs in MongoDB
    query = {'$set' : {'stype': hstg.stype}}
    mstg.update(mids, query)
Exemple #3
0
def cleanup(muri, tst, stype):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats':{'$lt': dateformat(tst)}}
    rdocs = mstg.ndocs(query)
    tdocs = time.time()-time0
    print(tstamp('mongo2avro'), 'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0)))
    time0 = time.time()
    response = mstg.remove(query)
    print(tstamp('mongo2avro'), 'remove query %s in %s' % (query, elapsed_time(time0)))
Exemple #4
0
def cleanup(muri, tst, stype, verbose):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}}
    if verbose:
        print("Clean-up records in MongoDB: %s" % muri)
        print("MongoDB cleanup spec:", query)
    response = mstg.remove(query)
    if verbose:
        print("response: %s" % response)
        print("Elapsed time: %s" % elapsed_time(time0))
Exemple #5
0
def cleanup(muri, tst, stype, verbose):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats':{'$lt': dateformat(tst)}}
    if  verbose:
        print("Clean-up records in MongoDB: %s" % muri)
        print("MongoDB cleanup spec:", query)
    response = mstg.remove(query)
    if  verbose:
        print("response: %s" % response)
        print("Elapsed time: %s" % elapsed_time(time0))
Exemple #6
0
def cleanup(muri, tst, stype):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}}
    rdocs = mstg.ndocs(query)
    tdocs = time.time() - time0
    print(tstamp('mongo2avro'),
          'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0)))
    time0 = time.time()
    response = mstg.remove(query)
    print(tstamp('mongo2avro'),
          'remove query %s in %s' % (query, elapsed_time(time0)))
Exemple #7
0
def migrate(muri):
    "Write and read data to MongoDB"
    mstg = MongoStorage(muri, dbname='test_fwjr')
    doc = {"test": 1, 'wmaid': 1}
    mstg.write([doc])

    # read data from MongoDB
    query = {}
    mdocs = mstg.read(query)
    for doc in mdocs:
        print(doc)
        if '_id' in doc:
            print("found _id in doc")
            print(doc)
            break
Exemple #8
0
def migrate(muri):
    "Write and read data to MongoDB"
    mstg = MongoStorage(muri, dbname='test_fwjr')
    doc = {"test":1, 'wmaid':1}
    mstg.write([doc])

    # read data from MongoDB
    query = {}
    mdocs = mstg.read(query)
    for doc in mdocs:
        print(doc)
        if '_id' in doc:
            print("found _id in doc")
            print(doc)
            break
Exemple #9
0
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk,
            close2midnight, dtype):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri, dbname)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB for given storage and document types
    query = {'stype': mstg.stype, 'dtype': dtype}
    mdocs = mstg.find(query, None)  # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if not len(data):
            break
        ids = astg.file_write(fname, data)
        if os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if ids:
            # update status attributes of docs in MongoDB
            spec = {'$set': {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'),
          "wrote %s docs out of %s" % (len(wmaids), total))
Exemple #10
0
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB, returned mdocs is generator type
    query = {'stype': mstg.stype}
    mdocs = mstg.find(query, None) # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if  not len(data):
            break
        ids = astg.file_write(fname, data)
        if  os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if  ids:
            # update status attributes of docs in MongoDB
            spec = {'$set' : {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if  PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
Exemple #11
0
 def __init__(self, uri):
     "ctor with STS uri"
     if  uri.startswith('mongo'):
         self.mgr = MongoStorage(uri)
     elif uri.startswith('file'):
         self.mgr = FileStorage(uri)
     elif uri.startswith('avro'):
         self.mgr = AvroStorage(uri)
     else:
         self.mgr = FileStorage(os.getenv('WMA_STORAGE_ROOT', '/tmp/wma_storage'))
     self.stype = self.mgr.stype # determine storage type
Exemple #12
0
class MongoStorageTest(unittest.TestCase):
    def setUp(self):
        uri = os.environ.get("WMA_MONGODB", "mongodb://localhost:8230")
        self.dbname = "test_fwjr"
        try:
            self.mgr = MongoStorage(uri, dbname=self.dbname)
            self.mgr.remove()
        except:
            self.mgr = None
            print("WARNING: cannot connect to %s" % uri)
        data = {
            "int": 1,
            "float": 1.2,
            "list": [1, 2, 3],
            "dict": {"dname": "foo", "dval": 1},
            "listdict": [{"lname": "foo"}],
            "str": "string",
        }
        self.bare_data = dict(data)
        data["wmaid"] = wmaHash(data)
        data["stype"] = "mongodb"
        self.data = data

    def tearDown(self):
        "Tear down content of temp dir"
        self.mgr.remove()
        self.mgr.dropdb(self.dbname)

    def test_write(self):
        "Test write functionality"
        if self.mgr:
            wmaids = self.mgr.write(self.data)
            self.assertEqual(len(wmaids), 1)
            data = self.mgr.read(wmaids[0])
            record = data[0]
            for key in ["wmaid", "stype"]:
                if key in record:
                    del record[key]
            self.assertEqual(record, self.bare_data)
            data = self.mgr.read(wmaids[0], ["dict"])
            self.assertEqual(1, len(data))
            self.assertEqual(data[0]["dict"], self.bare_data["dict"])
Exemple #13
0
 def setUp(self):
     uri = os.environ.get("WMA_MONGODB", "mongodb://localhost:8230")
     self.dbname = "test_fwjr"
     try:
         self.mgr = MongoStorage(uri, dbname=self.dbname)
         self.mgr.remove()
     except:
         self.mgr = None
         print("WARNING: cannot connect to %s" % uri)
     data = {
         "int": 1,
         "float": 1.2,
         "list": [1, 2, 3],
         "dict": {"dname": "foo", "dval": 1},
         "listdict": [{"lname": "foo"}],
         "str": "string",
     }
     self.bare_data = dict(data)
     data["wmaid"] = wmaHash(data)
     data["stype"] = "mongodb"
     self.data = data