Example #1
def migrate(muri, dbname, huri):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    mstg = MongoStorage(muri, dbname)
    hstg = HdfsStorage(huri)

    # read data from MongoDB
    query = {'stype': mstg.stype}
    mdocs = mstg.read(query)
    mids = [d['wmaid'] for d in mdocs]

    # do nothing if no documents is found
    if not len(mdocs):

    # store data to HDFS
    wmaid = hstg.write(mdocs)

    # read data from HDFS
    hdocs = hstg.read(wmaid)

    # now we can compare MongoDB docs with HDFS docs, a la cross-check
    for mdoc, hdoc in zip(mdocs, hdocs):
        # drop WMArchive keys
        for key in ['stype', 'wmaid']:
            if key in mdoc:
                del mdoc[key]
            if key in hdoc:
                del hdoc[key]
        if mdoc != hdoc:
            print("ERROR", mdoc, hdoc)

    # update status attributes of docs in MongoDB
    query = {'$set': {'stype': hstg.stype}}
    mstg.update(mids, query)
Example #2
 def __init__(self, uri):
     "ctor with STS uri"
     if  uri.startswith('mongo'):
         self.mgr = MongoStorage(uri)
     elif uri.startswith('file'):
         self.mgr = FileStorage(uri)
     elif uri.startswith('avro'):
         self.mgr = AvroStorage(uri)
         self.mgr = FileStorage(os.getenv('WMA_STORAGE_ROOT', '/tmp/wma_storage'))
     self.stype = self.mgr.stype # determine storage type
Example #3
def cleanup(muri, tst, stype, verbose):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}}
    if verbose:
        print("Clean-up records in MongoDB: %s" % muri)
        print("MongoDB cleanup spec:", query)
    response = mstg.remove(query)
    if verbose:
        print("response: %s" % response)
        print("Elapsed time: %s" % elapsed_time(time0))
Example #4
def cleanup(muri, tst, stype):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}}
    rdocs = mstg.ndocs(query)
    tdocs = time.time() - time0
          'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0)))
    time0 = time.time()
    response = mstg.remove(query)
          'remove query %s in %s' % (query, elapsed_time(time0)))
Example #5
def migrate(muri):
    "Write and read data to MongoDB"
    mstg = MongoStorage(muri, dbname='test_fwjr')
    doc = {"test": 1, 'wmaid': 1}

    # read data from MongoDB
    query = {}
    mdocs = mstg.read(query)
    for doc in mdocs:
        if '_id' in doc:
            print("found _id in doc")
Example #6
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk,
            close2midnight, dtype):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri, dbname)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB for given storage and document types
    query = {'stype': mstg.stype, 'dtype': dtype}
    mdocs = mstg.find(query, None)  # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if not len(data):
        ids = astg.file_write(fname, data)
        if os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if ids:
            # update status attributes of docs in MongoDB
            spec = {'$set': {'stype': astg.stype}}
            mstg.update(ids, spec)

            if PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
                rss = ''
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
          "wrote %s docs out of %s" % (len(wmaids), total))