예제 #1
0
def migrate(muri, dbname, huri):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    mstg = MongoStorage(muri, dbname)
    hstg = HdfsStorage(huri)

    # read data from MongoDB
    query = {'stype': mstg.stype}
    mdocs = mstg.read(query)
    mids = [d['wmaid'] for d in mdocs]

    # do nothing if no documents is found
    if not len(mdocs):
        return

    # store data to HDFS
    wmaid = hstg.write(mdocs)

    # read data from HDFS
    hdocs = hstg.read(wmaid)

    # now we can compare MongoDB docs with HDFS docs, a la cross-check
    for mdoc, hdoc in zip(mdocs, hdocs):
        # drop WMArchive keys
        for key in ['stype', 'wmaid']:
            if key in mdoc:
                del mdoc[key]
            if key in hdoc:
                del hdoc[key]
        if mdoc != hdoc:
            print("ERROR", mdoc, hdoc)
            sys.exit(1)

    # update status attributes of docs in MongoDB
    query = {'$set': {'stype': hstg.stype}}
    mstg.update(mids, query)
예제 #2
0
def migrate(muri, huri):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    mstg = MongoStorage(muri)
    hstg = HdfsStorage(huri)

    # read data from MongoDB
    query = {'stype': mstg.stype}
    mdocs = mstg.read(query)
    mids = [d['wmaid'] for d in mdocs]

    # do nothing if no documents is found
    if  not len(mdocs):
        return

    # store data to HDFS
    wmaid = hstg.write(mdocs)

    # read data from HDFS
    hdocs = hstg.read(wmaid)

    # now we can compare MongoDB docs with HDFS docs, a la cross-check
    for mdoc, hdoc in zip(mdocs, hdocs):
        # drop WMArchive keys
        for key in ['stype', 'wmaid']:
            if  key in mdoc:
                del mdoc[key]
            if  key in hdoc:
                del hdoc[key]
        if mdoc != hdoc:
            print("ERROR", mdoc, hdoc)
            sys.exit(1)

    # update status attributes of docs in MongoDB
    query = {'$set' : {'stype': hstg.stype}}
    mstg.update(mids, query)
예제 #3
0
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk,
            close2midnight, dtype):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri, dbname)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB for given storage and document types
    query = {'stype': mstg.stype, 'dtype': dtype}
    mdocs = mstg.find(query, None)  # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if not len(data):
            break
        ids = astg.file_write(fname, data)
        if os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if ids:
            # update status attributes of docs in MongoDB
            spec = {'$set': {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'),
          "wrote %s docs out of %s" % (len(wmaids), total))
예제 #4
0
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB, returned mdocs is generator type
    query = {'stype': mstg.stype}
    mdocs = mstg.find(query, None) # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if  not len(data):
            break
        ids = astg.file_write(fname, data)
        if  os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if  ids:
            # update status attributes of docs in MongoDB
            spec = {'$set' : {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if  PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))