コード例 #1
0
ファイル: json2avro.py プロジェクト: stiegerb/WMArchive
def migrate(fin, fout, avsc):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB
    data = json.load(open(fin))

    # store data to Avro
    wmaid = astg.file_write(fout, data)
    print("Wrote %s, wmaid=%s" % (fout, wmaid))
コード例 #2
0
 def __init__(self, uri):
     "ctor with STS uri"
     if  uri.startswith('mongo'):
         self.mgr = MongoStorage(uri)
     elif uri.startswith('file'):
         self.mgr = FileStorage(uri)
     elif uri.startswith('avro'):
         self.mgr = AvroStorage(uri)
     else:
         self.mgr = FileStorage(os.getenv('WMA_STORAGE_ROOT', '/tmp/wma_storage'))
     self.stype = self.mgr.stype # determine storage type
コード例 #3
0
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk,
            close2midnight, dtype):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri, dbname)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB for given storage and document types
    query = {'stype': mstg.stype, 'dtype': dtype}
    mdocs = mstg.find(query, None)  # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if not len(data):
            break
        ids = astg.file_write(fname, data)
        if os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if ids:
            # update status attributes of docs in MongoDB
            spec = {'$set': {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'),
          "wrote %s docs out of %s" % (len(wmaids), total))
コード例 #4
0
 def setUp(self):
     self.tdir = tempfile.mkdtemp()
     data = {
         "int": 1,
         "float": 1.2,
         "list": [1, 2, 3],
         "dict": {
             "dname": "foo",
             "dval": 1
         },
         "listdict": [{
             "lname": "foo"
         }],
         "str": "string"
     }
     self.bare_data = dict(data)
     data['wmaid'] = wmaHash(data)
     data['stype'] = 'avroio'
     self.data = data
     schema = gen_schema(self.data)
     sname = os.path.join(self.tdir, 'schema.avsc')
     with open(sname, 'w') as ostream:
         ostream.write(json.dumps(schema))
     self.mgr = AvroStorage('avroio:%s' % sname)