Esempio n. 1
0
def migrate(fin, fout, avsc):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB
    data = json.load(open(fin))

    # store data to Avro
    wmaid = astg.file_write(fout, data)
    print("Wrote %s, wmaid=%s" % (fout, wmaid))
Esempio n. 2
0
def migrate(fin, fout, avsc):
    "Migrate data from MongoDB (muri) to HDFS (huri)"
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB
    data = json.load(open(fin))

    # store data to Avro
    wmaid = astg.file_write(fout, data)
    print("Wrote %s, wmaid=%s" % (fout, wmaid))
Esempio n. 3
0
class FileStorageTest(unittest.TestCase):
    def setUp(self):
        self.tdir = tempfile.mkdtemp()
        data = {
            "int": 1,
            "float": 1.2,
            "list": [1, 2, 3],
            "dict": {
                "dname": "foo",
                "dval": 1
            },
            "listdict": [{
                "lname": "foo"
            }],
            "str": "string"
        }
        self.bare_data = dict(data)
        data['wmaid'] = wmaHash(data)
        data['stype'] = 'avroio'
        self.data = data
        schema = gen_schema(self.data)
        sname = os.path.join(self.tdir, 'schema.avsc')
        with open(sname, 'w') as ostream:
            ostream.write(json.dumps(schema))
        self.mgr = AvroStorage('avroio:%s' % sname)

    def tearDown(self):
        "Tear down content of temp dir"
        for fname in os.listdir(self.tdir):
            os.remove(os.path.join(self.tdir, fname))
        os.rmdir(self.tdir)

    def test_write(self):
        "Test write functionality"
        wmaids = self.mgr.write(self.data)
        self.assertEqual(len(wmaids), 1)
        data = self.mgr.read(wmaids[0])
        self.assertEqual(data[0], self.bare_data)

    def test_file_write(self):
        "Test file_write functionality"
        fname = os.path.join(self.tdir, 'file.avro')
        wmaids = self.mgr.file_write(fname, self.data)
        self.assertEqual(len(wmaids), 1)
        data = self.mgr.file_read(fname)
        self.assertEqual(data[0], self.data)

    def test_file_write_exception(self):
        "Test file_write functionality with exception"
        fname = os.path.join(
            '/etc/file.avro')  # we should not have access to /etc
        self.assertRaises(Exception, self.mgr.file_write, (fname, self.data))
Esempio n. 4
0
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk,
            close2midnight, dtype):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri, dbname)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB for given storage and document types
    query = {'stype': mstg.stype, 'dtype': dtype}
    mdocs = mstg.find(query, None)  # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if not len(data):
            break
        ids = astg.file_write(fname, data)
        if os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if ids:
            # update status attributes of docs in MongoDB
            spec = {'$set': {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'),
          "wrote %s docs out of %s" % (len(wmaids), total))
Esempio n. 5
0
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB, returned mdocs is generator type
    query = {'stype': mstg.stype}
    mdocs = mstg.find(query, None) # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if  not len(data):
            break
        ids = astg.file_write(fname, data)
        if  os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if  ids:
            # update status attributes of docs in MongoDB
            spec = {'$set' : {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if  PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
Esempio n. 6
0
class FileStorageTest(unittest.TestCase):
    def setUp(self):
        self.tdir = tempfile.mkdtemp()
	data = {"int":1, "float":1.2, "list":[1,2,3],
		"dict":{"dname": "foo", "dval":1},
		"listdict":[{"lname":"foo"}], "str":"string"}
        self.bare_data = dict(data)
        data['wmaid'] = wmaHash(data)
        data['stype'] = 'avroio'
        self.data = data
        schema = gen_schema(self.data)
        sname = os.path.join(self.tdir, 'schema.avsc')
        with open(sname, 'w') as ostream:
            ostream.write(json.dumps(schema))
        self.mgr = AvroStorage('avroio:%s' % sname)

    def tearDown(self):
        "Tear down content of temp dir"
        for fname in os.listdir(self.tdir):
            os.remove(os.path.join(self.tdir, fname))
        os.rmdir(self.tdir)

    def test_write(self):
        "Test write functionality"
        wmaids = self.mgr.write(self.data)
        self.assertEqual(len(wmaids), 1)
        data = self.mgr.read(wmaids[0])
        self.assertEqual(data[0], self.bare_data)

    def test_file_write(self):
        "Test file_write functionality"
        fname = os.path.join(self.tdir, 'file.avro')
        wmaids = self.mgr.file_write(fname, self.data)
        self.assertEqual(len(wmaids), 1)
        data = self.mgr.file_read(fname)
        self.assertEqual(data[0], self.data)

    def test_file_write_exception(self):
        "Test file_write functionality with exception"
        fname = os.path.join('/etc/file.avro') # we should not have access to /etc
        self.assertRaises(Exception, self.mgr.file_write, (fname, self.data))