Example #1
0
def test_twofile_multiblock(tmpdir):
    fn1 = os.path.join(tmpdir, 'one.avro')
    fn2 = os.path.join(tmpdir, 'two.avro')
    with open(fn1, 'wb') as f:
        fastavro.writer(f, records=expected[:500], schema=schema,
                        sync_interval=100)
    with open(fn2, 'wb') as f:
        fastavro.writer(f, records=expected[500:], schema=schema,
                        sync_interval=100)
    b = db.read_avro(os.path.join(tmpdir, '*.avro'), blocksize=None)
    assert b.npartitions == 2
    assert b.compute() == expected

    b = db.read_avro(os.path.join(tmpdir, '*.avro'), blocksize=1000)
    assert b.npartitions > 2
    assert b.compute() == expected
Example #2
0
def test_onefile_oneblock(tmpdir):
    fn = os.path.join(tmpdir, 'one.avro')
    with open(fn, 'wb') as f:
        fastavro.writer(f, records=expected, schema=schema)
    b = db.read_avro(fn, blocksize=None)
    assert b.npartitions == 1
    assert b.compute() == expected
Example #3
0
def test_onefile_oneblock(tmpdir):
    fn = os.path.join(tmpdir, 'one.avro')
    with open(fn, 'wb') as f:
        fastavro.writer(f, records=expected, schema=schema)
    b = db.read_avro(fn, blocksize=None)
    assert b.npartitions == 1
    assert b.compute() == expected
Example #4
0
def test_roundtrip(tmpdir, codec):
    if codec == 'snappy':
        pytest.importorskip('snappy')
    fn = os.path.join(tmpdir, 'out*.avro')
    b = db.from_sequence(expected, npartitions=3)
    b.to_avro(fn, schema=schema, codec=codec)
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()
def test_roundtrip(tmpdir, codec):
    if codec == 'snappy':
        pytest.importorskip('snappy')
    fn = os.path.join(tmpdir, 'out*.avro')
    b = db.from_sequence(expected, npartitions=3)
    b.to_avro(fn, schema=schema, codec=codec)
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()
Example #6
0
def test_roundtrip(tmpdir, codec):
    tmpdir = str(tmpdir)
    if codec == "snappy":
        pytest.importorskip("snappy")
    fn = os.path.join(tmpdir, "out*.avro")
    b = db.from_sequence(expected, npartitions=3)
    b.to_avro(fn, schema=schema, codec=codec)
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()
Example #7
0
    def to_dask_by_urlpath(self):
        """Create lazy dask bag object"""
        import dask.bag as db
        self._get_schema()

        actual_urlpath = self._storage_options['actual_urlpath']
        del self._storage_options['actual_urlpath']
        return db.read_avro(actual_urlpath,
                            storage_options=self._storage_options)
Example #8
0
def test_twofile_oneblock(tmpdir):
    fn1 = os.path.join(tmpdir, 'one.avro')
    fn2 = os.path.join(tmpdir, 'two.avro')
    with open(fn1, 'wb') as f:
        fastavro.writer(f, records=expected[:500], schema=schema)
    with open(fn2, 'wb') as f:
        fastavro.writer(f, records=expected[500:], schema=schema)
    b = db.read_avro(os.path.join(tmpdir, '*.avro'), blocksize=None)
    assert b.npartitions == 2
    assert b.compute() == expected
Example #9
0
def test_twofile_oneblock(tmpdir):
    tmpdir = str(tmpdir)
    fn1 = os.path.join(tmpdir, "one.avro")
    fn2 = os.path.join(tmpdir, "two.avro")
    with open(fn1, "wb") as f:
        fastavro.writer(f, records=expected[:500], schema=schema)
    with open(fn2, "wb") as f:
        fastavro.writer(f, records=expected[500:], schema=schema)
    b = db.read_avro(os.path.join(tmpdir, "*.avro"), blocksize=None)
    assert b.npartitions == 2
    assert b.compute() == expected
Example #10
0
def test_twofile_oneblock(tmpdir):
    tmpdir = str(tmpdir)
    fn1 = os.path.join(tmpdir, 'one.avro')
    fn2 = os.path.join(tmpdir, 'two.avro')
    with open(fn1, 'wb') as f:
        fastavro.writer(f, records=expected[:500], schema=schema)
    with open(fn2, 'wb') as f:
        fastavro.writer(f, records=expected[500:], schema=schema)
    b = db.read_avro(os.path.join(tmpdir, '*.avro'), blocksize=None)
    assert b.npartitions == 2
    assert b.compute() == expected
Example #11
0
    def _get_schema(self):
        if self._bag is None:
            from dask.bag import read_avro
            self._bag = read_avro(self._urlpath,
                                  blocksize=self._bs,
                                  storage_options=self._storage_options)
        self.npartitions = self._bag.npartitions

        return base.Schema(datashape=None,
                           dtype=None,
                           shape=None,
                           npartitions=self._bag.npartitions,
                           extra_metadata={})
Example #12
0
def test_roundtrip_simple(tmpdir):
    from dask.delayed import Delayed
    fn = os.path.join(tmpdir, 'out*.avro')
    b = db.from_sequence([{'a': i} for i in [1, 2, 3, 4, 5]], npartitions=2)
    schema = {
        'name': 'Test',
        'type': 'record',
        'fields': [
            {'name': 'a', 'type': 'int'}, ]}
    out = b.to_avro(fn, schema, compute=False)
    assert isinstance(out[0], Delayed)
    out = b.to_avro(fn, schema)
    assert len(out) == 2
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()
Example #13
0
def test_roundtrip_simple(tmpdir):
    from dask.delayed import Delayed

    tmpdir = str(tmpdir)
    fn = os.path.join(tmpdir, "out*.avro")
    b = db.from_sequence([{"a": i} for i in [1, 2, 3, 4, 5]], npartitions=2)
    schema = {
        "name": "Test",
        "type": "record",
        "fields": [{
            "name": "a",
            "type": "int"
        }],
    }
    out = b.to_avro(fn, schema, compute=False)
    assert isinstance(out[0], Delayed)
    out = b.to_avro(fn, schema)
    assert len(out) == 2
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()
Example #14
0
def test_roundtrip_simple(tmpdir):
    from dask.delayed import Delayed
    fn = os.path.join(tmpdir, 'out*.avro')
    b = db.from_sequence([{'a': i} for i in [1, 2, 3, 4, 5]], npartitions=2)
    schema = {
        'name': 'Test',
        'type': 'record',
        'fields': [
            {
                'name': 'a',
                'type': 'int'
            },
        ]
    }
    out = b.to_avro(fn, schema, compute=False)
    assert isinstance(out[0], Delayed)
    out = b.to_avro(fn, schema)
    assert len(out) == 2
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()