Esempio n. 1
0
def json_data():
    data = {
        'a.csv': [{
            'x': 1,
            'y': 2
        }, {
            'x': 3,
            'y': 4
        }],
        'b.csv': [{
            'x': 5,
            'y': 6
        }, {
            'x': 7,
            'y': 8
        }],
        'c.csv': [{
            'x': 9,
            'y': 10
        }, {
            'x': 11,
            'y': 12
        }]
    }

    text = dict(
        (fn, '\n'.join(map(json.dumps, dicts))) for fn, dicts in data.items())
    with filetexts(text) as filenames:
        descriptors = [
            JSON_Streaming(fn, schema='{x: int32, y: int32}')
            for fn in sorted(filenames)
        ]
        yield Stack(descriptors)
Esempio n. 2
0
    def test_Concat(self):
        with filetexts(self.data) as filenames:
            descriptors = [CSV(fn, schema='2 * int32')
                            for fn in sorted(filenames)]
            dd = Concat(descriptors)

            self.assertEqual(str(dd.schema), '2 * int32')
            self.assertEqual(str(dd.dshape), 'var * 2 * int32')

            expected = ((1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7))

            self.assertEqual(tuplify(tuple(dd)), expected)

            result = dd.as_dynd()
            expected2 = nd.array(expected, dtype='int32')
            self.assertEqual(nd.as_py(result),
                             nd.as_py(expected2))

            self.assertEqual(tuplify(tuple(dd)), expected)
            self.assertEqual(tuplify(tuple(dd)), expected)  # Not one use only

            chunks = list(dd.chunks())
            assert all(isinstance(chunk, nd.array) for chunk in chunks)

            self.assertEqual(tuple(dd[[0, 2], 0]), (1, 3))
            self.assertEqual(tuple(dd[2, [1, 0]]), (3, 3))

            assert isinstance(dd[:, 0], Iterator)
Esempio n. 3
0
def file_data():
    data = {'a.csv': '1,1\n2,2', 'b.csv': '3,3\n4,4\n5,5', 'c.csv': '6,6\n7,7'}
    with filetexts(data) as filenames:
        descriptors = [
            CSV(fn, schema='{a: int32, b: int32}') for fn in sorted(filenames)
        ]
        yield Concat(descriptors)
    def test_filesystem(self):
        with filetexts(data) as filenames:
            dd = Files(sorted(filenames), CSV, subdshape='var * 2 * int32')

            self.assertEqual(dd.filenames, ['a.csv', 'b.csv', 'c.csv'])
            self.assertEqual(str(dd.schema), '2 * int32')
            self.assertEqual(str(dd.dshape), 'var * 2 * int32')

            expected = [[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]

            self.assertEqual(dd.as_py(), expected)

            result = dd.as_dynd()
            expected2 = nd.array(expected, dtype='int32')
            self.assertEqual(nd.as_py(result),
                             nd.as_py(expected2))

            self.assertEqual(list(dd), expected)
            self.assertEqual(list(dd), expected)  # Not one use only

            chunks = list(dd.chunks(blen=3))
            expected = [nd.array([[1, 1], [2, 2], [3, 3]], dtype='int32'),
                        nd.array([[4, 4], [5, 5], [6, 6]], dtype='int32')]

            assert all(nd.as_py(a) == nd.as_py(b) for a, b in zip(chunks, expected))
Esempio n. 5
0
    def test_Stack(self):
        with filetexts(self.data) as filenames:
            descriptors = [CSV(fn, schema='2 * int32')
                            for fn in sorted(filenames)]
            dd = Stack(descriptors)
            self.assertEqual(dd.dshape, 3 * descriptors[0].dshape)

            expected = (((1, 1), (2, 2)),
                        ((3, 3), (4, 4)),
                        ((5, 5), (6, 6)))

            self.assertEqual(tuplify(tuple(dd.as_py())), expected)

            result = dd.as_dynd()
            expected2 = nd.array(expected, dtype='int32')
            self.assertEqual(nd.as_py(result),
                             nd.as_py(expected2))

            self.assertEqual(tuplify(tuple(dd)), expected)
            self.assertEqual(tuplify(tuple(dd)), expected)  # Not one use only

            chunks = dd.chunks()
            assert all(isinstance(chunk, nd.array) for chunk in chunks)

            self.assertEqual(tuple(dd[[0, 2], 0, 0]), (1, 5))
            self.assertEqual(tuplify(tuple(dd[0])), ((1, 1), (2, 2)))
            self.assertEqual(tuplify(tuple(dd[0, :, [1]])), ((1,), (2,)))
            self.assertEqual(tuplify(tuple(dd[0])), expected[0])

            assert isinstance(dd[:, 0], Iterator)
            assert isinstance(dd[:], Iterator)
Esempio n. 6
0
 def test_filesystem(self):
     prefix = 'test_filesystem'
     d = {prefix + 'a.csv': '1,1\n2,2',
          prefix + 'b.csv': '1,1\n2,2'}
     with filetexts(d) as filenames:
         dd = resource(prefix + '*.csv', schema='{x: int, y: int}')
         self.assertEqual(into(list, dd),
                         [(1, 1), (2, 2), (1, 1), (2, 2)])
Esempio n. 7
0
def file_data():
    data = {'a.csv': '1,1\n2,2',
            'b.csv': '3,3\n4,4\n5,5',
            'c.csv': '6,6\n7,7'}
    with filetexts(data) as filenames:
        descriptors = [CSV(fn, schema='{a: int32, b: int32}')
                       for fn in sorted(filenames)]
        yield Concat(descriptors)
Esempio n. 8
0
 def test_filesystem(self):
     prefix = 'test_filesystem'
     d = {prefix + 'a.csv': '1,1\n2,2',
          prefix + 'b.csv': '1,1\n2,2'}
     with filetexts(d) as filenames:
         dd = resource(prefix + '*.csv', schema='2 * int')
         self.assertEqual(tuplify(tuple(dd)),
                         (((1, 1), (2, 2)), ((1, 1), (2, 2))))
Esempio n. 9
0
    def test_gzip_json_files(self):
        with filetexts(texts, open=gzip.open) as filenames:
            descriptors = [JSON(fn, dshape=schema, open=gzip.open)
                           for fn in sorted(filenames)]
            dd = Stack(descriptors)

            self.assertEqual(sorted(dd), sorted(tuples))

            self.assertEqual(dd.schema, dshape(schema))
Esempio n. 10
0
    def test_gzip_json_files(self):
        with filetexts(texts, open=gzip.open) as filenames:
            descriptors = [JSON(fn, dshape=schema, open=gzip.open)
                            for fn in sorted(filenames)]
            dd = Stack(descriptors)

            self.assertEqual(sorted(dd), sorted(tuples))

            self.assertEqual(dd.schema, dshape(schema))
Esempio n. 11
0
def test_into_directory_of_csv_files():
    files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200\n3,Charlie,300',
             'accounts_2.csv': '4,Dan,400\n5,Edith,500'}
    with filetexts(files):
        assert into(list, 'accounts_*.csv') == [(1, 'Alice', 100),
                                                (2, 'Bob', 200),
                                                (3, 'Charlie', 300),
                                                (4, 'Dan', 400),
                                                (5, 'Edith', 500)]
    def test_gzip_json_files(self):
        with filetexts(texts, open=gzip.open) as filenames:
            dd = Files(sorted(filenames),
                       JSON,
                       open=gzip.open,
                       subdshape=dshape)

            self.assertEqual(sorted(dd), sorted(data.values()))

            self.assertEqual(dd.dshape, Var() * dshape)
    def test_gzip_json_files(self):
        with filetexts(texts, open=gzip.open) as filenames:
            dd = Files(sorted(filenames),
                       JSON,
                       open=gzip.open,
                       subdshape=dshape)

            self.assertEqual(sorted(dd), sorted(data.values()))

            self.assertEqual(dd.dshape, Var() * dshape)
Esempio n. 14
0
def json_data():
    data = {'a.csv': [{'x':  1, 'y':  2}, {'x':  3, 'y':  4}],
            'b.csv': [{'x':  5, 'y':  6}, {'x':  7, 'y':  8}],
            'c.csv': [{'x':  9, 'y': 10}, {'x': 11, 'y': 12}]}

    text = dict((fn, '\n'.join(map(json.dumps, dicts)))
                for fn, dicts in data.items())
    with filetexts(text) as filenames:
        descriptors = [JSON_Streaming(fn, schema='{x: int32, y: int32}')
                       for fn in sorted(filenames)]
        yield Stack(descriptors)
Esempio n. 15
0
    def test_Stack(self):
        with filetexts(self.text) as filenames:
            descriptors = [JSON_Streaming(fn, schema="{x: int32, y: int32}") for fn in sorted(filenames)]
            dd = Stack(descriptors)

            expected = (((1, 2), (3, 4)), ((5, 6), (7, 8)), ((9, 10), (11, 12)))

            self.assertEqual(tuplify(dd.as_py()), expected)

            self.assertEqual(tuplify(dd.py[::2, 1, :]), ((3, 4), (11, 12)))
            self.assertEqual(tuplify(dd.py[::2, 1, "x"]), (3, 11))
Esempio n. 16
0
def test_into_directory_of_csv_files():
    files = {
        'accounts_1.csv': '1,Alice,100\n2,Bob,200\n3,Charlie,300',
        'accounts_2.csv': '4,Dan,400\n5,Edith,500'
    }
    with filetexts(files):
        assert into(list, 'accounts_*.csv') == [(1, 'Alice', 100),
                                                (2, 'Bob', 200),
                                                (3, 'Charlie', 300),
                                                (4, 'Dan', 400),
                                                (5, 'Edith', 500)]
Esempio n. 17
0
def test_csv_join():
    d = {"a.csv": "a,b,c\n0,1,2\n3,4,5", "b.csv": "c,d,e\n2,3,4\n5,6,7"}

    with filetexts(d):
        resource_a = resource("a.csv")
        resource_b = resource("b.csv")
        a = symbol("a", discover(resource_a))
        b = symbol("b", discover(resource_b))
        tm.assert_frame_equal(
            odo(compute(join(a, b, "c"), {a: resource_a, b: resource_b}), pd.DataFrame),
            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype="int64"), columns=list("cabde")),
        )
Esempio n. 18
0
def test_multiple_csv_files():
    d = {"mult1.csv": "name,val\nAlice,1\nBob,2", "mult2.csv": "name,val\nAlice,3\nCharlie,4"}

    data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Charlie", 4)]
    with filetexts(d) as fns:
        r = resource("mult*.csv")
        s = symbol("s", discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: data})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Esempio n. 19
0
def test_concat():
    d = {"a.csv": "a,b\n1,2\n3,4", "b.csv": "a,b\n5,6\n7,8"}

    with filetexts(d):
        a_rsc = resource("a.csv")
        b_rsc = resource("b.csv")

        a = symbol("a", discover(a_rsc))
        b = symbol("b", discover(b_rsc))

        tm.assert_frame_equal(
            odo(compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame),
            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype="int64").reshape(4, 2), columns=list("ab")),
        )
Esempio n. 20
0
def test_multiple_csv_files():
    d = {'mult1.csv': 'name,val\nAlice,1\nBob,2',
         'mult2.csv': 'name,val\nAlice,3\nCharlie,4'}

    data = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)]
    with filetexts(d) as fns:
        r = resource('mult*.csv')
        s = symbol('s', discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(),
                s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: data})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Esempio n. 21
0
def test_multiple_csv_files():
    d = {'mult1.csv': 'name,val\nAlice,1\nBob,2',
         'mult2.csv': 'name,val\nAlice,3\nCharlie,4'}

    dta = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)]
    with filetexts(d) as fns:
        r = data('mult*.csv')
        s = symbol('s', discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(),
                s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: dta})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Esempio n. 22
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
Esempio n. 23
0
def test_concat():
    d = {'a.csv': 'a,b\n1,2\n3,4',
         'b.csv': 'a,b\n5,6\n7,8'}

    with filetexts(d):
        a_rsc = data('a.csv')
        b_rsc = data('b.csv')

        a = symbol('a', discover(a_rsc))
        b = symbol('b', discover(b_rsc))

        tm.assert_frame_equal(
            odo(
                compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2),
                         columns=list('ab')),
        )
Esempio n. 24
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
Esempio n. 25
0
def test_concat():
    d = {'a.csv': 'a,b\n1,2\n3,4',
         'b.csv': 'a,b\n5,6\n7,8'}

    with filetexts(d):
        a_rsc = data('a.csv')
        b_rsc = data('b.csv')

        a = symbol('a', discover(a_rsc))
        b = symbol('b', discover(b_rsc))

        tm.assert_frame_equal(
            odo(
                compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2),
                         columns=list('ab')),
        )
Esempio n. 26
0
def test_resource_different_csv_schemas():
    files = {'foobar_a.csv': '1.0,1\n2.0,2',
             'foobar_b.csv': '3,3\n4,4'}
    with filetexts(files):
        r = resource('foobar_*.csv')
        assert r.data[0].schema == r.data[1].schema
Esempio n. 27
0
 def test_filesystem(self):
     prefix = 'test_filesystem'
     d = {prefix + 'a.csv': '1,1\n2,2', prefix + 'b.csv': '1,1\n2,2'}
     with filetexts(d) as filenames:
         dd = resource(prefix + '*.csv', schema='{x: int, y: int}')
         self.assertEqual(into(list, dd), [(1, 1), (2, 2), (1, 1), (2, 2)])
Esempio n. 28
0
def test_into_resource():
    files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200'}
    with filetexts(files):
        assert into(list, 'accounts_1.csv') == [(1, 'Alice', 100),
                                                (2, 'Bob', 200)]
Esempio n. 29
0
def test_resource_csv():
    files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200'}
    with filetexts(files):
        assert isinstance(resource('accounts_1.csv'), CSV)
Esempio n. 30
0
def test_drop_uri():
    from blaze.data.csv import drop
    with filetexts({'foo.csv': '1,1\n2,2'}):
        assert os.path.exists('foo.csv')
        drop('foo.csv')
        assert not os.path.exists('foo.csv')
Esempio n. 31
0
def test_drop_uri():
    from blaze.data.csv import drop
    with filetexts({'foo.csv': '1,1\n2,2'}):
        assert os.path.exists('foo.csv')
        drop('foo.csv')
        assert not os.path.exists('foo.csv')
Esempio n. 32
0
def test_resource_different_csv_schemas():
    files = {'foobar_a.csv': '1.0,1\n2.0,2', 'foobar_b.csv': '3,3\n4,4'}
    with filetexts(files):
        r = resource('foobar_*.csv')
        assert r.data[0].schema == r.data[1].schema
 def test_filesystem(self):
     d = {'a.csv': '1,1\n2,2', 'b.csv': '1,1\n2,2'}
     with filetexts(d) as filenames:
         dd = resource('*.csv', schema='2 * int')
         assert isinstance(dd, Files)
Esempio n. 34
0
def test_into_resource():
    files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200'}
    with filetexts(files):
        assert into(list, 'accounts_1.csv') == [(1, 'Alice', 100),
                                                (2, 'Bob', 200)]
Esempio n. 35
0
def stack_data():
    data = {'a.csv': '1,1\n2,2',
            'b.csv': '3,3\n4,4',
            'c.csv': '5,5\n6,6'}
    with filetexts(data) as filenames:
        yield filenames
Esempio n. 36
0
def stack_data():
    data = {'a.csv': '1,1\n2,2', 'b.csv': '3,3\n4,4', 'c.csv': '5,5\n6,6'}
    with filetexts(data) as filenames:
        yield filenames
Esempio n. 37
0
def test_resource_csv():
    files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200'}
    with filetexts(files):
        assert isinstance(resource('accounts_1.csv'), CSV)