def json_data(): data = { 'a.csv': [{ 'x': 1, 'y': 2 }, { 'x': 3, 'y': 4 }], 'b.csv': [{ 'x': 5, 'y': 6 }, { 'x': 7, 'y': 8 }], 'c.csv': [{ 'x': 9, 'y': 10 }, { 'x': 11, 'y': 12 }] } text = dict( (fn, '\n'.join(map(json.dumps, dicts))) for fn, dicts in data.items()) with filetexts(text) as filenames: descriptors = [ JSON_Streaming(fn, schema='{x: int32, y: int32}') for fn in sorted(filenames) ] yield Stack(descriptors)
def test_Concat(self): with filetexts(self.data) as filenames: descriptors = [CSV(fn, schema='2 * int32') for fn in sorted(filenames)] dd = Concat(descriptors) self.assertEqual(str(dd.schema), '2 * int32') self.assertEqual(str(dd.dshape), 'var * 2 * int32') expected = ((1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7)) self.assertEqual(tuplify(tuple(dd)), expected) result = dd.as_dynd() expected2 = nd.array(expected, dtype='int32') self.assertEqual(nd.as_py(result), nd.as_py(expected2)) self.assertEqual(tuplify(tuple(dd)), expected) self.assertEqual(tuplify(tuple(dd)), expected) # Not one use only chunks = list(dd.chunks()) assert all(isinstance(chunk, nd.array) for chunk in chunks) self.assertEqual(tuple(dd[[0, 2], 0]), (1, 3)) self.assertEqual(tuple(dd[2, [1, 0]]), (3, 3)) assert isinstance(dd[:, 0], Iterator)
def file_data(): data = {'a.csv': '1,1\n2,2', 'b.csv': '3,3\n4,4\n5,5', 'c.csv': '6,6\n7,7'} with filetexts(data) as filenames: descriptors = [ CSV(fn, schema='{a: int32, b: int32}') for fn in sorted(filenames) ] yield Concat(descriptors)
def test_filesystem(self): with filetexts(data) as filenames: dd = Files(sorted(filenames), CSV, subdshape='var * 2 * int32') self.assertEqual(dd.filenames, ['a.csv', 'b.csv', 'c.csv']) self.assertEqual(str(dd.schema), '2 * int32') self.assertEqual(str(dd.dshape), 'var * 2 * int32') expected = [[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]] self.assertEqual(dd.as_py(), expected) result = dd.as_dynd() expected2 = nd.array(expected, dtype='int32') self.assertEqual(nd.as_py(result), nd.as_py(expected2)) self.assertEqual(list(dd), expected) self.assertEqual(list(dd), expected) # Not one use only chunks = list(dd.chunks(blen=3)) expected = [nd.array([[1, 1], [2, 2], [3, 3]], dtype='int32'), nd.array([[4, 4], [5, 5], [6, 6]], dtype='int32')] assert all(nd.as_py(a) == nd.as_py(b) for a, b in zip(chunks, expected))
def test_Stack(self): with filetexts(self.data) as filenames: descriptors = [CSV(fn, schema='2 * int32') for fn in sorted(filenames)] dd = Stack(descriptors) self.assertEqual(dd.dshape, 3 * descriptors[0].dshape) expected = (((1, 1), (2, 2)), ((3, 3), (4, 4)), ((5, 5), (6, 6))) self.assertEqual(tuplify(tuple(dd.as_py())), expected) result = dd.as_dynd() expected2 = nd.array(expected, dtype='int32') self.assertEqual(nd.as_py(result), nd.as_py(expected2)) self.assertEqual(tuplify(tuple(dd)), expected) self.assertEqual(tuplify(tuple(dd)), expected) # Not one use only chunks = dd.chunks() assert all(isinstance(chunk, nd.array) for chunk in chunks) self.assertEqual(tuple(dd[[0, 2], 0, 0]), (1, 5)) self.assertEqual(tuplify(tuple(dd[0])), ((1, 1), (2, 2))) self.assertEqual(tuplify(tuple(dd[0, :, [1]])), ((1,), (2,))) self.assertEqual(tuplify(tuple(dd[0])), expected[0]) assert isinstance(dd[:, 0], Iterator) assert isinstance(dd[:], Iterator)
def test_filesystem(self): prefix = 'test_filesystem' d = {prefix + 'a.csv': '1,1\n2,2', prefix + 'b.csv': '1,1\n2,2'} with filetexts(d) as filenames: dd = resource(prefix + '*.csv', schema='{x: int, y: int}') self.assertEqual(into(list, dd), [(1, 1), (2, 2), (1, 1), (2, 2)])
def file_data(): data = {'a.csv': '1,1\n2,2', 'b.csv': '3,3\n4,4\n5,5', 'c.csv': '6,6\n7,7'} with filetexts(data) as filenames: descriptors = [CSV(fn, schema='{a: int32, b: int32}') for fn in sorted(filenames)] yield Concat(descriptors)
def test_filesystem(self): prefix = 'test_filesystem' d = {prefix + 'a.csv': '1,1\n2,2', prefix + 'b.csv': '1,1\n2,2'} with filetexts(d) as filenames: dd = resource(prefix + '*.csv', schema='2 * int') self.assertEqual(tuplify(tuple(dd)), (((1, 1), (2, 2)), ((1, 1), (2, 2))))
def test_gzip_json_files(self): with filetexts(texts, open=gzip.open) as filenames: descriptors = [JSON(fn, dshape=schema, open=gzip.open) for fn in sorted(filenames)] dd = Stack(descriptors) self.assertEqual(sorted(dd), sorted(tuples)) self.assertEqual(dd.schema, dshape(schema))
def test_into_directory_of_csv_files(): files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200\n3,Charlie,300', 'accounts_2.csv': '4,Dan,400\n5,Edith,500'} with filetexts(files): assert into(list, 'accounts_*.csv') == [(1, 'Alice', 100), (2, 'Bob', 200), (3, 'Charlie', 300), (4, 'Dan', 400), (5, 'Edith', 500)]
def test_gzip_json_files(self): with filetexts(texts, open=gzip.open) as filenames: dd = Files(sorted(filenames), JSON, open=gzip.open, subdshape=dshape) self.assertEqual(sorted(dd), sorted(data.values())) self.assertEqual(dd.dshape, Var() * dshape)
def json_data(): data = {'a.csv': [{'x': 1, 'y': 2}, {'x': 3, 'y': 4}], 'b.csv': [{'x': 5, 'y': 6}, {'x': 7, 'y': 8}], 'c.csv': [{'x': 9, 'y': 10}, {'x': 11, 'y': 12}]} text = dict((fn, '\n'.join(map(json.dumps, dicts))) for fn, dicts in data.items()) with filetexts(text) as filenames: descriptors = [JSON_Streaming(fn, schema='{x: int32, y: int32}') for fn in sorted(filenames)] yield Stack(descriptors)
def test_Stack(self): with filetexts(self.text) as filenames: descriptors = [JSON_Streaming(fn, schema="{x: int32, y: int32}") for fn in sorted(filenames)] dd = Stack(descriptors) expected = (((1, 2), (3, 4)), ((5, 6), (7, 8)), ((9, 10), (11, 12))) self.assertEqual(tuplify(dd.as_py()), expected) self.assertEqual(tuplify(dd.py[::2, 1, :]), ((3, 4), (11, 12))) self.assertEqual(tuplify(dd.py[::2, 1, "x"]), (3, 11))
def test_into_directory_of_csv_files(): files = { 'accounts_1.csv': '1,Alice,100\n2,Bob,200\n3,Charlie,300', 'accounts_2.csv': '4,Dan,400\n5,Edith,500' } with filetexts(files): assert into(list, 'accounts_*.csv') == [(1, 'Alice', 100), (2, 'Bob', 200), (3, 'Charlie', 300), (4, 'Dan', 400), (5, 'Edith', 500)]
def test_csv_join(): d = {"a.csv": "a,b,c\n0,1,2\n3,4,5", "b.csv": "c,d,e\n2,3,4\n5,6,7"} with filetexts(d): resource_a = resource("a.csv") resource_b = resource("b.csv") a = symbol("a", discover(resource_a)) b = symbol("b", discover(resource_b)) tm.assert_frame_equal( odo(compute(join(a, b, "c"), {a: resource_a, b: resource_b}), pd.DataFrame), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype="int64"), columns=list("cabde")), )
def test_multiple_csv_files(): d = {"mult1.csv": "name,val\nAlice,1\nBob,2", "mult2.csv": "name,val\nAlice,3\nCharlie,4"} data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Charlie", 4)] with filetexts(d) as fns: r = resource("mult*.csv") s = symbol("s", discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: data}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def test_concat(): d = {"a.csv": "a,b\n1,2\n3,4", "b.csv": "a,b\n5,6\n7,8"} with filetexts(d): a_rsc = resource("a.csv") b_rsc = resource("b.csv") a = symbol("a", discover(a_rsc)) b = symbol("b", discover(b_rsc)) tm.assert_frame_equal( odo(compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.arange(1, 9, dtype="int64").reshape(4, 2), columns=list("ab")), )
def test_multiple_csv_files(): d = {'mult1.csv': 'name,val\nAlice,1\nBob,2', 'mult2.csv': 'name,val\nAlice,3\nCharlie,4'} data = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)] with filetexts(d) as fns: r = resource('mult*.csv') s = symbol('s', discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: data}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def test_multiple_csv_files(): d = {'mult1.csv': 'name,val\nAlice,1\nBob,2', 'mult2.csv': 'name,val\nAlice,3\nCharlie,4'} dta = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)] with filetexts(d) as fns: r = data('mult*.csv') s = symbol('s', discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: dta}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def test_csv_join(): d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5', 'b.csv': 'c,d,e\n2,3,4\n5,6,7'} with filetexts(d): data_a = data('a.csv') data_b = data('b.csv') a = symbol('a', discover(data_a)) b = symbol('b', discover(data_b)) tm.assert_frame_equal( odo( compute(join(a, b, 'c'), {a: data_a, b: data_b}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype='int64'), columns=list('cabde')) )
def test_concat(): d = {'a.csv': 'a,b\n1,2\n3,4', 'b.csv': 'a,b\n5,6\n7,8'} with filetexts(d): a_rsc = data('a.csv') b_rsc = data('b.csv') a = symbol('a', discover(a_rsc)) b = symbol('b', discover(b_rsc)) tm.assert_frame_equal( odo( compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2), columns=list('ab')), )
def test_resource_different_csv_schemas(): files = {'foobar_a.csv': '1.0,1\n2.0,2', 'foobar_b.csv': '3,3\n4,4'} with filetexts(files): r = resource('foobar_*.csv') assert r.data[0].schema == r.data[1].schema
def test_into_resource(): files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200'} with filetexts(files): assert into(list, 'accounts_1.csv') == [(1, 'Alice', 100), (2, 'Bob', 200)]
def test_resource_csv(): files = {'accounts_1.csv': '1,Alice,100\n2,Bob,200'} with filetexts(files): assert isinstance(resource('accounts_1.csv'), CSV)
def test_drop_uri(): from blaze.data.csv import drop with filetexts({'foo.csv': '1,1\n2,2'}): assert os.path.exists('foo.csv') drop('foo.csv') assert not os.path.exists('foo.csv')
def test_filesystem(self): d = {'a.csv': '1,1\n2,2', 'b.csv': '1,1\n2,2'} with filetexts(d) as filenames: dd = resource('*.csv', schema='2 * int') assert isinstance(dd, Files)
def stack_data(): data = {'a.csv': '1,1\n2,2', 'b.csv': '3,3\n4,4', 'c.csv': '5,5\n6,6'} with filetexts(data) as filenames: yield filenames