def test_temp_ssh_files(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert discover(csv) == discover(scsv) assert isinstance(scsv, _Temp)
def test_copy_remote_csv(): with tmpfile('csv') as target: with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = resource(fn) scsv = into('ssh://localhost:foo.csv', csv) assert isinstance(scsv, SSH(CSV)) assert discover(scsv) == discover(csv) # Round trip csv2 = into(target, scsv) assert into(list, csv) == into(list, csv2)
def test_slicing_with_lists(): nx = np.arange(20).reshape((4, 5)) dx = convert(Array, nx, blockshape=(2, 2)) sx = symbol('x', discover(dx)) expr = sx[[2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[::2, [2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[1, [2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[[2, 0, 3], -2] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[:, :] assert compute(expr, dx).dask == dx.dask expr = sx[0] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[0, [3, 1, 4]] assert eq(np.array(compute(expr, dx)), compute(expr, nx))
def test_pandas_discover_on_gzipped_files(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02', open=gzip.open, extension='.csv.gz') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: string, when: datetime}') assert discover(csv) == ds
def test_pandas_loads_in_datetimes_naively(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds df = convert(pd.DataFrame, csv) assert df.dtypes['when'] == 'M8[ns]'
def test_pandas_loads_in_datetimes_naively(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: string, when: datetime}') assert discover(csv) == ds df = convert(pd.DataFrame, csv) assert df.dtypes['when'] == 'M8[ns]'
def test_slicing_on_boundary_lines(): nx = np.arange(100).reshape((10, 10)) dx = convert(Array, nx, blockshape=(3, 3)) sx = symbol('x', discover(dx)) expr = sx[0, [1, 3, 9, 3]] result = compute(expr, dx) assert eq(result, nx[0, [1, 3, 9, 3]])
def test_multiple_object_ids(): data = [{ 'x': 1, 'y': 2, 'other': ObjectId('1' * 24) }, { 'x': 3, 'y': 4, 'other': ObjectId('2' * 24) }] with coll(data) as c: assert discover(c) == dshape('2 * {x: int64, y: int64}') assert convert(list, c) == [(1, 2), (3, 4)]
def test_slicing_with_newaxis(): nx = np.arange(20).reshape((4, 5)) dx = convert(Array, nx, blockshape=(2, 2)) sx = symbol('x', discover(dx)) expr = sx[:, None, :] result = compute(expr, dx) assert result.shape == (4, 1, 5) assert result.blockdims == ((2, 2), (1, ), (2, 2, 1)) assert eq(np.array(result), compute(expr, nx)) expr = sx[None, [2, 1, 3], None, None, :, None] result = compute(expr, dx) assert result.shape == (1, 3, 1, 1, 5, 1) assert result.blockdims == ((1, ), (3, ), (1, ), (1, ), (2, 2, 1), (1, )) assert eq(np.array(result), compute(expr, nx))
def test_discover_csv_yields_string_on_totally_empty_columns(): expected = dshape('var * {a: int64, b: string, c: int64}') with filetext('a,b,c\n1,,3\n4,,6\n7,,9') as fn: csv = CSV(fn, has_header=True) assert discover(csv) == expected
def test_discover_csv_without_columns(): with filetext('Alice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) ds = discover(csv) assert '100' not in str(ds)
def eq(a, b): if isinstance(a, Array): a = a.compute() if isinstance(b, Array): b = b.compute() c = a == b if isinstance(c, np.ndarray): c = c.all() return c nx = np.arange(600).reshape((20, 30)) dx = convert(Array, nx, blockshape=(4, 5)) sx = symbol('x', discover(dx)) ny = np.arange(600).reshape((30, 20)) dy = convert(Array, ny, blockshape=(5, 4)) sy = symbol('y', discover(dy)) na = np.arange(20) da = convert(Array, na, blockshape=(4, )) sa = symbol('a', discover(da)) nb = np.arange(30).reshape((30, 1)) db = convert(Array, nb, blockshape=(5, 1)) sb = symbol('b', discover(db)) dask_ns = {sx: dx, sy: dy, sa: da, sb: db} numpy_ns = {sx: nx, sy: ny, sa: na, sb: nb}
def test_discover_csv_with_spaces_in_header(): with filetext(' name, val\nAlice,100\nBob,200', extension='csv') as fn: ds = discover(CSV(fn, has_header=True)) assert ds.measure.names == ['name', 'val']
def test_discover_csv_yields_string_on_totally_empty_columns(): expected = dshape('var * {a: int64, b: ?string, c: int64}') with filetext('a,b,c\n1,,3\n4,,6\n7,,9') as fn: csv = CSV(fn, has_header=True) assert discover(csv) == expected
def discover_dask_array(a, **kwargs): block = a._get_block(*([0] * a.ndim)) return DataShape(*(a.shape + (discover(block).measure,)))
def test_discover(): with coll(bank) as c: assert discover(bank) == discover(c)
def discover_dask_array(a, **kwargs): block = a._get_block(*([0] * a.ndim)) return DataShape(*(a.shape + (discover(block).measure, )))
def test_discover(): with filetext('name,balance\nAlice,100\nBob,200') as fn: local = CSV(fn) remote = SSH(CSV)(fn, hostname='localhost') assert discover(local) == discover(remote)
def test_csv_missing_values(): with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn: csv = CSV(fn) assert discover(csv).measure.dict['name'] == Option(string)
def test_discover_from_resource(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: local = CSV(fn) remote = resource('ssh://localhost:' + fn) assert discover(local) == discover(remote)