Exemple #1
0
def test_sql_to_csv(sql, csv, tmpdir):
    sql, bind = sql
    sql = odo(csv, sql, bind=bind)
    with tmpfile('.csv', dir=tmpdir) as fn:
        csv = odo(sql, fn, bind=bind)
        assert odo(csv, list) == data
        assert discover(csv).measure.names == discover(sql).measure.names
Exemple #2
0
def test_temp_ssh_files():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        scsv = into(Temp(SSH(CSV)), csv, hostname='localhost')
        assert discover(csv) == discover(scsv)

        assert isinstance(scsv, _Temp)
Exemple #3
0
	def correct_commodities():
	src_dir = path.join(data_dir, 'agmarknet/by_commodity')
	init_dir = os.getcwd()
	os.chdir(src_dir)
	folders = glob.glob('*')
	csv_dir = os.getcwd()
	for folder in folders:
		os.chdir(path.join(csv_dir, folder))
		files = glob.glob('*_all.csv')
		for file in files:
			csvr = odo.resource(path.join(csv_dir, folder, file))  # Have to use resource to discover URIs
			num_col = len(odo.discover(csvr)[1].types)	
			ds = None
			if num_col == 9:
				ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, min: ?string, max: ?string, modal: ?string}")
			elif num_col == 10:	
				ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, grade: ?string, min: ?string, max: ?string, modal: ?string}")
			else:
				ds = odo.discover(csvr)

			d = bz.Data(path.join(csv_dir, folder, file), dshape=ds)
			if num_col == 10:
				d = bz.transform(d, grade=d.grade.map(lambda x: x.strip(), 'string'))

			d = bz.transform(d, commodity=d.commodity.map(lambda x: x.strip(), 'string'))
			d = bz.transform(d, commodity=d.commodity.map(lambda x: commodity_corrections[x] if x in commodity_corrections else x, 'string'))
			d = bz.transform(d, state=d.state.map(lambda x: x.strip(), 'string'))
			d = bz.transform(d, state=d.state.map(lambda x: state_corrections[x] if x in state_corrections else x, 'string'))
			d = bz.transform(d, market=d.market.map(lambda x: x.strip(), 'string'))

	return
Exemple #4
0
def test_temp_ssh_files():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        scsv = into(Temp(SSH(CSV)), csv, hostname='localhost')
        assert discover(csv) == discover(scsv)

        assert isinstance(scsv, _Temp)
Exemple #5
0
def test_sql_to_csv(sql, csv, tmpdir):
    sql, bind = sql
    sql = odo(csv, sql, bind=bind)
    with tmpfile('.csv', dir=tmpdir) as fn:
        csv = odo(sql, fn, bind=bind)
        assert odo(csv, list) == data
        assert discover(csv).measure.names == discover(sql).measure.names
Exemple #6
0
def test_s3_to_ssh():
    pytest.importorskip('boto')

    tips_uri = 's3://nyqpug/tips.csv'
    with tmpfile('.csv') as fn:
        result = into(Temp(SSH(CSV))(fn, hostname='localhost'), tips_uri)
        assert into(list, result) == into(list, tips_uri)
        assert discover(result) == discover(resource(tips_uri))
Exemple #7
0
def test_sql_to_csv(sql, csv):
    sql = odo(csv, sql)
    with tmpfile('.csv') as fn:
        csv = odo(sql, fn)
        assert odo(csv, list) == data

        # explicitly test that we do NOT preserve the header here
        assert discover(csv).measure.names != discover(sql).measure.names
Exemple #8
0
def test_append_other():
    with tmpfile('.hdf5') as fn:
        x = into(np.ndarray, df)
        dset = into('hdfstore://'+fn+'::/data', x)
        try:
            assert discover(dset) == discover(df)
        finally:
            dset.parent.close()
Exemple #9
0
def test_into_resource():
    with tmpfile('.hdf5') as fn:
        d = into('hdfstore://' + fn + '::/x', df)
        try:
            assert discover(d) == discover(df)
            assert eq(into(pd.DataFrame, d), df)
        finally:
            d.parent.close()
def test_append_other():
    with tmpfile('.hdf5') as fn:
        x = into(np.ndarray, df)
        dset = into('hdfstore://' + fn + '::/data', x)
        try:
            assert discover(dset) == discover(df)
        finally:
            dset.parent.close()
def test_into_resource():
    with tmpfile('.hdf5') as fn:
        d = into('hdfstore://' + fn + '::/x', df)
        try:
            assert discover(d) == discover(df)
            assert eq(into(pd.DataFrame, d), df)
        finally:
            d.parent.close()
Exemple #12
0
def test_s3_to_ssh():
    pytest.importorskip('boto')

    tips_uri = 's3://nyqpug/tips.csv'
    with tmpfile('.csv') as fn:
        result = into(Temp(SSH(CSV))(fn, hostname='localhost'), tips_uri)
        assert into(list, result) == into(list, tips_uri)
        assert discover(result) == discover(resource(tips_uri))
Exemple #13
0
def test_sql_to_csv(sql, csv):
    sql = odo(csv, sql)
    with tmpfile('.csv') as fn:
        csv = odo(sql, fn)
        assert odo(csv, list) == data

        # explicitly test that we do NOT preserve the header here
        assert discover(csv).measure.names != discover(sql).measure.names
Exemple #14
0
def test_jsonlines_to_s3():
    with tmpfile('.json') as fn:
        with open(fn, mode='w') as f:
            for row in js:
                f.write(pd.io.json.dumps(row))
                f.write(os.linesep)
        with s3_bucket('.json') as b:
            result = into(b, resource(fn))
            assert discover(result) == discover(js)
Exemple #15
0
def test_jsonlines_to_s3():
    with tmpfile('.json') as fn:
        with open(fn, mode='w') as f:
            for row in js:
                f.write(pd.io.json.dumps(row))
                f.write(os.linesep)
        with s3_bucket('.json') as b:
            result = into(b, resource(fn))
            assert discover(result) == discover(js)
Exemple #16
0
def discover_bson(b, n=10, **kwargs):
    with bson_lines(b.path) as lines:
        data = list(take(n, lines))

    if len(data) < n:
        ds = discover(data)
    else:
        ds = var * discover(data).subshape[0]
    return ds
Exemple #17
0
def test_ssh_csv_to_s3_csv():
    # for some reason this can only be run in the same file as other ssh tests
    # and must be a Temp(SSH(CSV)) otherwise tests above this one fail
    s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket

    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost')
        with s3_bucket('.csv') as b:
            result = into(b, remote)
            assert discover(result) == discover(resource(b))
Exemple #18
0
def test_float_dtype(sql_with_floats):
    sql_with_floats, bind = sql_with_floats

    expected = dshape("var * {a: float64, b: ?float64}")

    assert_dshape_equal(discover(sql_with_floats), expected)

    # Also check that reflection from the database returns expected dshape.
    assert_dshape_equal(discover(bind).subshape[sql_with_floats.name],
                        expected)
Exemple #19
0
def test_ssh_csv_to_s3_csv():
    # for some reason this can only be run in the same file as other ssh tests
    # and must be a Temp(SSH(CSV)) otherwise tests above this one fail
    s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket

    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost')
        with s3_bucket('.csv') as b:
            result = into(b, remote)
            assert discover(result) == discover(resource(b))
def test_discover():
    df = pd.DataFrame(
        {"x": list("a" * 5 + "b" * 5 + "c" * 5), "y": np.arange(15, dtype=np.int64), "z": list(map(float, range(15)))},
        columns=["x", "y", "z"],
    )
    df.x = df.x.astype("category")
    ddf = dd.from_pandas(df, npartitions=2)
    assert_dshape_equal(
        discover(ddf), var * Record([("x", Categorical(["a", "b", "c"])), ("y", int64), ("z", float64)])
    )
    assert_dshape_equal(discover(ddf.x), var * Categorical(["a", "b", "c"]))
Exemple #21
0
def test_join_type_promotion(sqla, sqlb):
    t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb))
    expr = join(t, s, 'B', how='inner')
    result = set(
        map(tuple,
            compute(expr, {
                t: sqla,
                s: sqlb
            }).execute().fetchall()))
    expected = set([(1, 'a', 'a'), (1, None, 'a')])
    assert result == expected
Exemple #22
0
def test_discover():
    df = pd.DataFrame({'x': list('a'*5 + 'b'*5 + 'c'*5),
                       'y': range(15),
                       'z': list(map(float, range(15)))},
                       columns=['x', 'y', 'z'])
    df.x = df.x.astype('category')
    ddf = dd.from_pandas(df, npartitions=2)
    assert_dshape_equal(discover(ddf),
                        var * Record([('x', Categorical(['a', 'b', 'c'])),
                                            ('y', int64), ('z', float64)]))
    assert_dshape_equal(discover(ddf.x), var * Categorical(['a', 'b', 'c']))
def test_concat(sql_two_tables):
    t_table, u_table = sql_two_tables
    t_data = pd.DataFrame(np.arange(5), columns=['a'])
    u_data = pd.DataFrame(np.arange(5, 10), columns=['a'])
    odo(t_data, t_table)
    odo(u_data, u_table)

    t = symbol('t', discover(t_data))
    u = symbol('u', discover(u_data))
    tm.assert_frame_equal(
        compute(concat(t, u).sort('a'), {t: t_table, u: u_table}, return_type=pd.DataFrame),
        pd.DataFrame(np.arange(10), columns=['a']),
    )
def test_concat(sql_two_tables):
    t_table, u_table = sql_two_tables
    t_data = pd.DataFrame(np.arange(5), columns=["a"])
    u_data = pd.DataFrame(np.arange(5, 10), columns=["a"])
    odo(t_data, t_table)
    odo(u_data, u_table)

    t = symbol("t", discover(t_data))
    u = symbol("u", discover(u_data))
    tm.assert_frame_equal(
        odo(compute(concat(t, u).sort("a"), {t: t_table, u: u_table}), pd.DataFrame),
        pd.DataFrame(np.arange(10), columns=["a"]),
    )
def test_concat(sql_two_tables):
    t_table, u_table = sql_two_tables
    t_data = pd.DataFrame(np.arange(5), columns=['a'])
    u_data = pd.DataFrame(np.arange(5, 10), columns=['a'])
    odo(t_data, t_table)
    odo(u_data, u_table)

    t = symbol('t', discover(t_data))
    u = symbol('u', discover(u_data))
    tm.assert_frame_equal(
        compute(concat(t, u).sort('a'), {t: t_table, u: u_table}, return_type=pd.DataFrame),
        pd.DataFrame(np.arange(10), columns=['a']),
    )
Exemple #26
0
def test_discover_nested():
    with tmpfile('hdf5') as fn:
        df.to_hdf(fn, '/a/b/data')
        df.to_hdf(fn, '/a/b/data2')
        df.to_hdf(fn, '/a/data')

        hdf = pd.HDFStore(fn)

        try:
            assert discover(hdf) == discover(
                {'a': {'b': {'data': df, 'data2': df}, 'data': df}}
            )
        finally:
            hdf.close()
Exemple #27
0
def test_url_to_hdfs():
    from .test_hdfs import tmpfile_hdfs, hdfs, HDFS

    with tmpfile_hdfs() as target:

        # build temp csv for assertion check
        url_csv = resource(iris_url)
        csv = convert(Temp(CSV), url_csv)

        # test against url
        scsv = HDFS(CSV)(target, hdfs=hdfs)
        odo(iris_url, scsv)

        assert discover(scsv) == discover(csv)
def test_discover():
    df = pd.DataFrame(
        {
            'x': list('a' * 5 + 'b' * 5 + 'c' * 5),
            'y': np.arange(15, dtype=np.int64),
            'z': list(map(float, range(15)))
        },
        columns=['x', 'y', 'z'])
    df.x = df.x.astype('category')
    ddf = dd.from_pandas(df, npartitions=2)
    assert_dshape_equal(
        discover(ddf),
        var * Record([('x', Categorical(['a', 'b', 'c'])), ('y', int64),
                      ('z', float64)]))
    assert_dshape_equal(discover(ddf.x), var * Categorical(['a', 'b', 'c']))
Exemple #29
0
def test_copy_remote_csv():
    with tmpfile('csv') as target:
        with filetext('name,balance\nAlice,100\nBob,200',
                      extension='csv') as fn:
            csv = resource(fn)

            uri = 'ssh://localhost:%s.csv' % target
            scsv = into(uri, csv)

            assert isinstance(scsv, SSH(CSV))
            assert discover(scsv) == discover(csv)

            # Round trip
            csv2 = into(target, scsv)
            assert into(list, csv) == into(list, csv2)
Exemple #30
0
def test_copy_remote_csv():
    with tmpfile('csv') as target:
        with filetext('name,balance\nAlice,100\nBob,200',
                      extension='csv') as fn:
            csv = resource(fn)

            uri = 'ssh://localhost:%s.csv' % target
            scsv = into(uri, csv)

            assert isinstance(scsv, SSH(CSV))
            assert discover(scsv) == discover(csv)

            # Round trip
            csv2 = into(target, scsv)
            assert into(list, csv) == into(list, csv2)
def test_shift_arithmetic(sql, n):
    t = symbol('t', discover(sql))
    expr = t.B - t.B.shift(n)
    result = odo(compute(expr, sql), pd.Series)
    df = odo(sql, pd.DataFrame)
    expected = df.B - df.B.shift(n)
    tm.assert_series_equal(result, expected)
def test_sample(big_sql):
    nn = symbol('nn', discover(big_sql))
    nrows = odo(compute(nn.nrows, big_sql), int)
    result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame)
    assert len(result) == nrows // 2
    result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame)
    assert len(result) == len(result2)
Exemple #33
0
def test_isin_selectable(sql):
    s = symbol('s', discover(sql))

    # wrap the resource in a select
    assert compute(s.B.isin({1, 3}),
                   sa.select(sql._resources()[sql].columns),
                   return_type=list) == [(True, ), (False, )]
Exemple #34
0
def test_shift_on_column(n, column, sql):
    sql = sql.data
    t = symbol('t', discover(sql))
    expr = t[column].shift(n)
    result = compute(expr, sql, return_type=pd.Series)
    expected = odo(sql, pd.DataFrame)[column].shift(n)
    tm.assert_series_equal(result, expected)
Exemple #35
0
def test_relabel_columns_over_selection(big_sql):
    t = symbol('t', discover(big_sql))
    result = compute(t[t['B'] == 2].relabel(B=u'b'),
                     big_sql,
                     return_type=pd.DataFrame)
    expected = pd.DataFrame([['a', 2]], columns=[u'A', u'b'])
    tm.assert_frame_equal(result, expected)
Exemple #36
0
def test_slicing_with_lists():
    nx = np.arange(20).reshape((4, 5))
    dx = from_array(nx, (2, 2))
    sx = symbol('x', discover(dx))

    expr = sx[[2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[::2, [2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[1, [2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[[2, 0, 3], -2]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[:, :]
    assert compute(expr, dx).dask == dx.dask

    expr = sx[0]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[0, [3, 1, 4]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))
Exemple #37
0
def test_append_convert(empty_bank, raw_bank):
    ds = discover(raw_bank)
    assert set(ds.measure.names) == {'name', 'amount'}

    append(empty_bank, raw_bank, dshape=ds)
    assert odo(empty_bank, list,
               dshape=ds) == list(pluck(ds.measure.names, raw_bank))
Exemple #38
0
def test_discover_with_dotted_names():
    with tmpfile(".csv") as fn:
        with open(fn, "w") as f:
            f.write("a.b,c.d\n1,2\n3,4")
        dshape = discover(resource(fn))
    assert dshape == datashape.dshape('var * {"a.b": int64, "c.d": int64}')
    assert dshape.measure.names == [u"a.b", u"c.d"]
Exemple #39
0
def test_slicing_with_lists():
    nx = np.arange(20).reshape((4, 5))
    dx = from_array(nx, (2, 2))
    sx = symbol('x', discover(dx))

    expr = sx[[2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[::2, [2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[1, [2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[[2, 0, 3], -2]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[:, :]
    assert compute(expr, dx).dask == dx.dask

    expr = sx[0]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[0, [3, 1, 4]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))
Exemple #40
0
def test_complex_into(complex_csv, complex_sql):
    complex_sql, bind = complex_sql
    # data from: http://dummydata.me/generate
    into(complex_sql, complex_csv, dshape=discover(complex_sql), bind=bind)
    assert_allclose(
        into(list, complex_sql, bind=bind), into(list, complex_csv)
    )
Exemple #41
0
def test_multiple_object_ids():
    data = [{'x': 1, 'y': 2, 'other': ObjectId('1' * 24)},
            {'x': 3, 'y': 4, 'other': ObjectId('2' * 24)}]
    with coll(data) as c:
        assert discover(c) == dshape('2 * {x: int64, y: int64}')

        assert convert(list, c) == [(1, 2), (3, 4)]
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        odo(compute(transformed.dist.max(), nyc), float) ==
        odo(compute(transformed.dist, nyc), pd.Series).max().item()
    )
def test_coerce_bool_and_sum(sql):
    n = sql.name
    t = symbol(n, discover(sql))
    expr = (t.B > 1.0).coerce(to='int32').sum()
    result = compute(expr, sql).scalar()
    expected = odo(compute(t.B, sql), pd.Series).gt(1).sum()
    assert result == expected
Exemple #44
0
def test_discover_with_dotted_names():
    with tmpfile('.csv') as fn:
        with open(fn, 'w') as f:
            f.write('a.b,c.d\n1,2\n3,4')
        dshape = discover(resource(fn))
    assert dshape == datashape.dshape('var * {"a.b": int64, "c.d": int64}')
    assert dshape.measure.names == [u'a.b', u'c.d']
Exemple #45
0
def test_s3_jsonlines_discover():
    json_dshape = discover(resource('s3://nyqpug/tips.json'))
    names = list(map(str, sorted(json_dshape.measure.names)))
    assert names == ['day', 'sex', 'size', 'smoker', 'time', 'tip',
                     'total_bill']
    types = [json_dshape.measure[name] for name in names]
    assert types == [string, string, int64, string, string, float64, float64]
def test_sample(big_sql):
    nn = symbol('nn', discover(big_sql))
    nrows = odo(compute(nn.nrows, big_sql), int)
    result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame)
    assert len(result) == nrows // 2
    result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame)
    assert len(result) == len(result2)
Exemple #47
0
def test_header_mix_str_digits():
    ds = datashape.dshape('''var * {"On- or Off- Budget": ?string,
                                    "1990": ?string}''')
    with filetext('On- or Off- Budget,1990\nOn Budget,-628\nOff budget,"5,962"\n') as fn:
        csv = CSV(fn, has_header=True)
        df = convert(pd.DataFrame, csv)
        assert discover(csv).measure == ds.measure
def test_isin_selectable(sql):
    s = symbol('s', discover(sql))

    # wrap the resource in a select
    assert compute(s.B.isin({1, 3}),
                   sa.select(sql._resources()[sql].columns),
                   return_type=list) == [(True,), (False,)]
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        compute(transformed.dist.max(), nyc, return_type=float) ==
        compute(transformed.dist, nyc, return_type=pd.Series).max()
    )
def test_shift_arithmetic(sql, n):
    t = symbol('t', discover(sql))
    expr = t.B - t.B.shift(n)
    result = compute(expr, sql, return_type=pd.Series)
    df = odo(sql, pd.DataFrame)
    expected = df.B - df.B.shift(n)
    tm.assert_series_equal(result, expected)
def test_shift_on_column(n, column, sql):
    sql = sql.data
    t = symbol('t', discover(sql))
    expr = t[column].shift(n)
    result = compute(expr, sql, return_type=pd.Series)
    expected = odo(sql, pd.DataFrame)[column].shift(n)
    tm.assert_series_equal(result, expected)
Exemple #52
0
def test_s3_to_sqlite():
    with tmpfile('.db') as fn:
        tb = into('sqlite:///%s::tips' % fn,
                  tips_uri,
                  dshape=discover(resource(tips_uri)))
        lhs = into(list, tb)
        assert lhs == into(list, tips_uri)
Exemple #53
0
def test_pandas_loads_in_datetimes_naively():
    with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn:
        csv = CSV(fn, has_header=True)
        ds = datashape.dshape('var * {name: ?string, when: ?datetime}')
        assert discover(csv) == ds

        df = convert(pd.DataFrame, csv)
        assert df.dtypes['when'] == 'M8[ns]'
Exemple #54
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})

    assert isinstance(result, (SparkDataFrame, SchemaRDD))
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape