Example #1
0
def test_temp_ssh_files():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        scsv = into(Temp(SSH(CSV)), csv, hostname='localhost')
        assert discover(csv) == discover(scsv)

        assert isinstance(scsv, _Temp)
Example #2
0
def test_pandas_read_supports_read_csv_kwargs():
    with filetext("Alice,1\nBob,2") as fn:
        ds = datashape.dshape("var * {name: string, amount: int}")
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds, usecols=["name"])
        assert isinstance(df, pd.DataFrame)
        assert convert(list, df) == [("Alice",), ("Bob",)]
Example #3
0
def test_header_mix_str_digits():
    ds = datashape.dshape('''var * {"On- or Off- Budget": ?string,
                                    "1990": ?string}''')
    with filetext('On- or Off- Budget,1990\nOn Budget,-628\nOff budget,"5,962"\n') as fn:
        csv = CSV(fn, has_header=True)
        df = convert(pd.DataFrame, csv)
        assert discover(csv).measure == ds.measure
Example #4
0
def test_more_unicode_column_names():
    with filetext(b'foo\xc4\x87,a\n1,2\n3,4', extension='csv',
                  mode='wb') as fn:
        df = into(pd.DataFrame, CSV(fn, has_header=True))
    expected = pd.DataFrame([(1, 2), (3, 4)],
                            columns=[b'foo\xc4\x87'.decode('utf8'), u'a'])
    tm.assert_frame_equal(df, expected)
Example #5
0
def test_csv_with_header():
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = into('sqlite:///%s::mytable' % dbfilename,
                     csvfilename, has_header=True)
            assert discover(t) == dshape('var * {a: int64, b: int64}')
            assert into(set, t) == set([(1, 2), (3, 4)])
Example #6
0
def test_pandas_read_supports_read_csv_kwargs():
    with filetext('Alice,1\nBob,2') as fn:
        ds = datashape.dshape('var * {name: string, amount: int}')
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds, usecols=['name'])
        assert isinstance(df, pd.DataFrame)
        assert convert(list, df) == [('Alice',), ('Bob',)]
Example #7
0
def test_pandas_read():
    with filetext('Alice,1\nBob,2') as fn:
        ds = datashape.dshape('var * {name: string, amount: int}')
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert convert(list, df) == [('Alice', 1), ('Bob', 2)]
        assert list(df.columns) == ['name', 'amount']
Example #8
0
def test_pandas_read_supports_gzip():
    with filetext("Alice,1\nBob,2", open=gzip.open, mode="wt", extension=".csv.gz") as fn:
        ds = datashape.dshape("var * {name: string, amount: int}")
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert convert(list, df) == [("Alice", 1), ("Bob", 2)]
        assert list(df.columns) == ["name", "amount"]
Example #9
0
def test_pandas_read_supports_missing_integers():
    with filetext("Alice,1\nBob,") as fn:
        ds = datashape.dshape("var * {name: string, val: ?int32}")
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert list(df.columns) == ["name", "val"]
        assert df.dtypes["val"] == "f4"
Example #10
0
def test_pandas_read():
    with filetext("Alice,1\nBob,2") as fn:
        ds = datashape.dshape("var * {name: string, amount: int}")
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert convert(list, df) == [("Alice", 1), ("Bob", 2)]
        assert list(df.columns) == ["name", "amount"]
Example #11
0
def test_header_disagrees_with_dshape():
    ds = datashape.dshape("var * {name: string, bal: int64}")
    with filetext("name,val\nAlice,100\nBob,200", extension="csv") as fn:
        csv = CSV(fn, header=True)
        assert convert(list, csv) == [("Alice", 100), ("Bob", 200)]

        assert list(convert(pd.DataFrame, csv).columns) == ["name", "val"]
        assert list(convert(pd.DataFrame, csv, dshape=ds).columns) == ["name", "bal"]
Example #12
0
def test_pandas_read_supports_datetimes():
    with filetext('Alice,2014-01-02\nBob,2014-01-03') as fn:
        ds = datashape.dshape('var * {name: string, when: date}')
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert list(df.columns) == ['name', 'when']
        assert df.dtypes['when'] == 'M8[ns]'
Example #13
0
def test_csv_to_bcolz():
    with filetext('name,runway,takeoff,datetime_nearest_close\n'
                  'S28,28,TRUE,A\n'
                  'S16,16,TRUE,Q\n'
                  'L14,14,FALSE,I', extension='csv') as src:
        with tmpfile('bcolz') as tgt:
            bc = into(tgt, src)
            assert len(bc) == 3
Example #14
0
def test_pandas_read_supports_missing_integers():
    with filetext('Alice,1\nBob,') as fn:
        ds = datashape.dshape('var * {name: string, val: ?int32}')
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert list(df.columns) == ['name', 'val']
        assert df.dtypes['val'] == 'f4'
def test_copy_local_files_to_hdfs():
    with tmpfile_hdfs() as target:
        with filetext('name,amount\nAlice,100\nBob,200') as source:
            csv = CSV(source)
            scsv = HDFS(CSV)(target, hdfs=hdfs)
            into(scsv, csv, blocksize=10)  # 10 bytes per message

            assert discover(scsv) == discover(csv)
Example #16
0
def test_header_disagrees_with_dshape():
    ds = datashape.dshape('var * {name: string, bal: int64}')
    with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn, header=True)
        assert convert(list, csv) == [('Alice', 100), ('Bob', 200)]

        assert list(convert(pd.DataFrame, csv).columns) == ['name', 'val']
        assert list(convert(pd.DataFrame, csv, dshape=ds).columns) == ['name', 'bal']
Example #17
0
def test_pandas_read_supports_datetimes():
    with filetext("Alice,2014-01-02\nBob,2014-01-03") as fn:
        ds = datashape.dshape("var * {name: string, when: date}")
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert list(df.columns) == ["name", "when"]
        assert df.dtypes["when"] == "M8[ns]"
Example #18
0
def test_pandas_loads_in_datetimes_naively():
    with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn:
        csv = CSV(fn, has_header=True)
        ds = datashape.dshape('var * {name: ?string, when: ?datetime}')
        assert discover(csv) == ds

        df = convert(pd.DataFrame, csv)
        assert df.dtypes['when'] == 'M8[ns]'
Example #19
0
def test_pandas_read_supports_gzip():
    with filetext('Alice,1\nBob,2', open=gzip.open,
                  mode='wt', extension='.csv.gz') as fn:
        ds = datashape.dshape('var * {name: string, amount: int}')
        csv = CSV(fn)
        df = csv_to_dataframe(csv, dshape=ds)
        assert isinstance(df, pd.DataFrame)
        assert convert(list, df) == [('Alice', 1), ('Bob', 2)]
        assert list(df.columns) == ['name', 'amount']
Example #20
0
def test_into_double_string(f):
    with filetext('alice,1\nbob,2', extension='.csv') as source:
        assert odo(source, list) == [('alice', 1), ('bob', 2)]

        with tmpfile('.csv') as target:
            csv = odo(source, f(target))
            assert isinstance(csv, CSV)
            with open(target, 'rU') as f:
                assert 'alice' in f.read()
Example #21
0
def test_ssh_csv_to_s3_csv():
    # for some reason this can only be run in the same file as other ssh tests
    # and must be a Temp(SSH(CSV)) otherwise tests above this one fail
    s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket

    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost')
        with s3_bucket('.csv') as b:
            result = into(b, remote)
            assert discover(result) == discover(resource(b))
def test_hive_creation_from_local_file():
    with filetext(accounts_1_csv, extension='csv') as fn:
        with hive_table(host) as uri:
            t = into(uri, fn, **auth)
            assert isinstance(t, sa.Table)
            assert into(set, t) == into(set, fn)

            t2 = into(uri, fn, **auth)
            assert isinstance(t2, sa.Table)
            assert len(into(list, t2)) == 2 * len(into(list, fn))
Example #23
0
def test_sqlite_to_csv(sep, header):
    with tmpfile("db") as dbfilename:
        with filetext("a,b\n1,2\n3,4", extension="csv") as csvfilename:
            t = odo(csvfilename, "sqlite:///%s::mytable" % dbfilename)

        with tmpfile(".csv") as fn:
            odo(t, fn, header=header, delimiter=sep)
            with open(fn, "rt") as f:
                lines = f.readlines()
            expected = [tuple(map(int, row)) for row in map(lambda x: x.split(sep), lines[header:])]
            assert odo(fn, list, delimiter=sep, has_header=header, dshape=discover(t)) == expected
Example #24
0
def test_different_encoding_to_csv():
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = odo(
                csvfilename,
                'sqlite:///%s::mytable' % dbfilename,
                encoding='latin1'
            )
            with tmpfile('.csv') as fn:
                with pytest.raises(ValueError):
                    odo(t, fn, encoding='latin1')
Example #25
0
def test_send_parameterized_query_to_csv():
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = odo(
                csvfilename,
                'sqlite:///%s::mytable' % dbfilename,
            )
        with tmpfile('.csv') as fn:
            q = t.select(t.c.a == 1)
            r = odo(q, fn)
            assert sorted(odo(q, list)) == sorted(odo(r, list))
def accounts_ssh():
    """ Three csv files on the remote host in a directory """
    dirname = str(uuid.uuid1())
    conn = sftp(**auth)
    conn.mkdir(dirname)
    with filetext(accounts_1_csv) as fn:
        conn.put(fn, dirname + '/accounts.1.csv')
    with filetext(accounts_2_csv) as fn:
        conn.put(fn, dirname + '/accounts.2.csv')
    with filetext(accounts_3_csv) as fn:
        conn.put(fn, dirname + '/accounts.3.csv')

    filenames = [dirname + '/accounts.%d.csv' % i for i in [1, 2, 3]]
    uris = ['ssh://ubuntu@%s:%s' % (host, fn) for fn in filenames]

    try:
        yield 'ssh://ubuntu@%s:%s/*.csv' % (host, dirname),  uris
    finally:
        for fn in filenames:
            conn.remove(fn)
        conn.rmdir(dirname)
Example #27
0
def test_convert_through_temporary_local_storage():
    with filetext('name,quantity\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        df = into(pd.DataFrame, csv)
        scsv = into(Temp(SSH(CSV)), csv, hostname='localhost')

        assert into(list, csv) == into(list, scsv)

        scsv2 = into(Temp(SSH(CSV)), df, hostname='localhost')
        assert into(list, scsv2) == into(list, df)

        sjson = into(Temp(SSH(JSONLines)), df, hostname='localhost')
        assert (into(np.ndarray, sjson) == into(np.ndarray, df)).all()
def test_dialect_of():
    with filetext(accounts_1_csv) as fn:
        d = dialect_of(CSV(fn))
        assert d['delimiter'] == ','
        assert d['has_header'] is True

    with accounts_data() as (directory, (a, b, c)):
        directory2 = HDFS(Directory(CSV))(directory.path, hdfs=directory.hdfs)
        d = dialect_of(directory2)
        assert d['has_header'] is True

        directory2 = HDFS(Directory(CSV))(directory.path, hdfs=directory.hdfs,
                                          has_header=False)
        d = dialect_of(directory2)
        assert d['has_header'] is False
Example #29
0
def test_copy_remote_csv():
    with tmpfile('csv') as target:
        with filetext('name,balance\nAlice,100\nBob,200',
                      extension='csv') as fn:
            csv = resource(fn)

            uri = 'ssh://localhost:%s.csv' % target
            scsv = into(uri, csv)

            assert isinstance(scsv, SSH(CSV))
            assert discover(scsv) == discover(csv)

            # Round trip
            csv2 = into(target, scsv)
            assert into(list, csv) == into(list, csv2)
Example #30
0
def test_drop():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        with tmpfile('csv') as target:
            scsv = SSH(CSV)(target, hostname='localhost')

            assert not os.path.exists(target)

            conn = sftp(**scsv.auth)
            conn.put(fn, target)

            assert os.path.exists(target)

            drop(scsv)
            drop(scsv)

            assert not os.path.exists(target)
Example #31
0
def test_sqlite_to_csv(sep, header):
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = odo(csvfilename, 'sqlite:///%s::mytable' % dbfilename)

        with tmpfile('.csv') as fn:
            odo(t, fn, header=header, delimiter=sep)
            with open(fn, 'rt') as f:
                lines = f.readlines()
            expected = [
                tuple(map(int, row))
                for row in map(lambda x: x.split(sep), lines[header:])
            ]
            assert odo(fn,
                       list,
                       delimiter=sep,
                       has_header=header,
                       dshape=discover(t)) == expected
Example #32
0
def test_unicode_column_names():
    with filetext(b'f\xc3\xbc,a\n1,2\n3,4', extension='csv', mode='wb') as fn:
        df = into(pd.DataFrame, CSV(fn, has_header=True))
    expected = pd.DataFrame([(1, 2), (3, 4)],
                            columns=[b'f\xc3\xbc'.decode('utf8'), u'a'])
    tm.assert_frame_equal(df, expected)
Example #33
0
def test_csv_missing_values():
    with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn:
        csv = CSV(fn)
        assert discover(csv).measure.dict['name'] == Option(string)
Example #34
0
def test_csv_separator_header():
    with filetext('a|b|c\n1|2|3\n4|5|6', extension='csv') as fn:
        csv = CSV(fn, delimiter='|', has_header=True)
        assert convert(list, csv) == [(1, 2, 3), (4, 5, 6)]
Example #35
0
def test_into_string_on_right():
    with filetext('alice,1\nbob,2', extension='.csv') as source:
        assert into([], source) == [('alice', 1), ('bob', 2)]
Example #36
0
def test_into_string_on_right(f):
    with filetext('alice,1\nbob,2', extension='.csv') as source:
        assert odo(f(source), []) == [('alice', 1), ('bob', 2)]
Example #37
0
def test_unicode_column_names():
    with filetext('foo\xc4\x87,a\n1,2\n3,4', extension='csv') as fn:
        csv = CSV(fn, has_header=True)
        df = into(pd.DataFrame, csv)
Example #38
0
def test_pandas_read_supports_whitespace_strings():
    with filetext('a,b, \n1,2, \n2,3, \n', extension='csv') as fn:
        csv = CSV(fn)
        ds = discover(csv)
        assert ds == datashape.dshape("var * {a: int64, b: int64, '': ?string}")
Example #39
0
def test_header_argument_set_with_or_without_header():
    with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn:
        assert into(list, fn) == [('Alice', 100), ('Bob', 200)]

    with filetext('Alice,100\nBob,200', extension='csv') as fn:
        assert into(list, fn) == [('Alice', 100), ('Bob', 200)]
Example #40
0
def test_discover():
    with filetext('name,balance\nAlice,100\nBob,200') as fn:
        local = CSV(fn)
        remote = SSH(CSV)(fn, hostname='localhost')

        assert discover(local) == discover(remote)
Example #41
0
def test_discover_csv_yields_string_on_totally_empty_columns():
    expected = dshape('var * {a: int64, b: ?string, c: int64}')
    with filetext('a,b,c\n1,,3\n4,,6\n7,,9') as fn:
        csv = CSV(fn, has_header=True)
        assert discover(csv) == expected
Example #42
0
def test_discover_csv_without_columns():
    with filetext('Alice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        ds = discover(csv)
        assert '100' not in str(ds)
Example #43
0
def test_discover_csv_files_without_header():
    with filetext('Alice,2014-01-01\nBob,2014-02-02') as fn:
        csv = CSV(fn, has_header=False)
        df = convert(pd.DataFrame, csv)
        assert len(df) == 2
        assert 'Alice' not in list(df.columns)
Example #44
0
def test_csv_into_list():
    with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn:
        L = into(list, fn)
        assert L == [('Alice', 100), ('Bob', 200)]
Example #45
0
def test_pandas_discover_on_gzipped_files():
    with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02',
                  open=gzip.open, mode='wt', extension='.csv.gz') as fn:
        csv = CSV(fn, has_header=True)
        ds = datashape.dshape('var * {name: ?string, when: ?datetime}')
        assert discover(csv) == ds
Example #46
0
def test_convert():
    with filetext('Hello\nWorld') as fn:
        assert convert(list, TextFile(fn)) == ['Hello\n', 'World']
Example #47
0
def test_drop():
    with filetext('hello\nworld') as fn:
        t = TextFile(fn)
        assert os.path.exists(fn)
        drop(t)
        assert not os.path.exists(fn)
Example #48
0
def test_infer_header():
    with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn:
        assert infer_header(CSV(fn).path, 100) == True
    with filetext('Alice,100\nNA,200', extension='csv') as fn:
        assert infer_header(CSV(fn).path, 100) == False
Example #49
0
def test_raise_errors_quickly_on_into_chunks_dataframe():
    with filetext('name,val\nAlice,100\nBob,foo', extension='csv') as fn:
        ds = datashape.dshape('var * {name: string, val: int}')
        csv = CSV(fn, header=True)
        assert raises(Exception,
                      lambda: CSV_to_chunks_of_dataframes(csv, dshape=ds))
Example #50
0
def test_csv_infer_header():
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = odo(csvfilename, 'sqlite:///%s::mytable' % dbfilename)
            assert discover(t) == dshape('var * {a: int64, b: int64}')
            assert odo(t, set) == set([(1, 2), (3, 4)])
Example #51
0
def test_unused_datetime_columns():
    ds = datashape.dshape('var * {val: string, when: datetime}')
    with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn:
        csv = CSV(fn, has_header=True)
        assert convert(list, csv_to_dataframe(csv, usecols=['val'],
                                              squeeze=True, dshape=ds)) == ['a', 'b']
Example #52
0
def test_convert_local_file_to_temp_ssh_file():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        scsv = convert(Temp(SSH(CSV)), csv, hostname='localhost')

        assert into(list, csv) == into(list, scsv)
Example #53
0
def test_empty_dataframe():
    with filetext('name,val', extension='csv') as fn:
        csv = CSV(fn, has_header=True)
        df = convert(pd.DataFrame, csv)
        assert isinstance(df, pd.DataFrame)
Example #54
0
def test_discover_csv_with_spaces_in_header():
    with filetext(' name,  val\nAlice,100\nBob,200', extension='csv') as fn:
        ds = discover(CSV(fn, has_header=True))
        assert ds.measure.names == ['name', 'val']
Example #55
0
def test_discover_from_resource():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        local = CSV(fn)
        remote = resource('ssh://localhost:' + fn)

        assert discover(local) == discover(remote)