def test_temp_ssh_files(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert discover(csv) == discover(scsv) assert isinstance(scsv, _Temp)
def test_pandas_read_supports_read_csv_kwargs(): with filetext("Alice,1\nBob,2") as fn: ds = datashape.dshape("var * {name: string, amount: int}") csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds, usecols=["name"]) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [("Alice",), ("Bob",)]
def test_header_mix_str_digits(): ds = datashape.dshape('''var * {"On- or Off- Budget": ?string, "1990": ?string}''') with filetext('On- or Off- Budget,1990\nOn Budget,-628\nOff budget,"5,962"\n') as fn: csv = CSV(fn, has_header=True) df = convert(pd.DataFrame, csv) assert discover(csv).measure == ds.measure
def test_more_unicode_column_names(): with filetext(b'foo\xc4\x87,a\n1,2\n3,4', extension='csv', mode='wb') as fn: df = into(pd.DataFrame, CSV(fn, has_header=True)) expected = pd.DataFrame([(1, 2), (3, 4)], columns=[b'foo\xc4\x87'.decode('utf8'), u'a']) tm.assert_frame_equal(df, expected)
def test_csv_with_header(): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = into('sqlite:///%s::mytable' % dbfilename, csvfilename, has_header=True) assert discover(t) == dshape('var * {a: int64, b: int64}') assert into(set, t) == set([(1, 2), (3, 4)])
def test_pandas_read_supports_read_csv_kwargs(): with filetext('Alice,1\nBob,2') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds, usecols=['name']) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice',), ('Bob',)]
def test_pandas_read(): with filetext('Alice,1\nBob,2') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', 1), ('Bob', 2)] assert list(df.columns) == ['name', 'amount']
def test_pandas_read_supports_gzip(): with filetext("Alice,1\nBob,2", open=gzip.open, mode="wt", extension=".csv.gz") as fn: ds = datashape.dshape("var * {name: string, amount: int}") csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [("Alice", 1), ("Bob", 2)] assert list(df.columns) == ["name", "amount"]
def test_pandas_read_supports_missing_integers(): with filetext("Alice,1\nBob,") as fn: ds = datashape.dshape("var * {name: string, val: ?int32}") csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["name", "val"] assert df.dtypes["val"] == "f4"
def test_pandas_read(): with filetext("Alice,1\nBob,2") as fn: ds = datashape.dshape("var * {name: string, amount: int}") csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [("Alice", 1), ("Bob", 2)] assert list(df.columns) == ["name", "amount"]
def test_header_disagrees_with_dshape(): ds = datashape.dshape("var * {name: string, bal: int64}") with filetext("name,val\nAlice,100\nBob,200", extension="csv") as fn: csv = CSV(fn, header=True) assert convert(list, csv) == [("Alice", 100), ("Bob", 200)] assert list(convert(pd.DataFrame, csv).columns) == ["name", "val"] assert list(convert(pd.DataFrame, csv, dshape=ds).columns) == ["name", "bal"]
def test_pandas_read_supports_datetimes(): with filetext('Alice,2014-01-02\nBob,2014-01-03') as fn: ds = datashape.dshape('var * {name: string, when: date}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ['name', 'when'] assert df.dtypes['when'] == 'M8[ns]'
def test_csv_to_bcolz(): with filetext('name,runway,takeoff,datetime_nearest_close\n' 'S28,28,TRUE,A\n' 'S16,16,TRUE,Q\n' 'L14,14,FALSE,I', extension='csv') as src: with tmpfile('bcolz') as tgt: bc = into(tgt, src) assert len(bc) == 3
def test_pandas_read_supports_missing_integers(): with filetext('Alice,1\nBob,') as fn: ds = datashape.dshape('var * {name: string, val: ?int32}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ['name', 'val'] assert df.dtypes['val'] == 'f4'
def test_copy_local_files_to_hdfs(): with tmpfile_hdfs() as target: with filetext('name,amount\nAlice,100\nBob,200') as source: csv = CSV(source) scsv = HDFS(CSV)(target, hdfs=hdfs) into(scsv, csv, blocksize=10) # 10 bytes per message assert discover(scsv) == discover(csv)
def test_header_disagrees_with_dshape(): ds = datashape.dshape('var * {name: string, bal: int64}') with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn, header=True) assert convert(list, csv) == [('Alice', 100), ('Bob', 200)] assert list(convert(pd.DataFrame, csv).columns) == ['name', 'val'] assert list(convert(pd.DataFrame, csv, dshape=ds).columns) == ['name', 'bal']
def test_pandas_read_supports_datetimes(): with filetext("Alice,2014-01-02\nBob,2014-01-03") as fn: ds = datashape.dshape("var * {name: string, when: date}") csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["name", "when"] assert df.dtypes["when"] == "M8[ns]"
def test_pandas_loads_in_datetimes_naively(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds df = convert(pd.DataFrame, csv) assert df.dtypes['when'] == 'M8[ns]'
def test_pandas_read_supports_gzip(): with filetext('Alice,1\nBob,2', open=gzip.open, mode='wt', extension='.csv.gz') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', 1), ('Bob', 2)] assert list(df.columns) == ['name', 'amount']
def test_into_double_string(f): with filetext('alice,1\nbob,2', extension='.csv') as source: assert odo(source, list) == [('alice', 1), ('bob', 2)] with tmpfile('.csv') as target: csv = odo(source, f(target)) assert isinstance(csv, CSV) with open(target, 'rU') as f: assert 'alice' in f.read()
def test_ssh_csv_to_s3_csv(): # for some reason this can only be run in the same file as other ssh tests # and must be a Temp(SSH(CSV)) otherwise tests above this one fail s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost') with s3_bucket('.csv') as b: result = into(b, remote) assert discover(result) == discover(resource(b))
def test_hive_creation_from_local_file(): with filetext(accounts_1_csv, extension='csv') as fn: with hive_table(host) as uri: t = into(uri, fn, **auth) assert isinstance(t, sa.Table) assert into(set, t) == into(set, fn) t2 = into(uri, fn, **auth) assert isinstance(t2, sa.Table) assert len(into(list, t2)) == 2 * len(into(list, fn))
def test_sqlite_to_csv(sep, header): with tmpfile("db") as dbfilename: with filetext("a,b\n1,2\n3,4", extension="csv") as csvfilename: t = odo(csvfilename, "sqlite:///%s::mytable" % dbfilename) with tmpfile(".csv") as fn: odo(t, fn, header=header, delimiter=sep) with open(fn, "rt") as f: lines = f.readlines() expected = [tuple(map(int, row)) for row in map(lambda x: x.split(sep), lines[header:])] assert odo(fn, list, delimiter=sep, has_header=header, dshape=discover(t)) == expected
def test_different_encoding_to_csv(): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = odo( csvfilename, 'sqlite:///%s::mytable' % dbfilename, encoding='latin1' ) with tmpfile('.csv') as fn: with pytest.raises(ValueError): odo(t, fn, encoding='latin1')
def test_send_parameterized_query_to_csv(): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = odo( csvfilename, 'sqlite:///%s::mytable' % dbfilename, ) with tmpfile('.csv') as fn: q = t.select(t.c.a == 1) r = odo(q, fn) assert sorted(odo(q, list)) == sorted(odo(r, list))
def accounts_ssh(): """ Three csv files on the remote host in a directory """ dirname = str(uuid.uuid1()) conn = sftp(**auth) conn.mkdir(dirname) with filetext(accounts_1_csv) as fn: conn.put(fn, dirname + '/accounts.1.csv') with filetext(accounts_2_csv) as fn: conn.put(fn, dirname + '/accounts.2.csv') with filetext(accounts_3_csv) as fn: conn.put(fn, dirname + '/accounts.3.csv') filenames = [dirname + '/accounts.%d.csv' % i for i in [1, 2, 3]] uris = ['ssh://ubuntu@%s:%s' % (host, fn) for fn in filenames] try: yield 'ssh://ubuntu@%s:%s/*.csv' % (host, dirname), uris finally: for fn in filenames: conn.remove(fn) conn.rmdir(dirname)
def test_convert_through_temporary_local_storage(): with filetext('name,quantity\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) df = into(pd.DataFrame, csv) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert into(list, csv) == into(list, scsv) scsv2 = into(Temp(SSH(CSV)), df, hostname='localhost') assert into(list, scsv2) == into(list, df) sjson = into(Temp(SSH(JSONLines)), df, hostname='localhost') assert (into(np.ndarray, sjson) == into(np.ndarray, df)).all()
def test_dialect_of(): with filetext(accounts_1_csv) as fn: d = dialect_of(CSV(fn)) assert d['delimiter'] == ',' assert d['has_header'] is True with accounts_data() as (directory, (a, b, c)): directory2 = HDFS(Directory(CSV))(directory.path, hdfs=directory.hdfs) d = dialect_of(directory2) assert d['has_header'] is True directory2 = HDFS(Directory(CSV))(directory.path, hdfs=directory.hdfs, has_header=False) d = dialect_of(directory2) assert d['has_header'] is False
def test_copy_remote_csv(): with tmpfile('csv') as target: with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = resource(fn) uri = 'ssh://localhost:%s.csv' % target scsv = into(uri, csv) assert isinstance(scsv, SSH(CSV)) assert discover(scsv) == discover(csv) # Round trip csv2 = into(target, scsv) assert into(list, csv) == into(list, csv2)
def test_drop(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: with tmpfile('csv') as target: scsv = SSH(CSV)(target, hostname='localhost') assert not os.path.exists(target) conn = sftp(**scsv.auth) conn.put(fn, target) assert os.path.exists(target) drop(scsv) drop(scsv) assert not os.path.exists(target)
def test_sqlite_to_csv(sep, header): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = odo(csvfilename, 'sqlite:///%s::mytable' % dbfilename) with tmpfile('.csv') as fn: odo(t, fn, header=header, delimiter=sep) with open(fn, 'rt') as f: lines = f.readlines() expected = [ tuple(map(int, row)) for row in map(lambda x: x.split(sep), lines[header:]) ] assert odo(fn, list, delimiter=sep, has_header=header, dshape=discover(t)) == expected
def test_unicode_column_names(): with filetext(b'f\xc3\xbc,a\n1,2\n3,4', extension='csv', mode='wb') as fn: df = into(pd.DataFrame, CSV(fn, has_header=True)) expected = pd.DataFrame([(1, 2), (3, 4)], columns=[b'f\xc3\xbc'.decode('utf8'), u'a']) tm.assert_frame_equal(df, expected)
def test_csv_missing_values(): with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn: csv = CSV(fn) assert discover(csv).measure.dict['name'] == Option(string)
def test_csv_separator_header(): with filetext('a|b|c\n1|2|3\n4|5|6', extension='csv') as fn: csv = CSV(fn, delimiter='|', has_header=True) assert convert(list, csv) == [(1, 2, 3), (4, 5, 6)]
def test_into_string_on_right(): with filetext('alice,1\nbob,2', extension='.csv') as source: assert into([], source) == [('alice', 1), ('bob', 2)]
def test_into_string_on_right(f): with filetext('alice,1\nbob,2', extension='.csv') as source: assert odo(f(source), []) == [('alice', 1), ('bob', 2)]
def test_unicode_column_names(): with filetext('foo\xc4\x87,a\n1,2\n3,4', extension='csv') as fn: csv = CSV(fn, has_header=True) df = into(pd.DataFrame, csv)
def test_pandas_read_supports_whitespace_strings(): with filetext('a,b, \n1,2, \n2,3, \n', extension='csv') as fn: csv = CSV(fn) ds = discover(csv) assert ds == datashape.dshape("var * {a: int64, b: int64, '': ?string}")
def test_header_argument_set_with_or_without_header(): with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn: assert into(list, fn) == [('Alice', 100), ('Bob', 200)] with filetext('Alice,100\nBob,200', extension='csv') as fn: assert into(list, fn) == [('Alice', 100), ('Bob', 200)]
def test_discover(): with filetext('name,balance\nAlice,100\nBob,200') as fn: local = CSV(fn) remote = SSH(CSV)(fn, hostname='localhost') assert discover(local) == discover(remote)
def test_discover_csv_yields_string_on_totally_empty_columns(): expected = dshape('var * {a: int64, b: ?string, c: int64}') with filetext('a,b,c\n1,,3\n4,,6\n7,,9') as fn: csv = CSV(fn, has_header=True) assert discover(csv) == expected
def test_discover_csv_without_columns(): with filetext('Alice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) ds = discover(csv) assert '100' not in str(ds)
def test_discover_csv_files_without_header(): with filetext('Alice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=False) df = convert(pd.DataFrame, csv) assert len(df) == 2 assert 'Alice' not in list(df.columns)
def test_csv_into_list(): with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn: L = into(list, fn) assert L == [('Alice', 100), ('Bob', 200)]
def test_pandas_discover_on_gzipped_files(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02', open=gzip.open, mode='wt', extension='.csv.gz') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds
def test_convert(): with filetext('Hello\nWorld') as fn: assert convert(list, TextFile(fn)) == ['Hello\n', 'World']
def test_drop(): with filetext('hello\nworld') as fn: t = TextFile(fn) assert os.path.exists(fn) drop(t) assert not os.path.exists(fn)
def test_infer_header(): with filetext('name,val\nAlice,100\nNA,200', extension='csv') as fn: assert infer_header(CSV(fn).path, 100) == True with filetext('Alice,100\nNA,200', extension='csv') as fn: assert infer_header(CSV(fn).path, 100) == False
def test_raise_errors_quickly_on_into_chunks_dataframe(): with filetext('name,val\nAlice,100\nBob,foo', extension='csv') as fn: ds = datashape.dshape('var * {name: string, val: int}') csv = CSV(fn, header=True) assert raises(Exception, lambda: CSV_to_chunks_of_dataframes(csv, dshape=ds))
def test_csv_infer_header(): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = odo(csvfilename, 'sqlite:///%s::mytable' % dbfilename) assert discover(t) == dshape('var * {a: int64, b: int64}') assert odo(t, set) == set([(1, 2), (3, 4)])
def test_unused_datetime_columns(): ds = datashape.dshape('var * {val: string, when: datetime}') with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn: csv = CSV(fn, has_header=True) assert convert(list, csv_to_dataframe(csv, usecols=['val'], squeeze=True, dshape=ds)) == ['a', 'b']
def test_convert_local_file_to_temp_ssh_file(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = convert(Temp(SSH(CSV)), csv, hostname='localhost') assert into(list, csv) == into(list, scsv)
def test_empty_dataframe(): with filetext('name,val', extension='csv') as fn: csv = CSV(fn, has_header=True) df = convert(pd.DataFrame, csv) assert isinstance(df, pd.DataFrame)
def test_discover_csv_with_spaces_in_header(): with filetext(' name, val\nAlice,100\nBob,200', extension='csv') as fn: ds = discover(CSV(fn, has_header=True)) assert ds.measure.names == ['name', 'val']
def test_discover_from_resource(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: local = CSV(fn) remote = resource('ssh://localhost:' + fn) assert discover(local) == discover(remote)