def test_csv_with_trailing_commas(): with tmpfile('.csv') as fn: with open(fn, 'wt') as f: # note the trailing space in the header f.write('a,b,c, \n1, 2, 3, ') csv = CSV(fn) assert expr_repr(data(fn)) assert discover(csv).measure.names == ['a', 'b', 'c', ''] with tmpfile('.csv') as fn: with open(fn, 'wt') as f: f.write('a,b,c,\n1, 2, 3, ') # NO trailing space in the header csv = CSV(fn) assert expr_repr(data(fn)) assert discover(csv).measure.names == ['a', 'b', 'c', 'Unnamed: 3']
def test_csv(): with tmpfile('.csv') as fn: csv = CSV(fn, dshape='var * {name: string, amount: int}', delimiter=',') assert csv.dialect['delimiter'] == ','
def test_header_mix_str_digits(): ds = datashape.dshape('''var * {"On- or Off- Budget": ?string, "1990": ?string}''') with filetext('On- or Off- Budget,1990\nOn Budget,-628\nOff budget,"5,962"\n') as fn: csv = CSV(fn, has_header=True) df = convert(pd.DataFrame, csv) assert discover(csv).measure == ds.measure
def test_pandas_read_supports_read_csv_kwargs(): with filetext('Alice,1\nBob,2') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds, usecols=['name']) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice',), ('Bob',)]
def test_temp_ssh_files(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert discover(csv) == discover(scsv) assert isinstance(scsv, _Temp)
def test_more_unicode_column_names(): with filetext(b'foo\xc4\x87,a\n1,2\n3,4', extension='csv', mode='wb') as fn: df = into(pd.DataFrame, CSV(fn, has_header=True)) expected = pd.DataFrame([(1, 2), (3, 4)], columns=[b'foo\xc4\x87'.decode('utf8'), u'a']) tm.assert_frame_equal(df, expected)
def test_pandas_loads_in_datetimes_naively(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds df = convert(pd.DataFrame, csv) assert df.dtypes['when'] == 'M8[ns]'
def test_pandas_discover_on_gzipped_files(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02', open=gzip.open, mode='wt', extension='.csv.gz') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds
def test_unused_datetime_columns(): ds = datashape.dshape('var * {val: string, when: datetime}') with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn: csv = CSV(fn, has_header=True) assert convert( list, csv_to_DataFrame(csv, usecols=['val'], squeeze=True, dshape=ds)) == ['a', 'b']
def test_pandas_read(): with filetext('Alice,1\nBob,2') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', 1), ('Bob', 2)] assert list(df.columns) == ['name', 'amount']
def test_pandas_read_supports_datetimes(): with filetext('Alice,2014-01-02\nBob,2014-01-03') as fn: ds = datashape.dshape('var * {name: string, when: date}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ['name', 'when'] assert df.dtypes['when'] == 'M8[ns]'
def test_pandas_read_supports_missing_integers(): with filetext('Alice,1\nBob,') as fn: ds = datashape.dshape('var * {name: string, val: ?int32}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ['name', 'val'] assert df.dtypes['val'] == 'f4'
def test_pandas_writes_header_by_default(): with tmpfile('.csv') as fn: ds = datashape.dshape('var * {name: string, amount: int}') data = [('Alice', 1), ('Bob', 2)] csv = CSV(fn) append(csv, data, dshape=ds) with open(fn) as f: assert 'name' in f.read()
def test_header_disagrees_with_dshape(): ds = datashape.dshape('var * {name: string, bal: int64}') with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn, header=True) assert convert(list, csv) == [('Alice', 100), ('Bob', 200)] assert list(convert(pd.DataFrame, csv).columns) == ['name', 'val'] assert list(convert(pd.DataFrame, csv, dshape=ds).columns) == [ 'name', 'bal']
def test_table_resource(): with tmpfile('csv') as filename: ds = dshape('var * {a: int, b: int}') csv = CSV(filename) append(csv, [[1, 2], [10, 20]], dshape=ds) t = data(filename) assert isinstance(t.data, CSV) assert into(list, compute(t)) == into(list, csv)
def test_pandas_read_supports_gzip(): with filetext('Alice,1\nBob,2', open=gzip.open, mode='wt', extension='.csv.gz') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_dataframe(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', 1), ('Bob', 2)] assert list(df.columns) == ['name', 'amount']
def test_ssh_csv_to_s3_csv(): # for some reason this can only be run in the same file as other ssh tests # and must be a Temp(SSH(CSV)) otherwise tests above this one fail s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost') with s3_bucket('.csv') as b: result = into(b, remote) assert discover(result) == discover(resource(b))
def dcsv(): this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') dshape = """var * { Name: string, RegistrationDate: date, ZipCode: int64, Consts: float64 }""" return CSV(file_name, dshape=dshape)
def test_header_with_quotes(): csv = CSV(os.path.join(os.path.dirname(__file__), 'encoding.csv'), encoding='latin1') expected = dshape("""var * { D_PROC: ?string, NUM_SEQ: int64, COD_TIP_RELAC: ?float64, COMPL: ?string, COD_ASSUNTO: int64 } """) assert discover(csv) == expected
def test_pandas_write_gzip(): with tmpfile('.csv.gz') as fn: ds = datashape.dshape('var * {name: string, amount: int}') data = [('Alice', 1), ('Bob', 2)] csv = CSV(fn, has_header=True) append(csv, data, dshape=ds) f = gzip.open(fn) s = f.read() assert 'name' in s assert 'Alice,1' in s f.close()
def test_string_n_convert(string_dshape): data = ['2015-03-13,FOO THE BAR', '2014-01-29,BAZ THE QUUX'] ds = 'var * {k: date, n: %s}' % string_dshape with tmpfile('.csv') as fn: with open(fn, 'w') as f: f.write('\n'.join(data)) csv = CSV(fn, has_header=False) result = odo(csv, pd.DataFrame, dshape=ds) assert list(result.columns) == list('kn') raw = [tuple(x.split(',')) for x in data] expected = pd.DataFrame(raw, columns=list('kn')) expected['k'] = pd.to_datetime(expected.k) tm.assert_frame_equal(result, expected)
def test_convert_through_temporary_local_storage(): with filetext('name,quantity\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) df = into(pd.DataFrame, csv) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert into(list, csv) == into(list, scsv) scsv2 = into(Temp(SSH(CSV)), df, hostname='localhost') assert into(list, scsv2) == into(list, df) sjson = into(Temp(SSH(JSONLines)), df, hostname='localhost') assert (into(np.ndarray, sjson) == into(np.ndarray, df)).all()
def test_csv_append(): with tmpfile('.csv') as fn: csv = CSV(fn, has_header=False) data = [('Alice', 100), ('Bob', 200)] append(csv, data) assert list(convert(Iterator, csv)) == data with open(fn) as f: s = f.read() assert 'Alice' in s assert '100' in s
def test_pandas_write(): with tmpfile('.csv') as fn: ds = datashape.dshape('var * {name: string, amount: int}') data = [('Alice', 1), ('Bob', 2)] csv = CSV(fn, has_header=True) append(csv, data, dshape=ds) with open(fn) as f: assert 'name' in f.read() # Doesn't write header twice append(csv, data, dshape=ds) with open(fn) as f: s = f.read() assert s.count('name') == 1
def test_pandas_read_supports_whitespace_strings(): with filetext('a,b, \n1,2, \n2,3, \n', extension='csv') as fn: csv = CSV(fn) ds = discover(csv) assert ds == datashape.dshape("var * {a: int64, b: int64, '': ?string}")
def test_multibyte_encoding_dialect(multibyte_csv): c = CSV(multibyte_csv, encoding='utf8', sniff_nbytes=10) assert c.dialect['delimiter'] == ','
def test_multibyte_encoding_header(multibyte_csv): c = CSV(multibyte_csv, encoding='utf8', sniff_nbytes=3) assert c.has_header is None # not enough data to infer header
def test_encoding_is_none(): with tmpfile('.csv') as fn: with open(fn, 'w') as f: f.write('a,1\nb,2\nc,3'.encode('utf-8').decode('utf-8')) assert CSV(fn, encoding=None).encoding == 'utf-8'
def test_has_header_on_tsv(): with tmpfile('.csv') as fn: with open(fn, 'wb') as f: f.write(b'a\tb\n1\t2\n3\t4') csv = CSV(fn) assert csv.has_header
def test_csv_supports_sep(): assert CSV('foo.csv', sep=';').dialect['delimiter'] == ';'