def test_csv_reader_buffer_strings(): names = ['text', 'int'] dtypes = ['str', 'int'] lines = [','.join(names), 'a,0', 'b,0', 'c,0', 'd,0'] buffer = '\n'.join(lines) cols_str = read_csv_strings(StringIO(buffer), names=names, dtype=dtypes, skiprows=1) assert (len(cols_str) == 2) assert (type(cols_str[0]) == nvstrings.nvstrings) assert (type(cols_str[1]) == cudf.Series) assert (cols_str[0].sublist([0]).to_host()[0] == 'a') assert (cols_str[0].sublist([1]).to_host()[0] == 'b') assert (cols_str[0].sublist([2]).to_host()[0] == 'c') assert (cols_str[0].sublist([3]).to_host()[0] == 'd') cols_bytes = read_csv_strings(BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1) assert (len(cols_bytes) == 2) assert (type(cols_bytes[0]) == nvstrings.nvstrings) assert (type(cols_bytes[1]) == cudf.Series) assert (cols_bytes[0].sublist([0]).to_host()[0] == 'a') assert (cols_bytes[0].sublist([1]).to_host()[0] == 'b') assert (cols_bytes[0].sublist([2]).to_host()[0] == 'c') assert (cols_bytes[0].sublist([3]).to_host()[0] == 'd')
def test_csv_reader_gzip_compression_strings(tmpdir): fnamebase = tmpdir.mkdir("gdf_csv") fname = fnamebase.join("tmp_csvreader_file15.csv") fnamez = fnamebase.join("tmp_csvreader_file15.csv.gz") names = ['text', 'int'] dtypes = ['str', 'int'] lines = [','.join(names), 'a,0', 'b,0', 'c,0', 'd,0'] with open(str(fname), 'w') as fp: fp.write('\n'.join(lines)) with open(str(fname), 'rb') as f_in, gzip.open(str(fnamez), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) cols = read_csv_strings(str(fnamez), names=names, dtype=dtypes, skiprows=1, decimal='.', thousands="'", compression='gzip') assert (len(cols) == 2) assert (type(cols[0]) == nvstrings.nvstrings) assert (type(cols[1]) == cudf.Series) assert (cols[0].sublist([0]).to_host()[0] == 'a') assert (cols[0].sublist([1]).to_host()[0] == 'b') assert (cols[0].sublist([2]).to_host()[0] == 'c') assert (cols[0].sublist([3]).to_host()[0] == 'd')
def test_csv_reader_pd_consistent_quotes(quoting): names = ['text'] dtypes = ['str'] lines = ['"a"', '"b ""c"" d"', '"f!\n."'] buffer = '\n'.join(lines) cu_cols = read_csv_strings(StringIO(buffer), names=names, dtype=dtypes, quoting=quoting) pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting) col = [str(elem) for elem in cu_cols[0].to_host()] np.testing.assert_array_equal(pd_df['text'], col)
def test_csv_reader_strings(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file7.csv") names = ['text', 'int'] dtypes = ['str', 'int'] lines = [','.join(names), 'a,0', 'b,0', 'c,0', 'd,0'] with open(str(fname), 'w') as fp: fp.write('\n'.join(lines) + '\n') cols = read_csv_strings(str(fname), names=names, dtype=dtypes, skiprows=1) assert (len(cols) == 2) assert (type(cols[0]) == nvstrings.nvstrings) assert (type(cols[1]) == cudf.Series) assert (cols[0].sublist([0]).to_host()[0] == 'a') assert (cols[0].sublist([1]).to_host()[0] == 'b') assert (cols[0].sublist([2]).to_host()[0] == 'c') assert (cols[0].sublist([3]).to_host()[0] == 'd')
def test_csv_quotednumbers(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") names = ['integer', 'decimal'] dtypes = ['int32', 'float32'] lines = [ ','.join(names), '1,"3.14"', '"2","300"', '"3",10101.0101', '4,"6.28318"' ] with open(str(fname), 'w') as fp: fp.write('\n'.join(lines)) integer_ref = [1, 2, 3, 4] decimal_ref = [3.14, 300, 10101.0101, 6.28318] cols1 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) cols2 = read_csv_strings(str(fname), names=names, dtype=dtypes, skiprows=1) assert (len(cols2) == 2) np.testing.assert_allclose(integer_ref, cols1['integer']) np.testing.assert_allclose(decimal_ref, cols1['decimal']) np.testing.assert_allclose(integer_ref, cols2[0]) np.testing.assert_allclose(decimal_ref, cols2[1])
def test_csv_reader_strings_quotechars(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file8.csv") names = ['text', 'int'] dtypes = ['str', 'int'] lines = [','.join(names), '"a,\n",0', '"b ""c"" d",0', 'e,0', '"f,,!.,",0'] with open(str(fname), 'w') as fp: fp.write('\n'.join(lines)) cols = read_csv_strings(str(fname), names=names, dtype=dtypes, skiprows=1, quotechar='\"', quoting=1) assert (len(cols) == 2) assert (type(cols[0]) == nvstrings.nvstrings) assert (type(cols[1]) == cudf.Series) assert (cols[0].sublist([0]).to_host()[0] == 'a,\n') assert (cols[0].sublist([1]).to_host()[0] == 'b "c" d') assert (cols[0].sublist([2]).to_host()[0] == 'e') assert (cols[0].sublist([3]).to_host()[0] == 'f,,!.,')