Example #1
0
def test_csv_reader_buffer_strings():

    names = ['text', 'int']
    dtypes = ['str', 'int']
    lines = [','.join(names), 'a,0', 'b,0', 'c,0', 'd,0']

    buffer = '\n'.join(lines)

    cols_str = read_csv_strings(StringIO(buffer),
                                names=names,
                                dtype=dtypes,
                                skiprows=1)
    assert (len(cols_str) == 2)
    assert (type(cols_str[0]) == nvstrings.nvstrings)
    assert (type(cols_str[1]) == cudf.Series)
    assert (cols_str[0].sublist([0]).to_host()[0] == 'a')
    assert (cols_str[0].sublist([1]).to_host()[0] == 'b')
    assert (cols_str[0].sublist([2]).to_host()[0] == 'c')
    assert (cols_str[0].sublist([3]).to_host()[0] == 'd')

    cols_bytes = read_csv_strings(BytesIO(str.encode(buffer)),
                                  names=names,
                                  dtype=dtypes,
                                  skiprows=1)
    assert (len(cols_bytes) == 2)
    assert (type(cols_bytes[0]) == nvstrings.nvstrings)
    assert (type(cols_bytes[1]) == cudf.Series)
    assert (cols_bytes[0].sublist([0]).to_host()[0] == 'a')
    assert (cols_bytes[0].sublist([1]).to_host()[0] == 'b')
    assert (cols_bytes[0].sublist([2]).to_host()[0] == 'c')
    assert (cols_bytes[0].sublist([3]).to_host()[0] == 'd')
Example #2
0
def test_csv_reader_gzip_compression_strings(tmpdir):
    fnamebase = tmpdir.mkdir("gdf_csv")
    fname = fnamebase.join("tmp_csvreader_file15.csv")
    fnamez = fnamebase.join("tmp_csvreader_file15.csv.gz")

    names = ['text', 'int']
    dtypes = ['str', 'int']
    lines = [','.join(names), 'a,0', 'b,0', 'c,0', 'd,0']

    with open(str(fname), 'w') as fp:
        fp.write('\n'.join(lines))

    with open(str(fname), 'rb') as f_in, gzip.open(str(fnamez), 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    cols = read_csv_strings(str(fnamez),
                            names=names,
                            dtype=dtypes,
                            skiprows=1,
                            decimal='.',
                            thousands="'",
                            compression='gzip')

    assert (len(cols) == 2)
    assert (type(cols[0]) == nvstrings.nvstrings)
    assert (type(cols[1]) == cudf.Series)
    assert (cols[0].sublist([0]).to_host()[0] == 'a')
    assert (cols[0].sublist([1]).to_host()[0] == 'b')
    assert (cols[0].sublist([2]).to_host()[0] == 'c')
    assert (cols[0].sublist([3]).to_host()[0] == 'd')
Example #3
0
def test_csv_reader_pd_consistent_quotes(quoting):
    names = ['text']
    dtypes = ['str']
    lines = ['"a"', '"b ""c"" d"', '"f!\n."']

    buffer = '\n'.join(lines)

    cu_cols = read_csv_strings(StringIO(buffer),
                               names=names,
                               dtype=dtypes,
                               quoting=quoting)
    pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting)

    col = [str(elem) for elem in cu_cols[0].to_host()]
    np.testing.assert_array_equal(pd_df['text'], col)
Example #4
0
def test_csv_reader_strings(tmpdir):
    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file7.csv")

    names = ['text', 'int']
    dtypes = ['str', 'int']
    lines = [','.join(names), 'a,0', 'b,0', 'c,0', 'd,0']

    with open(str(fname), 'w') as fp:
        fp.write('\n'.join(lines) + '\n')

    cols = read_csv_strings(str(fname), names=names, dtype=dtypes, skiprows=1)

    assert (len(cols) == 2)
    assert (type(cols[0]) == nvstrings.nvstrings)
    assert (type(cols[1]) == cudf.Series)
    assert (cols[0].sublist([0]).to_host()[0] == 'a')
    assert (cols[0].sublist([1]).to_host()[0] == 'b')
    assert (cols[0].sublist([2]).to_host()[0] == 'c')
    assert (cols[0].sublist([3]).to_host()[0] == 'd')
Example #5
0
def test_csv_quotednumbers(tmpdir):
    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")

    names = ['integer', 'decimal']
    dtypes = ['int32', 'float32']
    lines = [
        ','.join(names), '1,"3.14"', '"2","300"', '"3",10101.0101',
        '4,"6.28318"'
    ]

    with open(str(fname), 'w') as fp:
        fp.write('\n'.join(lines))

    integer_ref = [1, 2, 3, 4]
    decimal_ref = [3.14, 300, 10101.0101, 6.28318]

    cols1 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1)
    cols2 = read_csv_strings(str(fname), names=names, dtype=dtypes, skiprows=1)

    assert (len(cols2) == 2)
    np.testing.assert_allclose(integer_ref, cols1['integer'])
    np.testing.assert_allclose(decimal_ref, cols1['decimal'])
    np.testing.assert_allclose(integer_ref, cols2[0])
    np.testing.assert_allclose(decimal_ref, cols2[1])
Example #6
0
def test_csv_reader_strings_quotechars(tmpdir):
    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file8.csv")

    names = ['text', 'int']
    dtypes = ['str', 'int']
    lines = [','.join(names), '"a,\n",0', '"b ""c"" d",0', 'e,0', '"f,,!.,",0']

    with open(str(fname), 'w') as fp:
        fp.write('\n'.join(lines))

    cols = read_csv_strings(str(fname),
                            names=names,
                            dtype=dtypes,
                            skiprows=1,
                            quotechar='\"',
                            quoting=1)

    assert (len(cols) == 2)
    assert (type(cols[0]) == nvstrings.nvstrings)
    assert (type(cols[1]) == cudf.Series)
    assert (cols[0].sublist([0]).to_host()[0] == 'a,\n')
    assert (cols[0].sublist([1]).to_host()[0] == 'b "c" d')
    assert (cols[0].sublist([2]).to_host()[0] == 'e')
    assert (cols[0].sublist([3]).to_host()[0] == 'f,,!.,')