Exemple #1
0
def test_json_lines_byte_range(json_input):
    # include the first row and half of the second row
    # should parse the first two rows
    df = cudf.read_json(copy.deepcopy(json_input),
                        lines=True,
                        byte_range=(0, 15))
    assert df.shape == (2, 3)

    # include half of the second row and half of the third row
    # should parse only the third row
    df = cudf.read_json(copy.deepcopy(json_input),
                        lines=True,
                        byte_range=(15, 10))
    assert df.shape == (1, 3)

    # include half of the second row and entire third row
    # should parse only the third row
    df = cudf.read_json(copy.deepcopy(json_input),
                        lines=True,
                        byte_range=(15, 0))
    assert df.shape == (1, 3)

    # include half of the second row till past the end of the file
    # should parse only the third row
    df = cudf.read_json(copy.deepcopy(json_input),
                        lines=True,
                        byte_range=(10, 50))
    assert df.shape == (1, 3)
Exemple #2
0
def test_json_reader(json_files):
    path_df, path_series, orient, compression = json_files
    expect_df = pd.read_json(path_df, orient=orient, compression=compression)
    got_df = cudf.read_json(path_df, orient=orient, compression=compression)
    if len(expect_df) == 0:
        expect_df = expect_df.reset_index(drop=True)
        expect_df.columns = expect_df.columns.astype("object")
    if len(got_df) == 0:
        got_df = got_df.reset_index(drop=True)

    assert_eq(expect_df, got_df, check_categorical=False)

    # Only these orients are allowed for Series, but isn't enforced by Pandas
    if orient in ("split", "records", "index"):
        expect_series = pd.read_json(path_series,
                                     orient=orient,
                                     compression=compression,
                                     typ="series")
        got_series = cudf.read_json(path_series,
                                    orient=orient,
                                    compression=compression,
                                    typ="series")
        if len(expect_series) == 0:
            expect_series = expect_series.reset_index(drop=True)
        if len(got_df) == 0:
            got_series = got_series.reset_index(drop=True)

        assert_eq(expect_series, got_series)
Exemple #3
0
def test_json_bool_values():
    buffer = "[true,1]\n[false,false]\n[true,true]"
    cu_df = cudf.read_json(buffer, lines=True)
    pd_df = pd.read_json(buffer, lines=True)

    # types should be ['bool', 'int64']
    np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
    np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_array())
    # boolean values should be converted to 0/1
    np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_array())

    cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"])
    np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
Exemple #4
0
def test_json_bool_values():
    buffer = '[true,1]\n[false,false]\n[true,true]'
    cu_df = cudf.read_json(buffer, lines=True)
    pd_df = pd.read_json(buffer, lines=True)

    # types should be ['bool', 'int64']
    np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
    np.testing.assert_array_equal(pd_df[0], cu_df['0'])
    # boolean values should be converted to 0/1
    np.testing.assert_array_equal(pd_df[1], cu_df['1'])

    cu_df = cudf.read_json(buffer, lines=True, dtype=['bool', 'long'])
    np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
Exemple #5
0
    def fetch_data(self):
        """
        Fetch data using cudf based on provided config object
        """
        df = None
        input_format = self.config["input_format"].lower()
        filepath = self.config["input_path"]
        kwargs = self.config.copy()
        del kwargs["type"]
        del kwargs["input_format"]
        del kwargs["input_path"]

        if "csv" == input_format:
            df = cudf.read_csv(filepath, **kwargs)
        elif "parquet" == input_format:
            df = cudf.read_parquet(filepath, **kwargs)
        elif "orc" == input_format:
            df = cudf.read_orc(filepath, engine="cudf")
        elif "json" == input_format:
            df = cudf.read_json(filepath, **kwargs)
        else:
            raise NotImplementedError("%s is not a supported input_format" %
                                      (input_format))

        self.has_data = False
        return df
Exemple #6
0
def test_read_json(tmpdir, pdf, hdfs, test_url):
    fname = tmpdir.mkdir("json").join("test_json_reader.json")
    # Write to local file system
    # Sorting by col_name now as pandas sorts by col name while reading json

    pdf.sort_index(axis=1).to_json(fname, orient="records", lines=True)
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/test_json_reader.json", buffer)

    if test_url:
        hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format(
            host, port, basedir
        )
    else:
        hd_fpath = f"hdfs://{basedir}/test_json_reader.json"

    got = cudf.read_json(hd_fpath, engine="cudf", orient="records", lines=True)

    # Read pandas from byte buffer
    with hdfs.open(basedir + "/test_json_reader.json") as f:
        expect = pd.read_json(f, lines=True)

    assert_eq(expect, got)
Exemple #7
0
def test_json_read_directory(tmpdir, json_input, engine):
    pdf = pd.read_json(json_input, lines=True)
    pdf.to_json(
        tmpdir.join("MultiInputs1.json"),
        compression="infer",
        lines=True,
        orient="records",
    )
    pdf.to_json(
        tmpdir.join("MultiInputs2.json"),
        compression="infer",
        lines=True,
        orient="records",
    )
    pdf.to_json(
        tmpdir.join("MultiInputs3.json"),
        compression="infer",
        lines=True,
        orient="records",
    )

    cu_df = cudf.read_json(tmpdir, engine=engine, lines=True)
    pd_df = pd.concat([pdf, pdf, pdf])

    assert all(cu_df.dtypes == ["int64", "int64", "int64"])
    for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
        assert str(cu_col) == str(pd_col)
        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
Exemple #8
0
def json_writer_test_params(pdf, compression, dtype):
    gdf = cudf.from_pandas(pdf)

    pdf_buffer = pdf.to_json(
        lines=True, orient="records", compression=compression
    )
    gdf_buffer = gdf.to_json(
        lines=True, orient="records", compression=compression
    )

    # TODO: Uncomment once this is fixed:
    # https://github.com/rapidsai/cudf/issues/6429
    # compare_content(pdf_buffer, gdf_buffer)

    actual = cudf.read_json(
        io.StringIO(gdf_buffer),
        engine="cudf",
        lines=True,
        orient="records",
        dtype=dtype,
    )
    expected = pd.read_json(
        io.StringIO(pdf_buffer), lines=True, orient="records", dtype=dtype
    )

    # Difference in behaviour with pandas
    # cudf reads column as strings only.
    expected.columns = expected.columns.astype("str")
    assert_eq(actual, expected)
Exemple #9
0
def test_json_bad_protocol_string():
    test_string = '{"field": "s3://path"}'

    expect = pd.DataFrame([{"field": "s3://path"}])
    got = cudf.read_json(test_string, lines=True)

    assert_eq(expect, got)
Exemple #10
0
def json_reader_test_params(json_buffer, dtype):
    pdf = pd.read_json(json_buffer, dtype=dtype, orient="records", lines=True)
    pdf.columns = pdf.columns.astype("str")

    gdf = cudf.read_json(json_buffer, dtype=dtype, engine="cudf", lines=True)

    assert_eq(gdf, pdf)
Exemple #11
0
def test_json_lines_basic(json_input, engine):
    cu_df = cudf.read_json(json_input, engine=engine, lines=True)
    pd_df = pd.read_json(json_input, lines=True)

    assert all(cu_df.dtypes == ["int64", "int64", "int64"])
    for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
        assert str(cu_col) == str(pd_col)
        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
Exemple #12
0
def test_json_null_literal(buffer):
    df = cudf.read_json(buffer, lines=True)

    # first column contains a null field, type sould be set to float
    # second column contains only empty fields, type should be set to int8
    np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
    np.testing.assert_array_equal(df["0"], [None, 1.0])
    np.testing.assert_array_equal(df["1"], [None, None])
Exemple #13
0
def json_reader_test(json_buffer):
    pdf = pd.read_json(io.StringIO(json_buffer), orient="records", lines=True)
    # Difference in behaviour with pandas
    # cudf reads column as strings only.
    pdf.columns = pdf.columns.astype("str")
    gdf = cudf.read_json(io.StringIO(json_buffer), engine="cudf", lines=True)

    assert_eq(gdf, pdf)
Exemple #14
0
def test_json_null_literal(buffer):
    df = cudf.read_json(buffer, lines=True)

    # first column contains a null field, type should be set to float
    # second column contains only empty fields, type should be set to int8
    np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
    np.testing.assert_array_equal(
        df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]
    )
    np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
Exemple #15
0
def json_writer_test(gdf):
    pdf = gdf.to_pandas()

    pdf_buffer = pdf.to_json(lines=True, orient="records")
    gdf_buffer = gdf.to_json(lines=True, orient="records")

    compare_content(pdf_buffer, gdf_buffer)

    actual = cudf.read_json(gdf_buffer, lines=True, orient="records")
    expected = pd.read_json(pdf_buffer, lines=True, orient="records")
    assert_eq(actual, expected)
Exemple #16
0
def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
    fname = tmpdir.mkdir("gdf_json").join("tmp_json_compression" + ext)

    nrows = 20
    pd_df = make_numeric_dataframe(nrows, np.int32)
    pd_df.to_json(fname, compression=out_comp, lines=True, orient="records")

    cu_df = cudf.read_json(
        str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"]
    )
    assert_eq(pd_df, cu_df)
Exemple #17
0
def test_write_data_json(tmpdir, expected_df):
    fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.json"))
    config = {
        "type": "fs",
        "output_path": fname,
        "output_format": "json",
        "orient": "records"
    }
    writer = FileSystemWriter(config)
    writer.write_data(expected_df)

    result_df = cudf.read_json(fname, orient="records")
    assert result_df.equals(expected_df)
Exemple #18
0
def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
    fname = tmpdir.mkdir("gdf_json").join('tmp_json_compression' + ext)

    nrows = 20
    pd_df = make_numeric_dataframe(nrows, np.int32)
    pd_df.to_json(fname, compression=out_comp, lines=True, orient='records')

    cu_df = cudf.read_json(str(fname),
                           compression=in_comp,
                           lines=True,
                           dtype=['int', 'int'])

    pd.util.testing.assert_frame_equal(pd_df, cu_df.to_pandas())
Exemple #19
0
def test_json_engine_selection():
    json = "[1, 2, 3]"

    # should use the cudf engine
    df = cudf.read_json(json, lines=True)
    # column names are strings when parsing with cudf
    for col_name in df.columns:
        assert isinstance(col_name, str)

    # should use the pandas engine
    df = cudf.read_json(json, lines=False)
    # column names are ints when parsing with pandas
    for col_name in df.columns:
        assert isinstance(col_name, int)

    # should use the pandas engine
    df = cudf.read_json(json, lines=True, engine="pandas")
    # column names are ints when parsing with pandas
    for col_name in df.columns:
        assert isinstance(col_name, int)

    # should raise an exception
    with pytest.raises(ValueError):
        df = cudf.read_json(json, lines=False, engine="cudf")
Exemple #20
0
def test_json_lines_multiple(tmpdir, json_input, engine):
    tmp_file1 = tmpdir.join("MultiInputs1.json")
    tmp_file2 = tmpdir.join("MultiInputs2.json")

    pdf = pd.read_json(json_input, lines=True)
    pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records")
    pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records")

    cu_df = cudf.read_json([tmp_file1, tmp_file2], engine=engine, lines=True)
    pd_df = pd.concat([pdf, pdf])

    assert all(cu_df.dtypes == ["int64", "int64", "int64"])
    for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
        assert str(cu_col) == str(pd_col)
        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
Exemple #21
0
def json_writer_test(pdf):
    gdf = cudf.from_pandas(pdf)

    pdf_buffer = pdf.to_json(lines=True, orient="records")
    gdf_buffer = gdf.to_json(lines=True, orient="records")

    # TODO: Uncomment once this is fixed:
    # https://github.com/rapidsai/cudf/issues/6429
    # compare_content(pdf_buffer, gdf_buffer)

    actual = cudf.read_json(
        gdf_buffer, engine="cudf", lines=True, orient="records"
    )
    expected = pd.read_json(pdf_buffer, lines=True, orient="records")
    expected.columns = expected.columns.astype("str")
    assert_eq(actual, expected)
Exemple #22
0
def test_read_json():
    fname = "test_json_reader.json"
    bname = "json"
    buffer = (b'{"amount": 100, "name": "Alice"}\n'
              b'{"amount": 200, "name": "Bob"}\n'
              b'{"amount": 300, "name": "Charlie"}\n'
              b'{"amount": 400, "name": "Dennis"}\n')

    with s3_context(bname, {fname: buffer}):
        got = cudf.read_json(
            "s3://{}/{}".format(bname, fname),
            engine="cudf",
            orient="records",
            lines=True,
        )

    expect = pd.read_json(buffer, lines=True)
    assert_eq(expect, got)
Exemple #23
0
def test_read_json(s3_base, s3so):
    fname = "test_json_reader.json"
    bname = "json"
    buffer = (b'{"amount": 100, "name": "Alice"}\n'
              b'{"amount": 200, "name": "Bob"}\n'
              b'{"amount": 300, "name": "Charlie"}\n'
              b'{"amount": 400, "name": "Dennis"}\n')

    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
        got = cudf.read_json(
            f"s3://{bname}/{fname}",
            engine="cudf",
            orient="records",
            lines=True,
            storage_options=s3so,
        )

    expect = pd.read_json(buffer, lines=True)
    assert_eq(expect, got)
Exemple #24
0
def test_json_corner_case_with_escape_and_double_quote_char_with_strings():
    str_buffer = StringIO("""{"a":"ab\\"cd","b":"a\\tb\\t","c":"aeiou"}
           {"a":"\\\\\\b","b":"\\\\","c":"try"}
           {"a":"\\r\\\\","b":"\\\\\\"","c":"json"}
           {"a":"\'","b":"\\t","c":"cudf"}""")

    df = cudf.read_json(str_buffer,
                        compression="infer",
                        lines=True,
                        orient="records")

    expected = {
        "a": ['ab"cd', "\\\b", "\r\\", "'"],
        "b": ["a\tb\t", "\\", '\\"', "\t"],
        "c": ["aeiou", "try", "json", "cudf"],
    }

    num_rows = df.shape[0]
    for col_name in df._data:
        for i in range(num_rows):
            assert expected[col_name][i] == df[col_name][i]
Exemple #25
0
def test_json_corner_case_with_escape_and_double_quote_char_with_pandas(
    tmpdir, ):
    fname = tmpdir.mkdir("gdf_json").join("tmp_json_escape_double_quote")

    pdf = pd.DataFrame({
        "a": ['ab"cd', "\\\b", "\r\\", "'"],
        "b": ["a\tb\t", "\\", '\\"', "\t"],
        "c": ["aeiou", "try", "json", "cudf"],
    })
    pdf.to_json(fname, compression="infer", lines=True, orient="records")

    df = cudf.read_json(fname,
                        compression="infer",
                        lines=True,
                        orient="records")
    pdf = pd.read_json(fname,
                       compression="infer",
                       lines=True,
                       orient="records")

    assert_eq(cudf.DataFrame(pdf), df)
Exemple #26
0
def test_json_lines_dtypes(json_input, dtype):
    df = cudf.read_json(json_input, lines=True, dtype=dtype)
    assert (all(df.dtypes == ['float32', 'int32', 'int16']))
Exemple #27
0
def test_json_lines_dtypes(json_input, dtype):
    df = cudf.read_json(json_input, lines=True, dtype=dtype)
    assert all(df.dtypes == ["float32", "int32", "int16"])
Exemple #28
0
    'task.label_column': None,
    'task.feature_names': None
}
get_input_variables(input_variables)

dataframe_id = None
if input_variables['task.dataframe_id'] is not None:
    dataframe_id = input_variables['task.dataframe_id']
if input_variables['task.dataframe_id_train'] is not None:
    dataframe_id = input_variables['task.dataframe_id_train']
print("dataframe id (in): ", dataframe_id)

dataframe_json = get_and_decompress_json_dataframe(dataframe_id)

if NVIDIA_RAPIDS_ENABLED:
    dataframe = cudf.read_json(dataframe_json, orient='split')
else:
    dataframe = pd.read_json(dataframe_json, orient='split')

is_labeled_data = False
LABEL_COLUMN = variables.get("LABEL_COLUMN")
if is_not_none_not_empty(LABEL_COLUMN):
    is_labeled_data = True
else:
    LABEL_COLUMN = input_variables['task.label_column']
    if is_not_none_not_empty(LABEL_COLUMN):
        is_labeled_data = True

algorithm_json = input_variables['task.algorithm_json']
assert algorithm_json is not None
algorithm = json.loads(algorithm_json)
Exemple #29
0
def json_reader_test(json_buffer):
    pdf = pd.read_json(json_buffer)
    gdf = cudf.read_json(json_buffer)

    assert_eq(gdf, pdf)