def test_json_lines_byte_range(json_input): # include the first row and half of the second row # should parse the first two rows df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(0, 15)) assert df.shape == (2, 3) # include half of the second row and half of the third row # should parse only the third row df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(15, 10)) assert df.shape == (1, 3) # include half of the second row and entire third row # should parse only the third row df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(15, 0)) assert df.shape == (1, 3) # include half of the second row till past the end of the file # should parse only the third row df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(10, 50)) assert df.shape == (1, 3)
def test_json_reader(json_files): path_df, path_series, orient, compression = json_files expect_df = pd.read_json(path_df, orient=orient, compression=compression) got_df = cudf.read_json(path_df, orient=orient, compression=compression) if len(expect_df) == 0: expect_df = expect_df.reset_index(drop=True) expect_df.columns = expect_df.columns.astype("object") if len(got_df) == 0: got_df = got_df.reset_index(drop=True) assert_eq(expect_df, got_df, check_categorical=False) # Only these orients are allowed for Series, but isn't enforced by Pandas if orient in ("split", "records", "index"): expect_series = pd.read_json(path_series, orient=orient, compression=compression, typ="series") got_series = cudf.read_json(path_series, orient=orient, compression=compression, typ="series") if len(expect_series) == 0: expect_series = expect_series.reset_index(drop=True) if len(got_df) == 0: got_series = got_series.reset_index(drop=True) assert_eq(expect_series, got_series)
def test_json_bool_values(): buffer = "[true,1]\n[false,false]\n[true,true]" cu_df = cudf.read_json(buffer, lines=True) pd_df = pd.read_json(buffer, lines=True) # types should be ['bool', 'int64'] np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_array()) # boolean values should be converted to 0/1 np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_array()) cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"]) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
def test_json_bool_values(): buffer = '[true,1]\n[false,false]\n[true,true]' cu_df = cudf.read_json(buffer, lines=True) pd_df = pd.read_json(buffer, lines=True) # types should be ['bool', 'int64'] np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) np.testing.assert_array_equal(pd_df[0], cu_df['0']) # boolean values should be converted to 0/1 np.testing.assert_array_equal(pd_df[1], cu_df['1']) cu_df = cudf.read_json(buffer, lines=True, dtype=['bool', 'long']) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
def fetch_data(self): """ Fetch data using cudf based on provided config object """ df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"] kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "csv" == input_format: df = cudf.read_csv(filepath, **kwargs) elif "parquet" == input_format: df = cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = cudf.read_orc(filepath, engine="cudf") elif "json" == input_format: df = cudf.read_json(filepath, **kwargs) else: raise NotImplementedError("%s is not a supported input_format" % (input_format)) self.has_data = False return df
def test_read_json(tmpdir, pdf, hdfs, test_url): fname = tmpdir.mkdir("json").join("test_json_reader.json") # Write to local file system # Sorting by col_name now as pandas sorts by col name while reading json pdf.sort_index(axis=1).to_json(fname, orient="records", lines=True) # Read from local file system as buffer with open(fname, mode="rb") as f: buffer = BytesIO(f.read()) # Write to hdfs hdfs.upload(basedir + "/test_json_reader.json", buffer) if test_url: hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format( host, port, basedir ) else: hd_fpath = f"hdfs://{basedir}/test_json_reader.json" got = cudf.read_json(hd_fpath, engine="cudf", orient="records", lines=True) # Read pandas from byte buffer with hdfs.open(basedir + "/test_json_reader.json") as f: expect = pd.read_json(f, lines=True) assert_eq(expect, got)
def test_json_read_directory(tmpdir, json_input, engine): pdf = pd.read_json(json_input, lines=True) pdf.to_json( tmpdir.join("MultiInputs1.json"), compression="infer", lines=True, orient="records", ) pdf.to_json( tmpdir.join("MultiInputs2.json"), compression="infer", lines=True, orient="records", ) pdf.to_json( tmpdir.join("MultiInputs3.json"), compression="infer", lines=True, orient="records", ) cu_df = cudf.read_json(tmpdir, engine=engine, lines=True) pd_df = pd.concat([pdf, pdf, pdf]) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): assert str(cu_col) == str(pd_col) np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
def json_writer_test_params(pdf, compression, dtype): gdf = cudf.from_pandas(pdf) pdf_buffer = pdf.to_json( lines=True, orient="records", compression=compression ) gdf_buffer = gdf.to_json( lines=True, orient="records", compression=compression ) # TODO: Uncomment once this is fixed: # https://github.com/rapidsai/cudf/issues/6429 # compare_content(pdf_buffer, gdf_buffer) actual = cudf.read_json( io.StringIO(gdf_buffer), engine="cudf", lines=True, orient="records", dtype=dtype, ) expected = pd.read_json( io.StringIO(pdf_buffer), lines=True, orient="records", dtype=dtype ) # Difference in behaviour with pandas # cudf reads column as strings only. expected.columns = expected.columns.astype("str") assert_eq(actual, expected)
def test_json_bad_protocol_string(): test_string = '{"field": "s3://path"}' expect = pd.DataFrame([{"field": "s3://path"}]) got = cudf.read_json(test_string, lines=True) assert_eq(expect, got)
def json_reader_test_params(json_buffer, dtype): pdf = pd.read_json(json_buffer, dtype=dtype, orient="records", lines=True) pdf.columns = pdf.columns.astype("str") gdf = cudf.read_json(json_buffer, dtype=dtype, engine="cudf", lines=True) assert_eq(gdf, pdf)
def test_json_lines_basic(json_input, engine): cu_df = cudf.read_json(json_input, engine=engine, lines=True) pd_df = pd.read_json(json_input, lines=True) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): assert str(cu_col) == str(pd_col) np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
def test_json_null_literal(buffer): df = cudf.read_json(buffer, lines=True) # first column contains a null field, type sould be set to float # second column contains only empty fields, type should be set to int8 np.testing.assert_array_equal(df.dtypes, ["float64", "int8"]) np.testing.assert_array_equal(df["0"], [None, 1.0]) np.testing.assert_array_equal(df["1"], [None, None])
def json_reader_test(json_buffer): pdf = pd.read_json(io.StringIO(json_buffer), orient="records", lines=True) # Difference in behaviour with pandas # cudf reads column as strings only. pdf.columns = pdf.columns.astype("str") gdf = cudf.read_json(io.StringIO(json_buffer), engine="cudf", lines=True) assert_eq(gdf, pdf)
def test_json_null_literal(buffer): df = cudf.read_json(buffer, lines=True) # first column contains a null field, type should be set to float # second column contains only empty fields, type should be set to int8 np.testing.assert_array_equal(df.dtypes, ["float64", "int8"]) np.testing.assert_array_equal( df["0"].to_numpy(na_value=np.nan), [1.0, np.nan] ) np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
def json_writer_test(gdf): pdf = gdf.to_pandas() pdf_buffer = pdf.to_json(lines=True, orient="records") gdf_buffer = gdf.to_json(lines=True, orient="records") compare_content(pdf_buffer, gdf_buffer) actual = cudf.read_json(gdf_buffer, lines=True, orient="records") expected = pd.read_json(pdf_buffer, lines=True, orient="records") assert_eq(actual, expected)
def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): fname = tmpdir.mkdir("gdf_json").join("tmp_json_compression" + ext) nrows = 20 pd_df = make_numeric_dataframe(nrows, np.int32) pd_df.to_json(fname, compression=out_comp, lines=True, orient="records") cu_df = cudf.read_json( str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"] ) assert_eq(pd_df, cu_df)
def test_write_data_json(tmpdir, expected_df): fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.json")) config = { "type": "fs", "output_path": fname, "output_format": "json", "orient": "records" } writer = FileSystemWriter(config) writer.write_data(expected_df) result_df = cudf.read_json(fname, orient="records") assert result_df.equals(expected_df)
def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): fname = tmpdir.mkdir("gdf_json").join('tmp_json_compression' + ext) nrows = 20 pd_df = make_numeric_dataframe(nrows, np.int32) pd_df.to_json(fname, compression=out_comp, lines=True, orient='records') cu_df = cudf.read_json(str(fname), compression=in_comp, lines=True, dtype=['int', 'int']) pd.util.testing.assert_frame_equal(pd_df, cu_df.to_pandas())
def test_json_engine_selection(): json = "[1, 2, 3]" # should use the cudf engine df = cudf.read_json(json, lines=True) # column names are strings when parsing with cudf for col_name in df.columns: assert isinstance(col_name, str) # should use the pandas engine df = cudf.read_json(json, lines=False) # column names are ints when parsing with pandas for col_name in df.columns: assert isinstance(col_name, int) # should use the pandas engine df = cudf.read_json(json, lines=True, engine="pandas") # column names are ints when parsing with pandas for col_name in df.columns: assert isinstance(col_name, int) # should raise an exception with pytest.raises(ValueError): df = cudf.read_json(json, lines=False, engine="cudf")
def test_json_lines_multiple(tmpdir, json_input, engine): tmp_file1 = tmpdir.join("MultiInputs1.json") tmp_file2 = tmpdir.join("MultiInputs2.json") pdf = pd.read_json(json_input, lines=True) pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records") pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records") cu_df = cudf.read_json([tmp_file1, tmp_file2], engine=engine, lines=True) pd_df = pd.concat([pdf, pdf]) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): assert str(cu_col) == str(pd_col) np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
def json_writer_test(pdf): gdf = cudf.from_pandas(pdf) pdf_buffer = pdf.to_json(lines=True, orient="records") gdf_buffer = gdf.to_json(lines=True, orient="records") # TODO: Uncomment once this is fixed: # https://github.com/rapidsai/cudf/issues/6429 # compare_content(pdf_buffer, gdf_buffer) actual = cudf.read_json( gdf_buffer, engine="cudf", lines=True, orient="records" ) expected = pd.read_json(pdf_buffer, lines=True, orient="records") expected.columns = expected.columns.astype("str") assert_eq(actual, expected)
def test_read_json(): fname = "test_json_reader.json" bname = "json" buffer = (b'{"amount": 100, "name": "Alice"}\n' b'{"amount": 200, "name": "Bob"}\n' b'{"amount": 300, "name": "Charlie"}\n' b'{"amount": 400, "name": "Dennis"}\n') with s3_context(bname, {fname: buffer}): got = cudf.read_json( "s3://{}/{}".format(bname, fname), engine="cudf", orient="records", lines=True, ) expect = pd.read_json(buffer, lines=True) assert_eq(expect, got)
def test_read_json(s3_base, s3so): fname = "test_json_reader.json" bname = "json" buffer = (b'{"amount": 100, "name": "Alice"}\n' b'{"amount": 200, "name": "Bob"}\n' b'{"amount": 300, "name": "Charlie"}\n' b'{"amount": 400, "name": "Dennis"}\n') with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got = cudf.read_json( f"s3://{bname}/{fname}", engine="cudf", orient="records", lines=True, storage_options=s3so, ) expect = pd.read_json(buffer, lines=True) assert_eq(expect, got)
def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): str_buffer = StringIO("""{"a":"ab\\"cd","b":"a\\tb\\t","c":"aeiou"} {"a":"\\\\\\b","b":"\\\\","c":"try"} {"a":"\\r\\\\","b":"\\\\\\"","c":"json"} {"a":"\'","b":"\\t","c":"cudf"}""") df = cudf.read_json(str_buffer, compression="infer", lines=True, orient="records") expected = { "a": ['ab"cd', "\\\b", "\r\\", "'"], "b": ["a\tb\t", "\\", '\\"', "\t"], "c": ["aeiou", "try", "json", "cudf"], } num_rows = df.shape[0] for col_name in df._data: for i in range(num_rows): assert expected[col_name][i] == df[col_name][i]
def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( tmpdir, ): fname = tmpdir.mkdir("gdf_json").join("tmp_json_escape_double_quote") pdf = pd.DataFrame({ "a": ['ab"cd', "\\\b", "\r\\", "'"], "b": ["a\tb\t", "\\", '\\"', "\t"], "c": ["aeiou", "try", "json", "cudf"], }) pdf.to_json(fname, compression="infer", lines=True, orient="records") df = cudf.read_json(fname, compression="infer", lines=True, orient="records") pdf = pd.read_json(fname, compression="infer", lines=True, orient="records") assert_eq(cudf.DataFrame(pdf), df)
def test_json_lines_dtypes(json_input, dtype): df = cudf.read_json(json_input, lines=True, dtype=dtype) assert (all(df.dtypes == ['float32', 'int32', 'int16']))
def test_json_lines_dtypes(json_input, dtype): df = cudf.read_json(json_input, lines=True, dtype=dtype) assert all(df.dtypes == ["float32", "int32", "int16"])
'task.label_column': None, 'task.feature_names': None } get_input_variables(input_variables) dataframe_id = None if input_variables['task.dataframe_id'] is not None: dataframe_id = input_variables['task.dataframe_id'] if input_variables['task.dataframe_id_train'] is not None: dataframe_id = input_variables['task.dataframe_id_train'] print("dataframe id (in): ", dataframe_id) dataframe_json = get_and_decompress_json_dataframe(dataframe_id) if NVIDIA_RAPIDS_ENABLED: dataframe = cudf.read_json(dataframe_json, orient='split') else: dataframe = pd.read_json(dataframe_json, orient='split') is_labeled_data = False LABEL_COLUMN = variables.get("LABEL_COLUMN") if is_not_none_not_empty(LABEL_COLUMN): is_labeled_data = True else: LABEL_COLUMN = input_variables['task.label_column'] if is_not_none_not_empty(LABEL_COLUMN): is_labeled_data = True algorithm_json = input_variables['task.algorithm_json'] assert algorithm_json is not None algorithm = json.loads(algorithm_json)
def json_reader_test(json_buffer): pdf = pd.read_json(json_buffer) gdf = cudf.read_json(json_buffer) assert_eq(gdf, pdf)