def test_read_json_basic(orient): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) with tmpfile('json') as f: df.to_json(f, orient=orient, lines=False) actual = dask_cudf.read_json(f, orient=orient, lines=False) actual_pd = pd.read_json(f, orient=orient, lines=False) dd.assert_eq(actual, actual_pd)
def test_read_json_lines(lines): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) with tmpfile('json') as f: df.to_json(f, orient='records', lines=lines) actual = dask_cudf.read_json(f, orient='records', lines=lines) actual_pd = pd.read_json(f, orient='records', lines=lines) dd.assert_eq(actual, actual_pd)
def test_read_json_lines(lines): df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}) with tmpfile("json") as f: df.to_json(f, orient="records", lines=lines) actual = dask_cudf.read_json(f, orient="records", lines=lines) actual_pd = pd.read_json(f, orient="records", lines=lines) dd.assert_eq(actual, actual_pd)
def test_read_json_basic(orient): df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}) with tmpfile("json") as f: df.to_json(f, orient=orient, lines=False) actual = dask_cudf.read_json(f, orient=orient, lines=False) actual_pd = pd.read_json(f, orient=orient, lines=False) dd.assert_eq(actual, actual_pd)
def fetch_data(self): """ Fetch data using dask based on provided config object """ df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"] kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "csv" == input_format: df = dask_cudf.read_csv(filepath, **kwargs) elif "parquet" == input_format: df = dask_cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = dask_cudf.read_orc(filepath, engine="cudf") elif "json" == input_format: df = dask_cudf.read_json(filepath, **kwargs) else: raise NotImplementedError("%s is not a supported input_format" % (input_format)) self.has_data = False return df
def test_read_json(tmp_path): df1 = dask.datasets.timeseries(dtypes={ "x": int, "y": int }, freq="120s").reset_index(drop=True) df1.to_json(tmp_path / "data-*.json") df2 = dask_cudf.read_json(tmp_path / "data-*.json") dd.assert_eq(df1, df2)
def test_read_json(tmp_path): df1 = dask.datasets.timeseries(dtypes={ "x": int, "y": int }, freq="120s").reset_index(drop=True) df1.to_json(tmp_path / "data-*.json") df2 = dask_cudf.read_json(tmp_path / "data-*.json") dd.assert_eq(df1, df2) # file path test stmp_path = str(tmp_path / "data-*.json") df3 = dask_cudf.read_json(f"file://{stmp_path}") dd.assert_eq(df1, df3) # file list test list_paths = [ os.path.join(tmp_path, fname) for fname in sorted(os.listdir(tmp_path)) ] df4 = dask_cudf.read_json(list_paths) dd.assert_eq(df1, df4)