Beispiel #1
0
def test_read_json_basic(orient):
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})
    with tmpfile('json') as f:
        df.to_json(f, orient=orient, lines=False)
        actual = dask_cudf.read_json(f, orient=orient, lines=False)
        actual_pd = pd.read_json(f, orient=orient, lines=False)
        dd.assert_eq(actual, actual_pd)
Beispiel #2
0
def test_read_json_lines(lines):
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})
    with tmpfile('json') as f:
        df.to_json(f, orient='records', lines=lines)
        actual = dask_cudf.read_json(f, orient='records', lines=lines)
        actual_pd = pd.read_json(f, orient='records', lines=lines)
        dd.assert_eq(actual, actual_pd)
Beispiel #3
0
def test_read_json_lines(lines):
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]})
    with tmpfile("json") as f:
        df.to_json(f, orient="records", lines=lines)
        actual = dask_cudf.read_json(f, orient="records", lines=lines)
        actual_pd = pd.read_json(f, orient="records", lines=lines)
        dd.assert_eq(actual, actual_pd)
Beispiel #4
0
def test_read_json_basic(orient):
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]})
    with tmpfile("json") as f:
        df.to_json(f, orient=orient, lines=False)
        actual = dask_cudf.read_json(f, orient=orient, lines=False)
        actual_pd = pd.read_json(f, orient=orient, lines=False)
        dd.assert_eq(actual, actual_pd)
Beispiel #5
0
    def fetch_data(self):
        """
        Fetch data using dask based on provided config object
        """
        df = None
        input_format = self.config["input_format"].lower()
        filepath = self.config["input_path"]
        kwargs = self.config.copy()
        del kwargs["type"]
        del kwargs["input_format"]
        del kwargs["input_path"]

        if "csv" == input_format:
            df = dask_cudf.read_csv(filepath, **kwargs)
        elif "parquet" == input_format:
            df = dask_cudf.read_parquet(filepath, **kwargs)
        elif "orc" == input_format:
            df = dask_cudf.read_orc(filepath, engine="cudf")
        elif "json" == input_format:
            df = dask_cudf.read_json(filepath, **kwargs)
        else:
            raise NotImplementedError("%s is not a supported input_format" % (input_format))

        self.has_data = False
        return df
Beispiel #6
0
def test_read_json(tmp_path):
    df1 = dask.datasets.timeseries(dtypes={
        "x": int,
        "y": int
    }, freq="120s").reset_index(drop=True)
    df1.to_json(tmp_path / "data-*.json")
    df2 = dask_cudf.read_json(tmp_path / "data-*.json")
    dd.assert_eq(df1, df2)
Beispiel #7
0
def test_read_json(tmp_path):
    df1 = dask.datasets.timeseries(dtypes={
        "x": int,
        "y": int
    }, freq="120s").reset_index(drop=True)
    df1.to_json(tmp_path / "data-*.json")
    df2 = dask_cudf.read_json(tmp_path / "data-*.json")
    dd.assert_eq(df1, df2)

    # file path test
    stmp_path = str(tmp_path / "data-*.json")
    df3 = dask_cudf.read_json(f"file://{stmp_path}")
    dd.assert_eq(df1, df3)

    # file list test
    list_paths = [
        os.path.join(tmp_path, fname) for fname in sorted(os.listdir(tmp_path))
    ]
    df4 = dask_cudf.read_json(list_paths)
    dd.assert_eq(df1, df4)