Esempio n. 1
0
    def fetch_data(self):
        """
        Fetch data using dask based on provided config object
        """
        df = None
        input_format = self.config["input_format"].lower()
        filepath = self.config["input_path"]
        kwargs = self.config.copy()
        del kwargs["type"]
        del kwargs["input_format"]
        del kwargs["input_path"]

        if "csv" == input_format:
            df = dask_cudf.read_csv(filepath, **kwargs)
        elif "parquet" == input_format:
            df = dask_cudf.read_parquet(filepath, **kwargs)
        elif "orc" == input_format:
            df = dask_cudf.read_orc(filepath, engine="cudf")
        elif "json" == input_format:
            df = dask_cudf.read_json(filepath, **kwargs)
        else:
            raise NotImplementedError("%s is not a supported input_format" % (input_format))

        self.has_data = False
        return df
Esempio n. 2
0
def test_to_orc(tmpdir, dtypes, compression, compute):

    # Create cudf and dask_cudf dataframes
    df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
    df = df.set_index("index").sort_index()
    ddf = dask_cudf.from_cudf(df, npartitions=3)

    # Write cudf dataframe as single file
    # (preserve index by setting to column)
    fname = tmpdir.join("test.orc")
    df.reset_index().to_orc(fname, compression=compression)

    # Write dask_cudf dataframe as multiple files
    # (preserve index by `write_index=True`)
    to = ddf.to_orc(str(tmpdir),
                    write_index=True,
                    compression=compression,
                    compute=compute)

    if not compute:
        to.compute()

    # Read back cudf dataframe
    df_read = cudf.read_orc(fname).set_index("index")

    # Read back dask_cudf dataframe
    paths = glob.glob(str(tmpdir) + "/part.*.orc")
    ddf_read = dask_cudf.read_orc(paths).set_index("index")

    # Make sure the dask_cudf dataframe matches
    # the cudf dataframes (df and df_read)
    dd.assert_eq(df, ddf_read)
    dd.assert_eq(df_read, ddf_read)
Esempio n. 3
0
def test_read_orc_first_file_empty(tmpdir):

    # Write a 3-file dataset where the first file is empty
    # See: https://github.com/rapidsai/cudf/issues/8011
    path = str(tmpdir)
    os.makedirs(path, exist_ok=True)
    df1 = cudf.DataFrame({"id": [1, 2], "float": [1.0, 2.0]})
    df1.iloc[:0].to_orc(os.path.join(path, "data.0"))
    df1.iloc[:1].to_orc(os.path.join(path, "data.1"))
    df1.iloc[1:].to_orc(os.path.join(path, "data.2"))

    # Read back the files with dask_cudf,
    # and check the result.
    df2 = dask_cudf.read_orc(os.path.join(path, "*"))
    dd.assert_eq(df1, df2, check_index=False)
Esempio n. 4
0
    def fetch_data(self):
        df = None
        input_format = self.config["input_format"].lower()
        filepath = self.config["input_path"].lower()
        kwargs = self.config.copy()
        del kwargs["type"]
        del kwargs["input_format"]
        del kwargs["input_path"]

        if "parquet" == input_format:
            df = dask_cudf.read_parquet(filepath, **kwargs)
        elif "orc" == input_format:
            df = dask_cudf.read_orc(filepath, engine="cudf")
        else:
            df = dask_cudf.read_csv(filepath, **kwargs)

        self.has_data = False
        return df
Esempio n. 5
0
def test_read_orc_cols(engine, columns):
    df1 = cudf.read_orc(sample_orc, engine=engine, columns=columns)

    df2 = dask_cudf.read_orc(sample_orc, engine=engine, columns=columns)

    dd.assert_eq(df1, df2, check_index=False)
Esempio n. 6
0
def test_filepath_read_orc_defaults():
    path = "file://%s" % sample_orc
    df1 = cudf.read_orc(path)
    df2 = dask_cudf.read_orc(path)
    dd.assert_eq(df1, df2, check_index=False)
Esempio n. 7
0
def test_read_orc_defaults():
    df1 = cudf.read_orc(sample_orc)
    df2 = dask_cudf.read_orc(sample_orc)
    df2.head().to_pandas()
    dd.assert_eq(df1, df2, check_index=False)
Esempio n. 8
0
def test_filelist_read_orc_defaults():
    path = [sample_orc]
    df1 = cudf.read_orc(path[0])
    df2 = dask_cudf.read_orc(path)
    dd.assert_eq(df1, df2, check_index=False)
def gpu_load_performance_data(performance_path, **kwargs):
    ddf = dask_cudf.read_orc(performance_path, **kwargs)
    return ddf
Esempio n. 10
0
def test_read_orc_filtered(tmpdir, engine, predicate, expected_len):
    df = dask_cudf.read_orc(sample_orc, engine=engine, filters=predicate)

    dd.assert_eq(len(df), expected_len)