def fetch_data(self): """ Fetch data using dask based on provided config object """ df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"] kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "csv" == input_format: df = dask_cudf.read_csv(filepath, **kwargs) elif "parquet" == input_format: df = dask_cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = dask_cudf.read_orc(filepath, engine="cudf") elif "json" == input_format: df = dask_cudf.read_json(filepath, **kwargs) else: raise NotImplementedError("%s is not a supported input_format" % (input_format)) self.has_data = False return df
def test_to_orc(tmpdir, dtypes, compression, compute): # Create cudf and dask_cudf dataframes df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) df = df.set_index("index").sort_index() ddf = dask_cudf.from_cudf(df, npartitions=3) # Write cudf dataframe as single file # (preserve index by setting to column) fname = tmpdir.join("test.orc") df.reset_index().to_orc(fname, compression=compression) # Write dask_cudf dataframe as multiple files # (preserve index by `write_index=True`) to = ddf.to_orc(str(tmpdir), write_index=True, compression=compression, compute=compute) if not compute: to.compute() # Read back cudf dataframe df_read = cudf.read_orc(fname).set_index("index") # Read back dask_cudf dataframe paths = glob.glob(str(tmpdir) + "/part.*.orc") ddf_read = dask_cudf.read_orc(paths).set_index("index") # Make sure the dask_cudf dataframe matches # the cudf dataframes (df and df_read) dd.assert_eq(df, ddf_read) dd.assert_eq(df_read, ddf_read)
def test_read_orc_first_file_empty(tmpdir): # Write a 3-file dataset where the first file is empty # See: https://github.com/rapidsai/cudf/issues/8011 path = str(tmpdir) os.makedirs(path, exist_ok=True) df1 = cudf.DataFrame({"id": [1, 2], "float": [1.0, 2.0]}) df1.iloc[:0].to_orc(os.path.join(path, "data.0")) df1.iloc[:1].to_orc(os.path.join(path, "data.1")) df1.iloc[1:].to_orc(os.path.join(path, "data.2")) # Read back the files with dask_cudf, # and check the result. df2 = dask_cudf.read_orc(os.path.join(path, "*")) dd.assert_eq(df1, df2, check_index=False)
def fetch_data(self): df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"].lower() kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "parquet" == input_format: df = dask_cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = dask_cudf.read_orc(filepath, engine="cudf") else: df = dask_cudf.read_csv(filepath, **kwargs) self.has_data = False return df
def test_read_orc_cols(engine, columns): df1 = cudf.read_orc(sample_orc, engine=engine, columns=columns) df2 = dask_cudf.read_orc(sample_orc, engine=engine, columns=columns) dd.assert_eq(df1, df2, check_index=False)
def test_filepath_read_orc_defaults(): path = "file://%s" % sample_orc df1 = cudf.read_orc(path) df2 = dask_cudf.read_orc(path) dd.assert_eq(df1, df2, check_index=False)
def test_read_orc_defaults(): df1 = cudf.read_orc(sample_orc) df2 = dask_cudf.read_orc(sample_orc) df2.head().to_pandas() dd.assert_eq(df1, df2, check_index=False)
def test_filelist_read_orc_defaults(): path = [sample_orc] df1 = cudf.read_orc(path[0]) df2 = dask_cudf.read_orc(path) dd.assert_eq(df1, df2, check_index=False)
def gpu_load_performance_data(performance_path, **kwargs): ddf = dask_cudf.read_orc(performance_path, **kwargs) return ddf
def test_read_orc_filtered(tmpdir, engine, predicate, expected_len): df = dask_cudf.read_orc(sample_orc, engine=engine, filters=predicate) dd.assert_eq(len(df), expected_len)