def test_orc_reader_multi_file_single_stripe(datadir): path = datadir / "TestOrcFile.testSnappy.orc" # should raise an exception with pytest.raises(ValueError): cudf.read_orc([path, path], engine="cudf", stripes=[0])
def test_skip_rows_for_nested_types(columns): with pytest.raises( RuntimeError, match="skip_rows is not supported by nested column" ): cudf.read_orc( list_struct_buff, columns=columns, use_index=True, skiprows=5, )
def read_orc(path, **kwargs): """ Read ORC files into a Dask DataFrame This calls the ``cudf.read_orc`` function on many ORC files. See that function for additional details. Examples -------- >>> import dask_cudf >>> df = dask_cudf.read_orc("/path/to/*.orc") # doctest: +SKIP See Also -------- cudf.read_orc """ name = "read-orc-" + tokenize(path, **kwargs) dsk = {} if "://" in str(path): files = open_files(path) # An `OpenFile` should be used in a Context with files[0] as f: meta = cudf.read_orc(f, **kwargs) dsk = {(name, i): (apply, _read_orc, [f], kwargs) for i, f in enumerate(files)} else: filenames = sorted(glob(str(path))) meta = cudf.read_orc(filenames[0], **kwargs) dsk = {(name, i): (apply, cudf.read_orc, [fn], kwargs) for i, fn in enumerate(filenames)} divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def test_orc_read_stripes(datadir, engine): path = datadir / "TestOrcFile.testDate1900.orc" try: pdf = cudf.read_orc(path, engine=engine) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) num_rows, stripes, col_names = cudf.io.read_orc_metadata(path) # Read stripes one at a time gdf = [ cudf.read_orc(path, engine=engine, stripes=[i]) for i in range(stripes) ] gdf = cudf.concat(gdf).reset_index(drop=True) assert_eq(pdf, gdf, check_categorical=False) # Read stripes all at once gdf = cudf.read_orc(path, engine=engine, stripes=range(stripes)) assert_eq(pdf, gdf, check_categorical=False) # Read only some stripes gdf = cudf.read_orc(path, engine=engine, stripes=[0, 1]) assert_eq(gdf, pdf.head(25000)) gdf = cudf.read_orc(path, engine=engine, stripes=[0, stripes - 1]) assert_eq( gdf, cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True))
def test_empty_dataframe(): buffer = BytesIO() expected = cudf.DataFrame() expected.to_orc(buffer) # Raise error if column name is mentioned, but it doesn't exist. with pytest.raises(RuntimeError): cudf.read_orc(buffer, columns=["a"]) got_df = cudf.read_orc(buffer) expected_pdf = pd.read_orc(buffer) assert_eq(expected, got_df) assert_eq(expected_pdf, got_df)
def test_orc_read_stripe(datadir, engine): path = datadir / "TestOrcFile.testDate1900.orc" try: pdf = cudf.read_orc(path, engine=engine) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) num_rows, stripes, col_names = cudf.io.read_orc_metadata(path) gdf = [ cudf.read_orc(path, engine=engine, stripe=i) for i in range(stripes) ] gdf = cudf.concat(gdf).reset_index(drop=True) assert_eq(pdf, gdf, check_categorical=False)
def test_orc_timestamp_read(datadir): path = datadir / "TestOrcFile.timestamp.issue.orc" pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf)
def test_pyspark_struct(datadir): path = datadir / "TestOrcFile.testPySparkStruct.orc" pdf = pa.orc.ORCFile(path).read().to_pandas() gdf = cudf.read_orc(path) assert_eq(pdf, gdf)
def test_orc_read_rows(datadir, skip_rows, num_rows): path = datadir / 'TestOrcFile.decimal.orc' try: orcfile = pa.orc.ORCFile(path) except Exception as excpr: if type(excpr).__name__ == 'ArrowIOError': pytest.skip('.orc file is not found') else: print(type(excpr).__name__) pdf = orcfile.read().to_pandas() gdf = cudf.read_orc(path, engine='cudf', skip_rows=skip_rows, num_rows=num_rows).to_pandas() # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF # This is because cuDF returns as float64 as it lacks an equivalent dtype pdf = pdf.apply(pd.to_numeric) # Slice rows out of the whole dataframe for comparison as PyArrow doesn't # have an API to read a subsection of rows from the file pdf = pdf[skip_rows:skip_rows + num_rows] np.testing.assert_allclose(pdf, gdf)
def read_orc(path, **kwargs): """ Read ORC files into a Dask DataFrame This calls the ``cudf.read_orc`` function on many ORC files. See that function for additional details. Examples -------- >>> import dask_cudf >>> df = dask_cudf.read_orc("/path/to/*.orc") # doctest: +SKIP See Also -------- cudf.read_orc """ filenames = sorted(glob(str(path))) name = "read-orc-" + tokenize(path, **kwargs) meta = cudf.read_orc(filenames[0], **kwargs) graph = {(name, i): (apply, cudf.read_orc, [fn], kwargs) for i, fn in enumerate(filenames)} divisions = [None] * (len(filenames) + 1) return dd.core.new_dd_object(graph, name, meta, divisions)
def load_data(self, filename='dataset.orc', col_labels=None, y_label='ArrDelayBinary'): # target_filename = self.CSP_paths['train_data'] + '/' + filename target_filename = filename self.log_to_file(f'\n> loading dataset from {target_filename}...\n') with PerfTimer() as ingestion_timer: if 'CPU' in self.compute_type: if 'ORC' in self.data_type: with open(target_filename, mode='rb') as file: dataset = pyarrow_orc.ORCFile(file).read().to_pandas() elif 'CSV' in self.data_type: dataset = pd.read_csv(target_filename, names=col_labels) elif 'GPU' in self.compute_type: if 'ORC' in self.data_type: dataset = cudf.read_orc(target_filename) elif 'CSV' in self.data_type: dataset = cudf.read_csv(target_filename, names=col_labels) self.log_to_file(f'ingestion completed in {ingestion_timer.duration}') self.log_to_file( f'dataset descriptors: {dataset.shape}\n {dataset.dtypes}\n {dataset.columns}\n' ) return dataset, col_labels, y_label, ingestion_timer.duration
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index): # TODO: Remove skiprows=0 after # following issue is fixed: # https://github.com/rapidsai/cudf/issues/6563 skiprows = 0 pdf, file_buffer = input_tuple expected_pdf = pdf.iloc[skiprows:] if num_rows is not None: expected_pdf = expected_pdf.head(num_rows) if skiprows is not None or num_rows is not None: expected_pdf.reset_index(drop=True, inplace=True) if columns is not None: expected_pdf = expected_pdf[columns] if use_index is False: expected_pdf.reset_index(drop=True, inplace=True) gdf = cudf.read_orc( io.BytesIO(file_buffer), columns=columns, skiprows=skiprows, num_rows=num_rows, use_index=use_index, ) compare_dataframe(expected_pdf, gdf)
def test_no_row_group_index_orc_read(datadir, fname): fpath = datadir / fname expect = pa.orc.ORCFile(fpath).read() got = cudf.read_orc(fpath) assert expect.equals(got.to_arrow())
def test_orc_reader_boolean_type(datadir, orc_file): file_path = datadir / orc_file pdf = pd.read_orc(file_path) df = cudf.read_orc(file_path).to_pandas() assert_eq(pdf, df)
def fetch_data(self): """ Fetch data using cudf based on provided config object """ df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"] kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "csv" == input_format: df = cudf.read_csv(filepath, **kwargs) elif "parquet" == input_format: df = cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = cudf.read_orc(filepath, engine="cudf") elif "json" == input_format: df = cudf.read_json(filepath, **kwargs) else: raise NotImplementedError("%s is not a supported input_format" % (input_format)) self.has_data = False return df
def test_to_orc(tmpdir, dtypes, compression, compute): # Create cudf and dask_cudf dataframes df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) df = df.set_index("index").sort_index() ddf = dask_cudf.from_cudf(df, npartitions=3) # Write cudf dataframe as single file # (preserve index by setting to column) fname = tmpdir.join("test.orc") df.reset_index().to_orc(fname, compression=compression) # Write dask_cudf dataframe as multiple files # (preserve index by `write_index=True`) to = ddf.to_orc(str(tmpdir), write_index=True, compression=compression, compute=compute) if not compute: to.compute() # Read back cudf dataframe df_read = cudf.read_orc(fname).set_index("index") # Read back dask_cudf dataframe paths = glob.glob(str(tmpdir) + "/part.*.orc") ddf_read = dask_cudf.read_orc(paths).set_index("index") # Make sure the dask_cudf dataframe matches # the cudf dataframes (df and df_read) dd.assert_eq(df, ddf_read) dd.assert_eq(df_read, ddf_read)
def _read_orc_stripe(fs, path, stripe, columns, kwargs={}): """Pull out specific columns from specific stripe""" with fs.open(path, "rb") as f: df_stripe = cudf.read_orc(f, stripes=[stripe], columns=columns, **kwargs) return df_stripe
def test_orc_reader_filepath_or_buffer(path_or_buf, src): cols = ["int1", "long1", "float1", "double1"] orcfile = pa.orc.ORCFile(path_or_buf("filepath")) expect = orcfile.read(columns=cols).to_pandas() got = cudf.read_orc(path_or_buf(src), columns=cols) assert_eq(expect, got)
def test_orc_reader_decimal_as_int(datadir): path = datadir / "TestOrcFile.decimal.orc" gdf = cudf.read_orc( path, engine="cudf", decimals_as_float=False, force_decimal_scale=2 ).to_pandas() assert gdf["_col0"][0] == -100050 # -1000.5
def test_orc_read_filtered(datadir, engine, predicate, expected_len): path = datadir / "TestOrcFile.testStripeLevelStats.orc" try: df_filtered = cudf.read_orc(path, engine=engine, filters=predicate) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) # Assert # of rows after filtering assert len(df_filtered) == expected_len
def test_writer_lists_structs(list_struct_buff): df_in = cudf.read_orc(list_struct_buff) buff = BytesIO() df_in.to_orc(buff) pyarrow_tbl = pyarrow.orc.ORCFile(buff).read() assert pyarrow_tbl.equals(df_in.to_arrow())
def test_orc_reader_decimal_type(datadir, orc_file): file_path = datadir / orc_file pdf = pd.read_orc(file_path) df = cudf.read_orc(file_path).to_pandas() # Converting to strings since pandas keeps it in decimal pdf["col8"] = pdf["col8"].astype("str") df["col8"] = df["col8"].astype("str") assert_eq(pdf, df)
def test_orc_writer_rle_stream_size(datadir, tmpdir): original = datadir / "TestOrcFile.int16.rle.size.orc" reencoded = tmpdir.join("int16_map.orc") df = cudf.read_orc(original) df.to_orc(reencoded) # Segfaults when RLE stream sizes don't account for varint length pa_out = pa.orc.ORCFile(reencoded).read() assert_eq(df.to_pandas(), pa_out)
def test_orc_reader_gmt_timestamps(datadir): path = datadir / "TestOrcFile.gmt.orc" try: orcfile = pa.orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) pdf = orcfile.read().to_pandas() gdf = cudf.read_orc(path, engine="cudf").to_pandas() assert_eq(pdf, gdf)
def test_orc_decimal_precision_fail(datadir): file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc" try: orcfile = pa.orc.ORCFile(file_path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) # Max precision supported is 18 (Decimal64Dtype limit) # and the data has the precision 19. This test should be removed # once Decimal128Dtype is introduced. with pytest.raises(RuntimeError): cudf.read_orc(file_path) # Shouldn't cause failure if decimal column is not chosen to be read. pdf = orcfile.read(columns=["int"]).to_pandas() gdf = cudf.read_orc(file_path, columns=["int"]) assert_eq(pdf, gdf)
def test_orc_reader_strings(datadir): path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc" try: orcfile = pa.orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) expect = orcfile.read(columns=["string1"]) got = cudf.read_orc(path, engine="cudf", columns=["string1"]) assert_eq(expect, got, check_categorical=False)
def test_int_overflow(tmpdir): file_path = tmpdir.join("gdf_overflow.orc") # The number of rows and the large element trigger delta encoding num_rows = 513 df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int32") df["a"][0] = 1024 * 1024 * 1024 df["a"][num_rows - 1] = 1 df.to_orc(file_path) assert_eq(cudf.read_orc(file_path), df)
def test_orc_reader_uncompressed_block(datadir): path = datadir / "uncompressed_snappy.orc" try: orcfile = pa.orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) expect = orcfile.read().to_pandas() got = cudf.read_orc(path, engine="cudf") assert_eq(expect, got, check_categorical=False)
def test_orc_reader_datetimestamp(datadir, inputfile, use_index): path = datadir / inputfile try: orcfile = pa.orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) pdf = orcfile.read().to_pandas(date_as_object=False) gdf = cudf.read_orc(path, engine="cudf", use_index=use_index) assert_eq(pdf, gdf, check_categorical=False)
def test_orc_writer_sliced(tmpdir): cudf_path = tmpdir.join("cudf.orc") df = pd.DataFrame() df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) df = cudf.from_pandas(df) df_select = df.iloc[1:3] df_select.to_orc(cudf_path) assert_eq(cudf.read_orc(cudf_path), df_select.reset_index(drop=True))