Ejemplo n.º 1
0
def test_orc_reader_multi_file_single_stripe(datadir):

    path = datadir / "TestOrcFile.testSnappy.orc"

    # should raise an exception
    with pytest.raises(ValueError):
        cudf.read_orc([path, path], engine="cudf", stripes=[0])
Ejemplo n.º 2
0
def test_skip_rows_for_nested_types(columns):
    with pytest.raises(
        RuntimeError, match="skip_rows is not supported by nested column"
    ):
        cudf.read_orc(
            list_struct_buff, columns=columns, use_index=True, skiprows=5,
        )
Ejemplo n.º 3
0
def read_orc(path, **kwargs):
    """ Read ORC files into a Dask DataFrame

    This calls the ``cudf.read_orc`` function on many ORC files.
    See that function for additional details.

    Examples
    --------
    >>> import dask_cudf
    >>> df = dask_cudf.read_orc("/path/to/*.orc")  # doctest: +SKIP

    See Also
    --------
    cudf.read_orc
    """

    name = "read-orc-" + tokenize(path, **kwargs)
    dsk = {}
    if "://" in str(path):
        files = open_files(path)

        # An `OpenFile` should be used in a Context
        with files[0] as f:
            meta = cudf.read_orc(f, **kwargs)

        dsk = {(name, i): (apply, _read_orc, [f], kwargs)
               for i, f in enumerate(files)}
    else:
        filenames = sorted(glob(str(path)))
        meta = cudf.read_orc(filenames[0], **kwargs)
        dsk = {(name, i): (apply, cudf.read_orc, [fn], kwargs)
               for i, fn in enumerate(filenames)}

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Ejemplo n.º 4
0
def test_orc_read_stripes(datadir, engine):
    path = datadir / "TestOrcFile.testDate1900.orc"
    try:
        pdf = cudf.read_orc(path, engine=engine)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    num_rows, stripes, col_names = cudf.io.read_orc_metadata(path)

    # Read stripes one at a time
    gdf = [
        cudf.read_orc(path, engine=engine, stripes=[i]) for i in range(stripes)
    ]
    gdf = cudf.concat(gdf).reset_index(drop=True)
    assert_eq(pdf, gdf, check_categorical=False)

    # Read stripes all at once
    gdf = cudf.read_orc(path, engine=engine, stripes=range(stripes))
    assert_eq(pdf, gdf, check_categorical=False)

    # Read only some stripes
    gdf = cudf.read_orc(path, engine=engine, stripes=[0, 1])
    assert_eq(gdf, pdf.head(25000))
    gdf = cudf.read_orc(path, engine=engine, stripes=[0, stripes - 1])
    assert_eq(
        gdf, cudf.concat([pdf.head(15000), pdf.tail(10000)],
                         ignore_index=True))
Ejemplo n.º 5
0
def test_empty_dataframe():
    buffer = BytesIO()
    expected = cudf.DataFrame()
    expected.to_orc(buffer)

    # Raise error if column name is mentioned, but it doesn't exist.
    with pytest.raises(RuntimeError):
        cudf.read_orc(buffer, columns=["a"])

    got_df = cudf.read_orc(buffer)
    expected_pdf = pd.read_orc(buffer)

    assert_eq(expected, got_df)
    assert_eq(expected_pdf, got_df)
Ejemplo n.º 6
0
def test_orc_read_stripe(datadir, engine):
    path = datadir / "TestOrcFile.testDate1900.orc"
    try:
        pdf = cudf.read_orc(path, engine=engine)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    num_rows, stripes, col_names = cudf.io.read_orc_metadata(path)

    gdf = [
        cudf.read_orc(path, engine=engine, stripe=i) for i in range(stripes)
    ]
    gdf = cudf.concat(gdf).reset_index(drop=True)

    assert_eq(pdf, gdf, check_categorical=False)
Ejemplo n.º 7
0
def test_orc_timestamp_read(datadir):
    path = datadir / "TestOrcFile.timestamp.issue.orc"

    pdf = pd.read_orc(path)
    gdf = cudf.read_orc(path)

    assert_eq(pdf, gdf)
Ejemplo n.º 8
0
def test_pyspark_struct(datadir):
    path = datadir / "TestOrcFile.testPySparkStruct.orc"

    pdf = pa.orc.ORCFile(path).read().to_pandas()
    gdf = cudf.read_orc(path)

    assert_eq(pdf, gdf)
Ejemplo n.º 9
0
def test_orc_read_rows(datadir, skip_rows, num_rows):
    path = datadir / 'TestOrcFile.decimal.orc'
    try:
        orcfile = pa.orc.ORCFile(path)
    except Exception as excpr:
        if type(excpr).__name__ == 'ArrowIOError':
            pytest.skip('.orc file is not found')
        else:
            print(type(excpr).__name__)

    pdf = orcfile.read().to_pandas()
    gdf = cudf.read_orc(path,
                        engine='cudf',
                        skip_rows=skip_rows,
                        num_rows=num_rows).to_pandas()

    # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
    # This is because cuDF returns as float64 as it lacks an equivalent dtype
    pdf = pdf.apply(pd.to_numeric)

    # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
    # have an API to read a subsection of rows from the file
    pdf = pdf[skip_rows:skip_rows + num_rows]

    np.testing.assert_allclose(pdf, gdf)
Ejemplo n.º 10
0
def read_orc(path, **kwargs):
    """ Read ORC files into a Dask DataFrame

    This calls the ``cudf.read_orc`` function on many ORC files.
    See that function for additional details.

    Examples
    --------
    >>> import dask_cudf
    >>> df = dask_cudf.read_orc("/path/to/*.orc")  # doctest: +SKIP

    See Also
    --------
    cudf.read_orc
    """

    filenames = sorted(glob(str(path)))
    name = "read-orc-" + tokenize(path, **kwargs)

    meta = cudf.read_orc(filenames[0], **kwargs)

    graph = {(name, i): (apply, cudf.read_orc, [fn], kwargs)
             for i, fn in enumerate(filenames)}

    divisions = [None] * (len(filenames) + 1)

    return dd.core.new_dd_object(graph, name, meta, divisions)
Ejemplo n.º 11
0
    def load_data(self,
                  filename='dataset.orc',
                  col_labels=None,
                  y_label='ArrDelayBinary'):

        #         target_filename = self.CSP_paths['train_data'] + '/' + filename
        target_filename = filename
        self.log_to_file(f'\n> loading dataset from {target_filename}...\n')

        with PerfTimer() as ingestion_timer:
            if 'CPU' in self.compute_type:
                if 'ORC' in self.data_type:
                    with open(target_filename, mode='rb') as file:
                        dataset = pyarrow_orc.ORCFile(file).read().to_pandas()
                elif 'CSV' in self.data_type:
                    dataset = pd.read_csv(target_filename, names=col_labels)
            elif 'GPU' in self.compute_type:
                if 'ORC' in self.data_type:
                    dataset = cudf.read_orc(target_filename)
                elif 'CSV' in self.data_type:
                    dataset = cudf.read_csv(target_filename, names=col_labels)

        self.log_to_file(f'ingestion completed in {ingestion_timer.duration}')
        self.log_to_file(
            f'dataset descriptors: {dataset.shape}\n {dataset.dtypes}\n {dataset.columns}\n'
        )
        return dataset, col_labels, y_label, ingestion_timer.duration
Ejemplo n.º 12
0
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
    # TODO: Remove skiprows=0 after
    # following issue is fixed:
    # https://github.com/rapidsai/cudf/issues/6563
    skiprows = 0

    pdf, file_buffer = input_tuple
    expected_pdf = pdf.iloc[skiprows:]
    if num_rows is not None:
        expected_pdf = expected_pdf.head(num_rows)
    if skiprows is not None or num_rows is not None:
        expected_pdf.reset_index(drop=True, inplace=True)
    if columns is not None:
        expected_pdf = expected_pdf[columns]
    if use_index is False:
        expected_pdf.reset_index(drop=True, inplace=True)

    gdf = cudf.read_orc(
        io.BytesIO(file_buffer),
        columns=columns,
        skiprows=skiprows,
        num_rows=num_rows,
        use_index=use_index,
    )
    compare_dataframe(expected_pdf, gdf)
Ejemplo n.º 13
0
def test_no_row_group_index_orc_read(datadir, fname):
    fpath = datadir / fname

    expect = pa.orc.ORCFile(fpath).read()
    got = cudf.read_orc(fpath)

    assert expect.equals(got.to_arrow())
Ejemplo n.º 14
0
def test_orc_reader_boolean_type(datadir, orc_file):
    file_path = datadir / orc_file

    pdf = pd.read_orc(file_path)
    df = cudf.read_orc(file_path).to_pandas()

    assert_eq(pdf, df)
Ejemplo n.º 15
0
    def fetch_data(self):
        """
        Fetch data using cudf based on provided config object
        """
        df = None
        input_format = self.config["input_format"].lower()
        filepath = self.config["input_path"]
        kwargs = self.config.copy()
        del kwargs["type"]
        del kwargs["input_format"]
        del kwargs["input_path"]

        if "csv" == input_format:
            df = cudf.read_csv(filepath, **kwargs)
        elif "parquet" == input_format:
            df = cudf.read_parquet(filepath, **kwargs)
        elif "orc" == input_format:
            df = cudf.read_orc(filepath, engine="cudf")
        elif "json" == input_format:
            df = cudf.read_json(filepath, **kwargs)
        else:
            raise NotImplementedError("%s is not a supported input_format" %
                                      (input_format))

        self.has_data = False
        return df
Ejemplo n.º 16
0
def test_to_orc(tmpdir, dtypes, compression, compute):

    # Create cudf and dask_cudf dataframes
    df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
    df = df.set_index("index").sort_index()
    ddf = dask_cudf.from_cudf(df, npartitions=3)

    # Write cudf dataframe as single file
    # (preserve index by setting to column)
    fname = tmpdir.join("test.orc")
    df.reset_index().to_orc(fname, compression=compression)

    # Write dask_cudf dataframe as multiple files
    # (preserve index by `write_index=True`)
    to = ddf.to_orc(str(tmpdir),
                    write_index=True,
                    compression=compression,
                    compute=compute)

    if not compute:
        to.compute()

    # Read back cudf dataframe
    df_read = cudf.read_orc(fname).set_index("index")

    # Read back dask_cudf dataframe
    paths = glob.glob(str(tmpdir) + "/part.*.orc")
    ddf_read = dask_cudf.read_orc(paths).set_index("index")

    # Make sure the dask_cudf dataframe matches
    # the cudf dataframes (df and df_read)
    dd.assert_eq(df, ddf_read)
    dd.assert_eq(df_read, ddf_read)
Ejemplo n.º 17
0
Archivo: orc.py Proyecto: mnicely/cudf
def _read_orc_stripe(fs, path, stripe, columns, kwargs={}):
    """Pull out specific columns from specific stripe"""
    with fs.open(path, "rb") as f:
        df_stripe = cudf.read_orc(f,
                                  stripes=[stripe],
                                  columns=columns,
                                  **kwargs)
    return df_stripe
Ejemplo n.º 18
0
def test_orc_reader_filepath_or_buffer(path_or_buf, src):
    cols = ["int1", "long1", "float1", "double1"]

    orcfile = pa.orc.ORCFile(path_or_buf("filepath"))
    expect = orcfile.read(columns=cols).to_pandas()
    got = cudf.read_orc(path_or_buf(src), columns=cols)

    assert_eq(expect, got)
Ejemplo n.º 19
0
def test_orc_reader_decimal_as_int(datadir):
    path = datadir / "TestOrcFile.decimal.orc"

    gdf = cudf.read_orc(
        path, engine="cudf", decimals_as_float=False, force_decimal_scale=2
    ).to_pandas()

    assert gdf["_col0"][0] == -100050  # -1000.5
Ejemplo n.º 20
0
def test_orc_read_filtered(datadir, engine, predicate, expected_len):
    path = datadir / "TestOrcFile.testStripeLevelStats.orc"
    try:
        df_filtered = cudf.read_orc(path, engine=engine, filters=predicate)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    # Assert # of rows after filtering
    assert len(df_filtered) == expected_len
Ejemplo n.º 21
0
def test_writer_lists_structs(list_struct_buff):
    df_in = cudf.read_orc(list_struct_buff)

    buff = BytesIO()
    df_in.to_orc(buff)

    pyarrow_tbl = pyarrow.orc.ORCFile(buff).read()

    assert pyarrow_tbl.equals(df_in.to_arrow())
Ejemplo n.º 22
0
def test_orc_reader_decimal_type(datadir, orc_file):
    file_path = datadir / orc_file
    pdf = pd.read_orc(file_path)
    df = cudf.read_orc(file_path).to_pandas()
    # Converting to strings since pandas keeps it in decimal
    pdf["col8"] = pdf["col8"].astype("str")
    df["col8"] = df["col8"].astype("str")

    assert_eq(pdf, df)
Ejemplo n.º 23
0
def test_orc_writer_rle_stream_size(datadir, tmpdir):
    original = datadir / "TestOrcFile.int16.rle.size.orc"
    reencoded = tmpdir.join("int16_map.orc")

    df = cudf.read_orc(original)
    df.to_orc(reencoded)

    # Segfaults when RLE stream sizes don't account for varint length
    pa_out = pa.orc.ORCFile(reencoded).read()
    assert_eq(df.to_pandas(), pa_out)
Ejemplo n.º 24
0
def test_orc_reader_gmt_timestamps(datadir):
    path = datadir / "TestOrcFile.gmt.orc"
    try:
        orcfile = pa.orc.ORCFile(path)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    pdf = orcfile.read().to_pandas()
    gdf = cudf.read_orc(path, engine="cudf").to_pandas()
    assert_eq(pdf, gdf)
Ejemplo n.º 25
0
def test_orc_decimal_precision_fail(datadir):
    file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc"

    try:
        orcfile = pa.orc.ORCFile(file_path)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    # Max precision supported is 18 (Decimal64Dtype limit)
    # and the data has the precision 19. This test should be removed
    # once Decimal128Dtype is introduced.
    with pytest.raises(RuntimeError):
        cudf.read_orc(file_path)

    # Shouldn't cause failure if decimal column is not chosen to be read.
    pdf = orcfile.read(columns=["int"]).to_pandas()
    gdf = cudf.read_orc(file_path, columns=["int"])

    assert_eq(pdf, gdf)
Ejemplo n.º 26
0
def test_orc_reader_strings(datadir):
    path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc"
    try:
        orcfile = pa.orc.ORCFile(path)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    expect = orcfile.read(columns=["string1"])
    got = cudf.read_orc(path, engine="cudf", columns=["string1"])

    assert_eq(expect, got, check_categorical=False)
Ejemplo n.º 27
0
def test_int_overflow(tmpdir):
    file_path = tmpdir.join("gdf_overflow.orc")

    # The number of rows and the large element trigger delta encoding
    num_rows = 513
    df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int32")
    df["a"][0] = 1024 * 1024 * 1024
    df["a"][num_rows - 1] = 1
    df.to_orc(file_path)

    assert_eq(cudf.read_orc(file_path), df)
Ejemplo n.º 28
0
def test_orc_reader_uncompressed_block(datadir):
    path = datadir / "uncompressed_snappy.orc"
    try:
        orcfile = pa.orc.ORCFile(path)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    expect = orcfile.read().to_pandas()
    got = cudf.read_orc(path, engine="cudf")

    assert_eq(expect, got, check_categorical=False)
Ejemplo n.º 29
0
def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
    path = datadir / inputfile
    try:
        orcfile = pa.orc.ORCFile(path)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    pdf = orcfile.read().to_pandas(date_as_object=False)
    gdf = cudf.read_orc(path, engine="cudf", use_index=use_index)

    assert_eq(pdf, gdf, check_categorical=False)
Ejemplo n.º 30
0
def test_orc_writer_sliced(tmpdir):
    cudf_path = tmpdir.join("cudf.orc")

    df = pd.DataFrame()
    df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"])
    df = cudf.from_pandas(df)

    df_select = df.iloc[1:3]

    df_select.to_orc(cudf_path)
    assert_eq(cudf.read_orc(cudf_path), df_select.reset_index(drop=True))