Ejemplo n.º 1
0
def parquet_writer_test_rowgroup_index_compression(pdf, compression,
                                                   row_group_size):
    pd_file_name = "cpu_pdf.parquet"
    gd_file_name = "gpu_pdf.parquet"

    gdf = cudf.from_pandas(pdf)

    pdf.to_parquet(
        pd_file_name,
        compression=compression,
        row_group_size=row_group_size,
    )
    gdf.to_parquet(
        gd_file_name,
        compression=compression,
        row_group_size=row_group_size,
    )

    actual = cudf.read_parquet(gd_file_name)
    expected = pd.read_parquet(pd_file_name)
    compare_dataframe(actual, expected)

    actual = cudf.read_parquet(pd_file_name)
    expected = pd.read_parquet(gd_file_name)
    compare_dataframe(actual, expected, nullable=False)
Ejemplo n.º 2
0
def orc_reader_test(input_tuple, skiprows, columns, num_rows, use_index):
    # TODO: Remove skiprows=0 after
    # following issue is fixed:
    # https://github.com/rapidsai/cudf/issues/6563
    skiprows = 0

    pdf, file_buffer = input_tuple
    expected_pdf = pdf.iloc[skiprows:]
    if num_rows is not None:
        expected_pdf = expected_pdf.head(num_rows)
    if skiprows is not None or num_rows is not None:
        expected_pdf.reset_index(drop=True, inplace=True)
    if columns is not None:
        expected_pdf = expected_pdf[columns]
    if use_index is False:
        expected_pdf.reset_index(drop=True, inplace=True)

    gdf = cudf.read_orc(
        io.BytesIO(file_buffer),
        columns=columns,
        skiprows=skiprows,
        num_rows=num_rows,
        use_index=use_index,
    )
    compare_dataframe(expected_pdf, gdf)
Ejemplo n.º 3
0
def avro_reader_test(input_tuple, columns, skiprows, num_rows):
    pdf, parquet_buffer = input_tuple
    expected_pdf = pdf[skiprows:]
    if num_rows is not None:
        expected_pdf = expected_pdf.head(num_rows)
    if skiprows is not None or num_rows is not None:
        expected_pdf = expected_pdf.reset_index(drop=True)

    gdf = cudf.read_avro(parquet_buffer,
                         columns=columns,
                         skiprows=skiprows,
                         num_rows=num_rows)
    compare_dataframe(expected_pdf, gdf)
Ejemplo n.º 4
0
def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata):
    pdf = pd.read_parquet(
        parquet_buffer,
        columns=columns,
        use_pandas_metadata=use_pandas_metadata,
    )
    gdf = cudf.read_parquet(
        parquet_buffer,
        columns=columns,
        use_pandas_metadata=use_pandas_metadata,
    )

    compare_dataframe(gdf, pdf)
Ejemplo n.º 5
0
def orc_writer_test(pdf, compression, enable_statistics):
    file_to_strore = io.BytesIO()

    gdf = cudf.from_pandas(pdf)

    gdf.to_orc(
        file_to_strore,
        compression=compression,
        enable_statistics=enable_statistics,
    )
    file_to_strore.seek(0)

    actual_df = cudf.read_orc(file_to_strore)
    compare_dataframe(pdf, actual_df)
Ejemplo n.º 6
0
def orc_reader_stripes_test(input_tuple, columns, stripes):
    _, file_buffer = input_tuple
    expected_pdf = orc_to_pandas(file_io_obj=io.BytesIO(file_buffer),
                                 stripes=stripes)

    if columns is not None and len(columns) > 0:
        # ORC reader picks columns if only
        # there are any elements in `columns`
        expected_pdf = expected_pdf[columns]

    gdf = cudf.read_orc(io.BytesIO(file_buffer),
                        columns=columns,
                        stripes=stripes)

    compare_dataframe(expected_pdf, gdf)
Ejemplo n.º 7
0
def parquet_writer_test(pdf):
    pd_file_name = "cpu_pdf.parquet"
    gd_file_name = "gpu_pdf.parquet"

    gdf = cudf.from_pandas(pdf)

    pdf.to_parquet(pd_file_name)
    gdf.to_parquet(gd_file_name)

    actual = cudf.read_parquet(gd_file_name)
    expected = pd.read_parquet(pd_file_name)
    compare_dataframe(actual, expected)

    actual = cudf.read_parquet(pd_file_name)
    expected = pd.read_parquet(gd_file_name)
    compare_dataframe(actual, expected)
Ejemplo n.º 8
0
def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata,
                           skiprows, num_rows):
    pdf = pd.read_parquet(
        parquet_buffer,
        columns=columns,
        use_pandas_metadata=use_pandas_metadata,
    )

    pdf = pdf.iloc[skiprows:]
    if num_rows is not None:
        pdf = pdf.head(num_rows)

    gdf = cudf.read_parquet(
        parquet_buffer,
        columns=columns,
        use_pandas_metadata=use_pandas_metadata,
        skiprows=skiprows,
        num_rows=num_rows,
    )

    compare_dataframe(gdf, pdf)
Ejemplo n.º 9
0
def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index):
    pdf, file_buffer = input_tuple
    expected_pdf = pdf.iloc[skiprows:]
    if num_rows is not None:
        expected_pdf = expected_pdf.head(num_rows)
    if skiprows is not None or num_rows is not None:
        expected_pdf.reset_index(drop=True, inplace=True)
    if columns is not None and len(columns) > 0:
        # ORC reader picks columns if only
        # there are any elements in `columns`
        expected_pdf = expected_pdf[columns]
    if use_index is False:
        expected_pdf.reset_index(drop=True, inplace=True)

    gdf = cudf.read_orc(
        io.BytesIO(file_buffer),
        columns=columns,
        skiprows=skiprows,
        num_rows=num_rows,
        use_index=use_index,
    )

    compare_dataframe(expected_pdf, gdf)
Ejemplo n.º 10
0
def parquet_reader_test(parquet_buffer):
    pdf = pd.read_parquet(parquet_buffer)
    gdf = cudf.read_parquet(parquet_buffer)

    compare_dataframe(gdf, pdf)