Example #1
0
def test_dataframe_apply_rows(dtype, has_nulls):
    count = 1000
    gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls)
    gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls)
    gdf_series_expected = gdf_series_a * gdf_series_b

    df_expected = cudf.DataFrame({
        "a": gdf_series_a,
        "b": gdf_series_b,
        "out": gdf_series_expected
    })

    df_original = cudf.DataFrame({"a": gdf_series_a, "b": gdf_series_b})

    df_actual = df_original.apply_rows(_kernel_multiply, ["a", "b"],
                                       {"out": dtype}, {})

    assert_eq(df_expected, df_actual)
Example #2
0
def test_operator_func_between_series(dtype, func, has_nulls, fill_value):
    count = 1000
    gdf_series_a = utils.gen_rand_series(dtype,
                                         count,
                                         has_nulls=has_nulls,
                                         stride=10000)
    gdf_series_b = utils.gen_rand_series(dtype,
                                         count,
                                         has_nulls=has_nulls,
                                         stride=100)
    pdf_series_a = gdf_series_a.to_pandas()
    pdf_series_b = gdf_series_b.to_pandas()

    gdf_result = getattr(gdf_series_a, func)(gdf_series_b,
                                             fill_value=fill_value)
    pdf_result = getattr(pdf_series_a, func)(pdf_series_b,
                                             fill_value=fill_value)

    utils.assert_eq(pdf_result, gdf_result)
Example #3
0
def test_operator_func_series_and_scalar_logical(dtype, func, has_nulls,
                                                 scalar, fill_value):
    gdf_series = utils.gen_rand_series(dtype,
                                       1000,
                                       has_nulls=has_nulls,
                                       stride=10000)
    pdf_series = gdf_series.to_pandas()

    gdf_series_result = getattr(gdf_series, func)(scalar,
                                                  fill_value=fill_value)
    pdf_series_result = getattr(pdf_series, func)(scalar,
                                                  fill_value=fill_value)

    utils.assert_eq(pdf_series_result, gdf_series_result)
Example #4
0
def test_dataframe_apply_rows(dtype, has_nulls, pessimistic):
    count = 1000
    gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls)
    gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls)
    gdf_series_c = gen_rand_series(dtype, count, has_nulls=has_nulls)

    if pessimistic:
        # pessimistically combine the null masks
        gdf_series_expected = gdf_series_a * gdf_series_b
    else:
        # optimistically ignore the null masks
        a = cudf.Series(column.build_column(gdf_series_a.data, dtype))
        b = cudf.Series(column.build_column(gdf_series_b.data, dtype))
        gdf_series_expected = a * b

    df_expected = cudf.DataFrame(
        {
            "a": gdf_series_a,
            "b": gdf_series_b,
            "c": gdf_series_c,
            "out": gdf_series_expected,
        }
    )

    df_original = cudf.DataFrame(
        {"a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c}
    )

    df_actual = df_original.apply_rows(
        _kernel_multiply,
        ["a", "b"],
        {"out": dtype},
        {},
        pessimistic_nulls=pessimistic,
    )

    assert_eq(df_expected, df_actual)
Example #5
0
def test_orc_write_statistics(tmpdir, datadir, nrows):
    supported_stat_types = supported_numpy_dtypes + ["str"]
    # Can't write random bool columns until issue #6763 is fixed
    if nrows == 6000000:
        supported_stat_types.remove("bool")

    # Make a dataframe
    gdf = cudf.DataFrame({
        "col_" + str(dtype): gen_rand_series(dtype, nrows, has_nulls=True)
        for dtype in supported_stat_types
    })
    fname = tmpdir.join("gdf.orc")

    # Write said dataframe to ORC with cuDF
    gdf.to_orc(fname.strpath)

    # Read back written ORC's statistics
    orc_file = pa.orc.ORCFile(fname)
    (
        file_stats,
        stripes_stats,
    ) = cudf.io.orc.read_orc_statistics(fname)

    # check file stats
    for col in gdf:
        if "minimum" in file_stats[col]:
            stats_min = file_stats[col]["minimum"]
            actual_min = gdf[col].min()
            assert normalized_equals(actual_min, stats_min)
        if "maximum" in file_stats[col]:
            stats_max = file_stats[col]["maximum"]
            actual_max = gdf[col].max()
            assert normalized_equals(actual_max, stats_max)

    # compare stripe statistics with actual min/max
    for stripe_idx in range(0, orc_file.nstripes):
        stripe = orc_file.read_stripe(stripe_idx)
        # pandas is unable to handle min/max of string col with nulls
        stripe_df = cudf.DataFrame(stripe.to_pandas())
        for col in stripe_df:
            if "minimum" in stripes_stats[stripe_idx][col]:
                actual_min = stripe_df[col].min()
                stats_min = stripes_stats[stripe_idx][col]["minimum"]
                assert normalized_equals(actual_min, stats_min)

            if "maximum" in stripes_stats[stripe_idx][col]:
                actual_max = stripe_df[col].max()
                stats_max = stripes_stats[stripe_idx][col]["maximum"]
                assert normalized_equals(actual_max, stats_max)
Example #6
0
def test_orc_bool_encode_fail():
    np.random.seed(0)
    buffer = BytesIO()

    # Generate a boolean column longer than a single stripe
    fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 600000)})
    # Invalidate the first row in the second stripe to break encoding
    fail_df["col"][500000] = None

    # Should throw instead of generating a file that is incompatible
    # with other readers (see issue #6763)
    with pytest.raises(RuntimeError):
        fail_df.to_orc(buffer)

    # Generate a boolean column that fits into a single stripe
    okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 500000)})
    okay_df["col"][500000 - 1] = None
    # Invalid row is in the last row group of the stripe;
    # encoding is assumed to be correct
    okay_df.to_orc(buffer)

    # Also validate data
    pdf = pa.orc.ORCFile(buffer).read().to_pandas()
    assert_eq(okay_df, pdf)
Example #7
0
def test_operator_func_series_and_scalar(dtype, func, has_nulls, fill_value,
                                         use_cudf_scalar):
    count = 1000
    scalar = 59
    gdf_series = utils.gen_rand_series(dtype,
                                       count,
                                       has_nulls=has_nulls,
                                       stride=10000)
    pdf_series = gdf_series.to_pandas()

    gdf_series_result = getattr(gdf_series, func)(
        cudf.Scalar(scalar) if use_cudf_scalar else scalar,
        fill_value=fill_value,
    )
    pdf_series_result = getattr(pdf_series, func)(scalar,
                                                  fill_value=fill_value)

    utils.assert_eq(pdf_series_result, gdf_series_result)
Example #8
0
def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
    # Make a dataframe
    gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)})
    fname = tmpdir.join("gdf.orc")

    # Write said dataframe to ORC with cuDF
    gdf.to_orc(fname.strpath)

    # Read back written ORC's statistics
    orc_file = pa.orc.ORCFile(fname)
    (
        file_stats,
        stripes_stats,
    ) = cudf.io.orc.read_orc_statistics(fname)

    # check file stats
    col = "col_bool"
    if "true_count" in file_stats[col]:
        stats_true_count = file_stats[col]["true_count"]
        actual_true_count = gdf[col].sum()
        assert normalized_equals(actual_true_count, stats_true_count)

    if "number_of_values" in file_stats[col]:
        stats_valid_count = file_stats[col]["number_of_values"]
        actual_valid_count = gdf[col].valid_count
        assert normalized_equals(actual_valid_count, stats_valid_count)

    # compare stripe statistics with actual min/max
    for stripe_idx in range(0, orc_file.nstripes):
        stripe = orc_file.read_stripe(stripe_idx)
        # pandas is unable to handle min/max of string col with nulls
        stripe_df = cudf.DataFrame(stripe.to_pandas())

        if "true_count" in stripes_stats[stripe_idx][col]:
            actual_true_count = stripe_df[col].sum()
            stats_true_count = stripes_stats[stripe_idx][col]["true_count"]
            assert normalized_equals(actual_true_count, stats_true_count)

        if "number_of_values" in stripes_stats[stripe_idx][col]:
            actual_valid_count = stripe_df[col].valid_count
            stats_valid_count = stripes_stats[stripe_idx][col][
                "number_of_values"]
            assert normalized_equals(actual_valid_count, stats_valid_count)