Beispiel #1
0
def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
    source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
    fname = "test_orc_reader.orc"
    bname = "orc"
    expect = pa.orc.ORCFile(source_file).read().to_pandas()

    with open(source_file, "rb") as f:
        buffer = f.read()

    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
        got = cudf.read_orc(
            f"s3://{bname}/{fname}",
            columns=columns,
            storage_options=s3so,
            use_python_file_object=use_python_file_object,
        )

    if columns:
        expect = expect[columns]
    assert_eq(expect, got)
Beispiel #2
0
def test_chunked_orc_writer_lists():
    num_rows = 12345
    pdf_in = pd.DataFrame(
        {
            "ls": [[str(i), str(2 * i)] for i in range(num_rows)],
            "ld": [[dec(i / 2)] * 5 for i in range(num_rows)],
        }
    )

    gdf = cudf.from_pandas(pdf_in)
    expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True)

    buffer = BytesIO()
    writer = ORCWriter(buffer)
    writer.write_table(gdf)
    writer.write_table(gdf)
    writer.close()

    got = pa.orc.ORCFile(buffer).read().to_pandas()
    assert_eq(expect, got)
Beispiel #3
0
def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread,
                             use_python_file_object):
    # Write to buffer
    fname = "test_csv_reader_byte_range.csv"
    bname = "csv"
    buffer = pdf.to_csv(index=False)

    # Use fsspec file object
    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
        got = cudf.read_csv(
            f"s3://{bname}/{fname}",
            storage_options=s3so,
            byte_range=(74, 73),
            bytes_per_thread=bytes_per_thread,
            header=None,
            names=["Integer", "Float", "Integer2", "String", "Boolean"],
            use_python_file_object=use_python_file_object,
        )

    assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
Beispiel #4
0
def test_series_nlargest(data, n):
    """Indirectly tests Series.sort_values()"""
    sr = Series(data)
    psr = pd.Series(data)
    assert_eq(sr.nlargest(n), psr.nlargest(n))
    assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last"))

    assert_exceptions_equal(
        lfunc=psr.nlargest,
        rfunc=sr.nlargest,
        lfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        rfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        expected_error_message='keep must be either "first", "last"',
    )
Beispiel #5
0
def test_hdf_reader(hdf_files, columns):
    hdf_df_file, hdf_series, format, nrows = hdf_files
    if format == "fixed" and columns is not None:
        pytest.skip("Can't use columns with format 'fixed'")
    if format == "table" and nrows == 0:
        pytest.skip("Can't read 0 row table with format 'table'")
    expect_df = pd.read_hdf(hdf_df_file, columns=columns)
    got_df = cudf.read_hdf(hdf_df_file, columns=columns)

    assert_eq(expect_df,
              got_df,
              check_categorical=False,
              check_index_type=False)

    for column in hdf_series.keys():

        expect_series = pd.read_hdf(hdf_series[column])
        got_series = cudf.read_hdf(hdf_series[column])

        assert_eq(expect_series, got_series, check_index_type=False)
Beispiel #6
0
def test_replace_inplace(pframe, replace_args):
    gpu_frame = cudf.from_pandas(pframe)
    pandas_frame = pframe.copy()

    gpu_copy = gpu_frame.copy()
    cpu_copy = pandas_frame.copy()

    assert_eq(gpu_frame, pandas_frame)
    assert_eq(gpu_copy, cpu_copy)
    gpu_frame.replace(**replace_args)
    pandas_frame.replace(**replace_args)
    assert_eq(gpu_frame, pandas_frame)
    assert_eq(gpu_copy, cpu_copy)
Beispiel #7
0
def test_series_update(data, other):
    gs = data.copy(deep=True)
    if isinstance(other, cudf.Series):
        g_other = other.copy(deep=True)
        p_other = g_other.to_pandas()
    else:
        g_other = other
        p_other = other

    ps = gs.to_pandas()

    gs_column_before = gs._column
    gs.update(g_other)
    gs_column_after = gs._column

    assert_eq(gs_column_before.to_array(), gs_column_after.to_array())

    ps.update(p_other)

    assert_eq(gs, ps)
Beispiel #8
0
def test_empty():
    # empty should not throw
    order, quadtree = cuspatial.quadtree_on_points(
        cudf.Series([]),  # x
        cudf.Series([]),  # y
        *bbox_1,  # bbox
        1,  # scale
        1,  # max_depth
        1,  # min_size
    )
    assert_eq(
        quadtree,
        cudf.DataFrame({
            "key": cudf.Series([], dtype=np.uint32),
            "level": cudf.Series([], dtype=np.uint8),
            "is_quad": cudf.Series([], dtype=np.bool_),
            "length": cudf.Series([], dtype=np.uint32),
            "offset": cudf.Series([], dtype=np.uint32),
        }),
    )
Beispiel #9
0
def test_rollling_series_numba_udf_basic(data, index, center):

    psr = cudf.utils.utils._create_pandas_series(data=data, index=index)
    gsr = cudf.from_pandas(psr)

    def some_func(A):
        b = 0
        for a in A:
            b = max(b, math.sqrt(a))
        return b

    for window_size in range(1, len(data) + 1):
        for min_periods in range(1, window_size + 1):
            assert_eq(
                psr.rolling(window_size, min_periods,
                            center).apply(some_func).fillna(-1),
                gsr.rolling(window_size, min_periods,
                            center).apply(some_func).fillna(-1),
                check_dtype=False,
            )
Beispiel #10
0
def test_series_set_equal_length_object_by_mask(replace_data):

    psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
    gsr = cudf.from_pandas(psr)

    # Lengths match in trivial case
    pd_bool_col = pd.Series([True] * len(psr), dtype="boolean")
    gd_bool_col = cudf.from_pandas(pd_bool_col)
    psr[pd_bool_col] = (replace_data.to_pandas(
        nullable=True) if hasattr(replace_data, "to_pandas") else replace_data)
    gsr[gd_bool_col] = replace_data

    assert_eq(psr.astype("float"), gsr.astype("float"))

    # Test partial masking
    psr[psr > 1] = (replace_data.to_pandas() if hasattr(
        replace_data, "to_pandas") else replace_data)
    gsr[gsr > 1] = replace_data

    assert_eq(psr.astype("float"), gsr.astype("float"))
Beispiel #11
0
def test_concat_join_no_overlapping_columns(pdf1, pdf2, ignore_index, sort,
                                            join, axis):
    gdf1 = gd.from_pandas(pdf1)
    gdf2 = gd.from_pandas(pdf2)
    assert_eq(
        pd.concat(
            [pdf1, pdf2],
            sort=sort,
            join=join,
            ignore_index=ignore_index,
            axis=axis,
        ),
        gd.concat(
            [gdf1, gdf2],
            sort=sort,
            join=join,
            ignore_index=ignore_index,
            axis=axis,
        ),
    )
Beispiel #12
0
def test_rolling_dataframe_numba_udf_basic(data, center):

    pdf = pd.DataFrame(data)
    gdf = cudf.from_pandas(pdf)

    def some_func(A):
        b = 0
        for a in A:
            b = b + a**2
        return b / len(A)

    for window_size in range(1, len(data) + 1):
        for min_periods in range(1, window_size + 1):
            assert_eq(
                pdf.rolling(window_size, min_periods,
                            center).apply(some_func).fillna(-1),
                gdf.rolling(window_size, min_periods,
                            center).apply(some_func).fillna(-1),
                check_dtype=False,
            )
Beispiel #13
0
def test_read_avro(datadir, hdfs, test_url):
    fname = datadir / "avro" / "example.avro"
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/file.avro", buffer)

    if test_url:
        hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro"
    else:
        hd_fpath = f"hdfs://{basedir}/file.avro"

    got = cudf.read_avro(hd_fpath)
    with open(fname, mode="rb") as f:
        expect = pd.DataFrame.from_records(fa.reader(f))

    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)
    assert_eq(expect, got)
Beispiel #14
0
def test_series_round(arr, decimals):
    pser = pd.Series(arr)
    ser = cudf.Series(arr)
    result = ser.round(decimals)
    expected = pser.round(decimals)

    assert_eq(result, expected)

    # with nulls, maintaining existing null mask
    arr = arr.astype("float64")  # for pandas nulls
    arr.ravel()[np.random.choice(arr.shape[0],
                                 arr.shape[0] // 2,
                                 replace=False)] = np.nan

    pser = pd.Series(arr)
    ser = cudf.Series(arr)
    result = ser.round(decimals)
    expected = pser.round(decimals)

    assert_eq(result, expected)
def test_mixed_lines():
    buffers = GeoArrowBuffers({
        "lines_xy": range(24),
        "lines_offsets": np.array(range(5)) * 6,
        "mlines": [1, 3],
    })
    assert_eq(cudf.Series(range(24)), buffers.lines.xy)
    assert len(buffers.lines) == 3
    column = GeoColumn(buffers)
    assert_eq(
        GeoSeries(column),
        gpGeoSeries([
            LineString([[0, 1], [2, 3], [4, 5]]),
            MultiLineString([
                LineString([[6, 7], [8, 9], [10, 11]]),
                LineString([[12, 13], [14, 15], [16, 17]]),
            ]),
            LineString([[18, 19], [20, 21], [22, 23]]),
        ]),
    )
Beispiel #16
0
def test_concat_decimal_dataframe(ltype, rtype):
    gdf1 = gd.DataFrame({
        "id": np.random.randint(0, 10, 3),
        "val": ["22.3", "59.5", "81.1"]
    })
    gdf2 = gd.DataFrame({
        "id": np.random.randint(0, 10, 3),
        "val": ["2.35", "5.59", "8.14"]
    })

    gdf1["val"] = gdf1["val"].astype(ltype)
    gdf2["val"] = gdf2["val"].astype(rtype)

    pdf1 = gdf1.to_pandas()
    pdf2 = gdf2.to_pandas()

    got = gd.concat([gdf1, gdf2])
    expected = pd.concat([pdf1, pdf2])

    assert_eq(expected, got)
Beispiel #17
0
def test_dataframe_replace(df, to_replace, value):
    gdf = df
    pdf = gdf.to_pandas()

    pd_value = value
    if isinstance(value, pd.Series):
        gd_value = cudf.from_pandas(value)
    else:
        gd_value = value

    pd_to_replace = to_replace
    if isinstance(to_replace, pd.Series):
        gd_to_replace = cudf.from_pandas(to_replace)
    else:
        gd_to_replace = to_replace

    expected = pdf.replace(to_replace=pd_to_replace, value=pd_value)
    actual = gdf.replace(to_replace=gd_to_replace, value=gd_value)

    assert_eq(expected, actual)
Beispiel #18
0
def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered,
                                        inplace):

    pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered))
    cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered))

    assert_eq(pd_sr, cd_sr)

    assert str(pd_sr) == str(cd_sr)

    kwargs = dict(ordered=to_ordered, inplace=inplace)

    pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs)
    cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs)
    pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1
    cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1

    assert_eq(pd_sr_1, cd_sr_1)

    assert str(cd_sr_1) == str(pd_sr_1)
Beispiel #19
0
def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op):
    gpu_scalar = cudf.Scalar(cpu_scalar)

    gsr = cudf.Series(data=data, dtype=dtype)
    psr = gsr.to_pandas()

    if op == "add":
        expected = psr + cpu_scalar
        actual = gsr + gpu_scalar
    elif op == "sub":
        expected = psr - cpu_scalar
        actual = gsr - gpu_scalar
    elif op == "truediv":
        expected = psr / cpu_scalar
        actual = gsr / gpu_scalar
    elif op == "floordiv":
        expected = psr // cpu_scalar
        actual = gsr // gpu_scalar
    elif op == "mod":
        expected = psr % cpu_scalar
        actual = gsr % gpu_scalar

    assert_eq(expected, actual)

    if op == "add":
        expected = cpu_scalar + psr
        actual = gpu_scalar + gsr
    elif op == "sub":
        expected = cpu_scalar - psr
        actual = gpu_scalar - gsr
    elif op == "truediv":
        expected = cpu_scalar / psr
        actual = gpu_scalar / gsr
    elif op == "floordiv":
        expected = cpu_scalar // psr
        actual = gpu_scalar // gsr
    elif op == "mod":
        expected = cpu_scalar % psr
        actual = gpu_scalar % gsr

    assert_eq(expected, actual)
Beispiel #20
0
def test_timedelta_index_properties(data, dtype, name):
    gdi = cudf.Index(data, dtype=dtype, name=name)
    pdi = gdi.to_pandas()

    def local_assert(expected, actual):
        if actual._values.null_count:
            assert_eq(expected, actual.astype("float64"))
        else:
            assert_eq(expected, actual)

    expected_days = pdi.days
    actual_days = gdi.days

    local_assert(expected_days, actual_days)

    expected_seconds = pdi.seconds
    actual_seconds = gdi.seconds

    local_assert(expected_seconds, actual_seconds)

    expected_microseconds = pdi.microseconds
    actual_microseconds = gdi.microseconds

    local_assert(expected_microseconds, actual_microseconds)

    expected_nanoseconds = pdi.nanoseconds
    actual_nanoseconds = gdi.nanoseconds

    local_assert(expected_nanoseconds, actual_nanoseconds)

    expected_components = pdi.components
    actual_components = gdi.components

    if actual_components.isnull().any().any():
        assert_eq(expected_components, actual_components.astype("float"))
    else:
        assert_eq(
            expected_components,
            actual_components,
            check_index_type=not actual_components.empty,
        )
def test_can_detect_dtype_from_avro_type_nested(avro_type, expected_dtype,
                                                namespace, nullable):
    avro_type = avro_type if not nullable else ["null", avro_type]

    schema_leaf = {
        "name": "leaf",
        "type": "record",
        "fields": [{
            "name": "prop3",
            "type": avro_type
        }],
    }

    schema_child = {
        "name": "child",
        "type": "record",
        "fields": [{
            "name": "prop2",
            "type": schema_leaf
        }],
    }

    schema_root = {
        "name": "root",
        "type": "record",
        "namespace": namespace,
        "fields": [{
            "name": "prop1",
            "type": schema_child
        }],
    }

    actual = cudf_from_avro_util(schema_root, [])

    col_name = "{ns}child.{ns}leaf.prop3".format(
        ns="" if namespace is None else namespace + ".")

    expected = cudf.DataFrame(
        {col_name: cudf.Series(None, None, expected_dtype)})

    assert_eq(expected, actual)
Beispiel #22
0
def test_series_drop_edge_inputs():
    gs = cudf.Series([42], name="a")
    ps = gs.to_pandas()

    assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"]))

    assert_eq(ps.drop(columns="b"), gs.drop(columns="b"))

    assert_exceptions_equal(
        lfunc=ps.drop,
        rfunc=gs.drop,
        lfunc_args_and_kwargs=(["a"], {
            "columns": "a",
            "axis": 1
        }),
        rfunc_args_and_kwargs=(["a"], {
            "columns": "a",
            "axis": 1
        }),
        expected_error_message="Cannot specify both",
    )

    assert_exceptions_equal(
        lfunc=ps.drop,
        rfunc=gs.drop,
        lfunc_args_and_kwargs=([], {}),
        rfunc_args_and_kwargs=([], {}),
        expected_error_message="Need to specify at least one",
    )

    assert_exceptions_equal(
        lfunc=ps.drop,
        rfunc=gs.drop,
        lfunc_args_and_kwargs=(["b"], {
            "axis": 1
        }),
        rfunc_args_and_kwargs=(["b"], {
            "axis": 1
        }),
        expected_error_message="No axis named 1",
    )
Beispiel #23
0
def test_class_triple_six_splits():
    t = cudf.Series([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
                     5]).astype("float32")
    x = cudf.Series([3, 2, 3, 4, 3, 1, 3, 2, 3, 4, 3, 1, 3, 2, 3, 4, 3,
                     1]).astype("float32")
    prefixes = cudf.Series([0, 6, 12, 18]).astype("int32")
    g = cuspatial.interpolate.CubicSpline(t, x, prefixes=prefixes)
    groups = cudf.Series(
        np.ravel(
            np.array([np.repeat(0, 12),
                      np.repeat(1, 12),
                      np.repeat(2, 12)])))
    split_t = cudf.Series(
        np.ravel((
            np.linspace(0, 5, 11),
            np.linspace(0, 5, 11),
            np.linspace(0, 5, 11),
        )),
        dtype="float32",
    )
    split_t_ind = [
        0,
        2,
        4,
        6,
        8,
        10,
        11,
        13,
        15,
        17,
        19,
        21,
        22,
        24,
        26,
        28,
        30,
        32,
    ]
    assert_eq(g(split_t, groups=groups)[split_t_ind].reset_index(drop=True), x)
Beispiel #24
0
def compare_dataframe(left, right, nullable=True):
    if nullable and isinstance(left, cudf.DataFrame):
        left = left.to_pandas(nullable=True)
    if nullable and isinstance(right, cudf.DataFrame):
        right = right.to_pandas(nullable=True)

    if len(left.index) == 0 and len(right.index) == 0:
        check_index_type = False
    else:
        check_index_type = True

    return assert_eq(left, right, check_index_type=check_index_type)
Beispiel #25
0
def check_serialization(df):
    # basic
    assert_frame_picklable(df)
    # sliced
    assert_frame_picklable(df[:-1])
    assert_frame_picklable(df[1:])
    assert_frame_picklable(df[2:-2])
    # sorted
    sortvaldf = df.sort_values("vals")
    assert isinstance(sortvaldf.index, GenericIndex)
    assert_frame_picklable(sortvaldf)
    # out-of-band
    if pickle.HIGHEST_PROTOCOL >= 5:
        buffers = []
        serialbytes = pickle.dumps(df,
                                   protocol=5,
                                   buffer_callback=buffers.append)
        for b in buffers:
            assert isinstance(b, pickle.PickleBuffer)
        loaded = pickle.loads(serialbytes, buffers=buffers)
        assert_eq(loaded, df)
Beispiel #26
0
def test_df_stack(nulls, num_cols, num_rows, dtype):
    if dtype not in ["float32", "float64"] and nulls in ["some"]:
        pytest.skip(msg="nulls not supported in dtype: " + dtype)

    pdf = pd.DataFrame()
    for i in range(num_cols):
        colname = str(i)
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.random.choice(
                num_rows, size=int(num_rows / 2), replace=False
            )
            data[idx] = np.nan
        pdf[colname] = data

    gdf = cudf.from_pandas(pdf)

    got = gdf.stack()
    expect = pdf.stack()

    assert_eq(expect, got)
Beispiel #27
0
def test_json_corner_case_with_escape_and_double_quote_char_with_pandas(
    tmpdir, ):
    fname = tmpdir.mkdir("gdf_json").join("tmp_json_escape_double_quote")

    pdf = pd.DataFrame({
        "a": ['ab"cd', "\\\b", "\r\\", "'"],
        "b": ["a\tb\t", "\\", '\\"', "\t"],
        "c": ["aeiou", "try", "json", "cudf"],
    })
    pdf.to_json(fname, compression="infer", lines=True, orient="records")

    df = cudf.read_json(fname,
                        compression="infer",
                        lines=True,
                        orient="records")
    pdf = pd.read_json(fname,
                       compression="infer",
                       lines=True,
                       orient="records")

    assert_eq(cudf.DataFrame(pdf), df)
Beispiel #28
0
def test_storage_options(tmpdir, pdf, hdfs):
    fname = tmpdir.mkdir("csv").join("file.csv")
    # Write to local file system
    pdf.to_csv(fname)
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/file.csv", buffer)

    hd_fpath = f"hdfs://{basedir}/file.csv"

    storage_options = {"host": host, "port": port}

    got = cudf.read_csv(hd_fpath, storage_options=storage_options)

    # Read pandas from byte buffer
    with hdfs.open(basedir + "/file.csv") as f:
        expect = pd.read_csv(f)

    assert_eq(expect, got)
Beispiel #29
0
async def test_ping_pong_cudf(g):
    # if this test appears after cupy an import error arises
    # *** ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `CXXABI_1.3.11'
    # not found (required by python3.7/site-packages/pyarrow/../../../libarrow.so.12)
    cudf = pytest.importorskip("cudf")
    from cudf.testing._utils import assert_eq

    cudf_obj = g(cudf)

    com, serv_com = await get_comm_pair()
    msg = {"op": "ping", "data": to_serialize(cudf_obj)}

    await com.write(msg)
    result = await serv_com.read()

    cudf_obj_2 = result.pop("data")
    assert result["op"] == "ping"
    assert_eq(cudf_obj, cudf_obj_2)

    await com.close()
    await serv_com.close()
Beispiel #30
0
def test_cut_series(x, bins, right, include_lowest, ordered, precision):

    pcat = pd.cut(
        x=x,
        bins=bins,
        right=right,
        precision=precision,
        include_lowest=include_lowest,
        ordered=ordered,
    )

    gcat = cut(
        x=x,
        bins=bins,
        right=right,
        precision=precision,
        include_lowest=include_lowest,
        ordered=ordered,
    )

    assert_eq(pcat, gcat)