コード例 #1
0
ファイル: test_csv.py プロジェクト: zhuohuwu0603/cudf
def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype):
    lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF", "9512c20b"]
    values = [int(hex_int, 16) for hex_int in lines]

    buffer = "\n".join(lines)

    if gdf_dtype is not None:
        # require explicit `hex` dtype to parse hexadecimals
        pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"])
        gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
        np.testing.assert_array_equal(pdf["hex_int"],
                                      gdf["hex_int"].to_array())
    else:
        # otherwise, dtype inference returns as object (string)
        pdf = pd.read_csv(StringIO(buffer), names=["hex_int"])
        gdf = read_csv(StringIO(buffer), names=["hex_int"])
        assert_eq(pdf, gdf)
コード例 #2
0
ファイル: test_csv.py プロジェクト: zhuohuwu0603/cudf
def test_read_csv_names_header_combination():
    pdf = pd.DataFrame({
        "firstname": ["Emma", "Ava", "Sophia"],
        "lastname": ["Olivia", "Isabella", "Charlotte"],
        "gender": ["F", "F", "F"],
    })
    buffer = pdf.to_csv(header=True, index=False)
    names = pdf.columns

    gdf = read_csv(StringIO(buffer), names=names, header=0)
    assert_eq(pdf, gdf)

    gdf = read_csv(StringIO(buffer), header=0)
    assert_eq(pdf, gdf)

    gdf = read_csv(StringIO(buffer))
    assert_eq(pdf, gdf)
コード例 #3
0
def test_csv_reader_column_names(names):
    buffer = '0,1,2\n3,4,5\n6,7,8'

    df = read_csv(StringIO(buffer), names=names)
    if names is None:
        assert (list(df) == ['0', '1', '2'])
    else:
        assert (list(df) == list(names))
コード例 #4
0
ファイル: test_csv.py プロジェクト: zhuohuwu0603/cudf
def test_csv_reader_column_names(names):
    buffer = "0,1,2\n3,4,5\n6,7,8"

    df = read_csv(StringIO(buffer), names=names)
    if names is None:
        assert list(df) == ["0", "1", "2"]
    else:
        assert list(df) == list(names)
コード例 #5
0
ファイル: test_pagerank.py プロジェクト: rlratzel/rapids-test
def read_csv_file(csv_file):
    """
    Read csv_file and return a cuDF DataFrame
    """
    return cudf.read_csv(csv_file,
                         delimiter=' ',
                         dtype=['int32', 'int32', 'float32'],
                         header=None)
コード例 #6
0
def df(request):
    engine = request.getfixturevalue("engine")
    paths = request.getfixturevalue("paths")
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv-no-header":
        df1 = cudf.read_csv(paths[0], header=None, names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=None, names=allcols_csv)[mycols_csv]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        raise ValueError("unknown engine:" + engine)
    gdf = cudf.concat([df1, df2], axis=0)
    gdf["id"] = gdf["id"].astype("int64")
    return gdf
コード例 #7
0
def test_csv_reader_byte_range_type_corner_case(tmpdir):
    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv")

    cudf.datasets.timeseries(
        start="2000-01-01",
        end="2000-01-02",
        dtypes={
            "name": str,
            "id": int,
            "x": float,
            "y": float
        },
    ).to_csv(fname, chunksize=100000)

    byte_range = (2_147_483_648, 0)
    with pytest.raises(RuntimeError, match="Offset is past end of file"):
        cudf.read_csv(fname, byte_range=byte_range, header=None)
コード例 #8
0
def test_csv_reader_oversized_byte_range():
    buffer = "a,b,c,d,e\n4,5,6,7,8"

    cu_df = read_csv(StringIO(buffer), byte_range=(0, 1024))
    pd_df = pd.read_csv(StringIO(buffer))

    assert all(pd_df.columns == cu_df.columns)
    assert pd_df.shape == cu_df.shape
コード例 #9
0
ファイル: test_csv.py プロジェクト: zhuohuwu0603/cudf
def test_csv_empty_file(tmpdir, contents):
    fname = tmpdir.mkdir("gdf_csv").join("test_csv_empty_file.csv")
    with open(fname, "w") as f:
        f.write(contents)

    col_names = ["col1", "col2", "col3", "col4"]
    in_dtypes = ["int", "str", "float", "short"]
    out_dtypes = ["int32", "object", "float32", "int16"]

    # Empty dataframe if no columns names specified or inferred
    df = read_csv(str(fname))
    assert len(df.columns) == 0

    # No row dataframe if columns names are specified or inferred
    df = read_csv(str(fname), dtype=in_dtypes, names=col_names)
    assert all(df.columns == col_names)
    assert list(df.dtypes) == out_dtypes
コード例 #10
0
def test_dask_katz_centrality(client_connection):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree")
    largest_out_degree = largest_out_degree["out_degree"].iloc[0]
    katz_alpha = 1 / (largest_out_degree + 1)

    mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6)
    mg_res = mg_res.compute()

    import networkx as nx
    from cugraph.tests import utils
    NM = utils.read_csv_for_nx(input_data_path)
    Gnx = nx.from_pandas_edgelist(NM,
                                  create_using=nx.DiGraph(),
                                  source="0",
                                  target="1")
    nk = nx.katz_centrality(Gnx, alpha=katz_alpha)
    import pandas as pd
    pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality'])
    exp_res = cudf.DataFrame(pdf)
    err = 0
    tol = 1.0e-05

    compare_res = exp_res.merge(mg_res,
                                on="vertex",
                                suffixes=["_local", "_dask"])

    for i in range(len(compare_res)):
        diff = abs(compare_res["katz_centrality_local"].iloc[i] -
                   compare_res["katz_centrality_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
コード例 #11
0
def test_csv_reader_NaN_values():

    names = dtypes = ["float32"]
    empty_cells = '\n""\n  \n "" \n'
    default_na_cells = (
        "#N/A\n#N/A N/A\n#NA\n-1.#IND\n"
        "-1.#QNAN\n-NaN\n-nan\n1.#IND\n"
        "1.#QNAN\nN/A\nNA\nNULL\n"
        "NaN\nn/a\nnan\nnull\n"
    )
    custom_na_cells = "NV_NAN\nNotANumber\n"
    all_cells = empty_cells + default_na_cells + custom_na_cells
    custom_na_values = ["NV_NAN", "NotANumber"]

    # test default NA values. empty cells should also yield NaNs
    all_nan = read_csv(
        StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes
    )
    assert all(np.isnan(all_nan.to_pandas()["float32"]))

    # custom NA values
    all_nan = read_csv(
        StringIO(all_cells),
        names=names,
        dtype=dtypes,
        na_values=custom_na_values,
    )
    assert all(np.isnan(all_nan.to_pandas()["float32"]))

    # data type detection should evaluate the column to int8 (all nulls)
    df_int8 = read_csv(
        StringIO(default_na_cells + custom_na_cells),
        header=None,
        na_values=custom_na_values,
    )
    assert df_int8.dtypes[0] == "int8"
    assert all(df_int8["0"][idx] is None for idx in range(len(df_int8["0"])))

    # data type detection should evaluate the column to object;
    # for data type detection, cells need to be completely empty,
    # but some cells in empty_cells contain blank characters and quotes
    df_obj = read_csv(
        StringIO(all_cells), header=None, na_values=custom_na_values
    )
    assert df_obj.dtypes[0] == np.dtype("object")
コード例 #12
0
def test_csv_reader_mixed_data_delimiter_sep(tmpdir, pandas_arg, cudf_arg):

    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file3.csv")

    df = make_numpy_mixed_dataframe()
    df.to_csv(fname, sep="|", index=False, header=False)

    gdf1 = read_csv(
        str(fname),
        # Category is not yet supported from libcudf
        # names=["1", "2", "3", "4", "5", "6", "7"],
        # dtype=[
        #    "int64", "date", "float64", "int64", "category", "str", "bool"
        # ],
        names=["1", "2", "3", "4", "5", "6"],
        dtype=["int64", "date", "float64", "uint64", "str", "bool"],
        dayfirst=True,
        **cudf_arg,
    )
    gdf2 = read_csv(
        str(fname),
        # Category is not yet supported from libcudf
        # names=["1", "2", "3", "4", "5", "6", "7"],
        # dtype=[
        #    "int64", "date", "float64", "int64", "category", "str", "bool"
        # ],
        names=["1", "2", "3", "4", "5", "6"],
        dtype=["int64", "date", "float64", "uint64", "str", "bool"],
        dayfirst=True,
        **pandas_arg,
    )

    pdf = pd.read_csv(
        fname,
        # Category is not yet supported from libcudf
        # names=["1", "2", "3", "4", "5", "6", "7"],
        names=["1", "2", "3", "4", "5", "6"],
        parse_dates=[1],
        dayfirst=True,
        **pandas_arg,
    )

    assert len(gdf1.columns) == len(pdf.columns)
    assert len(gdf2.columns) == len(pdf.columns)
    assert_eq(gdf1, gdf2)
コード例 #13
0
def test_csv_blank_first_row(line_terminator):

    lines = ["colA,colB", "", "1, 1.1", "2, 2.2"]
    buffer = line_terminator.join(lines)

    cu_df = read_csv(StringIO(buffer))

    assert cu_df.shape == (2, 2)
    assert all(cu_df.columns == ["colA", "colB"])
コード例 #14
0
def test_csv_reader_unnamed_cols():
    # first and last columns are unnamed
    buffer = ",1,2,3,\n4,5,6,7,8"

    cu_df = read_csv(StringIO(buffer))
    pd_df = pd.read_csv(StringIO(buffer))

    assert all(pd_df.columns == cu_df.columns)
    assert pd_df.shape == cu_df.shape
コード例 #15
0
def test_csv_reader_category_hash():

    lines = ["HBM0676", "KRC0842", "ILM1441", "EJV0094", "ILM1441"]
    buffer = "\n".join(lines)

    df = read_csv(StringIO(buffer), names=["user"], dtype=["category"])

    hash_ref = [2022314536, -189888986, 1512937027, 397836265, 1512937027]
    assert list(df["user"]) == hash_ref
コード例 #16
0
def test_dask_bfs(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()

    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    def modify_dataset(df):
        temp_df = cudf.DataFrame()
        temp_df['src'] = df['src'] + 1000
        temp_df['dst'] = df['dst'] + 1000
        temp_df['value'] = df['value']
        return cudf.concat([df, temp_df])

    meta = ddf._meta
    ddf = ddf.map_partitions(modify_dataset, meta=meta)

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = modify_dataset(df)

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.bfs(g, [0, 1000])
    result_dist = dcg.bfs(dg, [0, 1000])
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
コード例 #17
0
def test_csv_reader_byte_range_strings(segment_bytes):
    names = ["strings"]
    buffer = "\n".join('"' + str(x) + '"' for x in range(1, 100))
    file_size = len(buffer)

    ref_df = read_csv(StringIO(buffer), names=names).to_pandas()

    dfs = []
    for segment in range((file_size + segment_bytes - 1) // segment_bytes):
        dfs.append(
            read_csv(
                StringIO(buffer),
                names=names,
                byte_range=(segment * segment_bytes, segment_bytes),
            ))
    df = cudf.concat(dfs).to_pandas()

    assert list(df["strings"]) == list(ref_df["strings"])
コード例 #18
0
def test_csv_reader_category_hash():

    lines = ['HBM0676', 'KRC0842', 'ILM1441', 'EJV0094', 'ILM1441']
    buffer = '\n'.join(lines)

    df = read_csv(StringIO(buffer), names=['user'], dtype=['category'])

    hash_ref = [2022314536, -189888986, 1512937027, 397836265, 1512937027]
    assert(list(df['user']) == hash_ref)
コード例 #19
0
    def load(self, index=None):
        data = cudf.read_csv(self.data_path)
        if self.index_col:
            data = data.set_index(self.index_col).sort_index()

        if index is not None:
            data = data.loc[index]
        self.data = data
        return
コード例 #20
0
def parse_log_file(filepath):
    """Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types and configure parser.
    """

    header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8)
    lines = header_gdf["line"].str.split_record()
    column_names = lines[6][1:len(lines[6])].to_host()
    column_types = lines[7][1:len(lines[7])].to_host()
    column_dtypes = list(map(lambda x: type_dict.get(x, "str"), column_types))
    log_gdf = cudf.read_csv(
        filepath,
        delimiter="\t",
        dtype=column_dtypes,
        names=column_names,
        skiprows=8,
        skipfooter=1,
    )
    return log_gdf
コード例 #21
0
def test_csv_blank_first_row(line_terminator):

    lines = ['colA,colB', '', '1, 1.1', '2, 2.2']
    buffer = line_terminator.join(lines)

    cu_df = read_csv(StringIO(buffer))

    assert (cu_df.shape == (2, 2))
    assert (all(cu_df.columns == ['colA', 'colB']))
コード例 #22
0
def test_csv_writer_buffer(tmpdir):

    gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]})

    buffer = BytesIO()
    gdf.to_csv(buffer, index=False)

    result = cudf.read_csv(buffer)
    assert_eq(result, gdf)
コード例 #23
0
 def read_csv(self, files, **kwargs):
     if type(files) == str:
         files = [files]
     if "dtype" in kwargs:
         kwargs["dtype"] = OrderedDict([
             (col, ("str" if dtype == "category" else dtype))
             for (col, dtype) in kwargs["dtype"].items()
         ])
     return self.concat([cudf.read_csv(f, **kwargs) for f in files])
コード例 #24
0
ファイル: test_s3.py プロジェクト: mnicely/cudf
def test_read_csv(pdf):
    # Write to buffer
    fname = "test_csv_reader.csv"
    bname = "csv"
    buffer = pdf.to_csv(index=False)
    with s3_context(bname, {fname: buffer}):
        got = cudf.read_csv("s3://{}/{}".format(bname, fname))

    assert_eq(pdf, got)
コード例 #25
0
ファイル: utils.py プロジェクト: cjber/ahah
def clean_dentists(england: Path, scotland: Path,
                   postcodes: cudf.DataFrame) -> cudf.DataFrame:
    logger.info("Cleaning dentists...")

    edent = (cudf.read_csv(
        england, usecols=[0, 9], header=None).rename(columns={
            "0": "dentist",
            "9": "postcode"
        }).pipe(fix_postcodes).set_index("postcode").join(postcodes).pipe(
            find_partial_pc, postcodes))

    sdent = (cudf.read_csv(scotland).rename(columns={
        "DentalPracticeCode": "dentist",
        "Postcode": "postcode",
    })[["dentist", "postcode"]].astype(str).pipe(fix_postcodes).set_index(
        "postcode").join(postcodes).pipe(find_partial_pc, postcodes))

    return edent.append(sdent).reset_index()
コード例 #26
0
def gpu_load_performance_csv(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]
    
    dtypes = OrderedDict([
        ("loan_id", "int64"),
        ("monthly_reporting_period", "date"),
        ("servicer", "category"),
        ("interest_rate", "float64"),
        ("current_actual_upb", "float64"),
        ("loan_age", "float64"),
        ("remaining_months_to_legal_maturity", "float64"),
        ("adj_remaining_months_to_maturity", "float64"),
        ("maturity_date", "date"),
        ("msa", "float64"),
        ("current_loan_delinquency_status", "int32"),
        ("mod_flag", "category"),
        ("zero_balance_code", "category"),
        ("zero_balance_effective_date", "date"),
        ("last_paid_installment_date", "date"),
        ("foreclosed_after", "date"),
        ("disposition_date", "date"),
        ("foreclosure_costs", "float64"),
        ("prop_preservation_and_repair_costs", "float64"),
        ("asset_recovery_costs", "float64"),
        ("misc_holding_expenses", "float64"),
        ("holding_taxes", "float64"),
        ("net_sale_proceeds", "float64"),
        ("credit_enhancement_proceeds", "float64"),
        ("repurchase_make_whole_proceeds", "float64"),
        ("other_foreclosure_proceeds", "float64"),
        ("non_interest_bearing_upb", "float64"),
        ("principal_forgiveness_upb", "float64"),
        ("repurchase_make_whole_proceeds_flag", "category"),
        ("foreclosure_principal_write_off_amount", "float64"),
        ("servicing_activity_indicator", "category")
    ])

    print(performance_path)
    
    return cudf.read_csv(performance_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)
コード例 #27
0
def read(path, media_type, index_col = None, header = 'infer', sep = ','):

    # TO REDO
    if media_type is 'csv':
        df = cuDFManager.read_csv(path, index_col = index_col, header = header, sep = sep)
    else:
        raise NotImplementedError('cuDFManager.read_'.join(media_type))

    return df
コード例 #28
0
ファイル: fuzz_test_csv.py プロジェクト: wenxiang-Li/cudf
def csv_writer_test(pdf):
    gdf = cudf.from_pandas(pdf)

    pd_buffer = pdf.to_csv()
    gd_buffer = gdf.to_csv()

    compare_content(pd_buffer, gd_buffer)
    actual = cudf.read_csv(StringIO(gd_buffer))
    expected = pd.read_csv(StringIO(pd_buffer))
    assert_eq(actual, expected)
コード例 #29
0
def load_data(config: DictConfig) -> Tuple[cudf.DataFrame, cudf.DataFrame]:
    loading = cudf.read_csv(f"{config.store.workdir}/input/loading.csv")
    fnc = cudf.read_csv(f"{config.store.workdir}/input/fnc.csv")
    train_df = cudf.read_csv(f"{config.store.workdir}/input/train_scores.csv")
    submissoin = cudf.read_csv(
        f"{config.store.workdir}/input/sample_submission.csv")
    train_df = train_df.merge(loading, on="Id", how="left")
    train_df = train_df.merge(fnc, on="Id", how="left")
    test_df = submissoin["Id"].str.split("_")[0].unique().astype(int)
    test_df = cudf.DataFrame({"Id": test_df})
    test_df = test_df.merge(loading, on="Id", how="left")
    test_df = test_df.merge(fnc, on="Id", how="left")
    # mean shift
    train_df["IC_20"] += 0.0022449734660541093
    # Scaling
    train_df[fnc.columns[1:]] /= 500
    test_df[fnc.columns[1:]] /= 500

    return train_df, test_df
コード例 #30
0
def _read_csv(fn, dtypes=None, **kwargs):
    try:
        cdf = cudf.read_csv(fn, **kwargs)
    except GDFError:
        # end of file check https://github.com/rapidsai/dask-cudf/issues/103
        # this should be removed when CUDF has better dtype/parse_date support
        dtypes = dict(zip(kwargs["names"], dtypes))
        df = dd.core.make_meta(dtypes)
        cdf = cudf.from_pandas(df)
    return cdf