def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF", "9512c20b"] values = [int(hex_int, 16) for hex_int in lines] buffer = "\n".join(lines) if gdf_dtype is not None: # require explicit `hex` dtype to parse hexadecimals pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) np.testing.assert_array_equal(pdf["hex_int"], gdf["hex_int"].to_array()) else: # otherwise, dtype inference returns as object (string) pdf = pd.read_csv(StringIO(buffer), names=["hex_int"]) gdf = read_csv(StringIO(buffer), names=["hex_int"]) assert_eq(pdf, gdf)
def test_read_csv_names_header_combination(): pdf = pd.DataFrame({ "firstname": ["Emma", "Ava", "Sophia"], "lastname": ["Olivia", "Isabella", "Charlotte"], "gender": ["F", "F", "F"], }) buffer = pdf.to_csv(header=True, index=False) names = pdf.columns gdf = read_csv(StringIO(buffer), names=names, header=0) assert_eq(pdf, gdf) gdf = read_csv(StringIO(buffer), header=0) assert_eq(pdf, gdf) gdf = read_csv(StringIO(buffer)) assert_eq(pdf, gdf)
def test_csv_reader_column_names(names): buffer = '0,1,2\n3,4,5\n6,7,8' df = read_csv(StringIO(buffer), names=names) if names is None: assert (list(df) == ['0', '1', '2']) else: assert (list(df) == list(names))
def test_csv_reader_column_names(names): buffer = "0,1,2\n3,4,5\n6,7,8" df = read_csv(StringIO(buffer), names=names) if names is None: assert list(df) == ["0", "1", "2"] else: assert list(df) == list(names)
def read_csv_file(csv_file): """ Read csv_file and return a cuDF DataFrame """ return cudf.read_csv(csv_file, delimiter=' ', dtype=['int32', 'int32', 'float32'], header=None)
def df(request): engine = request.getfixturevalue("engine") paths = request.getfixturevalue("paths") if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv-no-header": df1 = cudf.read_csv(paths[0], header=None, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=None, names=allcols_csv)[mycols_csv] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: raise ValueError("unknown engine:" + engine) gdf = cudf.concat([df1, df2], axis=0) gdf["id"] = gdf["id"].astype("int64") return gdf
def test_csv_reader_byte_range_type_corner_case(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", dtypes={ "name": str, "id": int, "x": float, "y": float }, ).to_csv(fname, chunksize=100000) byte_range = (2_147_483_648, 0) with pytest.raises(RuntimeError, match="Offset is past end of file"): cudf.read_csv(fname, byte_range=byte_range, header=None)
def test_csv_reader_oversized_byte_range(): buffer = "a,b,c,d,e\n4,5,6,7,8" cu_df = read_csv(StringIO(buffer), byte_range=(0, 1024)) pd_df = pd.read_csv(StringIO(buffer)) assert all(pd_df.columns == cu_df.columns) assert pd_df.shape == cu_df.shape
def test_csv_empty_file(tmpdir, contents): fname = tmpdir.mkdir("gdf_csv").join("test_csv_empty_file.csv") with open(fname, "w") as f: f.write(contents) col_names = ["col1", "col2", "col3", "col4"] in_dtypes = ["int", "str", "float", "short"] out_dtypes = ["int32", "object", "float32", "int16"] # Empty dataframe if no columns names specified or inferred df = read_csv(str(fname)) assert len(df.columns) == 0 # No row dataframe if columns names are specified or inferred df = read_csv(str(fname), dtype=in_dtypes, names=col_names) assert all(df.columns == col_names) assert list(df.dtypes) == out_dtypes
def test_dask_katz_centrality(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree") largest_out_degree = largest_out_degree["out_degree"].iloc[0] katz_alpha = 1 / (largest_out_degree + 1) mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6) mg_res = mg_res.compute() import networkx as nx from cugraph.tests import utils NM = utils.read_csv_for_nx(input_data_path) Gnx = nx.from_pandas_edgelist(NM, create_using=nx.DiGraph(), source="0", target="1") nk = nx.katz_centrality(Gnx, alpha=katz_alpha) import pandas as pd pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality']) exp_res = cudf.DataFrame(pdf) err = 0 tol = 1.0e-05 compare_res = exp_res.merge(mg_res, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_res)): diff = abs(compare_res["katz_centrality_local"].iloc[i] - compare_res["katz_centrality_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_csv_reader_NaN_values(): names = dtypes = ["float32"] empty_cells = '\n""\n \n "" \n' default_na_cells = ( "#N/A\n#N/A N/A\n#NA\n-1.#IND\n" "-1.#QNAN\n-NaN\n-nan\n1.#IND\n" "1.#QNAN\nN/A\nNA\nNULL\n" "NaN\nn/a\nnan\nnull\n" ) custom_na_cells = "NV_NAN\nNotANumber\n" all_cells = empty_cells + default_na_cells + custom_na_cells custom_na_values = ["NV_NAN", "NotANumber"] # test default NA values. empty cells should also yield NaNs all_nan = read_csv( StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes ) assert all(np.isnan(all_nan.to_pandas()["float32"])) # custom NA values all_nan = read_csv( StringIO(all_cells), names=names, dtype=dtypes, na_values=custom_na_values, ) assert all(np.isnan(all_nan.to_pandas()["float32"])) # data type detection should evaluate the column to int8 (all nulls) df_int8 = read_csv( StringIO(default_na_cells + custom_na_cells), header=None, na_values=custom_na_values, ) assert df_int8.dtypes[0] == "int8" assert all(df_int8["0"][idx] is None for idx in range(len(df_int8["0"]))) # data type detection should evaluate the column to object; # for data type detection, cells need to be completely empty, # but some cells in empty_cells contain blank characters and quotes df_obj = read_csv( StringIO(all_cells), header=None, na_values=custom_na_values ) assert df_obj.dtypes[0] == np.dtype("object")
def test_csv_reader_mixed_data_delimiter_sep(tmpdir, pandas_arg, cudf_arg): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file3.csv") df = make_numpy_mixed_dataframe() df.to_csv(fname, sep="|", index=False, header=False) gdf1 = read_csv( str(fname), # Category is not yet supported from libcudf # names=["1", "2", "3", "4", "5", "6", "7"], # dtype=[ # "int64", "date", "float64", "int64", "category", "str", "bool" # ], names=["1", "2", "3", "4", "5", "6"], dtype=["int64", "date", "float64", "uint64", "str", "bool"], dayfirst=True, **cudf_arg, ) gdf2 = read_csv( str(fname), # Category is not yet supported from libcudf # names=["1", "2", "3", "4", "5", "6", "7"], # dtype=[ # "int64", "date", "float64", "int64", "category", "str", "bool" # ], names=["1", "2", "3", "4", "5", "6"], dtype=["int64", "date", "float64", "uint64", "str", "bool"], dayfirst=True, **pandas_arg, ) pdf = pd.read_csv( fname, # Category is not yet supported from libcudf # names=["1", "2", "3", "4", "5", "6", "7"], names=["1", "2", "3", "4", "5", "6"], parse_dates=[1], dayfirst=True, **pandas_arg, ) assert len(gdf1.columns) == len(pdf.columns) assert len(gdf2.columns) == len(pdf.columns) assert_eq(gdf1, gdf2)
def test_csv_blank_first_row(line_terminator): lines = ["colA,colB", "", "1, 1.1", "2, 2.2"] buffer = line_terminator.join(lines) cu_df = read_csv(StringIO(buffer)) assert cu_df.shape == (2, 2) assert all(cu_df.columns == ["colA", "colB"])
def test_csv_reader_unnamed_cols(): # first and last columns are unnamed buffer = ",1,2,3,\n4,5,6,7,8" cu_df = read_csv(StringIO(buffer)) pd_df = pd.read_csv(StringIO(buffer)) assert all(pd_df.columns == cu_df.columns) assert pd_df.shape == cu_df.shape
def test_csv_reader_category_hash(): lines = ["HBM0676", "KRC0842", "ILM1441", "EJV0094", "ILM1441"] buffer = "\n".join(lines) df = read_csv(StringIO(buffer), names=["user"], dtype=["category"]) hash_ref = [2022314536, -189888986, 1512937027, 397836265, 1512937027] assert list(df["user"]) == hash_ref
def test_dask_bfs(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) def modify_dataset(df): temp_df = cudf.DataFrame() temp_df['src'] = df['src'] + 1000 temp_df['dst'] = df['dst'] + 1000 temp_df['value'] = df['value'] return cudf.concat([df, temp_df]) meta = ddf._meta ddf = ddf.map_partitions(modify_dataset, meta=meta) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = modify_dataset(df) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, [0, 1000]) result_dist = dcg.bfs(dg, [0, 1000]) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] buffer = "\n".join('"' + str(x) + '"' for x in range(1, 100)) file_size = len(buffer) ref_df = read_csv(StringIO(buffer), names=names).to_pandas() dfs = [] for segment in range((file_size + segment_bytes - 1) // segment_bytes): dfs.append( read_csv( StringIO(buffer), names=names, byte_range=(segment * segment_bytes, segment_bytes), )) df = cudf.concat(dfs).to_pandas() assert list(df["strings"]) == list(ref_df["strings"])
def test_csv_reader_category_hash(): lines = ['HBM0676', 'KRC0842', 'ILM1441', 'EJV0094', 'ILM1441'] buffer = '\n'.join(lines) df = read_csv(StringIO(buffer), names=['user'], dtype=['category']) hash_ref = [2022314536, -189888986, 1512937027, 397836265, 1512937027] assert(list(df['user']) == hash_ref)
def load(self, index=None): data = cudf.read_csv(self.data_path) if self.index_col: data = data.set_index(self.index_col).sort_index() if index is not None: data = data.loc[index] self.data = data return
def parse_log_file(filepath): """Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types and configure parser. """ header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8) lines = header_gdf["line"].str.split_record() column_names = lines[6][1:len(lines[6])].to_host() column_types = lines[7][1:len(lines[7])].to_host() column_dtypes = list(map(lambda x: type_dict.get(x, "str"), column_types)) log_gdf = cudf.read_csv( filepath, delimiter="\t", dtype=column_dtypes, names=column_names, skiprows=8, skipfooter=1, ) return log_gdf
def test_csv_blank_first_row(line_terminator): lines = ['colA,colB', '', '1, 1.1', '2, 2.2'] buffer = line_terminator.join(lines) cu_df = read_csv(StringIO(buffer)) assert (cu_df.shape == (2, 2)) assert (all(cu_df.columns == ['colA', 'colB']))
def test_csv_writer_buffer(tmpdir): gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) buffer = BytesIO() gdf.to_csv(buffer, index=False) result = cudf.read_csv(buffer) assert_eq(result, gdf)
def read_csv(self, files, **kwargs): if type(files) == str: files = [files] if "dtype" in kwargs: kwargs["dtype"] = OrderedDict([ (col, ("str" if dtype == "category" else dtype)) for (col, dtype) in kwargs["dtype"].items() ]) return self.concat([cudf.read_csv(f, **kwargs) for f in files])
def test_read_csv(pdf): # Write to buffer fname = "test_csv_reader.csv" bname = "csv" buffer = pdf.to_csv(index=False) with s3_context(bname, {fname: buffer}): got = cudf.read_csv("s3://{}/{}".format(bname, fname)) assert_eq(pdf, got)
def clean_dentists(england: Path, scotland: Path, postcodes: cudf.DataFrame) -> cudf.DataFrame: logger.info("Cleaning dentists...") edent = (cudf.read_csv( england, usecols=[0, 9], header=None).rename(columns={ "0": "dentist", "9": "postcode" }).pipe(fix_postcodes).set_index("postcode").join(postcodes).pipe( find_partial_pc, postcodes)) sdent = (cudf.read_csv(scotland).rename(columns={ "DentalPracticeCode": "dentist", "Postcode": "postcode", })[["dentist", "postcode"]].astype(str).pipe(fix_postcodes).set_index( "postcode").join(postcodes).pipe(find_partial_pc, postcodes)) return edent.append(sdent).reset_index()
def gpu_load_performance_csv(performance_path, **kwargs): """ Loads performance data Returns ------- GPU DataFrame """ cols = [ "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity", "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code", "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after", "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs", "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds", "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds", "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag", "foreclosure_principal_write_off_amount", "servicing_activity_indicator" ] dtypes = OrderedDict([ ("loan_id", "int64"), ("monthly_reporting_period", "date"), ("servicer", "category"), ("interest_rate", "float64"), ("current_actual_upb", "float64"), ("loan_age", "float64"), ("remaining_months_to_legal_maturity", "float64"), ("adj_remaining_months_to_maturity", "float64"), ("maturity_date", "date"), ("msa", "float64"), ("current_loan_delinquency_status", "int32"), ("mod_flag", "category"), ("zero_balance_code", "category"), ("zero_balance_effective_date", "date"), ("last_paid_installment_date", "date"), ("foreclosed_after", "date"), ("disposition_date", "date"), ("foreclosure_costs", "float64"), ("prop_preservation_and_repair_costs", "float64"), ("asset_recovery_costs", "float64"), ("misc_holding_expenses", "float64"), ("holding_taxes", "float64"), ("net_sale_proceeds", "float64"), ("credit_enhancement_proceeds", "float64"), ("repurchase_make_whole_proceeds", "float64"), ("other_foreclosure_proceeds", "float64"), ("non_interest_bearing_upb", "float64"), ("principal_forgiveness_upb", "float64"), ("repurchase_make_whole_proceeds_flag", "category"), ("foreclosure_principal_write_off_amount", "float64"), ("servicing_activity_indicator", "category") ]) print(performance_path) return cudf.read_csv(performance_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)
def read(path, media_type, index_col = None, header = 'infer', sep = ','): # TO REDO if media_type is 'csv': df = cuDFManager.read_csv(path, index_col = index_col, header = header, sep = sep) else: raise NotImplementedError('cuDFManager.read_'.join(media_type)) return df
def csv_writer_test(pdf): gdf = cudf.from_pandas(pdf) pd_buffer = pdf.to_csv() gd_buffer = gdf.to_csv() compare_content(pd_buffer, gd_buffer) actual = cudf.read_csv(StringIO(gd_buffer)) expected = pd.read_csv(StringIO(pd_buffer)) assert_eq(actual, expected)
def load_data(config: DictConfig) -> Tuple[cudf.DataFrame, cudf.DataFrame]: loading = cudf.read_csv(f"{config.store.workdir}/input/loading.csv") fnc = cudf.read_csv(f"{config.store.workdir}/input/fnc.csv") train_df = cudf.read_csv(f"{config.store.workdir}/input/train_scores.csv") submissoin = cudf.read_csv( f"{config.store.workdir}/input/sample_submission.csv") train_df = train_df.merge(loading, on="Id", how="left") train_df = train_df.merge(fnc, on="Id", how="left") test_df = submissoin["Id"].str.split("_")[0].unique().astype(int) test_df = cudf.DataFrame({"Id": test_df}) test_df = test_df.merge(loading, on="Id", how="left") test_df = test_df.merge(fnc, on="Id", how="left") # mean shift train_df["IC_20"] += 0.0022449734660541093 # Scaling train_df[fnc.columns[1:]] /= 500 test_df[fnc.columns[1:]] /= 500 return train_df, test_df
def _read_csv(fn, dtypes=None, **kwargs): try: cdf = cudf.read_csv(fn, **kwargs) except GDFError: # end of file check https://github.com/rapidsai/dask-cudf/issues/103 # this should be removed when CUDF has better dtype/parse_date support dtypes = dict(zip(kwargs["names"], dtypes)) df = dd.core.make_meta(dtypes) cdf = cudf.from_pandas(df) return cdf