def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None: import dask_cudf with Client(local_cuda_cluster) as client: X_, y_, w_ = generate_array(with_weights=True) y_ = (y_ * 10).astype(np.int32) X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_)) y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_)) w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_)) run_dask_classifier(X, y, w, model, client)
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None: with Client(local_cuda_cluster) as client: import dask_cudf X, y = make_categorical(client, 10000, 30, 13) X = dask_cudf.from_dask_dataframe(X) X_onehot, _ = make_categorical(client, 10000, 30, 13, True) X_onehot = dask_cudf.from_dask_dataframe(X_onehot) run_categorical(client, "gpu_hist", X, X_onehot, y)
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None: with Client(local_cuda_cluster) as client: import dask_cudf rounds = 10 X, y = make_categorical(client, 10000, 30, 13) X = dask_cudf.from_dask_dataframe(X) X_onehot, _ = make_categorical(client, 10000, 30, 13, True) X_onehot = dask_cudf.from_dask_dataframe(X_onehot) parameters = {"tree_method": "gpu_hist"} m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True) by_etl_results = dxgb.train( client, parameters, m, num_boost_round=rounds, evals=[(m, "Train")], )["history"] m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True) output = dxgb.train( client, parameters, m, num_boost_round=rounds, evals=[(m, "Train")], ) by_builtin_results = output["history"] np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), rtol=1e-3, ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) model = output["booster"] with tempfile.TemporaryDirectory() as tempdir: path = os.path.join(tempdir, "model.json") model.save_model(path) with open(path, "r") as fd: categorical = json.load(fd) categories_sizes = np.array( categorical["learner"]["gradient_booster"]["model"]["trees"] [-1]["categories_sizes"]) assert categories_sizes.shape[0] != 0 np.testing.assert_allclose(categories_sizes, 1)
def main(client): import dask_cudf product_reviews_df = read_tables() product_reviews_df = product_reviews_df[ product_reviews_df.pr_item_sk == q27_pr_item_sk ] sentences = product_reviews_df.map_partitions( create_sentences_from_reviews, review_column="pr_review_content", end_of_line_char=EOL_CHAR, ) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] sentences = sentences.persist() wait(sentences) # Do the NER sentences = sentences.to_dask_dataframe() ner_parsed = sentences.map_partitions(ner_parser, "sentence") ner_parsed = dask_cudf.from_dask_dataframe(ner_parsed) ner_parsed = ner_parsed.persist() wait(ner_parsed) ner_parsed = ner_parsed[ner_parsed.company_name_list != ""] # separate NER results into one row per found company repeated_names = ner_parsed.map_partitions( create_words_from_sentences, sentence_column="company_name_list", global_position_column="sentence_tokenized_global_pos", delimiter="é", ) # recombine recombined = repeated_names.merge( ner_parsed, how="left", left_on="sentence_idx_global_pos", right_on="sentence_tokenized_global_pos", ) recombined["pr_item_sk"] = q27_pr_item_sk recombined = recombined[["review_idx_global_pos", "pr_item_sk", "word", "sentence"]] recombined = recombined.persist() wait(recombined) recombined = recombined.sort_values( ["review_idx_global_pos", "pr_item_sk", "word", "sentence"] ).persist() recombined.columns = ["review_sk", "item_sk", "company_name", "review_sentence"] recombined = recombined.persist() wait(recombined) return recombined
def test_roundtrip_from_dask_cudf(tmpdir, write_meta): tmpdir = str(tmpdir) gddf = dask_cudf.from_dask_dataframe(ddf) gddf.to_parquet(tmpdir, write_metadata_file=write_meta) gddf2 = dask_cudf.read_parquet(tmpdir) dd.assert_eq(gddf, gddf2, check_divisions=write_meta)
def dask_gpu_parquet_ingest(self, target_files, columns=None): if self.rapids_version < 15: # rapids 0.14 has a known issue with read_parquet https://github.com/rapidsai/cudf/issues/5579 return dask_cudf.from_dask_dataframe( self.dask_cpu_parquet_ingest(target_files, columns=columns)) else: return dask_cudf.read_parquet(target_files, columns=columns)
def _move_ddf(self, destination): """Move the collection between cpu and gpu memory.""" _ddf = self._ddf if (self.moved_collection and isinstance(_ddf.dask, HighLevelGraph) and hasattr(_ddf.dask, "key_dependencies")): # If our collection has already been moved, and if the # underlying graph is a `HighLevelGraph`, we can just # drop the last "from_pandas-..." layer if the current # destination is "cpu", or we can drop the last # "to_pandas-..." layer if the destination is "gpu". search_name = "from_pandas-" if destination == "cpu" else "to_pandas-" pandas_conversion_layer = None pandas_conversion_dep = None for k, v in _ddf.dask.dependents.items(): if k.startswith(search_name) and v == set(): pandas_conversion_layer = k break if pandas_conversion_layer: deps = [ d for d in _ddf.dask.dependencies[pandas_conversion_layer] ] if len(deps) == 1: pandas_conversion_dep = deps[0] if pandas_conversion_layer and pandas_conversion_dep: # We have met the criteria to remove the last "from/to_pandas-" layer new_layers = { k: v for k, v in _ddf.dask.layers.items() if k != pandas_conversion_layer } new_deps = { k: v for k, v in _ddf.dask.dependencies.items() if k != pandas_conversion_layer } hlg = HighLevelGraph( layers=new_layers, dependencies=new_deps, key_dependencies=_ddf.dask.key_dependencies, ) _meta = (_ddf._meta.to_pandas() if destination == "cpu" else cudf.from_pandas(_ddf._meta)) return new_dd_object(hlg, pandas_conversion_dep, _meta, _ddf.divisions) if destination == "cpu": # Just extend the existing graph to move the collection to cpu return _ddf.to_dask_dataframe() elif destination == "gpu": # Just extend the existing graph to move the collection to gpu return dask_cudf.from_dask_dataframe(_ddf) else: raise ValueError(f"destination {destination} not recognized.")
def test_roundtrip_from_dask_cudf(tmpdir): tmpdir = str(tmpdir) gddf = dask_cudf.from_dask_dataframe(ddf) gddf.to_parquet(tmpdir) # NOTE: Need `.compute()` to resolve correct index # name after `from_dask_dataframe` gddf2 = dask_cudf.read_parquet(tmpdir) assert_eq(gddf.compute(), gddf2)
def test_from_dask_dataframe(): np.random.seed(0) df = pd.DataFrame({ "x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20) }) ddf = dd.from_pandas(df, npartitions=2) dgdf = dgd.from_dask_dataframe(ddf) got = dgdf.compute().to_pandas() expect = df np.testing.assert_array_equal(got.index.values, expect.index.values) np.testing.assert_array_equal(got.x.values, expect.x.values) np.testing.assert_array_equal(got.y.values, expect.y.values)
def using_quantile_device_dmatrix(client: Client, X, y): '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree method that reduces memory overhead. When training on GPU pipeline, it's preferred over `DaskDMatrix`. .. versionadded:: 1.2.0 ''' # Input must be on GPU for `DaskDeviceQuantileDMatrix`. X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X)) y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y)) # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful # that it can not be used for anything else other than training. dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) output = xgb.dask.train(client, { 'verbosity': 2, 'tree_method': 'gpu_hist' }, dtrain, num_boost_round=4) prediction = xgb.dask.predict(client, output, X) return prediction
def test_set_index(nelem): np.random.seed(0) # Use unique index range as the sort may not be stable-ordering x = np.arange(nelem) np.random.shuffle(x) df = pd.DataFrame({'x': x, 'y': np.random.randint(0, nelem, size=nelem)}) ddf = dd.from_pandas(df, npartitions=2) dgdf = dgd.from_dask_dataframe(ddf) expect = ddf.set_index('x').compute() got = dgdf.set_index('x').compute().to_pandas() np.testing.assert_array_equal(got.index.values, expect.index.values) np.testing.assert_array_equal(got.y.values, expect.y.values) assert got.columns == expect.columns
def to_dc(self, input_item: Any, table_name: str, format: str = None, gpu: bool = False, **kwargs): if gpu: # pragma: no cover try: import dask_cudf except ImportError: raise ModuleNotFoundError( "Setting `gpu=True` for table creation requires dask_cudf") if not isinstance(input_item, dask_cudf.DataFrame): input_item = dask_cudf.from_dask_dataframe( input_item, **kwargs) return input_item
def test_groupby_categorical_key(): # See https://github.com/rapidsai/cudf/issues/4608 df = dask.datasets.timeseries() gddf = dask_cudf.from_dask_dataframe(df) gddf["name"] = gddf["name"].astype("category") ddf = gddf.to_dask_dataframe() got = (gddf.groupby("name").agg({ "x": ["mean", "max"], "y": ["mean", "count"] }).compute()) expect = (ddf.groupby("name").agg({ "x": ["mean", "max"], "y": ["mean", "count"] }).compute()) dd.assert_eq(expect, got)
def process(self, inputs): """ genearte the fake data for classification Arguments ------- inputs: list empty list Returns ------- cudf.DataFrame """ output = {} def get_cudf(offset=None): conf = copy.copy(self.conf) if 'n_parts' in conf: del conf['n_parts'] x, y = cuml.datasets.make_classification(**conf) df = cudf.DataFrame({'x'+str(i): x[:, i] for i in range(x.shape[1])}) df['y'] = y if offset is not None: df.index += offset return df if self.outport_connected(CUDF_PORT_NAME): df = get_cudf() output.update({CUDF_PORT_NAME: df}) if self.outport_connected(DASK_CUDF_PORT_NAME): def mapfun(x): return x.get() x, y = cuml.dask.datasets.classification.make_classification( **self.conf) ddf = x.map_blocks(mapfun, dtype=x.dtype).to_dask_dataframe() out = dask_cudf.from_dask_dataframe(ddf) out.columns = ['x'+str(i) for i in range(x.shape[1])] out['y'] = y.astype('int64') output.update({DASK_CUDF_PORT_NAME: out}) return output
def test_take(nelem, nparts): np.random.seed(0) # # Use unique index range as the sort may not be stable-ordering x = np.random.randint(0, nelem, size=nelem) y = np.random.random(nelem) selected = np.random.randint(0, nelem - 1, size=nelem // 2) df = pd.DataFrame({'x': x, 'y': y}) ddf = dd.from_pandas(df, npartitions=nparts) dgdf = dgd.from_dask_dataframe(ddf) out = dgdf.take(gd.Series(selected), npartitions=5) got = out.compute().to_pandas() expect = df.take(selected) assert 1 < out.npartitions <= 5 np.testing.assert_array_equal(got.index, np.arange(len(got))) np.testing.assert_array_equal(got.x, expect.x) np.testing.assert_array_equal(got.y, expect.y)
def process(self, inputs): input_df = inputs[self.INPUT_PORT_NAME] bst_model = inputs[self.INPUT_PORT_MODEL_NAME] input_meta = self.get_input_meta() required_cols = input_meta[ self.INPUT_PORT_MODEL_NAME]['train'] required_cols = list(required_cols.keys()) # required_cols.sort() predict_col = self.conf.get('prediction', 'predict') pred_contribs: bool = self.conf.get('pred_contribs', False) if isinstance(input_df, dask_cudf.DataFrame): # get the client client = dask.distributed.client.default_client() dtrain = xgb.dask.DaskDMatrix(client, input_df[required_cols]) prediction = xgb.dask.predict(client, bst_model, dtrain, pred_contribs=pred_contribs) pred_df = dask_cudf.from_dask_dataframe( prediction.to_dask_dataframe()) pred_df.index = input_df.index if not pred_contribs: input_df[predict_col] = pred_df else: input_df = pred_df else: infer_dmatrix = xgb.DMatrix(input_df[required_cols]) if not pred_contribs: prediction = cudf.Series(bst_model.predict(infer_dmatrix), nan_as_null=False, index=input_df.index ) input_df[predict_col] = prediction else: prediction = cudf.DataFrame(bst_model.predict( infer_dmatrix, pred_contribs=pred_contribs), index=input_df.index) input_df = prediction return {self.OUTPUT_PORT_NAME: input_df}
def gpu_training_df(c): if dask_cudf: df = timeseries(freq="1d").reset_index(drop=True) df = dask_cudf.from_dask_dataframe(df) c.create_table("timeseries", input_table=df) return None
df = sp.GeoDataFrame(*args, **kwargs) else: df = pd.DataFrame(*args, **kwargs) return dd.from_pandas(df, npartitions=2) try: import cudf import cupy import dask_cudf if test_gpu is False: # GPU testing disabled even though cudf/cupy are available raise ImportError ddfs = [_ddf, dask_cudf.from_dask_dataframe(_ddf)] def dask_cudf_DataFrame(*args, **kwargs): assert not kwargs.pop("geo", False) cdf = cudf.DataFrame.from_pandas( pd.DataFrame(*args, **kwargs), nan_as_null=False ) return dask_cudf.from_cudf(cdf, npartitions=2) DataFrames = [dask_DataFrame, dask_cudf_DataFrame] except ImportError: cudf = cupy = dask_cudf = None ddfs = [_ddf] DataFrames = [dask_DataFrame] dask_cudf_DataFrame = None
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) import dask_cudf query = f""" SELECT pr_review_sk, pr_item_sk, pr_review_content FROM product_reviews WHERE pr_item_sk = {q27_pr_item_sk} """ product_reviews_df = bc.sql(query) sentences = product_reviews_df.map_partitions( create_sentences_from_reviews, review_column="pr_review_content", end_of_line_char=EOL_CHAR, ) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] del product_reviews_df # Do the NER sentences = sentences.to_dask_dataframe() ner_parsed = sentences.map_partitions(ner_parser, "sentence") ner_parsed = dask_cudf.from_dask_dataframe(ner_parsed) ner_parsed = ner_parsed.persist() wait(ner_parsed) ner_parsed = ner_parsed[ner_parsed.company_name_list != ""] # separate NER results into one row per found company repeated_names = ner_parsed.map_partitions( create_words_from_sentences, sentence_column="company_name_list", global_position_column="sentence_tokenized_global_pos", delimiter="é", ) del sentences # recombine repeated_names = repeated_names.persist() wait(repeated_names) bc.create_table('repeated_names', repeated_names) ner_parsed = ner_parsed.persist() wait(ner_parsed) bc.create_table('ner_parsed', ner_parsed) query = f""" SELECT review_idx_global_pos as review_sk, CAST({q27_pr_item_sk} AS BIGINT) as item_sk, word as company_name, sentence as review_sentence FROM repeated_names left join ner_parsed ON sentence_idx_global_pos = sentence_tokenized_global_pos ORDER BY review_idx_global_pos, item_sk, word, sentence """ recombined = bc.sql(query) bc.drop_table("repeated_names") bc.drop_table("ner_parsed") del ner_parsed del repeated_names return recombined
def __init__( self, path_or_source, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, dtypes=None, **kwargs, ): self.dtypes = dtypes if isinstance( path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)): # User is passing in a <dask.dataframe|cudf|pd>.DataFrame # Use DataFrameDatasetEngine if isinstance(path_or_source, cudf.DataFrame): path_or_source = dask_cudf.from_cudf(path_or_source, npartitions=1) elif isinstance(path_or_source, pd.DataFrame): path_or_source = dask_cudf.from_cudf( cudf.from_pandas(path_or_source), npartitions=1) elif not isinstance(path_or_source, dask_cudf.DataFrame): path_or_source = dask_cudf.from_dask_dataframe(path_or_source) if part_size: warnings.warn("part_size is ignored for DataFrame input.") if part_mem_fraction: warnings.warn( "part_mem_fraction is ignored for DataFrame input.") self.engine = DataFrameDatasetEngine(path_or_source) else: if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn( "Using very large partitions sizes for Dask. " "Memory-related errors are likely.") part_size = int( device_mem_size(kind="total") * part_mem_fraction) # Engine-agnostic path handling paths = path_or_source if hasattr(paths, "name"): paths = stringify_path(paths) if isinstance(paths, str): paths = [paths] storage_options = storage_options or {} # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine( paths, part_size, storage_options=storage_options, **kwargs) elif engine == "csv": self.engine = CSVDatasetEngine( paths, part_size, storage_options=storage_options, **kwargs) else: raise ValueError( "Only parquet and csv supported (for now).") else: self.engine = engine(paths, part_size, storage_options=storage_options)
def process(self, inputs): """ The process is doing following things: 1. split the data into training and testing based on provided conf['train_date']. If it is not provided, all the data is treated as training data. 2. train a XGBoost model based on the training data 3. Make predictions for all the data points including training and testing. 4. From the prediction of returns, compute the trading signals that can be used in the backtesting. Arguments ------- inputs: list list of input dataframes. Returns ------- dataframe """ dxgb_params = { 'max_depth': 8, 'max_leaves': 2**8, 'tree_method': 'gpu_hist', 'objective': 'reg:squarederror', 'grow_policy': 'lossguide', } # num_of_rounds = 100 if 'xgboost_parameters' in self.conf: dxgb_params.update(self.conf['xgboost_parameters']) input_df = inputs[self.INPUT_PORT_NAME] model_df = input_df train_cols = set(model_df.columns) - set(self.conf['no_feature']) train_cols = list(train_cols - set([self.conf['target']])) if isinstance(input_df, dask_cudf.DataFrame): # get the client client = dask.distributed.client.default_client() if 'train_date' in self.conf: train_date = datetime.datetime.strptime( self.conf['train_date'], # noqa: F841, E501 '%Y-%m-%d') model_df = model_df[model_df.datetime < train_date] train = model_df[train_cols] target = model_df[self.conf['target']] dmatrix = xgb.dask.DaskDMatrix(client, train, label=target) bst = xgb.dask.train(client, dxgb_params, dmatrix, num_boost_round=self.conf["num_of_rounds"]) dtrain = xgb.dask.DaskDMatrix(client, input_df[train_cols]) prediction = xgb.dask.predict(client, bst, dtrain) pred_df = dask_cudf.from_dask_dataframe( prediction.to_dask_dataframe()) pred_df.index = input_df.index input_df['signal'] = pred_df elif isinstance(input_df, cudf.DataFrame): if 'train_date' in self.conf: train_date = datetime.datetime.strptime( self.conf['train_date'], # noqa: F841, E501 '%Y-%m-%d') model_df = model_df.query('datetime<@train_date') train = model_df[train_cols] target = model_df[self.conf['target']] dmatrix = xgb.DMatrix(train, label=target) bst = xgb.train(dxgb_params, dmatrix, num_boost_round=self.conf["num_of_rounds"]) infer_dmatrix = xgb.DMatrix(input_df[train_cols]) prediction = cudf.Series(bst.predict(infer_dmatrix), nan_as_null=False, index=input_df.index).astype('float64') input_df['signal'] = prediction input_df['tmp'] = (input_df['asset'] - input_df['asset'].shift(1)).fillna(1) input_df['tmp'] = (input_df['tmp'] != 0).astype('int32') tmp = input_df['tmp'] input_df['tmp'] = tmp.where(tmp != 1, None) input_df = input_df.dropna(subset=['tmp']) input_df = input_df.drop('tmp', axis=1) # convert the signal to trading action # 1 is buy and -1 is sell # It predicts the tomorrow's return (shift -1) # We shift 1 for trading actions so that it acts on the second day input_df['signal'] = ((input_df['signal'] >= 0).astype('float') * 2 - 1).shift(1) # remove the bad datapints input_df = input_df.dropna() remaining = list(self.conf['no_feature']) + ['signal'] return {self.OUTPUT_PORT_NAME: input_df[remaining]}
def __init__( self, path_or_source, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, dtypes=None, client=None, cpu=None, base_dataset=None, **kwargs, ): self.dtypes = dtypes self.client = client # Check if we are keeping data in cpu memory self.cpu = cpu or False # Keep track of base dataset (optional) self.base_dataset = base_dataset or self # For now, lets warn the user that "cpu mode" is experimental if self.cpu: warnings.warn( "Initializing an NVTabular Dataset in CPU mode." "This is an experimental feature with extremely limited support!" ) if isinstance(path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)): # User is passing in a <dask.dataframe|cudf|pd>.DataFrame # Use DataFrameDatasetEngine moved_collection = ( False # Whether a pd-backed collection was moved to cudf (or vice versa) ) if self.cpu: if isinstance(path_or_source, pd.DataFrame): # Convert pandas DataFrame to pandas-backed dask.dataframe.DataFrame path_or_source = dask.dataframe.from_pandas(path_or_source, npartitions=1) elif isinstance(path_or_source, cudf.DataFrame): # Convert cudf DataFrame to pandas-backed dask.dataframe.DataFrame path_or_source = dask.dataframe.from_pandas( path_or_source.to_pandas(), npartitions=1 ) elif isinstance(path_or_source, dask_cudf.DataFrame): # Convert dask_cudf DataFrame to pandas-backed dask.dataframe.DataFrame path_or_source = path_or_source.to_dask_dataframe() moved_collection = True else: if isinstance(path_or_source, cudf.DataFrame): # Convert cudf DataFrame to dask_cudf.DataFrame path_or_source = dask_cudf.from_cudf(path_or_source, npartitions=1) elif isinstance(path_or_source, pd.DataFrame): # Convert pandas DataFrame to dask_cudf.DataFrame path_or_source = dask_cudf.from_cudf( cudf.from_pandas(path_or_source), npartitions=1 ) elif not isinstance(path_or_source, dask_cudf.DataFrame): # Convert dask.dataframe.DataFrame DataFrame to dask_cudf.DataFrame path_or_source = dask_cudf.from_dask_dataframe(path_or_source) moved_collection = True if part_size: warnings.warn("part_size is ignored for DataFrame input.") if part_mem_fraction: warnings.warn("part_mem_fraction is ignored for DataFrame input.") self.engine = DataFrameDatasetEngine( path_or_source, cpu=self.cpu, moved_collection=moved_collection ) else: if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert 0.0 < part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn( "Using very large partitions sizes for Dask. " "Memory-related errors are likely." ) part_size = int(device_mem_size(kind="total") * part_mem_fraction) # Engine-agnostic path handling paths = path_or_source if hasattr(paths, "name"): paths = stringify_path(paths) if isinstance(paths, str): paths = [paths] paths = sorted(paths, key=natural_sort_key) storage_options = storage_options or {} # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine( paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs ) elif engine == "csv": self.engine = CSVDatasetEngine( paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs ) elif engine == "avro": try: from .avro import AvroDatasetEngine except ImportError as e: raise RuntimeError( "Failed to import AvroDatasetEngine. Make sure uavro is installed." ) from e self.engine = AvroDatasetEngine( paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs ) else: raise ValueError("Only parquet, csv, and avro supported (for now).") else: self.engine = engine( paths, part_size, cpu=self.cpu, storage_options=storage_options )
def _(embedding, n_pca, self): embedding = dask_cudf.from_dask_dataframe(embedding) return _gpu_cluster_wrapper(embedding, n_pca, self)