def _create_model(self, model_func, client, workers, n_estimators, base_seed, ignore_empty_partitions, **kwargs): self.client = get_client(client) if workers is None: # Default to all workers workers = self.client.scheduler_info()['workers'].keys() self.workers = workers self._set_internal_model(None) self.active_workers = list() self.ignore_empty_partitions = ignore_empty_partitions self.n_estimators = n_estimators self.n_estimators_per_worker = \ self._estimators_per_worker(n_estimators) if base_seed is None: base_seed = 0 seeds = [base_seed] for i in range(1, len(self.n_estimators_per_worker)): sd = self.n_estimators_per_worker[i - 1] + seeds[i - 1] seeds.append(sd) self.rfs = { worker: self.client.submit( model_func, n_estimators=self.n_estimators_per_worker[n], random_state=seeds[n], **kwargs, pure=False, workers=[worker], ) for n, worker in enumerate(self.workers) } wait_and_raise_from_futures(list(self.rfs.values()))
def fit(self, X): """ Fit a multi-node multi-GPU KMeans model Parameters ---------- X : Dask cuDF DataFrame or CuPy backed Dask Array Training data to cluster. """ data = DistributedDataHandler.create(X, client=self.client) self.datatype = data.datatype comms = Comms(comms_p2p=False) comms.init(workers=data.workers) kmeans_fit = [self.client.submit(KMeans._func_fit, comms.sessionId, wf[1], self.datatype, **self.kwargs, workers=[wf[0]], pure=False) for idx, wf in enumerate(data.worker_to_parts.items())] wait_and_raise_from_futures(kmeans_fit) comms.destroy() self._set_internal_model(kmeans_fit[0]) return self
def _check_internal_model(model): """ Performs a brief validation that a model meets the requirements to be set as an `internal_model` Parameters ---------- model : distributed.client.Future[cuml.Base], cuml.Base, or None Returns ------- model : distributed.client.Future[cuml.Base], cuml.Base, or None """ if isinstance(model, Iterable): # If model is iterable, just grab the first model = first(model) if isinstance(model, Future): if model.type is None: wait_and_raise_from_futures([model]) if not issubclass(model.type, Base): raise ValueError("Dask Future expected to contain cuml.Base " "but found %s instead." % model.type) elif model is not None and not isinstance(model, Base): raise ValueError("Expected model of type cuml.Base but found %s " "instead." % type(model)) return model
def _create_model(self, model_func, client, workers, n_estimators, base_seed, **kwargs): self.client = get_client(client) self.workers = self.client.scheduler_info()['workers'].keys() self.local_model = None self.n_estimators_per_worker = \ self._estimators_per_worker(n_estimators) if base_seed is None: base_seed = 0 seeds = [base_seed] for i in range(1, len(self.n_estimators_per_worker)): sd = self.n_estimators_per_worker[i - 1] + seeds[i - 1] seeds.append(sd) self.rfs = { worker: self.client.submit( model_func, n_estimators=self.n_estimators_per_worker[n], seed=seeds[n], **kwargs, pure=False, workers=[worker], ) for n, worker in enumerate(self.workers) } wait_and_raise_from_futures(list(self.rfs.values()))
def _fit(self, model, dataset, convert_dtype, broadcast_data): data = DistributedDataHandler.create(dataset, client=self.client) self.active_workers = data.workers self.datatype = data.datatype if self.datatype == 'cudf': has_float64 = (dataset[0].dtypes == np.float64).any() else: has_float64 = (dataset[0].dtype == np.float64) if has_float64: raise TypeError("To use Dask RF data should have dtype float32.") labels = self.client.persist(dataset[1]) if self.datatype == 'cudf': self.num_classes = len(labels.unique()) else: self.num_classes = \ len(dask.array.unique(labels).compute()) combined_data = list(map(lambda x: x[1], data.gpu_futures)) \ if broadcast_data else None futures = list() for idx, (worker, worker_data) in \ enumerate(data.worker_to_parts.items()): futures.append( self.client.submit( _func_fit, model[worker], combined_data if broadcast_data else worker_data, convert_dtype, workers=[worker], pure=False) ) self.n_active_estimators_per_worker = [] for worker in data.worker_to_parts.keys(): n = self.workers.index(worker) n_est = self.n_estimators_per_worker[n] self.n_active_estimators_per_worker.append(n_est) if len(self.workers) > len(self.active_workers): if self.ignore_empty_partitions: curent_estimators = self.n_estimators / \ len(self.workers) * \ len(self.active_workers) warn_text = ( f"Data was not split among all workers " f"using only {self.active_workers} workers to fit." f"This will only train {curent_estimators}" f" estimators instead of the requested " f"{self.n_estimators}" ) warnings.warn(warn_text) else: raise ValueError("Data was not split among all workers. " "Re-run the code or " "use ignore_empty_partitions=True" " while creating model") wait_and_raise_from_futures(futures) return self
def _set_params(self, **params): model_params = list() for idx, worker in enumerate(self.workers): model_params.append( self.client.submit(_func_set_params, self.rfs[worker], **params, workers=[worker])) wait_and_raise_from_futures(model_params) return self
def _query_models(self, n_neighbors, comms, nn_models, index_handler, query_handler): worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ index_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) idx_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info, index_handler.gpu_futures) query_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Invoke kneighbors on Dask workers to perform distributed query """ key = uuid1() nn_fit = dict([ (worker_info[worker]["rank"], self.client.submit(NearestNeighbors._func_kneighbors, nn_models[worker], index_handler.worker_to_parts[worker] if worker in index_handler.workers else [], index_handler.total_rows, self.n_cols, idx_parts_to_ranks, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_handler.total_rows, query_parts_to_ranks, worker_info[worker]["rank"], n_neighbors, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(nn_fit.values())) """ Gather resulting partitions and return dask_cudfs """ out_d_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_d) out_i_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_i) return nn_fit, out_d_futures, out_i_futures
def fit(self, X, y, classes=None): """ Fit distributed Naive Bayes classifier model Parameters ---------- X : dask.Array with blocks containing dense or sparse cupy arrays y : dask.Array with blocks containing cupy.ndarray classes : array-like containing unique class labels Returns ------- cuml.dask.naive_bayes.MultinomialNB current model instance """ # Only Dask.Array supported for now if not isinstance(X, dask.array.core.Array): raise ValueError("Only dask.Array is supported for X") if not isinstance(y, dask.array.core.Array): raise ValueError("Only dask.Array is supported for y") if len(X.chunks[1]) != 1: raise ValueError("X must be chunked by row only. " "Multi-dimensional chunking is not supported") futures = DistributedDataHandler.create([X, y], self.client) classes = self._unique(y.map_blocks( MultinomialNB._unique).compute()) \ if classes is None else classes models = [ self.client.submit(self._fit, part, classes, self.kwargs, pure=False) for w, part in futures.gpu_futures ] models = reduce(models, self._merge_counts_to_model, client=self.client) models = self.client.submit(self._update_log_probs, models, pure=False) wait_and_raise_from_futures([models]) self._set_internal_model(models) return self
def _print_detailed(self): """ Print the summary of the forest used to train and test the model. """ futures = list() for n, w in enumerate(self.workers): futures.append( self.client.submit( _print_detailed_func, self.rfs[w], workers=[w], )) wait_and_raise_from_futures(futures) return self
def fit(self, X): """ Fit distributed TFIDF Transformer Parameters ---------- X : dask.Array with blocks containing dense or sparse cupy arrays Returns ------- cuml.dask.feature_extraction.text.TfidfTransformer instance """ # Only Dask.Array supported for now if not isinstance(X, dask.array.core.Array): raise ValueError("Only dask.Array is supported for X") if len(X.chunks[1]) != 1: raise ValueError( "X must be chunked by row only. " "Multi-dimensional chunking is not supported" ) # We don't' do anything if we don't need idf if not self.internal_model.use_idf: return self futures = DistributedDataHandler.create(X, self.client) models = [ self.client.submit( self._set_doc_stats, part, self.kwargs, pure=False ) for w, part in futures.gpu_futures ] models = reduce(models, self._merge_stats_to_model, client=self.client) wait_and_raise_from_futures([models]) models = self.client.submit(self._set_idf_diag, models, pure=False) wait_and_raise_from_futures([models]) self._set_internal_model(models) return self
def _fit(self, model, dataset, convert_dtype): data = DistributedDataHandler.create(dataset, client=self.client) self.datatype = data.datatype futures = list() for idx, (worker, worker_data) in \ enumerate(data.worker_to_parts.items()): futures.append( self.client.submit(_func_fit, model[worker], worker_data, convert_dtype, workers=[worker], pure=False)) wait_and_raise_from_futures(futures) return self
def _fit(self, model_func, data): n_cols = data[0].shape[1] data = DistributedDataHandler.create(data=data, client=self.client) self.datatype = data.datatype comms = Comms(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) self.ranks = data.ranks worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) lin_models = dict([(data.worker_info[worker_data[0]]["rank"], self.client.submit( model_func, comms.sessionId, self.datatype, **self.kwargs, pure=False, workers=[worker_data[0]])) for worker, worker_data in enumerate(data.worker_to_parts.items())]) lin_fit = dict([(worker_data[0], self.client.submit( _func_fit, lin_models[data.worker_info[worker_data[0]]["rank"]], worker_data[1], data.total_rows, n_cols, parts_to_sizes, data.worker_info[worker_data[0]]["rank"], pure=False, workers=[worker_data[0]])) for worker, worker_data in enumerate(data.worker_to_parts.items())]) wait_and_raise_from_futures(list(lin_fit.values())) comms.destroy() return lin_models
def fit(self, X, sample_weight=None): """ Fit a multi-node multi-GPU KMeans model Parameters ---------- X : Dask cuDF DataFrame or CuPy backed Dask Array Training data to cluster. sample_weight : Dask cuDF DataFrame or CuPy backed Dask Array shape = (n_samples,), default=None # noqa The weights for each observation in X. If None, all observations are assigned equal weight. Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy """ sample_weight = self._check_normalize_sample_weight(sample_weight) inputs = X if sample_weight is None else (X, sample_weight) data = DistributedDataHandler.create(inputs, client=self.client) self.datatype = data.datatype # This needs to happen on the scheduler comms = Comms(comms_p2p=False, client=self.client) comms.init(workers=data.workers) kmeans_fit = [ self.client.submit(KMeans._func_fit, comms.sessionId, wf[1], self.datatype, data.multiple, **self.kwargs, workers=[wf[0]], pure=False) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait_and_raise_from_futures(kmeans_fit) comms.destroy() self._set_internal_model(kmeans_fit[0]) return self
def fit(self, X, out_dtype="int32"): """ Fit a multi-node multi-GPU DBSCAN model Parameters ---------- X : array-like (device or host) Dense matrix containing floats or doubles. Acceptable formats: CUDA array interface compliant objects like CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas DataFrame/Series. out_dtype: dtype Determines the precision of the output labels array. default: "int32". Valid values are { "int32", np.int32, "int64", np.int64}. """ if out_dtype not in ["int32", np.int32, "int64", np.int64]: raise ValueError("Invalid value for out_dtype. " "Valid values are {'int32', 'int64', " "np.int32, np.int64}") data = self.client.scatter(X, broadcast=True) comms = Comms(comms_p2p=True) comms.init() dbscan_fit = [ self.client.submit(DBSCAN._func_fit(out_dtype), comms.sessionId, data, self.verbose, **self.kwargs, workers=[worker], pure=False) for worker in comms.worker_addresses ] wait_and_raise_from_futures(dbscan_fit) comms.destroy() self._set_internal_model(dbscan_fit[0]) return self
def predict_proba(self, X, convert_dtype=True): """ Provide score by comparing predictions and ground truth. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query data. Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array convert_dtype : bool, optional (default = True) When set to True, the predict method will automatically convert the data to the right formats. Returns ------- probabilities : Dask futures or Dask CuPy Arrays """ query_handler = \ DistributedDataHandler.create(data=X, client=self.client) self.datatype = query_handler.datatype comms = KNeighborsClassifier._build_comms(self.data_handler, query_handler, self.streams_per_handle) worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ self.data_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) data_parts_to_ranks, data_nrows = \ parts_to_ranks(self.client, worker_info, self.data_handler.gpu_futures) query_parts_to_ranks, query_nrows = \ parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Each Dask worker creates a single model """ key = uuid1() models = dict([(worker, self.client.submit(self._func_create_model, comms.sessionId, **self.kwargs, workers=[worker], key="%s-%s" % (key, idx))) for idx, worker in enumerate(comms.worker_addresses)]) """ Invoke knn_classify on Dask workers to perform distributed query """ key = uuid1() knn_prob_res = dict([ (worker_info[worker]["rank"], self.client.submit(self._func_predict, models[worker], self.data_handler.worker_to_parts[worker] if worker in self.data_handler.workers else [], data_parts_to_ranks, data_nrows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_nrows, self.uniq_labels, self.n_unique, X.shape[1], worker_info[worker]["rank"], convert_dtype, True, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(knn_prob_res.values())) n_outputs = len(self.n_unique) """ Gather resulting partitions and return result """ outputs = [] for o in range(n_outputs): futures = flatten_grouped_results(self.client, query_parts_to_ranks, knn_prob_res, getter_func=_custom_getter(o)) outputs.append(to_output(futures, self.datatype)) comms.destroy() return tuple(outputs)
def predict(self, X, convert_dtype=True): """ Predict outputs for a query from previously stored index and outputs. The process is done in a multi-node multi-GPU fashion. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query data. Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array convert_dtype : bool, optional (default = True) When set to True, the predict method will automatically convert the data to the right formats. Returns ------- predictions : Dask futures or Dask CuPy Arrays """ query_handler = \ DistributedDataHandler.create(data=X, client=self.client) self.datatype = query_handler.datatype comms = KNeighborsRegressor._build_comms(self.data_handler, query_handler, self.streams_per_handle) worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ self.data_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) data_parts_to_ranks, data_nrows = \ parts_to_ranks(self.client, worker_info, self.data_handler.gpu_futures) query_parts_to_ranks, query_nrows = \ parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Each Dask worker creates a single model """ key = uuid1() models = dict([(worker, self.client.submit(self._func_create_model, comms.sessionId, **self.kwargs, workers=[worker], key="%s-%s" % (key, idx))) for idx, worker in enumerate(comms.worker_addresses)]) """ Invoke knn_classify on Dask workers to perform distributed query """ key = uuid1() knn_reg_res = dict([ (worker_info[worker]["rank"], self.client.submit(self._func_predict, models[worker], self.data_handler.worker_to_parts[worker] if worker in self.data_handler.workers else [], data_parts_to_ranks, data_nrows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_nrows, X.shape[1], self.n_outputs, worker_info[worker]["rank"], convert_dtype, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(knn_reg_res.values())) """ Gather resulting partitions and return result """ out_futures = flatten_grouped_results(self.client, query_parts_to_ranks, knn_reg_res) comms.destroy() return to_output(out_futures, self.datatype).squeeze()