def _query_models(self, n_neighbors, comms, nn_models, index_futures, query_futures): worker_info = comms.worker_info(comms.worker_addresses) index_worker_to_parts = workers_to_parts(index_futures) query_worker_to_parts = workers_to_parts(query_futures) """ Build inputs and outputs """ idx_parts_to_ranks, idx_M = parts_to_ranks(self.client, worker_info, index_futures) query_parts_to_ranks, query_M = parts_to_ranks(self.client, worker_info, query_futures) """ Invoke kneighbors on Dask workers to perform distributed query """ key = uuid1() nn_fit = dict([(worker_info[worker]["r"], self.client.submit( NearestNeighbors._func_kneighbors, nn_models[worker], index_worker_to_parts[worker] if worker in index_worker_to_parts else [], idx_M, self.n_cols, idx_parts_to_ranks, query_worker_to_parts[worker] if worker in query_worker_to_parts else [], query_M, query_parts_to_ranks, worker_info[worker]["r"], n_neighbors, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses)]) wait(list(nn_fit.values())) raise_exception_from_futures(list(nn_fit.values())) """ Gather resulting partitions and return dask_cudfs """ out_d_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_d) out_i_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_i) return nn_fit, out_d_futures, out_i_futures
def _fit(self, model_func, data): n_cols = data[0].shape[1] data = DistributedDataHandler.create(data=data, client=self.client) self.datatype = data.datatype comms = Comms(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) self.ranks = data.ranks worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) lin_models = dict([(data.worker_info[worker_data[0]]["rank"], self.client.submit( model_func, comms.sessionId, self.datatype, **self.kwargs, pure=False, workers=[worker_data[0]])) for worker, worker_data in enumerate(data.worker_to_parts.items())]) lin_fit = dict([(worker_data[0], self.client.submit( _func_fit, lin_models[data.worker_info[worker_data[0]]["rank"]], worker_data[1], data.total_rows, n_cols, parts_to_sizes, data.worker_info[worker_data[0]]["rank"], pure=False, workers=[worker_data[0]])) for worker, worker_data in enumerate(data.worker_to_parts.items())]) wait_and_raise_from_futures(list(lin_fit.values())) comms.destroy() return lin_models
def predict_proba(self, X, convert_dtype=True): """ Provide score by comparing predictions and ground truth. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query data. Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array convert_dtype : bool, optional (default = True) When set to True, the predict method will automatically convert the data to the right formats. Returns ------- probabilities : Dask futures or Dask CuPy Arrays """ query_handler = \ DistributedDataHandler.create(data=X, client=self.client) self.datatype = query_handler.datatype comms = KNeighborsClassifier._build_comms(self.data_handler, query_handler, self.streams_per_handle) worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ self.data_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) data_parts_to_ranks, data_nrows = \ parts_to_ranks(self.client, worker_info, self.data_handler.gpu_futures) query_parts_to_ranks, query_nrows = \ parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Each Dask worker creates a single model """ key = uuid1() models = dict([(worker, self.client.submit(self._func_create_model, comms.sessionId, **self.kwargs, workers=[worker], key="%s-%s" % (key, idx))) for idx, worker in enumerate(comms.worker_addresses)]) """ Invoke knn_classify on Dask workers to perform distributed query """ key = uuid1() knn_prob_res = dict([ (worker_info[worker]["rank"], self.client.submit(self._func_predict, models[worker], self.data_handler.worker_to_parts[worker] if worker in self.data_handler.workers else [], data_parts_to_ranks, data_nrows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_nrows, self.uniq_labels, self.n_unique, X.shape[1], worker_info[worker]["rank"], convert_dtype, True, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(knn_prob_res.values())) n_outputs = len(self.n_unique) """ Gather resulting partitions and return result """ outputs = [] for o in range(n_outputs): futures = flatten_grouped_results(self.client, query_parts_to_ranks, knn_prob_res, getter_func=_custom_getter(o)) outputs.append(to_output(futures, self.datatype)) comms.destroy() return tuple(outputs)
def _query_models(self, n_neighbors, comms, nn_models, index_handler, query_handler): worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ index_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) idx_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info, index_handler.gpu_futures) query_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Invoke kneighbors on Dask workers to perform distributed query """ key = uuid1() nn_fit = dict([ (worker_info[worker]["rank"], self.client.submit(NearestNeighbors._func_kneighbors, nn_models[worker], index_handler.worker_to_parts[worker] if worker in index_handler.workers else [], idx_parts_to_ranks, index_handler.total_rows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_handler.total_rows, self.n_cols, worker_info[worker]["rank"], n_neighbors, False, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(nn_fit.values())) def _custom_getter(o): def func_get(f, idx): return f[o][idx] return func_get """ Gather resulting partitions and return dask_cudfs """ out_d_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_custom_getter(0)) out_i_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_custom_getter(1)) return nn_fit, out_d_futures, out_i_futures
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype if "svd_solver" in self.kwargs \ and self.kwargs["svd_solver"] == "tsqr": comms = CommsContext(comms_p2p=True) else: comms = CommsContext(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit(DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, parts_to_sizes, data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self._set_internal_model(list(models.values())[0]) if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self
def predict(self, X, convert_dtype=True): """ Predict outputs for a query from previously stored index and outputs. The process is done in a multi-node multi-GPU fashion. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query data. Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array convert_dtype : bool, optional (default = True) When set to True, the predict method will automatically convert the data to the right formats. Returns ------- predictions : Dask futures or Dask CuPy Arrays """ query_handler = \ DistributedDataHandler.create(data=X, client=self.client) self.datatype = query_handler.datatype comms = KNeighborsRegressor._build_comms(self.data_handler, query_handler, self.streams_per_handle) worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ self.data_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) data_parts_to_ranks, data_nrows = \ parts_to_ranks(self.client, worker_info, self.data_handler.gpu_futures) query_parts_to_ranks, query_nrows = \ parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Each Dask worker creates a single model """ key = uuid1() models = dict([(worker, self.client.submit(self._func_create_model, comms.sessionId, **self.kwargs, workers=[worker], key="%s-%s" % (key, idx))) for idx, worker in enumerate(comms.worker_addresses)]) """ Invoke knn_classify on Dask workers to perform distributed query """ key = uuid1() knn_reg_res = dict([ (worker_info[worker]["rank"], self.client.submit(self._func_predict, models[worker], self.data_handler.worker_to_parts[worker] if worker in self.data_handler.workers else [], data_parts_to_ranks, data_nrows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_nrows, X.shape[1], self.n_outputs, worker_info[worker]["rank"], convert_dtype, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(knn_reg_res.values())) """ Gather resulting partitions and return result """ out_futures = flatten_grouped_results(self.client, query_parts_to_ranks, knn_reg_res) comms.destroy() return to_output(out_futures, self.datatype).squeeze()