Beispiel #1
0
    def _create_model(self, model_func, client, workers, n_estimators,
                      base_seed, ignore_empty_partitions, **kwargs):

        self.client = get_client(client)
        if workers is None:
            # Default to all workers
            workers = self.client.scheduler_info()['workers'].keys()
        self.workers = workers
        self._set_internal_model(None)
        self.active_workers = list()
        self.ignore_empty_partitions = ignore_empty_partitions
        self.n_estimators = n_estimators

        self.n_estimators_per_worker = \
            self._estimators_per_worker(n_estimators)
        if base_seed is None:
            base_seed = 0
        seeds = [base_seed]
        for i in range(1, len(self.n_estimators_per_worker)):
            sd = self.n_estimators_per_worker[i - 1] + seeds[i - 1]
            seeds.append(sd)

        self.rfs = {
            worker: self.client.submit(
                model_func,
                n_estimators=self.n_estimators_per_worker[n],
                random_state=seeds[n],
                **kwargs,
                pure=False,
                workers=[worker],
            )
            for n, worker in enumerate(self.workers)
        }

        wait_and_raise_from_futures(list(self.rfs.values()))
Beispiel #2
0
    def fit(self, X):
        """
        Fit a multi-node multi-GPU KMeans model

        Parameters
        ----------
        X : Dask cuDF DataFrame or CuPy backed Dask Array
        Training data to cluster.

        """

        data = DistributedDataHandler.create(X, client=self.client)
        self.datatype = data.datatype

        comms = Comms(comms_p2p=False)
        comms.init(workers=data.workers)

        kmeans_fit = [self.client.submit(KMeans._func_fit,
                                         comms.sessionId,
                                         wf[1],
                                         self.datatype,
                                         **self.kwargs,
                                         workers=[wf[0]],
                                         pure=False)
                      for idx, wf in enumerate(data.worker_to_parts.items())]

        wait_and_raise_from_futures(kmeans_fit)

        comms.destroy()

        self._set_internal_model(kmeans_fit[0])

        return self
Beispiel #3
0
    def _check_internal_model(model):
        """
        Performs a brief validation that a model meets the requirements
        to be set as an `internal_model`

        Parameters
        ----------

        model : distributed.client.Future[cuml.Base], cuml.Base, or None

        Returns
        -------

        model : distributed.client.Future[cuml.Base], cuml.Base, or None

        """
        if isinstance(model, Iterable):
            # If model is iterable, just grab the first
            model = first(model)

        if isinstance(model, Future):
            if model.type is None:
                wait_and_raise_from_futures([model])

            if not issubclass(model.type, Base):
                raise ValueError("Dask Future expected to contain cuml.Base "
                                 "but found %s instead." % model.type)

        elif model is not None and not isinstance(model, Base):
            raise ValueError("Expected model of type cuml.Base but found %s "
                             "instead." % type(model))
        return model
Beispiel #4
0
    def _create_model(self, model_func, client, workers, n_estimators,
                      base_seed, **kwargs):

        self.client = get_client(client)
        self.workers = self.client.scheduler_info()['workers'].keys()
        self.local_model = None

        self.n_estimators_per_worker = \
            self._estimators_per_worker(n_estimators)
        if base_seed is None:
            base_seed = 0
        seeds = [base_seed]
        for i in range(1, len(self.n_estimators_per_worker)):
            sd = self.n_estimators_per_worker[i - 1] + seeds[i - 1]
            seeds.append(sd)

        self.rfs = {
            worker: self.client.submit(
                model_func,
                n_estimators=self.n_estimators_per_worker[n],
                seed=seeds[n],
                **kwargs,
                pure=False,
                workers=[worker],
            )
            for n, worker in enumerate(self.workers)
        }

        wait_and_raise_from_futures(list(self.rfs.values()))
Beispiel #5
0
    def _fit(self, model, dataset, convert_dtype, broadcast_data):
        data = DistributedDataHandler.create(dataset, client=self.client)
        self.active_workers = data.workers
        self.datatype = data.datatype
        if self.datatype == 'cudf':
            has_float64 = (dataset[0].dtypes == np.float64).any()
        else:
            has_float64 = (dataset[0].dtype == np.float64)
        if has_float64:
            raise TypeError("To use Dask RF data should have dtype float32.")

        labels = self.client.persist(dataset[1])
        if self.datatype == 'cudf':
            self.num_classes = len(labels.unique())
        else:
            self.num_classes = \
                len(dask.array.unique(labels).compute())

        combined_data = list(map(lambda x: x[1], data.gpu_futures)) \
            if broadcast_data else None

        futures = list()
        for idx, (worker, worker_data) in \
                enumerate(data.worker_to_parts.items()):
            futures.append(
                self.client.submit(
                    _func_fit,
                    model[worker],
                    combined_data if broadcast_data else worker_data,
                    convert_dtype,
                    workers=[worker],
                    pure=False)
            )

        self.n_active_estimators_per_worker = []
        for worker in data.worker_to_parts.keys():
            n = self.workers.index(worker)
            n_est = self.n_estimators_per_worker[n]
            self.n_active_estimators_per_worker.append(n_est)

        if len(self.workers) > len(self.active_workers):
            if self.ignore_empty_partitions:
                curent_estimators = self.n_estimators / \
                                    len(self.workers) * \
                                    len(self.active_workers)
                warn_text = (
                    f"Data was not split among all workers "
                    f"using only {self.active_workers} workers to fit."
                    f"This will only train {curent_estimators}"
                    f" estimators instead of the requested "
                    f"{self.n_estimators}"
                )
                warnings.warn(warn_text)
            else:
                raise ValueError("Data was not split among all workers. "
                                 "Re-run the code or "
                                 "use ignore_empty_partitions=True"
                                 " while creating model")
        wait_and_raise_from_futures(futures)
        return self
Beispiel #6
0
 def _set_params(self, **params):
     model_params = list()
     for idx, worker in enumerate(self.workers):
         model_params.append(
             self.client.submit(_func_set_params,
                                self.rfs[worker],
                                **params,
                                workers=[worker]))
     wait_and_raise_from_futures(model_params)
     return self
    def _query_models(self, n_neighbors, comms, nn_models, index_handler,
                      query_handler):

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        index_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        idx_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info,
                                               index_handler.gpu_futures)

        query_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info,
                                                 query_handler.gpu_futures)
        """
        Invoke kneighbors on Dask workers to perform distributed query
        """
        key = uuid1()
        nn_fit = dict([
            (worker_info[worker]["rank"],
             self.client.submit(NearestNeighbors._func_kneighbors,
                                nn_models[worker],
                                index_handler.worker_to_parts[worker]
                                if worker in index_handler.workers else [],
                                index_handler.total_rows,
                                self.n_cols,
                                idx_parts_to_ranks,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_handler.total_rows,
                                query_parts_to_ranks,
                                worker_info[worker]["rank"],
                                n_neighbors,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(nn_fit.values()))
        """
        Gather resulting partitions and return dask_cudfs
        """
        out_d_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_d)

        out_i_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_i)

        return nn_fit, out_d_futures, out_i_futures
Beispiel #8
0
    def fit(self, X, y, classes=None):
        """
        Fit distributed Naive Bayes classifier model

        Parameters
        ----------

        X : dask.Array with blocks containing dense or sparse cupy arrays
        y : dask.Array with blocks containing cupy.ndarray
        classes : array-like containing unique class labels

        Returns
        -------

        cuml.dask.naive_bayes.MultinomialNB current model instance
        """

        # Only Dask.Array supported for now
        if not isinstance(X, dask.array.core.Array):
            raise ValueError("Only dask.Array is supported for X")

        if not isinstance(y, dask.array.core.Array):
            raise ValueError("Only dask.Array is supported for y")

        if len(X.chunks[1]) != 1:
            raise ValueError("X must be chunked by row only. "
                             "Multi-dimensional chunking is not supported")

        futures = DistributedDataHandler.create([X, y], self.client)

        classes = self._unique(y.map_blocks(
            MultinomialNB._unique).compute()) \
            if classes is None else classes

        models = [
            self.client.submit(self._fit,
                               part,
                               classes,
                               self.kwargs,
                               pure=False) for w, part in futures.gpu_futures
        ]

        models = reduce(models,
                        self._merge_counts_to_model,
                        client=self.client)

        models = self.client.submit(self._update_log_probs, models, pure=False)

        wait_and_raise_from_futures([models])

        self._set_internal_model(models)

        return self
Beispiel #9
0
 def _print_detailed(self):
     """
     Print the summary of the forest used to train and test the model.
     """
     futures = list()
     for n, w in enumerate(self.workers):
         futures.append(
             self.client.submit(
                 _print_detailed_func,
                 self.rfs[w],
                 workers=[w],
             ))
         wait_and_raise_from_futures(futures)
     return self
Beispiel #10
0
    def fit(self, X):

        """
        Fit distributed TFIDF Transformer

        Parameters
        ----------

        X : dask.Array with blocks containing dense or sparse cupy arrays

        Returns
        -------

        cuml.dask.feature_extraction.text.TfidfTransformer instance
        """

        # Only Dask.Array supported for now
        if not isinstance(X, dask.array.core.Array):
            raise ValueError("Only dask.Array is supported for X")

        if len(X.chunks[1]) != 1:
            raise ValueError(
                "X must be chunked by row only. "
                "Multi-dimensional chunking is not supported"
            )

        # We don't' do anything if we don't need idf
        if not self.internal_model.use_idf:
            return self

        futures = DistributedDataHandler.create(X, self.client)

        models = [
            self.client.submit(
                self._set_doc_stats, part, self.kwargs, pure=False
            )
            for w, part in futures.gpu_futures
        ]

        models = reduce(models, self._merge_stats_to_model, client=self.client)

        wait_and_raise_from_futures([models])

        models = self.client.submit(self._set_idf_diag, models, pure=False)

        wait_and_raise_from_futures([models])

        self._set_internal_model(models)

        return self
Beispiel #11
0
 def _fit(self, model, dataset, convert_dtype):
     data = DistributedDataHandler.create(dataset, client=self.client)
     self.datatype = data.datatype
     futures = list()
     for idx, (worker, worker_data) in \
             enumerate(data.worker_to_parts.items()):
         futures.append(
             self.client.submit(_func_fit,
                                model[worker],
                                worker_data,
                                convert_dtype,
                                workers=[worker],
                                pure=False))
     wait_and_raise_from_futures(futures)
     return self
Beispiel #12
0
    def _fit(self, model_func, data):

        n_cols = data[0].shape[1]

        data = DistributedDataHandler.create(data=data, client=self.client)
        self.datatype = data.datatype

        comms = Comms(comms_p2p=False)
        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)
        self.ranks = data.ranks

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client,
                                           worker_info,
                                           data.gpu_futures)

        lin_models = dict([(data.worker_info[worker_data[0]]["rank"],
                            self.client.submit(
            model_func,
            comms.sessionId,
            self.datatype,
            **self.kwargs,
            pure=False,
            workers=[worker_data[0]]))

            for worker, worker_data in
            enumerate(data.worker_to_parts.items())])

        lin_fit = dict([(worker_data[0], self.client.submit(
            _func_fit,
            lin_models[data.worker_info[worker_data[0]]["rank"]],
            worker_data[1],
            data.total_rows,
            n_cols,
            parts_to_sizes,
            data.worker_info[worker_data[0]]["rank"],
            pure=False,
            workers=[worker_data[0]]))

            for worker, worker_data in
            enumerate(data.worker_to_parts.items())])

        wait_and_raise_from_futures(list(lin_fit.values()))

        comms.destroy()
        return lin_models
Beispiel #13
0
    def fit(self, X, sample_weight=None):
        """
        Fit a multi-node multi-GPU KMeans model

        Parameters
        ----------
        X : Dask cuDF DataFrame or CuPy backed Dask Array
        Training data to cluster.

        sample_weight : Dask cuDF DataFrame or CuPy backed Dask Array
                        shape = (n_samples,), default=None # noqa
            The weights for each observation in X. If None, all observations
            are assigned equal weight.
            Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
            ndarray, cuda array interface compliant array like CuPy
        """

        sample_weight = self._check_normalize_sample_weight(sample_weight)

        inputs = X if sample_weight is None else (X, sample_weight)

        data = DistributedDataHandler.create(inputs, client=self.client)
        self.datatype = data.datatype

        # This needs to happen on the scheduler
        comms = Comms(comms_p2p=False, client=self.client)
        comms.init(workers=data.workers)

        kmeans_fit = [
            self.client.submit(KMeans._func_fit,
                               comms.sessionId,
                               wf[1],
                               self.datatype,
                               data.multiple,
                               **self.kwargs,
                               workers=[wf[0]],
                               pure=False)
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]

        wait_and_raise_from_futures(kmeans_fit)

        comms.destroy()

        self._set_internal_model(kmeans_fit[0])

        return self
Beispiel #14
0
    def fit(self, X, out_dtype="int32"):
        """
        Fit a multi-node multi-GPU DBSCAN model

        Parameters
        ----------
        X : array-like (device or host)
            Dense matrix containing floats or doubles.
            Acceptable formats: CUDA array interface compliant objects like
            CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas
            DataFrame/Series.
        out_dtype: dtype Determines the precision of the output labels array.
            default: "int32". Valid values are { "int32", np.int32,
            "int64", np.int64}.
        """
        if out_dtype not in ["int32", np.int32, "int64", np.int64]:
            raise ValueError("Invalid value for out_dtype. "
                             "Valid values are {'int32', 'int64', "
                             "np.int32, np.int64}")

        data = self.client.scatter(X, broadcast=True)

        comms = Comms(comms_p2p=True)
        comms.init()

        dbscan_fit = [
            self.client.submit(DBSCAN._func_fit(out_dtype),
                               comms.sessionId,
                               data,
                               self.verbose,
                               **self.kwargs,
                               workers=[worker],
                               pure=False) for worker in comms.worker_addresses
        ]

        wait_and_raise_from_futures(dbscan_fit)

        comms.destroy()

        self._set_internal_model(dbscan_fit[0])

        return self
    def predict_proba(self, X, convert_dtype=True):
        """
        Provide score by comparing predictions and ground truth.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query data.
            Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array

        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will automatically
            convert the data to the right formats.

        Returns
        -------
        probabilities : Dask futures or Dask CuPy Arrays
        """
        query_handler = \
            DistributedDataHandler.create(data=X,
                                          client=self.client)
        self.datatype = query_handler.datatype

        comms = KNeighborsClassifier._build_comms(self.data_handler,
                                                  query_handler,
                                                  self.streams_per_handle)

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        self.data_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        data_parts_to_ranks, data_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           self.data_handler.gpu_futures)

        query_parts_to_ranks, query_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           query_handler.gpu_futures)
        """
        Each Dask worker creates a single model
        """
        key = uuid1()
        models = dict([(worker,
                        self.client.submit(self._func_create_model,
                                           comms.sessionId,
                                           **self.kwargs,
                                           workers=[worker],
                                           key="%s-%s" % (key, idx)))
                       for idx, worker in enumerate(comms.worker_addresses)])
        """
        Invoke knn_classify on Dask workers to perform distributed query
        """
        key = uuid1()
        knn_prob_res = dict([
            (worker_info[worker]["rank"],
             self.client.submit(self._func_predict,
                                models[worker],
                                self.data_handler.worker_to_parts[worker]
                                if worker in self.data_handler.workers else [],
                                data_parts_to_ranks,
                                data_nrows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_nrows,
                                self.uniq_labels,
                                self.n_unique,
                                X.shape[1],
                                worker_info[worker]["rank"],
                                convert_dtype,
                                True,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(knn_prob_res.values()))

        n_outputs = len(self.n_unique)
        """
        Gather resulting partitions and return result
        """
        outputs = []
        for o in range(n_outputs):
            futures = flatten_grouped_results(self.client,
                                              query_parts_to_ranks,
                                              knn_prob_res,
                                              getter_func=_custom_getter(o))
            outputs.append(to_output(futures, self.datatype))

        comms.destroy()

        return tuple(outputs)
Beispiel #16
0
    def predict(self, X, convert_dtype=True):
        """
        Predict outputs for a query from previously stored index
        and outputs.
        The process is done in a multi-node multi-GPU fashion.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query data.
            Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array

        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will automatically
            convert the data to the right formats.

        Returns
        -------
        predictions : Dask futures or Dask CuPy Arrays
        """
        query_handler = \
            DistributedDataHandler.create(data=X,
                                          client=self.client)
        self.datatype = query_handler.datatype

        comms = KNeighborsRegressor._build_comms(self.data_handler,
                                                 query_handler,
                                                 self.streams_per_handle)

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        self.data_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        data_parts_to_ranks, data_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           self.data_handler.gpu_futures)

        query_parts_to_ranks, query_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           query_handler.gpu_futures)
        """
        Each Dask worker creates a single model
        """
        key = uuid1()
        models = dict([(worker,
                        self.client.submit(self._func_create_model,
                                           comms.sessionId,
                                           **self.kwargs,
                                           workers=[worker],
                                           key="%s-%s" % (key, idx)))
                       for idx, worker in enumerate(comms.worker_addresses)])
        """
        Invoke knn_classify on Dask workers to perform distributed query
        """
        key = uuid1()
        knn_reg_res = dict([
            (worker_info[worker]["rank"],
             self.client.submit(self._func_predict,
                                models[worker],
                                self.data_handler.worker_to_parts[worker]
                                if worker in self.data_handler.workers else [],
                                data_parts_to_ranks,
                                data_nrows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_nrows,
                                X.shape[1],
                                self.n_outputs,
                                worker_info[worker]["rank"],
                                convert_dtype,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(knn_reg_res.values()))
        """
        Gather resulting partitions and return result
        """
        out_futures = flatten_grouped_results(self.client,
                                              query_parts_to_ranks,
                                              knn_reg_res)

        comms.destroy()

        return to_output(out_futures, self.datatype).squeeze()