Ejemplo n.º 1
0
    def predict_proba(self, X, convert_dtype=True):
        """
        Provide score by comparing predictions and ground truth.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query data.
            Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array

        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will automatically
            convert the data to the right formats.

        Returns
        -------
        probabilities : Dask futures or Dask CuPy Arrays
        """
        query_handler = \
            DistributedDataHandler.create(data=X,
                                          client=self.client)
        self.datatype = query_handler.datatype

        comms = KNeighborsClassifier._build_comms(self.data_handler,
                                                  query_handler,
                                                  self.streams_per_handle)

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        self.data_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        data_parts_to_ranks, data_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           self.data_handler.gpu_futures)

        query_parts_to_ranks, query_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           query_handler.gpu_futures)
        """
        Each Dask worker creates a single model
        """
        key = uuid1()
        models = dict([(worker,
                        self.client.submit(self._func_create_model,
                                           comms.sessionId,
                                           **self.kwargs,
                                           workers=[worker],
                                           key="%s-%s" % (key, idx)))
                       for idx, worker in enumerate(comms.worker_addresses)])
        """
        Invoke knn_classify on Dask workers to perform distributed query
        """
        key = uuid1()
        knn_prob_res = dict([
            (worker_info[worker]["rank"],
             self.client.submit(self._func_predict,
                                models[worker],
                                self.data_handler.worker_to_parts[worker]
                                if worker in self.data_handler.workers else [],
                                data_parts_to_ranks,
                                data_nrows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_nrows,
                                self.uniq_labels,
                                self.n_unique,
                                X.shape[1],
                                worker_info[worker]["rank"],
                                convert_dtype,
                                True,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(knn_prob_res.values()))

        n_outputs = len(self.n_unique)
        """
        Gather resulting partitions and return result
        """
        outputs = []
        for o in range(n_outputs):
            futures = flatten_grouped_results(self.client,
                                              query_parts_to_ranks,
                                              knn_prob_res,
                                              getter_func=_custom_getter(o))
            outputs.append(to_output(futures, self.datatype))

        comms.destroy()

        return tuple(outputs)
Ejemplo n.º 2
0
    def kneighbors(self,
                   X=None,
                   n_neighbors=None,
                   return_distance=True,
                   _return_futures=False):
        """
        Query the distributed nearest neighbors index

        Parameters
        ----------
        X : dask_cudf.Dataframe
            Vectors to query. If not provided, neighbors of each indexed point
            are returned.
        n_neighbors : int
            Number of neighbors to query for each row in X. If not provided,
            the n_neighbors on the model are used.
        return_distance : boolean (default=True)
            If false, only indices are returned

        Returns
        -------
        ret : tuple (dask_cudf.DataFrame, dask_cudf.DataFrame)
            First dask-cuDF DataFrame contains distances, second contains the
            indices.
        """
        n_neighbors = self.get_neighbors(n_neighbors)

        query_handler = self.X_handler if X is None else \
            DistributedDataHandler.create(data=X, client=self.client)

        if query_handler is None:
            raise ValueError("Model needs to be trained using fit() "
                             "before calling kneighbors()")
        """
        Create communicator clique
        """
        comms = NearestNeighbors._build_comms(self.X_handler, query_handler,
                                              self.streams_per_handle)
        """
        Initialize models on workers
        """
        nn_models = self._create_models(comms)
        """
        Perform model query
        """
        nn_fit, out_d_futures, out_i_futures = \
            self._query_models(n_neighbors, comms, nn_models,
                               self.X_handler, query_handler)

        comms.destroy()

        if _return_futures:
            ret = nn_fit, out_i_futures if not return_distance else \
                (nn_fit, out_d_futures, out_i_futures)
        else:
            ret = to_output(out_i_futures, self.datatype) \
                if not return_distance else (to_output(out_d_futures,
                                             self.datatype), to_output(
                                                 out_i_futures,
                                                 self.datatype))

        return ret
Ejemplo n.º 3
0
    def predict(self, X, convert_dtype=True):
        """
        Predict outputs for a query from previously stored index
        and outputs.
        The process is done in a multi-node multi-GPU fashion.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query data.
            Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array

        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will automatically
            convert the data to the right formats.

        Returns
        -------
        predictions : Dask futures or Dask CuPy Arrays
        """
        query_handler = \
            DistributedDataHandler.create(data=X,
                                          client=self.client)
        self.datatype = query_handler.datatype

        comms = KNeighborsRegressor._build_comms(self.data_handler,
                                                 query_handler,
                                                 self.streams_per_handle)

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        self.data_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        data_parts_to_ranks, data_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           self.data_handler.gpu_futures)

        query_parts_to_ranks, query_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           query_handler.gpu_futures)
        """
        Each Dask worker creates a single model
        """
        key = uuid1()
        models = dict([(worker,
                        self.client.submit(self._func_create_model,
                                           comms.sessionId,
                                           **self.kwargs,
                                           workers=[worker],
                                           key="%s-%s" % (key, idx)))
                       for idx, worker in enumerate(comms.worker_addresses)])
        """
        Invoke knn_classify on Dask workers to perform distributed query
        """
        key = uuid1()
        knn_reg_res = dict([
            (worker_info[worker]["rank"],
             self.client.submit(self._func_predict,
                                models[worker],
                                self.data_handler.worker_to_parts[worker]
                                if worker in self.data_handler.workers else [],
                                data_parts_to_ranks,
                                data_nrows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_nrows,
                                X.shape[1],
                                self.n_outputs,
                                worker_info[worker]["rank"],
                                convert_dtype,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(knn_reg_res.values()))
        """
        Gather resulting partitions and return result
        """
        out_futures = flatten_grouped_results(self.client,
                                              query_parts_to_ranks,
                                              knn_reg_res)

        comms.destroy()

        return to_output(out_futures, self.datatype).squeeze()
Ejemplo n.º 4
0
Archivo: base.py Proyecto: isVoid/cuml
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        if "svd_solver" in self.kwargs \
                and self.kwargs["svd_solver"] == "tsqr":
            comms = CommsContext(comms_p2p=True)
        else:
            comms = CommsContext(comms_p2p=False)

        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client, worker_info,
                                           data.gpu_futures)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(DecompositionSyncFitMixin._func_fit,
                                models[data.worker_info[wf[0]]["rank"]],
                                wf[1],
                                total_rows,
                                n_cols,
                                parts_to_sizes,
                                data.worker_info[wf[0]]["rank"],
                                _transform,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self._set_internal_model(list(models.values())[0])

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self
Ejemplo n.º 5
0
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(
                 DecompositionSyncFitMixin._func_fit,
                 models[data.worker_info[wf[0]]["rank"]],
                 wf[1],
                 total_rows,
                 n_cols,
                 data.parts_to_sizes[data.worker_info[wf[0]]["rank"]],
                 data.worker_info[wf[0]]["rank"],
                 _transform,
                 pure=False,
                 workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self.local_model = list(models.values())[0].result()

        self.components_ = self.local_model.components_
        self.explained_variance_ = self.local_model.explained_variance_
        self.explained_variance_ratio_ = \
            self.local_model.explained_variance_ratio_
        self.singular_values_ = self.local_model.singular_values_

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self

        return self