Esempio n. 1
0
    def _query_models(self, n_neighbors,
                      comms, nn_models,
                      index_futures, query_futures):

        worker_info = comms.worker_info(comms.worker_addresses)

        index_worker_to_parts = workers_to_parts(index_futures)
        query_worker_to_parts = workers_to_parts(query_futures)

        """
        Build inputs and outputs
        """
        idx_parts_to_ranks, idx_M = parts_to_ranks(self.client,
                                                   worker_info,
                                                   index_futures)

        query_parts_to_ranks, query_M = parts_to_ranks(self.client,
                                                       worker_info,
                                                       query_futures)

        """
        Invoke kneighbors on Dask workers to perform distributed query
        """

        key = uuid1()
        nn_fit = dict([(worker_info[worker]["r"], self.client.submit(
                        NearestNeighbors._func_kneighbors,
                        nn_models[worker],
                        index_worker_to_parts[worker] if
                        worker in index_worker_to_parts else [],
                        idx_M,
                        self.n_cols,
                        idx_parts_to_ranks,
                        query_worker_to_parts[worker] if
                        worker in query_worker_to_parts else [],
                        query_M,
                        query_parts_to_ranks,
                        worker_info[worker]["r"],
                        n_neighbors,
                        key="%s-%s" % (key, idx),
                        workers=[worker]))
                       for idx, worker in enumerate(comms.worker_addresses)])

        wait(list(nn_fit.values()))
        raise_exception_from_futures(list(nn_fit.values()))

        """
        Gather resulting partitions and return dask_cudfs
        """
        out_d_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_d)

        out_i_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_i)

        return nn_fit, out_d_futures, out_i_futures
Esempio n. 2
0
File: base.py Progetto: h2oai/cuml
    def _fit(self, model_func, data):

        n_cols = data[0].shape[1]

        data = DistributedDataHandler.create(data=data, client=self.client)
        self.datatype = data.datatype

        comms = Comms(comms_p2p=False)
        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)
        self.ranks = data.ranks

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client,
                                           worker_info,
                                           data.gpu_futures)

        lin_models = dict([(data.worker_info[worker_data[0]]["rank"],
                            self.client.submit(
            model_func,
            comms.sessionId,
            self.datatype,
            **self.kwargs,
            pure=False,
            workers=[worker_data[0]]))

            for worker, worker_data in
            enumerate(data.worker_to_parts.items())])

        lin_fit = dict([(worker_data[0], self.client.submit(
            _func_fit,
            lin_models[data.worker_info[worker_data[0]]["rank"]],
            worker_data[1],
            data.total_rows,
            n_cols,
            parts_to_sizes,
            data.worker_info[worker_data[0]]["rank"],
            pure=False,
            workers=[worker_data[0]]))

            for worker, worker_data in
            enumerate(data.worker_to_parts.items())])

        wait_and_raise_from_futures(list(lin_fit.values()))

        comms.destroy()
        return lin_models
Esempio n. 3
0
    def predict_proba(self, X, convert_dtype=True):
        """
        Provide score by comparing predictions and ground truth.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query data.
            Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array

        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will automatically
            convert the data to the right formats.

        Returns
        -------
        probabilities : Dask futures or Dask CuPy Arrays
        """
        query_handler = \
            DistributedDataHandler.create(data=X,
                                          client=self.client)
        self.datatype = query_handler.datatype

        comms = KNeighborsClassifier._build_comms(self.data_handler,
                                                  query_handler,
                                                  self.streams_per_handle)

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        self.data_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        data_parts_to_ranks, data_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           self.data_handler.gpu_futures)

        query_parts_to_ranks, query_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           query_handler.gpu_futures)
        """
        Each Dask worker creates a single model
        """
        key = uuid1()
        models = dict([(worker,
                        self.client.submit(self._func_create_model,
                                           comms.sessionId,
                                           **self.kwargs,
                                           workers=[worker],
                                           key="%s-%s" % (key, idx)))
                       for idx, worker in enumerate(comms.worker_addresses)])
        """
        Invoke knn_classify on Dask workers to perform distributed query
        """
        key = uuid1()
        knn_prob_res = dict([
            (worker_info[worker]["rank"],
             self.client.submit(self._func_predict,
                                models[worker],
                                self.data_handler.worker_to_parts[worker]
                                if worker in self.data_handler.workers else [],
                                data_parts_to_ranks,
                                data_nrows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_nrows,
                                self.uniq_labels,
                                self.n_unique,
                                X.shape[1],
                                worker_info[worker]["rank"],
                                convert_dtype,
                                True,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(knn_prob_res.values()))

        n_outputs = len(self.n_unique)
        """
        Gather resulting partitions and return result
        """
        outputs = []
        for o in range(n_outputs):
            futures = flatten_grouped_results(self.client,
                                              query_parts_to_ranks,
                                              knn_prob_res,
                                              getter_func=_custom_getter(o))
            outputs.append(to_output(futures, self.datatype))

        comms.destroy()

        return tuple(outputs)
Esempio n. 4
0
    def _query_models(self, n_neighbors, comms, nn_models, index_handler,
                      query_handler):

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        index_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        idx_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info,
                                               index_handler.gpu_futures)

        query_parts_to_ranks, _ = parts_to_ranks(self.client, worker_info,
                                                 query_handler.gpu_futures)
        """
        Invoke kneighbors on Dask workers to perform distributed query
        """
        key = uuid1()
        nn_fit = dict([
            (worker_info[worker]["rank"],
             self.client.submit(NearestNeighbors._func_kneighbors,
                                nn_models[worker],
                                index_handler.worker_to_parts[worker]
                                if worker in index_handler.workers else [],
                                idx_parts_to_ranks,
                                index_handler.total_rows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_handler.total_rows,
                                self.n_cols,
                                worker_info[worker]["rank"],
                                n_neighbors,
                                False,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(nn_fit.values()))

        def _custom_getter(o):
            def func_get(f, idx):
                return f[o][idx]

            return func_get

        """
        Gather resulting partitions and return dask_cudfs
        """
        out_d_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_custom_getter(0))

        out_i_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_custom_getter(1))

        return nn_fit, out_d_futures, out_i_futures
Esempio n. 5
0
File: base.py Progetto: isVoid/cuml
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        if "svd_solver" in self.kwargs \
                and self.kwargs["svd_solver"] == "tsqr":
            comms = CommsContext(comms_p2p=True)
        else:
            comms = CommsContext(comms_p2p=False)

        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client, worker_info,
                                           data.gpu_futures)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(DecompositionSyncFitMixin._func_fit,
                                models[data.worker_info[wf[0]]["rank"]],
                                wf[1],
                                total_rows,
                                n_cols,
                                parts_to_sizes,
                                data.worker_info[wf[0]]["rank"],
                                _transform,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self._set_internal_model(list(models.values())[0])

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self
Esempio n. 6
0
    def predict(self, X, convert_dtype=True):
        """
        Predict outputs for a query from previously stored index
        and outputs.
        The process is done in a multi-node multi-GPU fashion.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query data.
            Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array

        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will automatically
            convert the data to the right formats.

        Returns
        -------
        predictions : Dask futures or Dask CuPy Arrays
        """
        query_handler = \
            DistributedDataHandler.create(data=X,
                                          client=self.client)
        self.datatype = query_handler.datatype

        comms = KNeighborsRegressor._build_comms(self.data_handler,
                                                 query_handler,
                                                 self.streams_per_handle)

        worker_info = comms.worker_info(comms.worker_addresses)
        """
        Build inputs and outputs
        """
        self.data_handler.calculate_parts_to_sizes(comms=comms)
        query_handler.calculate_parts_to_sizes(comms=comms)

        data_parts_to_ranks, data_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           self.data_handler.gpu_futures)

        query_parts_to_ranks, query_nrows = \
            parts_to_ranks(self.client,
                           worker_info,
                           query_handler.gpu_futures)
        """
        Each Dask worker creates a single model
        """
        key = uuid1()
        models = dict([(worker,
                        self.client.submit(self._func_create_model,
                                           comms.sessionId,
                                           **self.kwargs,
                                           workers=[worker],
                                           key="%s-%s" % (key, idx)))
                       for idx, worker in enumerate(comms.worker_addresses)])
        """
        Invoke knn_classify on Dask workers to perform distributed query
        """
        key = uuid1()
        knn_reg_res = dict([
            (worker_info[worker]["rank"],
             self.client.submit(self._func_predict,
                                models[worker],
                                self.data_handler.worker_to_parts[worker]
                                if worker in self.data_handler.workers else [],
                                data_parts_to_ranks,
                                data_nrows,
                                query_handler.worker_to_parts[worker]
                                if worker in query_handler.workers else [],
                                query_parts_to_ranks,
                                query_nrows,
                                X.shape[1],
                                self.n_outputs,
                                worker_info[worker]["rank"],
                                convert_dtype,
                                key="%s-%s" % (key, idx),
                                workers=[worker]))
            for idx, worker in enumerate(comms.worker_addresses)
        ])

        wait_and_raise_from_futures(list(knn_reg_res.values()))
        """
        Gather resulting partitions and return result
        """
        out_futures = flatten_grouped_results(self.client,
                                              query_parts_to_ranks,
                                              knn_reg_res)

        comms.destroy()

        return to_output(out_futures, self.datatype).squeeze()