Ejemplo n.º 1
0
    def _build_comms(index_handler, query_handler, streams_per_handle):
        # Communicator clique needs to include the union of workers hosting
        # query and index partitions
        workers = set(index_handler.workers)
        workers.update(query_handler.workers)

        comms = Comms(comms_p2p=True, streams_per_handle=streams_per_handle)
        comms.init(workers=workers)
        return comms
Ejemplo n.º 2
0
    def fit(self, X):
        """
        Fit a multi-node multi-GPU KMeans model

        Parameters
        ----------
        X : Dask cuDF DataFrame or CuPy backed Dask Array
        Training data to cluster.

        """

        data = DistributedDataHandler.create(X, client=self.client)
        self.datatype = data.datatype

        comms = Comms(comms_p2p=False)
        comms.init(workers=data.workers)

        kmeans_fit = [self.client.submit(KMeans._func_fit,
                                         comms.sessionId,
                                         wf[1],
                                         self.datatype,
                                         **self.kwargs,
                                         workers=[wf[0]],
                                         pure=False)
                      for idx, wf in enumerate(data.worker_to_parts.items())]

        wait_and_raise_from_futures(kmeans_fit)

        comms.destroy()

        self._set_internal_model(kmeans_fit[0])

        return self
Ejemplo n.º 3
0
Archivo: base.py Proyecto: h2oai/cuml
    def _fit(self, model_func, data):

        n_cols = data[0].shape[1]

        data = DistributedDataHandler.create(data=data, client=self.client)
        self.datatype = data.datatype

        comms = Comms(comms_p2p=False)
        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)
        self.ranks = data.ranks

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client,
                                           worker_info,
                                           data.gpu_futures)

        lin_models = dict([(data.worker_info[worker_data[0]]["rank"],
                            self.client.submit(
            model_func,
            comms.sessionId,
            self.datatype,
            **self.kwargs,
            pure=False,
            workers=[worker_data[0]]))

            for worker, worker_data in
            enumerate(data.worker_to_parts.items())])

        lin_fit = dict([(worker_data[0], self.client.submit(
            _func_fit,
            lin_models[data.worker_info[worker_data[0]]["rank"]],
            worker_data[1],
            data.total_rows,
            n_cols,
            parts_to_sizes,
            data.worker_info[worker_data[0]]["rank"],
            pure=False,
            workers=[worker_data[0]]))

            for worker, worker_data in
            enumerate(data.worker_to_parts.items())])

        wait_and_raise_from_futures(list(lin_fit.values()))

        comms.destroy()
        return lin_models
Ejemplo n.º 4
0
    def fit(self, X, sample_weight=None):
        """
        Fit a multi-node multi-GPU KMeans model

        Parameters
        ----------
        X : Dask cuDF DataFrame or CuPy backed Dask Array
        Training data to cluster.

        sample_weight : Dask cuDF DataFrame or CuPy backed Dask Array
                        shape = (n_samples,), default=None # noqa
            The weights for each observation in X. If None, all observations
            are assigned equal weight.
            Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
            ndarray, cuda array interface compliant array like CuPy
        """

        sample_weight = self._check_normalize_sample_weight(sample_weight)

        inputs = X if sample_weight is None else (X, sample_weight)

        data = DistributedDataHandler.create(inputs, client=self.client)
        self.datatype = data.datatype

        # This needs to happen on the scheduler
        comms = Comms(comms_p2p=False, client=self.client)
        comms.init(workers=data.workers)

        kmeans_fit = [
            self.client.submit(KMeans._func_fit,
                               comms.sessionId,
                               wf[1],
                               self.datatype,
                               data.multiple,
                               **self.kwargs,
                               workers=[wf[0]],
                               pure=False)
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]

        wait_and_raise_from_futures(kmeans_fit)

        comms.destroy()

        self._set_internal_model(kmeans_fit[0])

        return self
Ejemplo n.º 5
0
    def fit(self, X, out_dtype="int32"):
        """
        Fit a multi-node multi-GPU DBSCAN model

        Parameters
        ----------
        X : array-like (device or host)
            Dense matrix containing floats or doubles.
            Acceptable formats: CUDA array interface compliant objects like
            CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas
            DataFrame/Series.
        out_dtype: dtype Determines the precision of the output labels array.
            default: "int32". Valid values are { "int32", np.int32,
            "int64", np.int64}.
        """
        if out_dtype not in ["int32", np.int32, "int64", np.int64]:
            raise ValueError("Invalid value for out_dtype. "
                             "Valid values are {'int32', 'int64', "
                             "np.int32, np.int64}")

        data = self.client.scatter(X, broadcast=True)

        comms = Comms(comms_p2p=True)
        comms.init()

        dbscan_fit = [
            self.client.submit(DBSCAN._func_fit(out_dtype),
                               comms.sessionId,
                               data,
                               self.verbose,
                               **self.kwargs,
                               workers=[worker],
                               pure=False) for worker in comms.worker_addresses
        ]

        wait_and_raise_from_futures(dbscan_fit)

        comms.destroy()

        self._set_internal_model(dbscan_fit[0])

        return self
Ejemplo n.º 6
0
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        if "svd_solver" in self.kwargs \
                and self.kwargs["svd_solver"] == "tsqr":
            comms = Comms(comms_p2p=True)
        else:
            comms = Comms(comms_p2p=False)

        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client, worker_info,
                                           data.gpu_futures)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(DecompositionSyncFitMixin._func_fit,
                                models[data.worker_info[wf[0]]["rank"]],
                                wf[1],
                                total_rows,
                                n_cols,
                                parts_to_sizes,
                                data.worker_info[wf[0]]["rank"],
                                _transform,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self._set_internal_model(list(models.values())[0])

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self