def _predict_using_cpu(self, X, convert_dtype):
        workers = self.workers

        X_Scattered = self.client.scatter(X)

        futures = list()
        for n, w in enumerate(workers):
            futures.append(
                self.client.submit(
                    RandomForestRegressor._predict_cpu,
                    self.rfs[w],
                    X_Scattered,
                    convert_dtype,
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        indexes = list()
        rslts = list()
        for d in range(len(futures)):
            rslts.append(futures[d].result())
            indexes.append(0)

        pred = list()

        for i in range(len(X)):
            pred_per_worker = 0.0
            for d in range(len(rslts)):
                pred_per_worker = pred_per_worker + rslts[d][i]

            pred.append(pred_per_worker / len(rslts))

        return pred
Exemple #2
0
    def _query_models(self, n_neighbors,
                      comms, nn_models,
                      index_futures, query_futures):

        worker_info = comms.worker_info(comms.worker_addresses)

        index_worker_to_parts = workers_to_parts(index_futures)
        query_worker_to_parts = workers_to_parts(query_futures)

        """
        Build inputs and outputs
        """
        idx_parts_to_ranks, idx_M = parts_to_ranks(self.client,
                                                   worker_info,
                                                   index_futures)

        query_parts_to_ranks, query_M = parts_to_ranks(self.client,
                                                       worker_info,
                                                       query_futures)

        """
        Invoke kneighbors on Dask workers to perform distributed query
        """

        key = uuid1()
        nn_fit = dict([(worker_info[worker]["r"], self.client.submit(
                        NearestNeighbors._func_kneighbors,
                        nn_models[worker],
                        index_worker_to_parts[worker] if
                        worker in index_worker_to_parts else [],
                        idx_M,
                        self.n_cols,
                        idx_parts_to_ranks,
                        query_worker_to_parts[worker] if
                        worker in query_worker_to_parts else [],
                        query_M,
                        query_parts_to_ranks,
                        worker_info[worker]["r"],
                        n_neighbors,
                        key="%s-%s" % (key, idx),
                        workers=[worker]))
                       for idx, worker in enumerate(comms.worker_addresses)])

        wait(list(nn_fit.values()))
        raise_exception_from_futures(list(nn_fit.values()))

        """
        Gather resulting partitions and return dask_cudfs
        """
        out_d_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_d)

        out_i_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_i)

        return nn_fit, out_d_futures, out_i_futures
Exemple #3
0
    def predict(self, X):
        """
        Make predictions for X and returns a y_pred.

        Parameters
        ----------
        X : dask cuDF dataframe (n_rows, n_features)

        Returns
        -------
        y : dask cuDF (n_rows, 1)
        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            worker_to_parts[w].append(p)

        key = uuid1()
        partsToSizes = [(self.rnks[wf[0]], self.client.submit(
            Ridge._func_get_size,
            wf[1],
            workers=[wf[0]],
            key="%s-%s" % (key, idx)).result())
            for idx, wf in enumerate(gpu_futures)]

        n_cols = X.shape[1]
        n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes))

        key = uuid1()
        linear_pred = dict([(self.rnks[wf[0]], self.client.submit(
            Ridge._func_predict,
            wf[1],
            worker_to_parts[wf[0]],
            n_rows, n_cols,
            partsToSizes,
            self.rnks[wf[0]],
            key="%s-%s" % (key, idx),
            workers=[wf[0]]))
            for idx, wf in enumerate(self.linear_models)])

        wait(list(linear_pred.values()))
        raise_exception_from_futures(list(linear_pred.values()))

        out_futures = []
        completed_part_map = {}
        for rank, size in partsToSizes:
            if rank not in completed_part_map:
                completed_part_map[rank] = 0

            f = linear_pred[rank]
            out_futures.append(self.client.submit(
                Ridge._func_get_idx, f, completed_part_map[rank]))

            completed_part_map[rank] += 1

        return to_dask_cudf(out_futures)
Exemple #4
0
    def _fit_with_colocality(self, X, y):
        input_futures = self.client.sync(extract_colocated_ddf_partitions,
                                         X, y, self.client)
        workers = list(input_futures.keys())

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        n_cols = X.shape[1]
        n_rows = 0

        self.rnks = dict()
        partsToSizes = dict()
        key = uuid1()
        for w, futures in input_futures.items():
            self.rnks[w] = worker_info[w]["r"]
            parts = [(self.client.submit(
                Ridge._func_get_size_cl,
                future,
                workers=[w],
                key="%s-%s" % (key, idx)).result())
                for idx, future in enumerate(futures)]

            partsToSizes[worker_info[w]["r"]] = parts
            for p in parts:
                n_rows = n_rows + p

        key = uuid1()
        self.linear_models = [(w, self.client.submit(
            Ridge._func_create_model,
            comms.sessionId,
            **self.kwargs,
            workers=[w],
            key="%s-%s" % (key, idx)))
            for idx, w in enumerate(workers)]

        key = uuid1()
        linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(
            Ridge._func_fit_colocated,
            wf[1],
            input_futures[wf[0]],
            n_rows, n_cols,
            partsToSizes,
            worker_info[wf[0]]["r"],
            key="%s-%s" % (key, idx),
            workers=[wf[0]]))
            for idx, wf in enumerate(self.linear_models)])

        wait(list(linear_fit.values()))
        raise_exception_from_futures(list(linear_fit.values()))

        comms.destroy()

        self.local_model = self.linear_models[0][1].result()
        self.coef_ = self.local_model.coef_
        self.intercept_ = self.local_model.intercept_
Exemple #5
0
def test_dask_exceptions(cluster):
    c = Client(cluster)
    try:
        fut = c.submit(_raise_exception)
        wait(fut)

        with pytest.raises(RuntimeError):
            raise_exception_from_futures([fut])
    finally:
        c.close()
Exemple #6
0
    def _inverse_transform(self, X):
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            worker_to_parts[w].append(p)

        key = uuid1()
        partsToRanks = [(self.rnks[wf[0]],
                         self.client.submit(TruncatedSVD._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        tsvd_inverse_transform = dict([
            (self.rnks[wf[0]],
             self.client.submit(TruncatedSVD._func_inverse_transform,
                                wf[1],
                                worker_to_parts[wf[0]],
                                M,
                                N,
                                partsToRanks,
                                self.rnks[wf[0]],
                                key="%s-%s" % (key, idx),
                                workers=[wf[0]]))
            for idx, wf in enumerate(self.tsvd_models)
        ])

        wait(list(tsvd_inverse_transform.values()))
        raise_exception_from_futures(list(tsvd_inverse_transform.values()))

        out_futures = []
        completed_part_map = {}
        for rank, size in partsToRanks:
            if rank not in completed_part_map:
                completed_part_map[rank] = 0

            f = tsvd_inverse_transform[rank]
            out_futures.append(
                self.client.submit(TruncatedSVD._func_get_idx, f,
                                   completed_part_map[rank]))

            completed_part_map[rank] += 1

        return to_dask_cudf(out_futures)
Exemple #7
0
    def predict(self, X):
        """
        Predicts the regressor outputs for X.

        Parameters
        ----------
        X : Dense matrix (floats or doubles) of shape (n_samples, n_features).

        Returns
        ----------
        y: NumPy
           Dense vector (float) of shape (n_samples, 1)

        """
        c = default_client()
        workers = self.workers

        if not isinstance(X, np.ndarray):
            raise ValueError("Predict inputs must be numpy arrays")

        X_Scattered = c.scatter(X)

        futures = list()
        for n, w in enumerate(workers):
            futures.append(
                c.submit(
                    RandomForestRegressor._predict,
                    self.rfs[w],
                    X_Scattered,
                    random.random(),
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        indexes = list()
        rslts = list()
        for d in range(len(futures)):
            rslts.append(futures[d].result())
            indexes.append(0)

        pred = list()

        for i in range(len(X)):
            pred_per_worker = 0.0
            for d in range(len(rslts)):
                pred_per_worker = pred_per_worker + rslts[d][i]

            pred.append(pred_per_worker / len(rslts))

        return pred
Exemple #8
0
    def print_summary(self):
        """
        Print the summary of the forest used to train and test the model.
        """
        futures = list()
        workers = self.workers

        for n, w in enumerate(workers):
            futures.append(
                self.client.submit(
                    RandomForestClassifier._print_summary,
                    self.rfs[w],
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)
        return self
Exemple #9
0
    def _fit(self, model_func, data, **kwargs):

        n_cols = data[0].shape[1]

        data = DistributedDataHandler.create(data=data, client=self.client)
        self.datatype = data.datatype

        comms = CommsContext(comms_p2p=False, verbose=self.verbose)
        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)
        self.ranks = data.ranks

        lin_models = dict([
            (data.worker_info[wf[0]]["rank"],
             self.client.submit(model_func,
                                comms.sessionId,
                                self.datatype,
                                **self.kwargs,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        lin_fit = dict([
            (wf[0],
             self.client.submit(
                 _func_fit,
                 lin_models[data.worker_info[wf[0]]["rank"]],
                 wf[1],
                 data.total_rows,
                 n_cols,
                 data.parts_to_sizes[data.worker_info[wf[0]]["rank"]],
                 data.worker_info[wf[0]]["rank"],
                 pure=False,
                 workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(lin_fit.values()))
        raise_exception_from_futures(list(lin_fit.values()))

        comms.destroy()
        return lin_models
Exemple #10
0
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(
                 DecompositionSyncFitMixin._func_fit,
                 models[data.worker_info[wf[0]]["rank"]],
                 wf[1],
                 total_rows,
                 n_cols,
                 data.parts_to_sizes[data.worker_info[wf[0]]["rank"]],
                 data.worker_info[wf[0]]["rank"],
                 _transform,
                 pure=False,
                 workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self.local_model = list(models.values())[0].result()

        self.components_ = self.local_model.components_
        self.explained_variance_ = self.local_model.explained_variance_
        self.explained_variance_ratio_ = \
            self.local_model.explained_variance_ratio_
        self.singular_values_ = self.local_model.singular_values_

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self

        return self
Exemple #11
0
    def fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        self.rnks = dict()
        rnk_counter = 0
        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            if w not in self.rnks.keys():
                self.rnks[w] = rnk_counter
                rnk_counter = rnk_counter + 1
            worker_to_parts[w].append(p)

        workers = list(map(lambda x: x[0], gpu_futures))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        key = uuid1()
        partsToRanks = [(worker_info[wf[0]]["r"],
                         self.client.submit(TruncatedSVD._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        self.tsvd_models = [
            (wf[0],
             self.client.submit(TruncatedSVD._func_create_model,
                                comms.sessionId,
                                wf[1],
                                **self.kwargs,
                                workers=[wf[0]],
                                key="%s-%s" % (key, idx)))
            for idx, wf in enumerate(worker_to_parts.items())
        ]

        key = uuid1()
        tsvd_fit = dict([(worker_info[wf[0]]["r"],
                          self.client.submit(TruncatedSVD._func_fit,
                                             wf[1],
                                             M,
                                             N,
                                             partsToRanks,
                                             worker_info[wf[0]]["r"],
                                             _transform,
                                             key="%s-%s" % (key, idx),
                                             workers=[wf[0]]))
                         for idx, wf in enumerate(self.tsvd_models)])

        wait(list(tsvd_fit.values()))
        raise_exception_from_futures(list(tsvd_fit.values()))

        comms.destroy()

        self.local_model = self.client.submit(TruncatedSVD._func_get_first,
                                              self.tsvd_models[0][1]).result()

        self.components_ = self.local_model.components_
        self.explained_variance_ = self.local_model.explained_variance_
        self.explained_variance_ratio_ = \
            self.local_model.explained_variance_ratio_
        self.singular_values_ = self.local_model.singular_values_

        out_futures = []
        if _transform:
            completed_part_map = {}
            for rank, size in partsToRanks:
                if rank not in completed_part_map:
                    completed_part_map[rank] = 0

                f = tsvd_fit[rank]
                out_futures.append(
                    self.client.submit(TruncatedSVD._func_get_idx, f,
                                       completed_part_map[rank]))

                completed_part_map[rank] += 1

            return to_dask_cudf(out_futures)
Exemple #12
0
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        if "svd_solver" in self.kwargs \
                and self.kwargs["svd_solver"] == "tsqr":
            comms = CommsContext(comms_p2p=True)
        else:
            comms = CommsContext(comms_p2p=False)

        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client, worker_info,
                                           data.gpu_futures)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(DecompositionSyncFitMixin._func_fit,
                                models[data.worker_info[wf[0]]["rank"]],
                                wf[1],
                                total_rows,
                                n_cols,
                                parts_to_sizes,
                                data.worker_info[wf[0]]["rank"],
                                _transform,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self._set_internal_model(list(models.values())[0])

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self
Exemple #13
0
def test_dask_exceptions(client):
    fut = client.submit(_raise_exception)
    wait(fut)

    with pytest.raises(RuntimeError):
        raise_exception_from_futures([fut])
    def _predict_using_cpu(self, X, convert_dtype=True):
        """
        Predicts the labels for X.

        Parameters
        ----------
        X : Dask cuDF dataframe  or CuPy backed Dask Array (n_rows, n_features)
            Distributed dense matrix (floats or doubles) of shape
            (n_samples, n_features).
        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will, when necessary, convert
            the input to the data type which was used to train the model. This
            will increase memory used for the method.
        Returns
        ----------
        y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1)
        """
        c = default_client()
        workers = self.workers

        X_Scattered = c.scatter(X)
        futures = list()
        for n, w in enumerate(workers):
            futures.append(
                c.submit(
                    RandomForestClassifier._predict_cpu,
                    self.rfs[w],
                    X_Scattered,
                    convert_dtype,
                    random.random(),
                    workers=[w],
                )
            )

        wait(futures)
        raise_exception_from_futures(futures)

        indexes = list()
        rslts = list()
        for d in range(len(futures)):
            rslts.append(futures[d].result())
            indexes.append(0)

        pred = list()

        for i in range(len(X)):
            classes = dict()
            max_class = -1
            max_val = 0

            for d in range(len(rslts)):
                for j in range(self.n_estimators_per_worker[d]):
                    sub_ind = indexes[d] + j
                    cls = rslts[d][sub_ind]
                    if cls not in classes.keys():
                        classes[cls] = 1
                    else:
                        classes[cls] = classes[cls] + 1

                    if classes[cls] > max_val:
                        max_val = classes[cls]
                        max_class = cls

                indexes[d] = indexes[d] + self.n_estimators_per_worker[d]

            pred.append(max_class)
        return pred
    def fit(self, X, y, convert_dtype=False):
        """
        Fit the input data with a Random Forest classifier

        IMPORTANT: X is expected to be partitioned with at least one partition
        on each Dask worker being used by the forest (self.workers).

        If a worker has multiple data partitions, they will be concatenated
        before fitting, which will lead to additional memory usage. To minimize
        memory consumption, ensure that each worker has exactly one partition.

        When persisting data, you can use
        cuml.dask.common.utils.persist_across_workers to simplify this::

            X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers)
            y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers)
            X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client,
                                                              [X_dask_cudf,
                                                               y_dask_cudf])

        (this is equivalent to calling `persist` with the data and workers)::
            X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf,
                                                            y_dask_cudf],
                                                           workers={
                                                           X_dask_cudf=workers,
                                                           y_dask_cudf=workers
                                                           })

        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Features of training examples.
        y : dask_cudf.Dataframe
            Dense  matrix (floats or doubles) of shape (n_samples, 1)
            Labels of training examples.
            **y must be partitioned the same way as X**
        convert_dtype : bool, optional (default = False)
            When set to True, the fit method will, when necessary, convert
            y to be the same data type as X if they differ. This
            will increase memory used for the method.

        """

        c = default_client()

        self.num_classes = len(y.unique())
        X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X))
        y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y))

        X_partition_workers = [w for w, xc in X_futures.items()]
        y_partition_workers = [w for w, xc in y_futures.items()]

        if set(X_partition_workers) != set(self.workers) or \
           set(y_partition_workers) != set(self.workers):
            raise ValueError("""
              X is not partitioned on the same workers expected by RF\n
              X workers: %s\n
              y workers: %s\n
              RF workers: %s
            """ % (str(X_partition_workers),
                   str(y_partition_workers),
                   str(self.workers)))

        futures = list()
        for w, xc in X_futures.items():
            futures.append(
                c.submit(
                    RandomForestClassifier._fit,
                    self.rfs[w],
                    xc,
                    y_futures[w],
                    convert_dtype,
                    random.random(),
                    workers=[w],
                )
            )

        wait(futures)
        raise_exception_from_futures(futures)
        return self
    def __init__(
        self,
        n_estimators=10,
        max_depth=-1,
        max_features="auto",
        n_bins=8,
        split_algo=1,
        split_criterion=0,
        min_rows_per_node=2,
        bootstrap=True,
        bootstrap_features=False,
        type_model="classifier",
        verbose=False,
        rows_sample=1.0,
        max_leaves=-1,
        n_streams=4,
        quantile_per_tree=False,
        dtype=None,
        criterion=None,
        min_samples_leaf=None,
        min_weight_fraction_leaf=None,
        max_leaf_nodes=None,
        min_impurity_decrease=None,
        min_impurity_split=None,
        oob_score=None,
        n_jobs=None,
        random_state=None,
        warm_start=None,
        class_weight=None,
        workers=None,
        client=None
    ):

        unsupported_sklearn_params = {
            "criterion": criterion,
            "min_samples_leaf": min_samples_leaf,
            "min_weight_fraction_leaf": min_weight_fraction_leaf,
            "max_leaf_nodes": max_leaf_nodes,
            "min_impurity_decrease": min_impurity_decrease,
            "min_impurity_split": min_impurity_split,
            "oob_score": oob_score,
            "n_jobs": n_jobs,
            "random_state": random_state,
            "warm_start": warm_start,
            "class_weight": class_weight,
        }

        for key, vals in unsupported_sklearn_params.items():
            if vals is not None:
                raise TypeError(
                    "The Scikit-learn variable",
                    key,
                    " is not supported in cuML,"
                    " please read the cuML documentation for"
                    " more information",
                )

        self.n_estimators = n_estimators
        self.n_estimators_per_worker = list()
        self.num_classes = 2

        self.client = default_client() if client is None else client
        if workers is None:
            workers = self.client.has_what().keys()  # Default to all workers
        self.workers = workers

        n_workers = len(workers)
        if n_estimators < n_workers:
            raise ValueError(
                "n_estimators cannot be lower than number of dask workers."
            )

        n_est_per_worker = math.floor(n_estimators / n_workers)

        for i in range(n_workers):
            self.n_estimators_per_worker.append(n_est_per_worker)

        remaining_est = n_estimators - (n_est_per_worker * n_workers)

        for i in range(remaining_est):
            self.n_estimators_per_worker[i] = (
                self.n_estimators_per_worker[i] + 1
            )

        seeds = list()
        seeds.append(0)
        for i in range(1, len(self.n_estimators_per_worker)):
            sd = self.n_estimators_per_worker[i-1] + seeds[i-1]
            seeds.append(sd)

        key = str(uuid1())
        self.rfs = {
            worker: self.client.submit(
                RandomForestClassifier._func_build_rf,
                self.n_estimators_per_worker[n],
                max_depth,
                n_streams,
                max_features,
                n_bins,
                split_algo,
                split_criterion,
                min_rows_per_node,
                bootstrap,
                bootstrap_features,
                type_model,
                verbose,
                rows_sample,
                max_leaves,
                quantile_per_tree,
                seeds[n],
                dtype,
                key="%s-%s" % (key, n),
                workers=[worker],
            )
            for n, worker in enumerate(workers)
        }

        rfs_wait = list()
        for r in self.rfs.values():
            rfs_wait.append(r)

        wait(rfs_wait)
        raise_exception_from_futures(rfs_wait)
Exemple #17
0
    def predict(self, X):
        """
        Predicts the labels for X.

        Parameters
        ----------
        X : np.array
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Features of examples to predict.

        Returns
        ----------
        y: np.array
           Dense vector (int) of shape (n_samples, 1)

        """
        c = default_client()
        workers = self.workers

        X_Scattered = c.scatter(X)
        futures = list()
        for n, w in enumerate(workers):
            futures.append(
                c.submit(
                    RandomForestClassifier._predict,
                    self.rfs[w],
                    X_Scattered,
                    random.random(),
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        indexes = list()
        rslts = list()
        for d in range(len(futures)):
            rslts.append(futures[d].result())
            indexes.append(0)

        pred = list()

        for i in range(len(X)):
            classes = dict()
            max_class = -1
            max_val = 0

            for d in range(len(rslts)):
                for j in range(self.n_estimators_per_worker[d]):
                    sub_ind = indexes[d] + j
                    cls = rslts[d][sub_ind]
                    if cls not in classes.keys():
                        classes[cls] = 1
                    else:
                        classes[cls] = classes[cls] + 1

                    if classes[cls] > max_val:
                        max_val = classes[cls]
                        max_class = cls

                indexes[d] = indexes[d] + self.n_estimators_per_worker[d]

            pred.append(max_class)
        return pred
Exemple #18
0
    def fit(self, X, y):
        """
        Fit the input data with a Random Forest regression model

        IMPORTANT: X is expected to be partitioned with at least one partition
        on each Dask worker being used by the forest (self.workers).

        When persisting data, you can use
        cuml.dask.common.utils.persist_across_workers to simplify this::

            X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers)
            y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers)
            X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client,
                                                              [X_dask_cudf,
                                                               y_dask_cudf])

        (this is equivalent to calling `persist` with the data and workers)::
            X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf,
                                                            y_dask_cudf],
                                                           workers={
                                                           X_dask_cudf=workers,
                                                           y_dask_cudf=workers
                                                           })
        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Features of training examples.

        y : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, 1)
            Labels of training examples.
            y must be partitioned the same way as X
        """
        c = default_client()

        X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X))
        y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y))

        X_partition_workers = [w for w, xc in X_futures.items()]
        y_partition_workers = [w for w, xc in y_futures.items()]

        if set(X_partition_workers) != set(self.workers) or \
           set(y_partition_workers) != set(self.workers):
            raise ValueError("""
              X is not partitioned on the same workers expected by RF\n
              X workers: %s\n
              y workers: %s\n
              RF workers: %s
            """ % (str(X_partition_workers), str(y_partition_workers),
                   str(self.workers)))

        futures = list()
        for w, xc in X_futures.items():
            futures.append(
                c.submit(
                    RandomForestRegressor._fit,
                    self.rfs[w],
                    xc,
                    y_futures[w],
                    random.random(),
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        return self
Exemple #19
0
    def _fit(self, X, y):
        X_futures = self.client.sync(extract_ddf_partitions, X)
        y_futures = self.client.sync(extract_ddf_partitions, y)

        X_partition_workers = [w for w, xc in X_futures]
        y_partition_workers = [w for w, xc in y_futures]

        if set(X_partition_workers) != set(y_partition_workers):
            raise ValueError("""
              X  and y are not partitioned on the same workers expected \n_cols
              Linear Regression""")

        self.rnks = dict()
        rnk_counter = 0
        worker_to_parts = OrderedDict()
        for w, p in X_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            if w not in self.rnks.keys():
                self.rnks[w] = rnk_counter
                rnk_counter = rnk_counter + 1
            worker_to_parts[w].append(p)

        worker_to_parts_y = OrderedDict()
        for w, p in y_futures:
            if w not in worker_to_parts_y:
                worker_to_parts_y[w] = []
            worker_to_parts_y[w].append(p)

        workers = list(map(lambda x: x[0], X_futures))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        key = uuid1()
        partsToSizes = [(worker_info[wf[0]]["r"], self.client.submit(
            Ridge._func_get_size,
            wf[1],
            workers=[wf[0]],
            key="%s-%s" % (key, idx)).result())
            for idx, wf in enumerate(X_futures)]

        n_cols = X.shape[1]
        n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes))

        key = uuid1()
        self.linear_models = [(wf[0], self.client.submit(
            Ridge._func_create_model,
            comms.sessionId,
            **self.kwargs,
            workers=[wf[0]],
            key="%s-%s" % (key, idx)))
            for idx, wf in enumerate(worker_to_parts.items())]

        key = uuid1()
        linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(
            Ridge._func_fit,
            wf[1],
            worker_to_parts[wf[0]],
            worker_to_parts_y[wf[0]],
            n_rows, n_cols,
            partsToSizes,
            worker_info[wf[0]]["r"],
            key="%s-%s" % (key, idx),
            workers=[wf[0]]))
            for idx, wf in enumerate(self.linear_models)])

        wait(list(linear_fit.values()))
        raise_exception_from_futures(list(linear_fit.values()))

        comms.destroy()

        self.local_model = self.linear_models[0][1].result()
        self.coef_ = self.local_model.coef_
        self.intercept_ = self.local_model.intercept_