Esempio n. 1
0
    def kneighbors(self, X=None, n_neighbors=None, return_distance=True,
                   _return_futures=False):
        """
        Query the distributed nearest neighbors index
        :param X : dask_cudf.Dataframe Vectors to query. If not
                   provided, neighbors of each indexed point are returned.
        :param n_neighbors : Number of neighbors to query for each row in
                             X. If not provided, the n_neighbors on the
                             model are used.
        :param return_distance : If false, only indices are returned
        :return : dask_cudf.DataFrame containing distances
        :return : dask_cudf.DataFrame containing indices
        """
        n_neighbors = self.get_neighbors(n_neighbors)

        query_futures = self.X if X is None else \
            self.client.sync(extract_ddf_partitions, X)

        if X is None:
            raise ValueError("Model needs to be trained using fit() "
                             "before calling kneighbors()")

        """
        Create communicator clique
        """
        comms = NearestNeighbors._build_comms(self.X, query_futures,
                                              self.streams_per_handle,
                                              self.verbose)

        """
        Initialize models on workers
        """
        nn_models = self._create_models(comms)

        """
        Perform model query
        """
        nn_fit, out_d_futures, out_i_futures = \
            self._query_models(n_neighbors, comms, nn_models,
                               self.X, query_futures)

        comms.destroy()

        if _return_futures:
            ret = nn_fit, out_i_futures if not return_distance else \
                (nn_fit, out_d_futures, out_i_futures)
        else:
            ret = to_dask_cudf(out_i_futures) \
                if not return_distance else (to_dask_cudf(out_d_futures),
                                             to_dask_cudf(out_i_futures))

        return ret
Esempio n. 2
0
    def _parallel_func(self, X, func):
        """
        Internal function that predicts the labels using a distributed
        KMeans model.

        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dataframe to predict

        Returns
        -------
        result: dask_cudf.Dataframe
            Dataframe containing label predictions
        """

        key = uuid1()
        gpu_futures = self.client.sync(extract_ddf_partitions, X)
        worker_to_parts = workers_to_parts(gpu_futures)

        kmeans_predict = [
            self.client.submit(func,
                               self.local_model,
                               wf[1],
                               workers=[wf[0]],
                               key="%s-%s" % (key, idx))
            for idx, wf in enumerate(worker_to_parts.items())
        ]
        self.raise_exception_from_futures(kmeans_predict)

        return to_dask_cudf(kmeans_predict)
Esempio n. 3
0
    def predict(self, X):
        """
        Make predictions for X and returns a y_pred.

        Parameters
        ----------
        X : dask cuDF dataframe (n_rows, n_features)

        Returns
        -------
        y : dask cuDF (n_rows, 1)
        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            worker_to_parts[w].append(p)

        key = uuid1()
        partsToSizes = [(self.rnks[wf[0]], self.client.submit(
            Ridge._func_get_size,
            wf[1],
            workers=[wf[0]],
            key="%s-%s" % (key, idx)).result())
            for idx, wf in enumerate(gpu_futures)]

        n_cols = X.shape[1]
        n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes))

        key = uuid1()
        linear_pred = dict([(self.rnks[wf[0]], self.client.submit(
            Ridge._func_predict,
            wf[1],
            worker_to_parts[wf[0]],
            n_rows, n_cols,
            partsToSizes,
            self.rnks[wf[0]],
            key="%s-%s" % (key, idx),
            workers=[wf[0]]))
            for idx, wf in enumerate(self.linear_models)])

        wait(list(linear_pred.values()))
        raise_exception_from_futures(list(linear_pred.values()))

        out_futures = []
        completed_part_map = {}
        for rank, size in partsToSizes:
            if rank not in completed_part_map:
                completed_part_map[rank] = 0

            f = linear_pred[rank]
            out_futures.append(self.client.submit(
                Ridge._func_get_idx, f, completed_part_map[rank]))

            completed_part_map[rank] += 1

        return to_dask_cudf(out_futures)
Esempio n. 4
0
    def _inverse_transform(self, X):
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            worker_to_parts[w].append(p)

        key = uuid1()
        partsToRanks = [(self.rnks[wf[0]],
                         self.client.submit(TruncatedSVD._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        tsvd_inverse_transform = dict([
            (self.rnks[wf[0]],
             self.client.submit(TruncatedSVD._func_inverse_transform,
                                wf[1],
                                worker_to_parts[wf[0]],
                                M,
                                N,
                                partsToRanks,
                                self.rnks[wf[0]],
                                key="%s-%s" % (key, idx),
                                workers=[wf[0]]))
            for idx, wf in enumerate(self.tsvd_models)
        ])

        wait(list(tsvd_inverse_transform.values()))
        raise_exception_from_futures(list(tsvd_inverse_transform.values()))

        out_futures = []
        completed_part_map = {}
        for rank, size in partsToRanks:
            if rank not in completed_part_map:
                completed_part_map[rank] = 0

            f = tsvd_inverse_transform[rank]
            out_futures.append(
                self.client.submit(TruncatedSVD._func_get_idx, f,
                                   completed_part_map[rank]))

            completed_part_map[rank] += 1

        return to_dask_cudf(out_futures)
Esempio n. 5
0
def post_etl_processing(client, train_data, test_data):
    import cudf
    from cuml.dask.naive_bayes import MultinomialNB as DistMNB
    from cuml.dask.common import to_dask_cudf
    from cuml.dask.common.input_utils import DistributedDataHandler

    # Feature engineering
    X_train = build_features(train_data)
    X_test = build_features(test_data)

    y_train = build_labels(train_data)
    y_test = build_labels(test_data)

    # Perform ML
    model = DistMNB(client=client, alpha=0.001)
    model.fit(X_train, y_train)

    ### this regression seems to be coming from here
    test_pred_st = time.time()
    y_hat = model.predict(X_test).persist()

    # Compute distributed performance metrics
    acc = accuracy_score(client, y_test, y_hat)

    print("Accuracy: " + str(acc))
    prec = precision_score(client, y_test, y_hat, average="macro")

    print("Precision: " + str(prec))
    cmat = confusion_matrix(client, y_test, y_hat)

    print("Confusion Matrix: " + str(cmat))
    metric_et = time.time()

    # Place results back in original Dataframe

    ddh = DistributedDataHandler.create(y_hat)
    test_preds = to_dask_cudf(
        [client.submit(cudf.Series, part) for w, part in ddh.gpu_futures])

    test_preds = test_preds.map_partitions(categoricalize)

    test_data["prediction"] = test_preds

    final_data = test_data[["pr_review_sk", "pr_review_rating",
                            "prediction"]].persist()

    final_data = final_data.sort_values("pr_review_sk").reset_index(drop=True)
    wait(final_data)
    return final_data, acc, prec, cmat
Esempio n. 6
0
    def parallel_func(self, X, func):
        """
        Predicts the labels using a distributed KMeans model
        :param X: dask_cudf.Dataframe to predict
        :return: A dask_cudf.Dataframe containing label predictions
        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)
        kmeans_predict = [
            self.client.submit(func,
                               self.local_model,
                               f,
                               random.random(),
                               workers=[w]) for w, f in gpu_futures.items()
        ]

        return to_dask_cudf(kmeans_predict)
Esempio n. 7
0
    def parallel_func(self, X, func):
        """
        Predicts the labels using a distributed KMeans model
        :param X: dask_cudf.Dataframe to predict
        :return: A dask_cudf.Dataframe containing label predictions
        """

        key = uuid1()
        gpu_futures = self.client.sync(extract_ddf_partitions, X)
        kmeans_predict = [
            self.client.submit(func,
                               self.local_model,
                               wf[1],
                               workers=[wf[0]],
                               key="%s-%s" % (key, idx))
            for idx, wf in enumerate(gpu_futures.items())
        ]
        self.raise_exception_from_futures(kmeans_predict)

        return to_dask_cudf(kmeans_predict)
Esempio n. 8
0
    def fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        self.rnks = dict()
        rnk_counter = 0
        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            if w not in self.rnks.keys():
                self.rnks[w] = rnk_counter
                rnk_counter = rnk_counter + 1
            worker_to_parts[w].append(p)

        workers = list(map(lambda x: x[0], gpu_futures))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        key = uuid1()
        partsToRanks = [(worker_info[wf[0]]["r"],
                         self.client.submit(TruncatedSVD._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        self.tsvd_models = [
            (wf[0],
             self.client.submit(TruncatedSVD._func_create_model,
                                comms.sessionId,
                                wf[1],
                                **self.kwargs,
                                workers=[wf[0]],
                                key="%s-%s" % (key, idx)))
            for idx, wf in enumerate(worker_to_parts.items())
        ]

        key = uuid1()
        tsvd_fit = dict([(worker_info[wf[0]]["r"],
                          self.client.submit(TruncatedSVD._func_fit,
                                             wf[1],
                                             M,
                                             N,
                                             partsToRanks,
                                             worker_info[wf[0]]["r"],
                                             _transform,
                                             key="%s-%s" % (key, idx),
                                             workers=[wf[0]]))
                         for idx, wf in enumerate(self.tsvd_models)])

        wait(list(tsvd_fit.values()))
        raise_exception_from_futures(list(tsvd_fit.values()))

        comms.destroy()

        self.local_model = self.client.submit(TruncatedSVD._func_get_first,
                                              self.tsvd_models[0][1]).result()

        self.components_ = self.local_model.components_
        self.explained_variance_ = self.local_model.explained_variance_
        self.explained_variance_ratio_ = \
            self.local_model.explained_variance_ratio_
        self.singular_values_ = self.local_model.singular_values_

        out_futures = []
        if _transform:
            completed_part_map = {}
            for rank, size in partsToRanks:
                if rank not in completed_part_map:
                    completed_part_map[rank] = 0

                f = tsvd_fit[rank]
                out_futures.append(
                    self.client.submit(TruncatedSVD._func_get_idx, f,
                                       completed_part_map[rank]))

                completed_part_map[rank] += 1

            return to_dask_cudf(out_futures)