Beispiel #1
0
    def _fit_with_colocality(self, X, y):
        input_futures = self.client.sync(extract_colocated_ddf_partitions,
                                         X, y, self.client)
        workers = list(input_futures.keys())

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        n_cols = X.shape[1]
        n_rows = 0

        self.rnks = dict()
        partsToSizes = dict()
        key = uuid1()
        for w, futures in input_futures.items():
            self.rnks[w] = worker_info[w]["r"]
            parts = [(self.client.submit(
                Ridge._func_get_size_cl,
                future,
                workers=[w],
                key="%s-%s" % (key, idx)).result())
                for idx, future in enumerate(futures)]

            partsToSizes[worker_info[w]["r"]] = parts
            for p in parts:
                n_rows = n_rows + p

        key = uuid1()
        self.linear_models = [(w, self.client.submit(
            Ridge._func_create_model,
            comms.sessionId,
            **self.kwargs,
            workers=[w],
            key="%s-%s" % (key, idx)))
            for idx, w in enumerate(workers)]

        key = uuid1()
        linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(
            Ridge._func_fit_colocated,
            wf[1],
            input_futures[wf[0]],
            n_rows, n_cols,
            partsToSizes,
            worker_info[wf[0]]["r"],
            key="%s-%s" % (key, idx),
            workers=[wf[0]]))
            for idx, wf in enumerate(self.linear_models)])

        wait(list(linear_fit.values()))
        raise_exception_from_futures(list(linear_fit.values()))

        comms.destroy()

        self.local_model = self.linear_models[0][1].result()
        self.coef_ = self.local_model.coef_
        self.intercept_ = self.local_model.intercept_
Beispiel #2
0
    def fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        self.rnks = dict()
        rnk_counter = 0
        worker_to_parts = OrderedDict()
        for w, p in gpu_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            if w not in self.rnks.keys():
                self.rnks[w] = rnk_counter
                rnk_counter = rnk_counter + 1
            worker_to_parts[w].append(p)

        workers = list(map(lambda x: x[0], gpu_futures))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        key = uuid1()
        partsToRanks = [(worker_info[wf[0]]["r"],
                         self.client.submit(TruncatedSVD._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        self.tsvd_models = [
            (wf[0],
             self.client.submit(TruncatedSVD._func_create_model,
                                comms.sessionId,
                                wf[1],
                                **self.kwargs,
                                workers=[wf[0]],
                                key="%s-%s" % (key, idx)))
            for idx, wf in enumerate(worker_to_parts.items())
        ]

        key = uuid1()
        tsvd_fit = dict([(worker_info[wf[0]]["r"],
                          self.client.submit(TruncatedSVD._func_fit,
                                             wf[1],
                                             M,
                                             N,
                                             partsToRanks,
                                             worker_info[wf[0]]["r"],
                                             _transform,
                                             key="%s-%s" % (key, idx),
                                             workers=[wf[0]]))
                         for idx, wf in enumerate(self.tsvd_models)])

        wait(list(tsvd_fit.values()))
        raise_exception_from_futures(list(tsvd_fit.values()))

        comms.destroy()

        self.local_model = self.client.submit(TruncatedSVD._func_get_first,
                                              self.tsvd_models[0][1]).result()

        self.components_ = self.local_model.components_
        self.explained_variance_ = self.local_model.explained_variance_
        self.explained_variance_ratio_ = \
            self.local_model.explained_variance_ratio_
        self.singular_values_ = self.local_model.singular_values_

        out_futures = []
        if _transform:
            completed_part_map = {}
            for rank, size in partsToRanks:
                if rank not in completed_part_map:
                    completed_part_map[rank] = 0

                f = tsvd_fit[rank]
                out_futures.append(
                    self.client.submit(TruncatedSVD._func_get_idx, f,
                                       completed_part_map[rank]))

                completed_part_map[rank] += 1

            return to_dask_cudf(out_futures)
Beispiel #3
0
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        if "svd_solver" in self.kwargs \
                and self.kwargs["svd_solver"] == "tsqr":
            comms = CommsContext(comms_p2p=True)
        else:
            comms = CommsContext(comms_p2p=False)

        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client, worker_info,
                                           data.gpu_futures)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(DecompositionSyncFitMixin._func_fit,
                                models[data.worker_info[wf[0]]["rank"]],
                                wf[1],
                                total_rows,
                                n_cols,
                                parts_to_sizes,
                                data.worker_info[wf[0]]["rank"],
                                _transform,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self._set_internal_model(list(models.values())[0])

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self
Beispiel #4
0
    def _fit(self, X, y):
        X_futures = self.client.sync(extract_ddf_partitions, X)
        y_futures = self.client.sync(extract_ddf_partitions, y)

        X_partition_workers = [w for w, xc in X_futures]
        y_partition_workers = [w for w, xc in y_futures]

        if set(X_partition_workers) != set(y_partition_workers):
            raise ValueError("""
              X  and y are not partitioned on the same workers expected \n_cols
              Linear Regression""")

        self.rnks = dict()
        rnk_counter = 0
        worker_to_parts = OrderedDict()
        for w, p in X_futures:
            if w not in worker_to_parts:
                worker_to_parts[w] = []
            if w not in self.rnks.keys():
                self.rnks[w] = rnk_counter
                rnk_counter = rnk_counter + 1
            worker_to_parts[w].append(p)

        worker_to_parts_y = OrderedDict()
        for w, p in y_futures:
            if w not in worker_to_parts_y:
                worker_to_parts_y[w] = []
            worker_to_parts_y[w].append(p)

        workers = list(map(lambda x: x[0], X_futures))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        key = uuid1()
        partsToSizes = [(worker_info[wf[0]]["r"], self.client.submit(
            Ridge._func_get_size,
            wf[1],
            workers=[wf[0]],
            key="%s-%s" % (key, idx)).result())
            for idx, wf in enumerate(X_futures)]

        n_cols = X.shape[1]
        n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes))

        key = uuid1()
        self.linear_models = [(wf[0], self.client.submit(
            Ridge._func_create_model,
            comms.sessionId,
            **self.kwargs,
            workers=[wf[0]],
            key="%s-%s" % (key, idx)))
            for idx, wf in enumerate(worker_to_parts.items())]

        key = uuid1()
        linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(
            Ridge._func_fit,
            wf[1],
            worker_to_parts[wf[0]],
            worker_to_parts_y[wf[0]],
            n_rows, n_cols,
            partsToSizes,
            worker_info[wf[0]]["r"],
            key="%s-%s" % (key, idx),
            workers=[wf[0]]))
            for idx, wf in enumerate(self.linear_models)])

        wait(list(linear_fit.values()))
        raise_exception_from_futures(list(linear_fit.values()))

        comms.destroy()

        self.local_model = self.linear_models[0][1].result()
        self.coef_ = self.local_model.coef_
        self.intercept_ = self.local_model.intercept_