def _predict_using_cpu(self, X, convert_dtype): workers = self.workers X_Scattered = self.client.scatter(X) futures = list() for n, w in enumerate(workers): futures.append( self.client.submit( RandomForestRegressor._predict_cpu, self.rfs[w], X_Scattered, convert_dtype, workers=[w], )) wait(futures) raise_exception_from_futures(futures) indexes = list() rslts = list() for d in range(len(futures)): rslts.append(futures[d].result()) indexes.append(0) pred = list() for i in range(len(X)): pred_per_worker = 0.0 for d in range(len(rslts)): pred_per_worker = pred_per_worker + rslts[d][i] pred.append(pred_per_worker / len(rslts)) return pred
def _query_models(self, n_neighbors, comms, nn_models, index_futures, query_futures): worker_info = comms.worker_info(comms.worker_addresses) index_worker_to_parts = workers_to_parts(index_futures) query_worker_to_parts = workers_to_parts(query_futures) """ Build inputs and outputs """ idx_parts_to_ranks, idx_M = parts_to_ranks(self.client, worker_info, index_futures) query_parts_to_ranks, query_M = parts_to_ranks(self.client, worker_info, query_futures) """ Invoke kneighbors on Dask workers to perform distributed query """ key = uuid1() nn_fit = dict([(worker_info[worker]["r"], self.client.submit( NearestNeighbors._func_kneighbors, nn_models[worker], index_worker_to_parts[worker] if worker in index_worker_to_parts else [], idx_M, self.n_cols, idx_parts_to_ranks, query_worker_to_parts[worker] if worker in query_worker_to_parts else [], query_M, query_parts_to_ranks, worker_info[worker]["r"], n_neighbors, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses)]) wait(list(nn_fit.values())) raise_exception_from_futures(list(nn_fit.values())) """ Gather resulting partitions and return dask_cudfs """ out_d_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_d) out_i_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_i) return nn_fit, out_d_futures, out_i_futures
def predict(self, X): """ Make predictions for X and returns a y_pred. Parameters ---------- X : dask cuDF dataframe (n_rows, n_features) Returns ------- y : dask cuDF (n_rows, 1) """ gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = OrderedDict() for w, p in gpu_futures: if w not in worker_to_parts: worker_to_parts[w] = [] worker_to_parts[w].append(p) key = uuid1() partsToSizes = [(self.rnks[wf[0]], self.client.submit( Ridge._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(gpu_futures)] n_cols = X.shape[1] n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes)) key = uuid1() linear_pred = dict([(self.rnks[wf[0]], self.client.submit( Ridge._func_predict, wf[1], worker_to_parts[wf[0]], n_rows, n_cols, partsToSizes, self.rnks[wf[0]], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.linear_models)]) wait(list(linear_pred.values())) raise_exception_from_futures(list(linear_pred.values())) out_futures = [] completed_part_map = {} for rank, size in partsToSizes: if rank not in completed_part_map: completed_part_map[rank] = 0 f = linear_pred[rank] out_futures.append(self.client.submit( Ridge._func_get_idx, f, completed_part_map[rank])) completed_part_map[rank] += 1 return to_dask_cudf(out_futures)
def _fit_with_colocality(self, X, y): input_futures = self.client.sync(extract_colocated_ddf_partitions, X, y, self.client) workers = list(input_futures.keys()) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) n_cols = X.shape[1] n_rows = 0 self.rnks = dict() partsToSizes = dict() key = uuid1() for w, futures in input_futures.items(): self.rnks[w] = worker_info[w]["r"] parts = [(self.client.submit( Ridge._func_get_size_cl, future, workers=[w], key="%s-%s" % (key, idx)).result()) for idx, future in enumerate(futures)] partsToSizes[worker_info[w]["r"]] = parts for p in parts: n_rows = n_rows + p key = uuid1() self.linear_models = [(w, self.client.submit( Ridge._func_create_model, comms.sessionId, **self.kwargs, workers=[w], key="%s-%s" % (key, idx))) for idx, w in enumerate(workers)] key = uuid1() linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit( Ridge._func_fit_colocated, wf[1], input_futures[wf[0]], n_rows, n_cols, partsToSizes, worker_info[wf[0]]["r"], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.linear_models)]) wait(list(linear_fit.values())) raise_exception_from_futures(list(linear_fit.values())) comms.destroy() self.local_model = self.linear_models[0][1].result() self.coef_ = self.local_model.coef_ self.intercept_ = self.local_model.intercept_
def test_dask_exceptions(cluster): c = Client(cluster) try: fut = c.submit(_raise_exception) wait(fut) with pytest.raises(RuntimeError): raise_exception_from_futures([fut]) finally: c.close()
def _inverse_transform(self, X): gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = OrderedDict() for w, p in gpu_futures: if w not in worker_to_parts: worker_to_parts[w] = [] worker_to_parts[w].append(p) key = uuid1() partsToRanks = [(self.rnks[wf[0]], self.client.submit(TruncatedSVD._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(gpu_futures)] N = X.shape[1] M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks)) key = uuid1() tsvd_inverse_transform = dict([ (self.rnks[wf[0]], self.client.submit(TruncatedSVD._func_inverse_transform, wf[1], worker_to_parts[wf[0]], M, N, partsToRanks, self.rnks[wf[0]], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.tsvd_models) ]) wait(list(tsvd_inverse_transform.values())) raise_exception_from_futures(list(tsvd_inverse_transform.values())) out_futures = [] completed_part_map = {} for rank, size in partsToRanks: if rank not in completed_part_map: completed_part_map[rank] = 0 f = tsvd_inverse_transform[rank] out_futures.append( self.client.submit(TruncatedSVD._func_get_idx, f, completed_part_map[rank])) completed_part_map[rank] += 1 return to_dask_cudf(out_futures)
def predict(self, X): """ Predicts the regressor outputs for X. Parameters ---------- X : Dense matrix (floats or doubles) of shape (n_samples, n_features). Returns ---------- y: NumPy Dense vector (float) of shape (n_samples, 1) """ c = default_client() workers = self.workers if not isinstance(X, np.ndarray): raise ValueError("Predict inputs must be numpy arrays") X_Scattered = c.scatter(X) futures = list() for n, w in enumerate(workers): futures.append( c.submit( RandomForestRegressor._predict, self.rfs[w], X_Scattered, random.random(), workers=[w], )) wait(futures) raise_exception_from_futures(futures) indexes = list() rslts = list() for d in range(len(futures)): rslts.append(futures[d].result()) indexes.append(0) pred = list() for i in range(len(X)): pred_per_worker = 0.0 for d in range(len(rslts)): pred_per_worker = pred_per_worker + rslts[d][i] pred.append(pred_per_worker / len(rslts)) return pred
def print_summary(self): """ Print the summary of the forest used to train and test the model. """ futures = list() workers = self.workers for n, w in enumerate(workers): futures.append( self.client.submit( RandomForestClassifier._print_summary, self.rfs[w], workers=[w], )) wait(futures) raise_exception_from_futures(futures) return self
def _fit(self, model_func, data, **kwargs): n_cols = data[0].shape[1] data = DistributedDataHandler.create(data=data, client=self.client) self.datatype = data.datatype comms = CommsContext(comms_p2p=False, verbose=self.verbose) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) self.ranks = data.ranks lin_models = dict([ (data.worker_info[wf[0]]["rank"], self.client.submit(model_func, comms.sessionId, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) lin_fit = dict([ (wf[0], self.client.submit( _func_fit, lin_models[data.worker_info[wf[0]]["rank"]], wf[1], data.total_rows, n_cols, data.parts_to_sizes[data.worker_info[wf[0]]["rank"]], data.worker_info[wf[0]]["rank"], pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(lin_fit.values())) raise_exception_from_futures(list(lin_fit.values())) comms.destroy() return lin_models
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype comms = CommsContext(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit( DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, data.parts_to_sizes[data.worker_info[wf[0]]["rank"]], data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self.local_model = list(models.values())[0].result() self.components_ = self.local_model.components_ self.explained_variance_ = self.local_model.explained_variance_ self.explained_variance_ratio_ = \ self.local_model.explained_variance_ratio_ self.singular_values_ = self.local_model.singular_values_ if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self return self
def fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ gpu_futures = self.client.sync(extract_ddf_partitions, X) self.rnks = dict() rnk_counter = 0 worker_to_parts = OrderedDict() for w, p in gpu_futures: if w not in worker_to_parts: worker_to_parts[w] = [] if w not in self.rnks.keys(): self.rnks[w] = rnk_counter rnk_counter = rnk_counter + 1 worker_to_parts[w].append(p) workers = list(map(lambda x: x[0], gpu_futures)) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) key = uuid1() partsToRanks = [(worker_info[wf[0]]["r"], self.client.submit(TruncatedSVD._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(gpu_futures)] N = X.shape[1] M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks)) key = uuid1() self.tsvd_models = [ (wf[0], self.client.submit(TruncatedSVD._func_create_model, comms.sessionId, wf[1], **self.kwargs, workers=[wf[0]], key="%s-%s" % (key, idx))) for idx, wf in enumerate(worker_to_parts.items()) ] key = uuid1() tsvd_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(TruncatedSVD._func_fit, wf[1], M, N, partsToRanks, worker_info[wf[0]]["r"], _transform, key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.tsvd_models)]) wait(list(tsvd_fit.values())) raise_exception_from_futures(list(tsvd_fit.values())) comms.destroy() self.local_model = self.client.submit(TruncatedSVD._func_get_first, self.tsvd_models[0][1]).result() self.components_ = self.local_model.components_ self.explained_variance_ = self.local_model.explained_variance_ self.explained_variance_ratio_ = \ self.local_model.explained_variance_ratio_ self.singular_values_ = self.local_model.singular_values_ out_futures = [] if _transform: completed_part_map = {} for rank, size in partsToRanks: if rank not in completed_part_map: completed_part_map[rank] = 0 f = tsvd_fit[rank] out_futures.append( self.client.submit(TruncatedSVD._func_get_idx, f, completed_part_map[rank])) completed_part_map[rank] += 1 return to_dask_cudf(out_futures)
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype if "svd_solver" in self.kwargs \ and self.kwargs["svd_solver"] == "tsqr": comms = CommsContext(comms_p2p=True) else: comms = CommsContext(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit(DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, parts_to_sizes, data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self._set_internal_model(list(models.values())[0]) if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self
def test_dask_exceptions(client): fut = client.submit(_raise_exception) wait(fut) with pytest.raises(RuntimeError): raise_exception_from_futures([fut])
def _predict_using_cpu(self, X, convert_dtype=True): """ Predicts the labels for X. Parameters ---------- X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) Distributed dense matrix (floats or doubles) of shape (n_samples, n_features). convert_dtype : bool, optional (default = True) When set to True, the predict method will, when necessary, convert the input to the data type which was used to train the model. This will increase memory used for the method. Returns ---------- y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) """ c = default_client() workers = self.workers X_Scattered = c.scatter(X) futures = list() for n, w in enumerate(workers): futures.append( c.submit( RandomForestClassifier._predict_cpu, self.rfs[w], X_Scattered, convert_dtype, random.random(), workers=[w], ) ) wait(futures) raise_exception_from_futures(futures) indexes = list() rslts = list() for d in range(len(futures)): rslts.append(futures[d].result()) indexes.append(0) pred = list() for i in range(len(X)): classes = dict() max_class = -1 max_val = 0 for d in range(len(rslts)): for j in range(self.n_estimators_per_worker[d]): sub_ind = indexes[d] + j cls = rslts[d][sub_ind] if cls not in classes.keys(): classes[cls] = 1 else: classes[cls] = classes[cls] + 1 if classes[cls] > max_val: max_val = classes[cls] max_class = cls indexes[d] = indexes[d] + self.n_estimators_per_worker[d] pred.append(max_class) return pred
def fit(self, X, y, convert_dtype=False): """ Fit the input data with a Random Forest classifier IMPORTANT: X is expected to be partitioned with at least one partition on each Dask worker being used by the forest (self.workers). If a worker has multiple data partitions, they will be concatenated before fitting, which will lead to additional memory usage. To minimize memory consumption, ensure that each worker has exactly one partition. When persisting data, you can use cuml.dask.common.utils.persist_across_workers to simplify this:: X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client, [X_dask_cudf, y_dask_cudf]) (this is equivalent to calling `persist` with the data and workers):: X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ X_dask_cudf=workers, y_dask_cudf=workers }) Parameters ---------- X : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, n_features). Features of training examples. y : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, 1) Labels of training examples. **y must be partitioned the same way as X** convert_dtype : bool, optional (default = False) When set to True, the fit method will, when necessary, convert y to be the same data type as X if they differ. This will increase memory used for the method. """ c = default_client() self.num_classes = len(y.unique()) X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X)) y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y)) X_partition_workers = [w for w, xc in X_futures.items()] y_partition_workers = [w for w, xc in y_futures.items()] if set(X_partition_workers) != set(self.workers) or \ set(y_partition_workers) != set(self.workers): raise ValueError(""" X is not partitioned on the same workers expected by RF\n X workers: %s\n y workers: %s\n RF workers: %s """ % (str(X_partition_workers), str(y_partition_workers), str(self.workers))) futures = list() for w, xc in X_futures.items(): futures.append( c.submit( RandomForestClassifier._fit, self.rfs[w], xc, y_futures[w], convert_dtype, random.random(), workers=[w], ) ) wait(futures) raise_exception_from_futures(futures) return self
def __init__( self, n_estimators=10, max_depth=-1, max_features="auto", n_bins=8, split_algo=1, split_criterion=0, min_rows_per_node=2, bootstrap=True, bootstrap_features=False, type_model="classifier", verbose=False, rows_sample=1.0, max_leaves=-1, n_streams=4, quantile_per_tree=False, dtype=None, criterion=None, min_samples_leaf=None, min_weight_fraction_leaf=None, max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None, oob_score=None, n_jobs=None, random_state=None, warm_start=None, class_weight=None, workers=None, client=None ): unsupported_sklearn_params = { "criterion": criterion, "min_samples_leaf": min_samples_leaf, "min_weight_fraction_leaf": min_weight_fraction_leaf, "max_leaf_nodes": max_leaf_nodes, "min_impurity_decrease": min_impurity_decrease, "min_impurity_split": min_impurity_split, "oob_score": oob_score, "n_jobs": n_jobs, "random_state": random_state, "warm_start": warm_start, "class_weight": class_weight, } for key, vals in unsupported_sklearn_params.items(): if vals is not None: raise TypeError( "The Scikit-learn variable", key, " is not supported in cuML," " please read the cuML documentation for" " more information", ) self.n_estimators = n_estimators self.n_estimators_per_worker = list() self.num_classes = 2 self.client = default_client() if client is None else client if workers is None: workers = self.client.has_what().keys() # Default to all workers self.workers = workers n_workers = len(workers) if n_estimators < n_workers: raise ValueError( "n_estimators cannot be lower than number of dask workers." ) n_est_per_worker = math.floor(n_estimators / n_workers) for i in range(n_workers): self.n_estimators_per_worker.append(n_est_per_worker) remaining_est = n_estimators - (n_est_per_worker * n_workers) for i in range(remaining_est): self.n_estimators_per_worker[i] = ( self.n_estimators_per_worker[i] + 1 ) seeds = list() seeds.append(0) for i in range(1, len(self.n_estimators_per_worker)): sd = self.n_estimators_per_worker[i-1] + seeds[i-1] seeds.append(sd) key = str(uuid1()) self.rfs = { worker: self.client.submit( RandomForestClassifier._func_build_rf, self.n_estimators_per_worker[n], max_depth, n_streams, max_features, n_bins, split_algo, split_criterion, min_rows_per_node, bootstrap, bootstrap_features, type_model, verbose, rows_sample, max_leaves, quantile_per_tree, seeds[n], dtype, key="%s-%s" % (key, n), workers=[worker], ) for n, worker in enumerate(workers) } rfs_wait = list() for r in self.rfs.values(): rfs_wait.append(r) wait(rfs_wait) raise_exception_from_futures(rfs_wait)
def predict(self, X): """ Predicts the labels for X. Parameters ---------- X : np.array Dense matrix (floats or doubles) of shape (n_samples, n_features). Features of examples to predict. Returns ---------- y: np.array Dense vector (int) of shape (n_samples, 1) """ c = default_client() workers = self.workers X_Scattered = c.scatter(X) futures = list() for n, w in enumerate(workers): futures.append( c.submit( RandomForestClassifier._predict, self.rfs[w], X_Scattered, random.random(), workers=[w], )) wait(futures) raise_exception_from_futures(futures) indexes = list() rslts = list() for d in range(len(futures)): rslts.append(futures[d].result()) indexes.append(0) pred = list() for i in range(len(X)): classes = dict() max_class = -1 max_val = 0 for d in range(len(rslts)): for j in range(self.n_estimators_per_worker[d]): sub_ind = indexes[d] + j cls = rslts[d][sub_ind] if cls not in classes.keys(): classes[cls] = 1 else: classes[cls] = classes[cls] + 1 if classes[cls] > max_val: max_val = classes[cls] max_class = cls indexes[d] = indexes[d] + self.n_estimators_per_worker[d] pred.append(max_class) return pred
def fit(self, X, y): """ Fit the input data with a Random Forest regression model IMPORTANT: X is expected to be partitioned with at least one partition on each Dask worker being used by the forest (self.workers). When persisting data, you can use cuml.dask.common.utils.persist_across_workers to simplify this:: X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client, [X_dask_cudf, y_dask_cudf]) (this is equivalent to calling `persist` with the data and workers):: X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ X_dask_cudf=workers, y_dask_cudf=workers }) Parameters ---------- X : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, n_features). Features of training examples. y : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, 1) Labels of training examples. y must be partitioned the same way as X """ c = default_client() X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X)) y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y)) X_partition_workers = [w for w, xc in X_futures.items()] y_partition_workers = [w for w, xc in y_futures.items()] if set(X_partition_workers) != set(self.workers) or \ set(y_partition_workers) != set(self.workers): raise ValueError(""" X is not partitioned on the same workers expected by RF\n X workers: %s\n y workers: %s\n RF workers: %s """ % (str(X_partition_workers), str(y_partition_workers), str(self.workers))) futures = list() for w, xc in X_futures.items(): futures.append( c.submit( RandomForestRegressor._fit, self.rfs[w], xc, y_futures[w], random.random(), workers=[w], )) wait(futures) raise_exception_from_futures(futures) return self
def _fit(self, X, y): X_futures = self.client.sync(extract_ddf_partitions, X) y_futures = self.client.sync(extract_ddf_partitions, y) X_partition_workers = [w for w, xc in X_futures] y_partition_workers = [w for w, xc in y_futures] if set(X_partition_workers) != set(y_partition_workers): raise ValueError(""" X and y are not partitioned on the same workers expected \n_cols Linear Regression""") self.rnks = dict() rnk_counter = 0 worker_to_parts = OrderedDict() for w, p in X_futures: if w not in worker_to_parts: worker_to_parts[w] = [] if w not in self.rnks.keys(): self.rnks[w] = rnk_counter rnk_counter = rnk_counter + 1 worker_to_parts[w].append(p) worker_to_parts_y = OrderedDict() for w, p in y_futures: if w not in worker_to_parts_y: worker_to_parts_y[w] = [] worker_to_parts_y[w].append(p) workers = list(map(lambda x: x[0], X_futures)) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) key = uuid1() partsToSizes = [(worker_info[wf[0]]["r"], self.client.submit( Ridge._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(X_futures)] n_cols = X.shape[1] n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes)) key = uuid1() self.linear_models = [(wf[0], self.client.submit( Ridge._func_create_model, comms.sessionId, **self.kwargs, workers=[wf[0]], key="%s-%s" % (key, idx))) for idx, wf in enumerate(worker_to_parts.items())] key = uuid1() linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit( Ridge._func_fit, wf[1], worker_to_parts[wf[0]], worker_to_parts_y[wf[0]], n_rows, n_cols, partsToSizes, worker_info[wf[0]]["r"], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.linear_models)]) wait(list(linear_fit.values())) raise_exception_from_futures(list(linear_fit.values())) comms.destroy() self.local_model = self.linear_models[0][1].result() self.coef_ = self.local_model.coef_ self.intercept_ = self.local_model.intercept_