def test_reduce_futures(n_parts, client): def s(x): return x a = [client.submit(s, i) for i in range(n_parts)] b = reduce(a, sum) c = client.compute(b, sync=True) # Testing this gets the correct result for now. assert (sum(range(n_parts)) == c)
def fit(self, X, y, classes=None): """ Fit distributed Naive Bayes classifier model Parameters ---------- X : dask.Array with blocks containing dense or sparse cupy arrays y : dask.Array with blocks containing cupy.ndarray classes : array-like containing unique class labels Returns ------- cuml.dask.naive_bayes.MultinomialNB current model instance """ # Only Dask.Array supported for now if not isinstance(X, dask.array.core.Array): raise ValueError("Only dask.Array is supported for X") if not isinstance(y, dask.array.core.Array): raise ValueError("Only dask.Array is supported for y") if len(X.chunks[1]) != 1: raise ValueError("X must be chunked by row only. " "Multi-dimensional chunking is not supported") futures = DistributedDataHandler.create([X, y], self.client) classes = self._unique(y.map_blocks( MultinomialNB._unique).compute()) \ if classes is None else classes models = [ self.client.submit(self._fit, part, classes, self.kwargs, pure=False) for w, part in futures.gpu_futures ] models = reduce(models, self._merge_counts_to_model, client=self.client) models = self.client.submit(self._update_log_probs, models, pure=False) wait_and_raise_from_futures([models]) self._set_internal_model(models) return self
def fit(self, X): """ Fit distributed TFIDF Transformer Parameters ---------- X : dask.Array with blocks containing dense or sparse cupy arrays Returns ------- cuml.dask.feature_extraction.text.TfidfTransformer instance """ # Only Dask.Array supported for now if not isinstance(X, dask.array.core.Array): raise ValueError("Only dask.Array is supported for X") if len(X.chunks[1]) != 1: raise ValueError( "X must be chunked by row only. " "Multi-dimensional chunking is not supported" ) # We don't' do anything if we don't need idf if not self.internal_model.use_idf: return self futures = DistributedDataHandler.create(X, self.client) models = [ self.client.submit( self._set_doc_stats, part, self.kwargs, pure=False ) for w, part in futures.gpu_futures ] models = reduce(models, self._merge_stats_to_model, client=self.client) wait_and_raise_from_futures([models]) models = self.client.submit(self._set_idf_diag, models, pure=False) wait_and_raise_from_futures([models]) self._set_internal_model(models) return self
def test_reduce_futures(n_parts, cluster): def s(x): return x client = Client(cluster) try: a = [client.submit(s, i) for i in range(n_parts)] b = reduce(a, sum) c = client.compute(b, sync=True) # Testing this gets the correct result for now. assert (sum(range(n_parts)) == c) finally: client.close()