コード例 #1
0
 def predict(self, X):
     client = default_client()
     class_probs = predict(client, self._Booster, X)
     if class_probs.ndim > 1:
         cidx = da.argmax(class_probs, axis=1)
     else:
         cidx = (class_probs > 0).astype(np.int64)
     return cidx
コード例 #2
0
 def predict_proba(self, data, ntree_limit=None):
     client = default_client()
     if ntree_limit is not None:
         raise NotImplementedError(
             "'ntree_limit' is not currently " "supported."
         )
     class_probs = predict(client, self._Booster, data)
     return class_probs
コード例 #3
0
ファイル: core.py プロジェクト: lgh0504/dask-lightgbm
 def predict(self, X, client=None, **kwargs):
     if client is None:
         client = default_client()
     return predict(client,
                    self.to_local(),
                    X,
                    dtype=self.classes_.dtype,
                    **kwargs)
コード例 #4
0
    def close(self, running=True):
        from dask.distributed import default_client

        try:
            client = default_client()
            client.close()
        except ValueError:
            pass
コード例 #5
0
ファイル: dask.py プロジェクト: urihoenig/mlrun-1
 def client(self):
     from dask.distributed import Client, default_client
     try:
         return default_client()
     except ValueError:
         if self._cluster:
             return Client(self._cluster)
         return Client()
コード例 #6
0
ファイル: louvain.py プロジェクト: h2oai/cugraph
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # FIXME: import here to prevent circular import: cugraph->louvain
    # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure
    # from cugraph.structure.graph import Graph

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")

    client = default_client()

    if (input_graph.local_data is not None
            and input_graph.local_data['by'] == 'src'):
        data = input_graph.local_data['data']
    else:
        data = get_local_data(input_graph, by='src', load_balance=load_balance)

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_louvain,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  max_iter,
                                  resolution,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    (parts, modularity_score) = result[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        parts = input_graph.unrenumber(parts, "vertex")

    return parts, modularity_score
コード例 #7
0
    def predict_model_on_cpu(self, X, convert_dtype=True):
        """
        Predicts the labels for X.

        Parameters
        ----------
        X : Dask cuDF dataframe  or CuPy backed Dask Array (n_rows, n_features)
            Distributed dense matrix (floats or doubles) of shape
            (n_samples, n_features).
        convert_dtype : bool, optional (default = True)
            When set to True, the predict method will, when necessary, convert
            the input to the data type which was used to train the model. This
            will increase memory used for the method.
        Returns
        ----------
        y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1)
        """
        c = default_client()
        workers = self.workers

        X_Scattered = c.scatter(X)
        futures = list()
        for n, w in enumerate(workers):
            futures.append(
                c.submit(
                    RandomForestClassifier._predict_model_on_cpu,
                    self.rfs[w],
                    X_Scattered,
                    convert_dtype,
                    workers=[w],
                ))

        rslts = self.client.gather(futures, errors="raise")
        indexes = np.zeros(len(futures), dtype=np.int32)
        pred = list()

        for i in range(len(X)):
            classes = dict()
            max_class = -1
            max_val = 0

            for d in range(len(rslts)):
                for j in range(self.n_estimators_per_worker[d]):
                    sub_ind = indexes[d] + j
                    cls = rslts[d][sub_ind]
                    if cls not in classes.keys():
                        classes[cls] = 1
                    else:
                        classes[cls] = classes[cls] + 1

                    if classes[cls] > max_val:
                        max_val = classes[cls]
                        max_class = cls

                indexes[d] = indexes[d] + self.n_estimators_per_worker[d]

            pred.append(max_class)
        return pred
コード例 #8
0
ファイル: dask.py プロジェクト: marcelonyc/mlrun_old
 def close(self):
     from dask.distributed import Client, default_client, as_completed
     try:
         client = default_client()
         client.close()
     except ValueError:
         pass
     if self._cluster:
         self._cluster.close()
コード例 #9
0
ファイル: core.py プロジェクト: zhouyonglong/dask-lightgbm
 def predict_proba(self, X, client=None, **kwargs):
     if client is None:
         client = default_client()
     return predict(client,
                    self.to_local(),
                    X,
                    proba=True,
                    dtype=self.classes_[0].dtype,
                    **kwargs)
コード例 #10
0
def test_parquet_concat_within_workers(client_connection):
    if not os.path.exists("test_files_parquet"):
        print("Generate data... ")
        os.mkdir("test_files_parquet")
    for x in range(10):
        if not os.path.exists("test_files_parquet/df" + str(x)):
            df = utils.random_edgelist(e=100,
                                       ef=16,
                                       dtypes={
                                           "src": np.int32,
                                           "dst": np.int32
                                       },
                                       seed=x)
            df.to_parquet("test_files_parquet/df" + str(x), index=False)

    n_gpu = get_n_workers()

    print("Read_parquet... ")
    t1 = time.time()
    ddf = dask_cudf.read_parquet("test_files_parquet/*",
                                 dtype=["int32", "int32"])
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t1 = time.time() - t1
    print("*** Read Time: ", t1, "s")
    print(ddf)

    assert ddf.npartitions > n_gpu

    print("Drop_duplicates... ")
    t2 = time.time()
    ddf.drop_duplicates(inplace=True)
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t2 = time.time() - t2
    print("*** Drop duplicate time: ", t2, "s")
    assert t2 < t1

    print("Repartition... ")
    t3 = time.time()
    # Notice that ideally we would use :
    # ddf = ddf.repartition(npartitions=n_gpu)
    # However this is slower than reading and requires more memory
    # Using custom concat instead
    client = default_client()
    ddf = concat_within_workers(client, ddf)
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t3 = time.time() - t3
    print("*** repartition Time: ", t3, "s")
    print(ddf)

    assert t3 < t1
コード例 #11
0
ファイル: ridge.py プロジェクト: tusharkalecam/cuml
 def __init__(self, client=None, **kwargs):
     """
     Initializes the linear regression class.
     """
     self.client = default_client() if client is None else client
     self.kwargs = kwargs
     self.coef_ = None
     self.intercept_ = None
     self._model_fit = False
     self._consec_call = 0
コード例 #12
0
ファイル: _incremental.py プロジェクト: EpyDoc/dask-ml
    def fit(self, X, y, **fit_params):
        """Find the best parameters for a particular model.

        Parameters
        ----------
        X, y : array-like
        **fit_params
            Additional partial fit keyword arguments for the estimator.
        """
        return default_client().sync(self._fit, X, y, **fit_params)
コード例 #13
0
ファイル: part_utils.py プロジェクト: goncaloperes/cugraph
def persist_distributed_data(dask_df, client):
    client = default_client() if client is None else client
    worker_addresses = Comms.get_workers()
    _keys = dask_df.__dask_keys__()
    worker_dict = {}
    for i, key in enumerate(_keys):
        worker_dict[str(key)] = tuple([worker_addresses[i]])
    persisted = client.persist(dask_df, workers=worker_dict)
    parts = futures_of(persisted)
    return parts
コード例 #14
0
def setup_dask(cls):
    try:
        from dask.distributed import default_client
        client = default_client()
    except:
        client = _startup_dask(2.0)
    print('Dask Client:', client)

    if cls is not None:
        setattr(cls, 'dask_client_', client)
コード例 #15
0
ファイル: dask_arr_utils.py プロジェクト: teju85/cuml
def to_dask_cudf(dask_arr, client=None):
    client = default_client() if client is None else client

    elms = [_to_cudf(dp) for dp in dask_arr.to_delayed().flatten()]
    dfs = client.compute(elms)

    meta = client.submit(_get_meta, dfs[0])
    meta_local = meta.result()

    return dd.from_delayed(dfs, meta=meta_local)
コード例 #16
0
    def fit(self, ddf):
        """
        Fits a single-node multi-gpu knn model using single process-multiGPU
        technique.
        :param futures:
        :return:
        """

        client = default_client()

        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures, cols = client.sync(self._get_mg_info, ddf)
        self.gpu_futures = gpu_futures

        host_dict = self._build_host_dict(gpu_futures, client).items()
        if len(host_dict) > 1:
            raise Exception("Dask cluster appears to span hosts. Current "
                            "multi-GPU version is limited to single host")

        # Choose a random worker on each unique host to run dask-cuml's
        # kNN.fit() function on all the cuDFs living on that host.
        self.master_host = [(host, random.sample(ports, 1)[0])
                            for host, ports in host_dict][0]

        host, port = self.master_host

        gpu_futures_for_host = list(
            filter(lambda d: d[0][0] == host, gpu_futures))
        exec_node = (host, port)

        # build ipc handles
        gpu_data_excl_worker = list(
            filter(lambda d: d[0] != exec_node, gpu_futures_for_host))
        gpu_data_incl_worker = list(
            filter(lambda d: d[0] == exec_node, gpu_futures_for_host))

        ipc_handles = [
            client.submit(get_ipc_handle, future, workers=[worker])
            for worker, future in gpu_data_excl_worker
        ]

        raw_arrays = [future for worker, future in gpu_data_incl_worker]

        f = (exec_node,
             client.submit(_fit_on_worker, (ipc_handles, raw_arrays), {
                 "D": cols,
                 "should_downcast": self.should_downcast
             },
                           workers=[exec_node]))

        wait(f)

        # The model on each unique host is held for futures queries
        self.model = f
コード例 #17
0
 def __init__(self,
              n_clusters=8,
              max_iter=300,
              tol=1e-4,
              verbose=0,
              random_state=1,
              precompute_distances='auto',
              init='scalable-k-means++',
              n_init=1,
              algorithm='auto',
              client=None):
     """
     Constructor for distributed KMeans model
     handle : cuml.Handle
         If it is None, a new one is created just for this class.
     n_clusters : int (default = 8)
         The number of centroids or clusters you want.
     max_iter : int (default = 300)
         The more iterations of EM, the more accurate, but slower.
     tol : float (default = 1e-4)
         Stopping criterion when centroid means do not change much.
     verbose : boolean (default = 0)
         If True, prints diagnositc information.
     random_state : int (default = 1)
         If you want results to be the same when you restart Python,
         select a state.
     precompute_distances : boolean (default = 'auto')
         Not supported yet.
     init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray}
            (default = 'scalable-k-means++')
         'scalable-k-means++' or 'k-means||': Uses fast and stable scalable
         kmeans++ intialization.
         'random': Choose 'n_cluster' observations (rows) at random
         from data for the initial centroids. If an ndarray is passed,
         it should be of shape (n_clusters, n_features) and gives the
         initial centers.
     n_init : int (default = 1)
         Number of times intialization is run. More is slower,
         but can be better.
     algorithm : "auto"
         Currently uses full EM, but will support others later.
     n_gpu : int (default = 1)
         Number of GPUs to use. Currently uses single GPU, but will support
         multiple GPUs later.
     """
     self.client = default_client() if client is None else client
     self.max_iter = max_iter
     self.tol = tol
     self.random_state = random_state
     self.precompute_distances = precompute_distances
     self.n_init = n_init
     self.algorithm = algorithm
     self.n_clusters = n_clusters
     self.init = init
     self.verbose = verbose
コード例 #18
0
def to_dask_cudf(futures):
    """
    Convert a list of futures containing cudf Dataframes into a Dask.Dataframe
    :param futures: list[cudf.Dataframe] list of futures containing dataframes
    :return: dask.Dataframe a dask.Dataframe
    """
    c = default_client()
    # Convert a list of futures containing dfs back into a dask_cudf
    dfs = [d for d in futures if d.type != type(None)]  # NOQA
    meta = c.submit(get_meta, dfs[0]).result()
    return dd.from_delayed(dfs, meta=meta)
コード例 #19
0
ファイル: dask.py プロジェクト: inteplus/mtstreamz
 def update(self, batch, who=None, metadata=None):
     try:
         client = default_client()
         result = [
             client.submit(self.func, x, *self.args, **self.kwargs)
             for x in batch
         ]
     except Exception as e:
         logger.exception(e)
         raise
     else:
         return self._emit(result, metadata=metadata)
コード例 #20
0
def to_dask_cudf(futures, client=None):
    """
    Convert a list of futures containing cudf Dataframes into a Dask.Dataframe
    :param futures: list[cudf.Dataframe] list of futures containing dataframes
    :param client: dask.distributed.Client Optional client to use
    :return: dask.Dataframe a dask.Dataframe
    """
    c = default_client() if client is None else client
    # Convert a list of futures containing dfs back into a dask_cudf
    dfs = [d for d in futures if d.type != type(None)]  # NOQA
    meta = c.submit(get_meta, dfs[0]).result()
    return dd.from_delayed(dfs, meta=meta)
コード例 #21
0
ファイル: dask.py プロジェクト: xiaoahang/LightGBM
    def _fit(self, model_factory, X, y=None, sample_weight=None, client=None, **kwargs):
        """Docstring is inherited from the LGBMModel."""
        if client is None:
            client = default_client()

        params = self.get_params(True)
        model = _train(client, X, y, params, model_factory, sample_weight, **kwargs)

        self.set_params(**model.get_params())
        self._copy_extra_params(model, self)

        return self
コード例 #22
0
ファイル: randomforestregressor.py プロジェクト: rnyak/cuml
    def predict(self, X):
        """
        Predicts the regressor outputs for X.

        Parameters
        ----------
        X : Dense matrix (floats or doubles) of shape (n_samples, n_features).

        Returns
        ----------
        y: NumPy
           Dense vector (float) of shape (n_samples, 1)

        """
        c = default_client()
        workers = self.workers

        if not isinstance(X, np.ndarray):
            raise ValueError("Predict inputs must be numpy arrays")

        X_Scattered = c.scatter(X)

        futures = list()
        for n, w in enumerate(workers):
            futures.append(
                c.submit(
                    RandomForestRegressor._predict,
                    self.rfs[w],
                    X_Scattered,
                    random.random(),
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        indexes = list()
        rslts = list()
        for d in range(len(futures)):
            rslts.append(futures[d].result())
            indexes.append(0)

        pred = list()

        for i in range(len(X)):
            pred_per_worker = 0.0
            for d in range(len(rslts)):
                pred_per_worker = pred_per_worker + rslts[d][i]

            pred.append(pred_per_worker / len(rslts))

        return pred
コード例 #23
0
    def fit(self, X, y=None, sample_weight=None, client=None, **kwargs):
        if client is None:
            client = default_client()

        model_factory = lightgbm.LGBMRegressor
        params = self.get_params(True)
        model = train(client, X, y, params, model_factory, sample_weight,
                      **kwargs)

        self.set_params(**model.get_params())
        self._copy_extra_params(model, self)

        return self
コード例 #24
0
    def fit(self, X, y=None, **fit_params):
        """Find the best parameters for a particular model.

        Parameters
        ----------
        X, y : array-like
        **fit_params
            Additional partial fit keyword arguments for the estimator.
        """
        client = default_client()
        if not client.asynchronous:
            return client.sync(self._fit, X, y, **fit_params)
        return self._fit(X, y, **fit_params)
コード例 #25
0
ファイル: tsvd.py プロジェクト: tusharkalecam/cuml
    def __init__(self, client=None, **kwargs):
        """
        Constructor for distributed TruncatedSVD model
        """
        self.client = default_client() if client is None else client
        self.kwargs = kwargs

        # define attributes to make sure they
        # are available even on untrained object
        self.local_model = None
        self.components_ = None
        self.explained_variance_ = None
        self.explained_variance_ratio_ = None
        self.singular_values_ = None
コード例 #26
0
    def __init__(self, client=None, streams_per_handle=0, verbose=False,
                 **kwargs):

        raise NotImplementedError("Multi-GPU KNN is not available in RAPIDS "
                                  "0.11, it will be enabled in the next "
                                  "release. Legacy version is available in "
                                  "0.10.")
        self.client = default_client() if client is None else client
        self.model_args = kwargs
        self.X = None
        self.Y = None
        self.n_cols = 0
        self.streams_per_handle = streams_per_handle
        self.verbose = verbose
コード例 #27
0
ファイル: part_utils.py プロジェクト: goncaloperes/cugraph
def load_balance_func(ddf_, by, client=None):
    # Load balances the sorted dask_cudf DataFrame.
    # Input is a dask_cudf dataframe ddf_ which is sorted by
    # the column name passed as the 'by' argument.

    client = default_client() if client is None else client

    parts = persist_distributed_data(ddf_, client)
    wait(parts)

    who_has = client.who_has(parts)
    key_to_part = [(str(part.key), part) for part in parts]
    gpu_fututres = [(first(who_has[key]), part.key[1], part)
                    for key, part in key_to_part]
    worker_to_data = create_dict(gpu_fututres)

    # Calculate cumulative sum in each dataframe partition
    cumsum_parts = [
        client.submit(get_cumsum, wf[1][0][0], by, workers=[wf[0]]).result()
        for idx, wf in enumerate(worker_to_data.items())
    ]

    num_rows = []
    for cumsum in cumsum_parts:
        num_rows.append(cumsum.iloc[-1])

    # Calculate current partition divisions
    divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)]
    divisions[-1] = divisions[-1] - 1
    divisions = tuple(divisions)

    # Set global index from 0 to len(dask_cudf_dataframe) so that global
    # indexing of divisions can be used for repartitioning.
    futures = [
        client.submit(set_global_index,
                      wf[1][0][0],
                      divisions[wf[1][0][1]],
                      workers=[wf[0]])
        for idx, wf in enumerate(worker_to_data.items())
    ]
    wait(futures)

    ddf = dask_cudf.from_delayed(futures)
    ddf.divisions = divisions

    # Repartition the data
    ddf = repartition(ddf, cumsum_parts)

    return ddf
コード例 #28
0
ファイル: input_utils.py プロジェクト: daxiongshu/cuml
def _to_dask_cudf(futures, client=None):
    """
    Convert a list of futures containing cudf Dataframes into a Dask.Dataframe
    :param futures: list[cudf.Dataframe] list of futures containing dataframes
    :param client: dask.distributed.Client Optional client to use
    :return: dask.Dataframe a dask.Dataframe
    """
    c = default_client() if client is None else client
    # Convert a list of futures containing dfs back into a dask_cudf
    dfs = [d for d in futures if d.type != type(None)]  # NOQA
    if logger.should_log_for(logger.level_debug):
        logger.debug("to_dask_cudf dfs=%s" % str(dfs))
    meta_future = c.submit(_get_meta, dfs[0], pure=False)
    meta = meta_future.result()
    return dd.from_delayed(dfs, meta=meta)
コード例 #29
0
ファイル: kmeans.py プロジェクト: tusharkalecam/cuml
    def __init__(self, client=None, **kwargs):
        """
        Constructor for distributed KMeans model

        Parameters
        ----------
        handle : cuml.Handle
            If it is None, a new one is created just for this class.
        n_clusters : int (default = 8)
            The number of centroids or clusters you want.
        max_iter : int (default = 300)
            The more iterations of EM, the more accurate, but slower.
        tol : float (default = 1e-4)
            Stopping criterion when centroid means do not change much.
        verbose : boolean (default = 0)
            If True, prints diagnositc information.
        random_state : int (default = 1)
            If you want results to be the same when you restart Python,
            select a state.
        init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray}
               (default = 'scalable-k-means++')
            'scalable-k-means++' or 'k-means||': Uses fast and stable scalable
            kmeans++ intialization.
            'random': Choose 'n_cluster' observations (rows) at random
            from data for the initial centroids. If an ndarray is passed,
            it should be of shape (n_clusters, n_features) and gives the
            initial centers.
        oversampling_factor : int (default = 2) The amount of points to sample
            in scalable k-means++ initialization for potential centroids.
            Increasing this value can lead to better initial centroids at the
            cost of memory. The total number of centroids sampled in scalable
            k-means++ is oversampling_factor * n_clusters * 8.
        max_samples_per_batch : int (default = 32768) The number of data
            samples to use for batches of the pairwise distance computation.
            This computation is done throughout both fit predict. The default
            should suit most cases. The total number of elements in the
            batched pairwise distance computation is max_samples_per_batch
            * n_clusters. It might become necessary to lower this number when
            n_clusters becomes prohibitively large.

        Attributes
        ----------
        cluster_centers_ : array
            The coordinates of the final clusters. This represents of "mean" of
            each data cluster.
        """
        self.client = default_client() if client is None else client
        self.kwargs = kwargs
コード例 #30
0
ファイル: core.py プロジェクト: zhouyonglong/dask-lightgbm
    def fit(self, X, y=None, sample_weight=None, client=None, **kwargs):
        if client is None:
            client = default_client()
        model_factory = lightgbm.LGBMRegressor
        params = self.get_params(True)

        model = train(client, X, y, params, model_factory, sample_weight,
                      **kwargs)
        self.set_params(**model.get_params())
        self._Booster = model._Booster
        self._n_features = model._n_features
        self._evals_result = model._evals_result
        self._best_iteration = model._best_iteration
        self._best_score = model._best_score

        return self