Beispiel #1
0
def create_cuml_distributed(X_train, y_train):
    start_time = datetime.now()
    print('init dask cluster')

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    workers = client.has_what().keys()

    n_workers = len(workers)
    X_train_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    y_train_cudf = cudf.Series(y_train)

    X_train_dask = dask_cudf.from_cudf(X_train_cudf, npartitions=n_workers)
    y_train_dask = dask_cudf.from_cudf(y_train_cudf, npartitions=n_workers)

    X_train_ddask, y_train_ddask = dask_utils.persist_across_workers(
        client, [X_train_dask, y_train_dask], workers=workers)
    print('cuml distributed initialized', datetime.now() - start_time)
    model = distributed_cuml_Rf(n_estimators=500, n_streams=64)
    model.fit(X_train, y_train)

    wait(model.rfs)
    print('cuml distributed finished', datetime.now() - start_time)
    client.close()
    cluster.close()
    return model
Beispiel #2
0
def test_rf_regression_dask_fil(partitions_per_worker, cluster):

    # Use CUDA_VISIBLE_DEVICES to control the number of workers
    c = Client(cluster)

    try:

        X, y = make_regression(n_samples=10000,
                               n_features=20,
                               n_informative=10,
                               random_state=123)

        X = X.astype(np.float32)
        y = y.astype(np.float32)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1000)

        cu_rf_params = {
            'n_estimators': 50,
            'max_depth': 16,
            'n_bins': 16,
        }

        workers = c.has_what().keys()
        n_partitions = partitions_per_worker * len(workers)

        X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
        X_train_df = \
            dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

        y_cudf = np.array(pd.DataFrame(y_train).values)
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)
        y_train_df = \
            dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)
        X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
        X_test_df = \
            dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions)

        X_train_df, y_train_df = dask_utils.persist_across_workers(
            c, [X_train_df, y_train_df], workers=workers)

        cu_rf_mg = cuRFR_mg(**cu_rf_params)
        cu_rf_mg.fit(X_train_df, y_train_df)

        cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute()
        cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict))

        acc_score = r2_score(cu_rf_mg_predict, y_test)

        assert acc_score >= 0.67

    finally:
        c.close()
Beispiel #3
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        cumlPred = cp.array(cumlLabels.compute())

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = np.squeeze(y.compute().to_pandas().values)

        score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get()))

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()
Beispiel #4
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=True,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=1,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf)
        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        from sklearn.metrics import adjusted_rand_score

        cumlPred = cumlLabels.compute().to_pandas().values

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = y.compute().to_pandas().values

        score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred)

        assert 1.0 == score

    finally:
        client.close()
Beispiel #5
0
def test_rf_regression(n_workers, partitions_per_worker):
    if dask_cuda.utils.get_n_gpus() < n_workers:
        pytest.skip("too few GPUs")

    cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers)
    c = Client(cluster)

    X, y = make_regression(n_samples=40000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)

    cu_rf_params = {
        'n_estimators': 25,
        'max_depth': 13,
    }

    workers = c.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = np.array(pd.DataFrame(y_train).values)
    y_cudf = y_cudf[:, 0]
    y_cudf = cudf.Series(y_cudf)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        c, [X_train_df, y_train_df], workers=workers)

    cu_rf_mg = cuRFR_mg(**cu_rf_params)
    cu_rf_mg.fit(X_train_df, y_train_df)
    cu_rf_mg_predict = cu_rf_mg.predict(X_test)

    acc_score = r2_score(cu_rf_mg_predict, y_test)

    print(str(acc_score))

    assert acc_score >= 0.70

    c.close()
    cluster.close()
def test_ols(cluster):

    client = Client(cluster)

    try:

        import dask_cudf

        import cudf
        import numpy as np

        from cuml.dask.linear_model import LinearRegression as cumlOLS_dask

        nrows = 2**8
        ncols = 399

        X, y = load_data(nrows, ncols)

        X_cudf = cudf.DataFrame.from_pandas(X)
        y_cudf = np.array(y.as_matrix())
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)

        workers = client.has_what().keys()

        X_df = dask_cudf.from_cudf(X_cudf, npartitions=len(workers)).persist()
        y_df = dask_cudf.from_cudf(y_cudf, npartitions=len(workers)).persist()

        lr = cumlOLS_dask()

        lr.fit(X_df, y_df)

        ret = lr.predict(X_df)

        error_cuml = mean_squared_error(y, ret.compute().to_array())

        assert(error_cuml < 1e-6)

    finally:
        client.close()
        cluster.close()
class RapidsCloudML(object):
    def __init__(
        self,
        cloud_type="Azure",
        model_type="RandomForest",
        data_type="Parquet",
        compute_type="single-GPU",
        verbose_estimator=False,
        CSP_paths=default_azureml_paths,
    ):

        self.CSP_paths = CSP_paths
        self.cloud_type = cloud_type
        self.model_type = model_type
        self.data_type = data_type
        self.compute_type = compute_type
        self.verbose_estimator = verbose_estimator
        self.log_to_file(
            f"\n> RapidsCloudML\n\tCompute, Data , Model, Cloud types {self.compute_type, self.data_type, self.model_type, self.cloud_type}"
        )

        # Setting up client for multi-GPU option
        if "multi" in self.compute_type:
            self.log_to_file("\n\tMulti-GPU selected")
            # This will use all GPUs on the local host by default
            cluster = LocalCUDACluster(threads_per_worker=1)
            self.client = Client(cluster)

            # Query the client for all connected workers
            self.workers = self.client.has_what().keys()
            self.n_workers = len(self.workers)
            self.log_to_file(f"\n\tClient information {self.client}")

    def load_hyperparams(self, model_name="XGBoost"):
        """
        Selecting model paramters based on the model we select for execution.
        Checks if there is a config file present in the path self.CSP_paths['hyperparams'] with
        the parameters for the experiment. If not present, it returns the default parameters.

        Parameters
        ----------
        model_name : string
                     Selects which model to set the parameters for. Takes either 'XGBoost' or 'RandomForest'.

        Returns
        ----------
        model_params : dict
                       Loaded model parameters (dict)
        """

        self.log_to_file("\n> Loading Hyperparameters")

        # Default parameters of the models
        if self.model_type == "XGBoost":
            # https://xgboost.readthedocs.io/en/latest/parameter.html
            model_params = {
                "max_depth": 6,
                "num_boost_round": 100,
                "learning_rate": 0.3,
                "gamma": 0.0,
                "lambda": 1.0,
                "alpha": 0.0,
                "objective": "binary:logistic",
                "random_state": 0,
            }

        elif self.model_type == "RandomForest":
            # https://docs.rapids.ai/api/cuml/stable/  -> cuml.ensemble.RandomForestClassifier
            model_params = {
                "n_estimators": 10,
                "max_depth": 10,
                "n_bins": 16,
                "max_features": 1.0,
                "seed": 0,
            }

        hyperparameters = {}
        try:
            with open(self.CSP_paths["hyperparams"], "r") as file_handle:
                hyperparameters = json.load(file_handle)
                for key, value in hyperparameters.items():
                    model_params[key] = value
                pprint.pprint(model_params)
                return model_params

        except Exception as error:
            self.log_to_file(str(error))
            return

    def load_data(self,
                  filename="dataset.orc",
                  col_labels=None,
                  y_label="ArrDelayBinary"):
        """
        Loading the data into the object from the filename and based on the columns that we are
        interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary
        classification problem.

        Parameters
        ----------
        filename : string
                   the path of the dataset to be loaded

        col_labels : list of strings
                     The input columns that we are interested in. None selects all the columns

        y_label : string
                  The column to perform the prediction task in.

        Returns
        ----------
        dataset : dataframe (Pandas, cudf or dask-cudf)
                  Ingested dataset in the format of a dataframe

        col_labels : list of strings
                     The input columns selected

        y_label : string
                  The generated y_label name for binary classification

        duration : float
                   The time it took to execute the function
        """
        target_filename = filename
        self.log_to_file(f"\n> Loading dataset from {target_filename}")

        with PerfTimer() as ingestion_timer:
            if "CPU" in self.compute_type:
                # CPU Reading options
                self.log_to_file(f"\n\tCPU read")

                if self.data_type == "ORC":
                    with open(target_filename, mode="rb") as file:
                        dataset = pyarrow_orc.ORCFile(file).read().to_pandas()
                elif self.data_type == "CSV":
                    dataset = pd.read_csv(target_filename, names=col_labels)

                elif self.data_type == "Parquet":

                    if "single" in self.compute_type:
                        dataset = pd.read_parquet(target_filename)

                    elif "multi" in self.compute_type:
                        self.log_to_file(f"\n\tReading using dask dataframe")
                        dataset = dask.dataframe.read_parquet(target_filename,
                                                              columns=columns)

            elif "GPU" in self.compute_type:
                # GPU Reading Option

                self.log_to_file(f"\n\tGPU read")
                if self.data_type == "ORC":
                    dataset = cudf.read_orc(target_filename)

                elif self.data_type == "CSV":
                    dataset = cudf.read_csv(target_filename, names=col_labels)

                elif self.data_type == "Parquet":

                    if "single" in self.compute_type:
                        dataset = cudf.read_parquet(target_filename)

                    elif "multi" in self.compute_type:
                        self.log_to_file(f"\n\tReading using dask_cudf")
                        dataset = dask_cudf.read_parquet(target_filename,
                                                         columns=col_labels)

        # cast all columns to float32
        for col in dataset.columns:
            dataset[col] = dataset[col].astype(
                np.float32)  # needed for random forest

        # Adding y_label column if it is not present
        if y_label not in dataset.columns:
            dataset[y_label] = 1.0 * (dataset["ArrDelay"] > 10)

        dataset[y_label] = dataset[y_label].astype(
            np.int32)  # Needed for cuml RF

        dataset = dataset.fillna(
            0.0)  # Filling the null values. Needed for dask-cudf

        self.log_to_file(
            f"\n\tIngestion completed in {ingestion_timer.duration}")
        self.log_to_file(
            f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}")
        return dataset, col_labels, y_label, ingestion_timer.duration

    def split_data(self,
                   dataset,
                   y_label,
                   train_size=0.8,
                   random_state=0,
                   shuffle=True):
        """
        Splitting data into train and test split, has appropriate imports for different compute modes.
        CPU compute - Uses sklearn, we manually filter y_label column in the split call
        GPU Compute - Single GPU uses cuml and multi GPU uses dask, both split y_label internally.

        Parameters
        ----------
        dataset : dataframe
                  The dataframe on which we wish to perform the split
        y_label : string
                  The name of the column (not the series itself)
        train_size : float
                     The size for the split. Takes values between 0 to 1.
        random_state : int
                       Useful for running reproducible splits.
        shuffle : binary
                  Specifies if the data must be shuffled before splitting.

        Returns
        ----------
        X_train : dataframe
                  The data to be used for training. Has same type as input dataset.
        X_test : dataframe
                  The data to be used for testing. Has same type as input dataset.
        y_train : dataframe
                  The label to be used for training. Has same type as input dataset.
        y_test : dataframe
                  The label to be used for testing. Has same type as input dataset.
        duration : float
                   The time it took to perform the split
        """
        self.log_to_file("\n> Splitting train and test data")
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if "CPU" in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(
                    dataset.loc[:, dataset.columns != y_label],
                    dataset[y_label],
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state,
                )

            elif "GPU" in self.compute_type:
                if "single" in self.compute_type:
                    X_train, X_test, y_train, y_test = cuml_train_test_split(
                        X=dataset,
                        y=y_label,
                        train_size=train_size,
                        shuffle=shuffle,
                        random_state=random_state,
                    )
                elif "multi" in self.compute_type:
                    X_train, X_test, y_train, y_test = dask_train_test_split(
                        dataset,
                        y_label,
                        train_size=train_size,
                        shuffle=False,  # shuffle not available for dask_cudf yet
                        random_state=random_state,
                    )

        self.log_to_file(
            f"\n\tX_train shape and type{X_train.shape} {type(X_train)}")
        self.log_to_file(f"\n\tSplit completed in {split_timer.duration}")
        return X_train, X_test, y_train, y_test, split_timer.duration

    def train_model(self, X_train, y_train, model_params):
        """
        Trains a model with the model_params specified by calling fit_xgboost or
        fit_random_forest depending on the model_type.

        Parameters
        ----------
        X_train : dataframe
                  The data for traning
        y_train : dataframe
                  The label to be used for training.
        model_params : dict
                       The model params to use for this training
        Returns
        ----------
        trained_model : The object of the trained model either of XGBoost or RandomForest

        training_time : float
                        The time it took to train the model
        """
        self.log_to_file(
            f"\n> Training {self.model_type} estimator w/ hyper-params")
        training_time = 0

        try:
            if self.model_type == "XGBoost":
                trained_model, training_time = self.fit_xgboost(
                    X_train, y_train, model_params)
            elif self.model_type == "RandomForest":
                trained_model, training_time = self.fit_random_forest(
                    X_train, y_train, model_params)
        except Exception as error:
            self.log_to_file("\n\n!error during model training: " + str(error))
        self.log_to_file(f"\n\tFinished training in {training_time:.4f} s")
        return trained_model, training_time

    def fit_xgboost(self, X_train, y_train, model_params):
        """
        Trains a XGBoost model on X_train and y_train with model_params

        Parameters and Objects returned are same as trained_model
        """
        if "GPU" in self.compute_type:
            model_params.update({"tree_method": "gpu_hist"})
        else:
            model_params.update({"tree_method": "hist"})

        with PerfTimer() as train_timer:
            if "single" in self.compute_type:
                train_DMatrix = xgboost.DMatrix(data=X_train, label=y_train)
                trained_model = xgboost.train(
                    dtrain=train_DMatrix,
                    params=model_params,
                    num_boost_round=model_params["num_boost_round"],
                )
            elif "multi" in self.compute_type:
                self.log_to_file("\n\tTraining multi-GPU XGBoost")
                train_DMatrix = xgboost.dask.DaskDMatrix(self.client,
                                                         data=X_train,
                                                         label=y_train)
                trained_model = xgboost.dask.train(
                    self.client,
                    dtrain=train_DMatrix,
                    params=model_params,
                    num_boost_round=model_params["num_boost_round"],
                )
        return trained_model, train_timer.duration

    def fit_random_forest(self, X_train, y_train, model_params):
        """
        Trains a RandomForest model on X_train and y_train with model_params.
        Depending on compute_type, estimators from appropriate packages are used.
        CPU - sklearn
        Single-GPU - cuml
        multi_gpu - cuml.dask

        Parameters and Objects returned are same as trained_model
        """
        if "CPU" in self.compute_type:
            rf_model = sklearn.ensemble.RandomForestClassifier(
                n_estimators=model_params["n_estimators"],
                max_depth=model_params["max_depth"],
                max_features=model_params["max_features"],
                n_jobs=int(self.n_workers),
                verbose=self.verbose_estimator,
            )
        elif "GPU" in self.compute_type:
            if "single" in self.compute_type:
                rf_model = cuml.ensemble.RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],
                    max_depth=model_params["max_depth"],
                    n_bins=model_params["n_bins"],
                    max_features=model_params["max_features"],
                    verbose=self.verbose_estimator,
                )
            elif "multi" in self.compute_type:
                self.log_to_file("\n\tFitting multi-GPU daskRF")
                X_train, y_train = dask_utils.persist_across_workers(
                    self.client,
                    [X_train.fillna(0.0),
                     y_train.fillna(0.0)],
                    workers=self.workers,
                )
                rf_model = cuml.dask.ensemble.RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],
                    max_depth=model_params["max_depth"],
                    n_bins=model_params["n_bins"],
                    max_features=model_params["max_features"],
                    verbose=self.verbose_estimator,
                )
        with PerfTimer() as train_timer:
            try:
                trained_model = rf_model.fit(X_train, y_train)
            except Exception as error:
                self.log_to_file("\n\n! Error during fit " + str(error))
        return trained_model, train_timer.duration

    def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
        """
        Evaluates the model performance on the inference set. For XGBoost we need
        to generate a DMatrix and then we can evaluate the model.
        For Random Forest, in single GPU case, we can just call .score function.
        And multi-GPU Random Forest needs to predict on the model and then compute
        the accuracy score.

        Parameters
        ----------
        trained_model : The object of the trained model either of XGBoost or RandomForest
        X_test : dataframe
                  The data for testing
        y_test : dataframe
                  The label to be used for testing.
        Returns
        ----------
        test_accuracy : float
                        The accuracy achieved on test set
        duration : float
                   The time it took to evaluate the model
        """
        self.log_to_file(f"\n> Inferencing on test set")
        test_accuracy = None
        with PerfTimer() as inference_timer:
            try:
                if self.model_type == "XGBoost":
                    if "multi" in self.compute_type:
                        test_DMatrix = xgboost.dask.DaskDMatrix(self.client,
                                                                data=X_test,
                                                                label=y_test)
                        xgb_pred = xgboost.dask.predict(
                            self.client, trained_model,
                            test_DMatrix).compute()
                        xgb_pred = (xgb_pred > threshold) * 1.0
                        test_accuracy = accuracy_score(y_test.compute(),
                                                       xgb_pred)
                    elif "single" in self.compute_type:
                        test_DMatrix = xgboost.DMatrix(data=X_test,
                                                       label=y_test)
                        xgb_pred = trained_model.predict(test_DMatrix)
                        xgb_pred = (xgb_pred > threshold) * 1.0
                        test_accuracy = accuracy_score(y_test, xgb_pred)

                elif self.model_type == "RandomForest":
                    if "multi" in self.compute_type:
                        cuml_pred = trained_model.predict(X_test).compute()
                        self.log_to_file("\n\tPrediction complete")
                        test_accuracy = accuracy_score(y_test.compute(),
                                                       cuml_pred,
                                                       convert_dtype=True)
                    elif "single" in self.compute_type:
                        test_accuracy = trained_model.score(
                            X_test, y_test.astype("int32"))

            except Exception as error:
                self.log_to_file("\n\n!error during inference: " + str(error))

        self.log_to_file(
            f"\n\tFinished inference in {inference_timer.duration:.4f} s")
        self.log_to_file(f"\n\tTest-accuracy: {test_accuracy}")
        return test_accuracy, inference_timer.duration

    def set_up_logging(self):
        """
        Function to set up logging for the object.
        """
        logging_path = self.CSP_paths["output"] + "/log.txt"
        logging.basicConfig(filename=logging_path, level=logging.INFO)

    def log_to_file(self, text):
        """
        Logs the text that comes in as input.
        """
        logging.info(text)
        print(text)
Beispiel #8
0
d_train = pd.read_csv("https://raw.githubusercontent.com/szilard/benchm-ml--data/master/int_enc/train-1m-intenc.csv")
d_test = pd.read_csv("https://raw.githubusercontent.com/szilard/benchm-ml--data/master/int_enc/test-1m-intenc.csv")

dx_train = dd.from_pandas(d_train, npartitions=16)
dx_test = dd.from_pandas(d_test, npartitions=1)

X_train = dx_train.iloc[:, :-1].to_dask_array(lengths=True)
y_train = dx_train.iloc[:,-1:].to_dask_array(lengths=True)
X_test = dx_test.iloc[:, :-1].to_dask_array(lengths=True)
y_test = dx_test.iloc[:,-1:].to_dask_array(lengths=True)

X_train.persist()
y_train.persist()

client.has_what()


dxgb_train = xgb.dask.DaskDMatrix(client, X_train, y_train)
dxgb_test = xgb.dask.DaskDMatrix(client, X_test)


param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1}             
%time md = xgb.dask.train(client, param, dxgb_train, num_boost_round = 100)


y_pred = xgb.dask.predict(client, md, dxgb_test)
y_pred_loc = y_pred.compute()
y_test_loc = y_test.compute()
print(metrics.roc_auc_score(y_test_loc, y_pred_loc))
Beispiel #9
0
    def run(self, client: DaskClient):
        """
        Run the algorithm.

        Parameters
        ----------
        client : DaskClient
            A client to Dask.
        rj : RedisClient
            A Redist Client, a rejson.Client

        Notes
        -----
        This function runs the adaptive algorithm. Because it's asynchronous,
        this function should return if
        ``"reset" in rj.keys() and rj.jsonget("reset")``.

        """
        rj = self.redis_client()

        answers: List = []
        logger.info(f"Staring {self.ident}")

        def submit(fn: str, *args, allow_other_workers=True, **kwargs):
            if "workers" in kwargs:
                kwargs.update({"allow_other_workers": allow_other_workers})
            return client.submit(
                getattr(type(self), fn),
                *args,
                **kwargs,
            )

        update = False
        queries = np.array([])
        scores = np.array([])
        n_model_updates = 0
        rj.jsonset(f"alg-perf-{self.ident}", root, [])
        save_deadline = 0.0  # right away
        data: List[Dict[str, Any]] = []

        error_raised: List[int] = []
        for k in itertools.count():
            try:
                loop_start = time()
                datum = {"iteration": k, "ident": self.ident, "time": time()}

                answers = self.get_answers(rj, clear=True)
                datum["num_answers"] = len(answers)
                self_future = client.scatter(self)

                _start = time()
                if len(queries) and len(scores):
                    queries_f = client.scatter(queries)
                    scores_f = client.scatter(scores)
                else:
                    queries_f = scores_f = []
                if update:
                    datum["cleared_queries"] = True
                    __start = time()
                    self.clear_queries(rj)
                    datum["time_clearing"] = time() - __start
                else:
                    datum["cleared_queries"] = False
                done = distributed.Event(name="pa_finished")
                done.clear()

                workers = list(client.has_what())
                random.shuffle(workers)
                f_post = submit(
                    "post_queries",
                    self_future,
                    queries_f,
                    scores_f,
                    done=done,
                    workers=workers[0],
                )
                f_model = submit(
                    "process_answers",
                    self_future,
                    answers,
                    workers=workers[1],
                )

                f_search = submit(
                    "get_queries",
                    self_future,
                    stop=done,
                    workers=workers[2],
                )

                time_model = 0.0
                time_post = 0.0
                time_search = 0.0

                def _model_done(_):
                    nonlocal time_model
                    nonlocal done
                    done.set()
                    time_model += time() - _start

                def _post_done(_):
                    nonlocal time_post
                    time_post += time() - _start

                def _search_done(_):
                    nonlocal time_search
                    time_search += time() - _start

                f_model.add_done_callback(_model_done)
                f_post.add_done_callback(_post_done)
                f_search.add_done_callback(_search_done)

                # Future.result raises errors automatically
                posted = f_post.result()
                new_self, update = f_model.result()
                queries, scores, search_meta = f_search.result()

                _datum_update = {
                    "n_queries_posted": posted,
                    "n_queries_scored": len(queries),
                    "n_queries_in_db": rj.zcard(f"alg-{self.ident}-queries"),
                    "model_updated": update,
                    "n_model_updates": n_model_updates,
                    "time_posting_queries": time_post,
                    "time_model_update": time_model,
                    "time_search": time_search,
                    "time": time(),
                    **search_meta,
                }
                datum.update(_datum_update)
                if update:
                    _s = time()
                    self.__dict__.update(new_self.__dict__)
                    datum["time_update"] = time() - _s
                    n_model_updates += 1

                if time() > save_deadline + 1e-3:
                    save_deadline = time() + 60
                    _s = time()
                    self.save()
                    datum["time_save"] = time() - _s
                datum["time_loop"] = time() - loop_start

                data.append(datum)
                logger.info(datum)
                posting_deadline = data[0]["time"] + 2 * 60
                if time() >= posting_deadline or k == 10 or k == 20:
                    flush_logger(logger)
                    keys = data[-1].keys()
                    to_post = {}
                    for _k in keys:
                        vals = [d.get(_k, None) for d in data]
                        vals = [v for v in vals if v]
                        if not len(vals):
                            continue
                        if isinstance(vals[0], (int, np.integer)):
                            Type = int
                        elif isinstance(vals[0], (float, np.floating)):
                            Type = float
                        else:
                            continue
                        _update = {
                            f"{_k}_median": np.median(vals),
                            f"{_k}_mean": np.mean(vals),
                            f"{_k}_min": np.min(vals),
                            f"{_k}_max": np.max(vals),
                        }
                        if _k == "time":
                            _update = {"time": _update["time_median"]}
                        to_post.update(
                            {_k: Type(v)
                             for _k, v in _update.items()})

                    try:
                        rj.jsonarrappend(f"alg-perf-{self.ident}", root,
                                         to_post)
                    except ResponseError as e:
                        if ("could not perform this operation on a key that doesn't exist"
                                in str(e)):
                            # I think this happens when the frontend deletes
                            # the database when /reset is triggered
                            pass
                        else:
                            raise e

                    data = []

                if "reset" in rj.keys() and rj.jsonget("reset", root):
                    logger.warning(f"Resetting {self.ident}")
                    self.reset(client, rj, futures=[f_model, f_post, f_search])
                    break

            except Exception as e:
                logger.exception(e)
                flush_logger(logger)
                error_raised.append(k)

                __n = 5
                if np.diff(error_raised[-__n:]).tolist() == [1] * (__n - 1):
                    logger.exception(e)
                    flush_logger(logger)
                    raise e
        return True
Beispiel #10
0
    X_dask, y_dask = \
      dask_utils.persist_across_workers(c, [X_dask, y_dask], workers=workers)
    
    return X_dask, y_dask


if __name__ == "__main__":
    ## using dask to setup cluster

    # This will use all GPUs on the local host by default
    # set this to use on node disk for caching
    cluster = LocalCUDACluster(threads_per_worker=1)
    c = Client(cluster)

    # Query the client for all connected workers
    workers = c.has_what().keys()
    n_workers = len(workers)
    n_streams = 8 # Performance optimization

    ## setting parameters

    # Data parameters
    train_size = 100000
    test_size = 1000
    n_samples = train_size + test_size
    n_features = 20

    # Random Forest building parameters
    max_depth = 12
    n_bins = 16
    n_trees = 1000
Beispiel #11
0
class RapidsCloudML(object):
    def __init__(self,
                 model_type='RandomForest',
                 compute_type='multi-GPU',
                 CSP_paths=default_sagemaker_paths):

        self.CSP_paths = CSP_paths
        self.model_type = model_type
        self.compute_type = compute_type

        # CPU or GPU cluster
        if 'multi-GPU' in self.compute_type:
            self.n_workers = cupy.cuda.runtime.getDeviceCount()
            self.cluster = LocalCUDACluster(n_workers=self.n_workers)
            self.client = Client(self.cluster)
            print(f'dask multi-GPU cluster with {self.n_workers} workers ')

        elif 'multi-CPU' in self.compute_type:
            self.n_workers = os.cpu_count()
            self.cluster = LocalCluster(n_workers=self.n_workers,
                                        threads_per_worker=1)
            self.client = Client(self.cluster)
            print(f'dask multi-CPU cluster with {self.n_workers} workers')
        else:
            self.cluster = None
            self.client = None

    def load_data(self, filename='*.parquet', columns=None):

        target_filename = self.CSP_paths['train_data'] + '/' + filename
        self.log(f'\n> loading dataset from {target_filename}...\n')

        with PerfTimer(self, 'ingestion_timer'):
            if 'multi-CPU' in self.compute_type:
                dataset = dask.dataframe.read_parquet(target_filename,
                                                      columns=columns)

            elif 'multi-GPU' in self.compute_type:
                dataset = dask_cudf.read_parquet(target_filename,
                                                 columns=columns)

            dataset = dataset.dropna()
            dataset = dataset.repartition(npartitions=self.n_workers * 4)

        print(f'dataset len : {len(dataset)}')
        return dataset

    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):

        with PerfTimer(self, 'split_timer'):
            train, test = train_test_split(
                dataset, random_state=random_state
            )  # unable to shuffle -- no dask_cudf sampler implemented

            X_train, y_train = train.drop(
                y_label,
                axis=1).astype('float32'), train[y_label].astype('int32')
            X_test, y_test = test.drop(
                y_label,
                axis=1).astype('float32'), test[y_label].astype('int32')

        if 'multi-GPU' in self.compute_type:
            with PerfTimer(self, 'persist_timer'):
                workers = self.client.has_what().keys()
                X_train, X_test, y_train, y_test = persist_across_workers(
                    self.client, [X_train, X_test, y_train, y_test],
                    workers=workers)
                wait([X_train, X_test, y_train, y_test])

        return X_train, X_test, y_train, y_test

    def train_model(self, X_train, y_train, model_params):

        with PerfTimer(self, 'train_timer'):

            if 'XGBoost' in self.model_type:
                dtrain = xgboost.dask.DaskDMatrix(self.client, X_train,
                                                  y_train)

                # avoids warning messages
                boosting_rounds = model_params.pop('num_boost_round')

                trained_model = xgboost.dask.train(
                    self.client,
                    model_params,
                    dtrain,
                    num_boost_round=boosting_rounds)
                return trained_model['booster']

            elif 'RandomForest' in self.model_type:
                if 'GPU' in self.compute_type:
                    from cuml.dask.ensemble import RandomForestClassifier
                    rf_model = RandomForestClassifier(
                        n_estimators=model_params['n_estimators'],
                        max_depth=model_params['max_depth'],
                        max_features=model_params['max_features'],
                        n_bins=32)
                else:
                    from sklearn.ensemble import RandomForestClassifier
                    rf_model = RandomForestClassifier(
                        n_estimators=model_params['n_estimators'],
                        max_depth=model_params['max_depth'],
                        max_features=model_params['max_features'],
                        n_jobs=-1)

                trained_model = rf_model.fit(X_train, y_train)
                return trained_model
            print(len(X_train))
        return None

    def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
        with PerfTimer(self, 'score_timer'):

            if 'XGBoost' in self.model_type:
                dtest = xgboost.dask.DaskDMatrix(self.client, X_test, y_test)
                predictions = xgboost.dask.predict(self.client, trained_model,
                                                   dtest).compute()
                predictions = np.where(
                    predictions >= threshold, 1,
                    0)  # threshold returned probabilities into 0/1 labels

            elif 'RandomForest' in self.model_type:
                predictions = trained_model.predict(X_test)
                if 'multi-CPU' not in self.compute_type:
                    predictions = predictions.compute()

            if 'multi' in self.compute_type:
                y_test = y_test.compute()

            if 'GPU' in self.compute_type:
                test_accuracy = cuml_accuracy_score(y_test, predictions)
            elif 'CPU' in self.compute_type:
                test_accuracy = sklearn_accuracy_score(y_test, predictions)

        # accumulate internal list
        return test_accuracy

    # emit score so sagemaker can parse it (using string REGEX)
    def emit_score(self, test_accuracy):
        self.log(f'\n\t test-accuracy: {test_accuracy}; \n')

    def save_best_model(self, global_best_model=None):
        pass

    def set_up_logging(self):
        logging_path = self.CSP_paths['output'] + '/log.txt'
        logging.basicConfig(filename=logging_path, level=logging.INFO)

    def log(self, text):
        logging.info(text)
        print(text)
def test_end_to_end():

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    # NOTE: The LocalCUDACluster needs to be started before any imports that
    # could potentially create a CUDA context.

    import dask_cudf

    import cudf
    import numpy as np

    from dask_cuml.neighbors import NearestNeighbors as cumlKNN

    def create_df(f, m, n):
        X = np.random.rand(m, n)
        ret = cudf.DataFrame(
            [(i, X[:, i].astype(np.float32)) for i in range(n)],
            index=cudf.dataframe.RangeIndex(f * m, f * m + m, 1))
        return ret

    def get_meta(df):
        ret = df.iloc[:0]
        return ret

    # Per gpu/worker
    train_m = 500
    train_n = 25

    search_m = 10
    search_k = 15

    workers = client.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [
        client.submit(create_df, n, train_m, train_n, workers=[worker])
        for worker, n in list(zip(workers, list(range(len(workers)))))
    ]

    # Wait for completion
    wait(dfs)

    meta = client.submit(get_meta, dfs[0]).result()

    X_df = dask_cudf.from_delayed(dfs, meta=meta)
    X_pd = X_df.compute().to_pandas()

    cumlNN = cumlKNN()
    cumlNN.fit(X_df)

    sklNN = NearestNeighbors(metric="sqeuclidean")
    sklNN.fit(X_pd)

    cuml_D, cuml_I = cumlNN.kneighbors(X_df[0:search_m - 1], search_k)
    sk_D, sk_I = sklNN.kneighbors(X_pd[0:search_m], search_k)

    cuml_I_nd = np.array(cuml_I.compute().as_gpu_matrix(), dtype=sk_I.dtype)
    cuml_D_nd = np.array(cuml_D.compute().as_gpu_matrix(), dtype=sk_D.dtype)

    print(str(cuml_D_nd.dtype))
    print(str(sk_D.dtype))

    assert np.array_equal(cuml_I_nd, sk_I)
    assert np.allclose(cuml_D_nd, sk_D, atol=1e-5)

    cluster.close()
Beispiel #13
0
def test_end_to_end(nrows, ncols, nclusters, n_parts,
                    delayed_predict, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)
        cumlLabels = cumlModel.predict(X_train, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            parts_len = n_parts
        else:
            parts_len = n_workers

        if input_type == "dataframe":
            assert cumlLabels.npartitions == parts_len
            cumlPred = cp.array(cumlLabels.compute().to_pandas().values)
            labels = cp.squeeze(y_train.compute().to_pandas().values)
        elif input_type == "array":
            assert len(cumlLabels.chunks[0]) == parts_len
            cumlPred = cp.array(cumlLabels.compute())
            labels = cp.squeeze(y_train.compute())

        assert cumlPred.shape[0] == nrows
        assert cp.max(cumlPred) == nclusters - 1
        assert cp.min(cumlPred) == 0

        score = adjusted_rand_score(labels, cumlPred)

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()