Example #1
0
def train(fpath, max_depth, max_features, n_estimators):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param max_depth: RF max_depth parameter
    :param max_features: RF max_features parameter
    :param n_estimators: RF n_estimators parameter
    :return: trained model
    """
    X_train, X_test, y_train, y_test = load_data(fpath)

    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlparams = {
        "max_depth": str(max_depth),
        "max_features": str(max_features),
        "n_estimators": str(n_estimators),
    }
    mlflow.log_params(mlparams)

    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(mod, "saved_models")

    return mod
def GridSearch_random_forest(X_train, y_train):
    # Encode as float32
    X_train = X_train.to_numpy().astype('float32')
    y_train = y_train.to_numpy().astype('float32')

    # Init Kfolds
    folds = KFold(n_splits=5)

    # Init hyperparam vals
    n_estimators_lst = [128, 256, 512, 1024]
    max_features_lst = ['sqrt', 'log2']

    fin_arr = []

    # Run GridSearch for all hyperparam combos
    for n_estimators in n_estimators_lst:

        for max_features in max_features_lst:

            # Init clf
            clf = RandomForestClassifier(n_estimators=n_estimators,
                                         max_features=max_features)

            predicted_y = []
            true_y = []
            # Run CV and calc metrics
            for train, holdout in folds.split(X_train):
                clf.fit(X_train[train], y_train[train])

                predicted_y.append(clf.predict(X_train[holdout]))

                true_y.append(y_train[holdout])

            predicted_y = np.concatenate(predicted_y)
            true_y = np.concatenate(true_y)

            accuracy_train = accuracy_score(true_y, predicted_y)
            f1_train = f1_score(true_y, predicted_y)
            roc_auc_train = roc_auc_score(true_y, predicted_y)

            fin_arr.append([
                n_estimators, max_features, accuracy_train, f1_train,
                roc_auc_train
            ])
    # Create final dataframe from GridSearch results
    fin_arr = np.array(fin_arr).reshape(
        (len(n_estimators_lst) * len(max_features_lst)), 5)

    columns = [
        'n_estimators', 'max_features', 'mean_accuracy', 'mean_f1', 'mean_auc'
    ]

    results = pd.DataFrame(data=fin_arr, columns=columns)
    results.n_estimators = results.n_estimators.astype(int)

    return results
Example #3
0
def fast_rf_classifier(
    X,
    y,
    *,
    num_classes=2,
    split_algo=1,
    split_criterion=0,
    min_rows_per_node=2,
    min_impurity_decrease=0.0,
    bootstrap_features=False,
    rows_sample=1.0,
    max_leaves=-1,
    n_estimators=100,
    max_depth=16,
    max_features='auto',
    bootstrap=True,
    n_bins=8,
    n_cols=None,
    dtype=None,
    accuracy_metric=None,
    quantile_per_tree=False,
    n_streams=8,
    random_state: int = 1,
    n_jobs: Optional[int] = None,
    framework: Literal['auto', 'cuml', 'sklearn'] = 'auto',
    **kwargs,
):
    kw = dict(locals())
    kwargs = kw.pop('kwargs')
    X = kw.pop('X')
    y = kw.pop('y')
    kw.update(kwargs)
    framework = kw.pop('framework')
    ### import
    is_cuml = False
    if framework == 'sklearn':
        RFC = RandomForestClassifier
    else:
        try:
            from cuml.ensemble import RandomForestClassifier as RFC
            is_cuml = True
        except ImportError as e:
            RFC = RandomForestClassifier
    ### fine-tune keywords
    if is_cuml:
        kw['output_type'] = 'numpy'
        kw['seed'] = kw.pop('random_state')
    else:
        kw = dict()
    ### training
    tree = RFC()
    for k, v in tree.__dict__.items():
        print(k, v)
    exit()
    tree.fit(X, y)
    return tree
Example #4
0
def train_and_eval(X_param, y_param, max_depth=16, n_estimators=100):
    X_train, X_valid, y_train, y_valid = train_test_split(X_param,
                                                          y_param,
                                                          random_state=77)
    classifier = RandomForestClassifier(max_depth=max_depth,
                                        n_estimators=n_estimators)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    return score
Example #5
0
def fit(X, y):
    global clf
    clf = RandomForestClassifier(split_criterion=params.criterion,
                                 split_algo=params.split_algorithm,
                                 n_estimators=params.num_trees,
                                 max_depth=params.max_depth,
                                 max_features=params.max_features,
                                 min_samples_split=params.min_samples_split,
                                 max_leaves=params.max_leaf_nodes,
                                 min_impurity_decrease=params.min_impurity_decrease,
                                 bootstrap=params.bootstrap)
    return clf.fit(X, y)
Example #6
0
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = (int(max_depth),
                                             float(max_features),
                                             int(n_estimators))

    # Log all of our training parameters for this run.
    pyver = sys.version_info
    mlparams = {
        'cudf_version': str(cudf.__version__),
        'cuml_version': str(cuml.__version__),
        'max_depth': str(max_depth),
        'max_features': str(max_features),
        'n_estimators': str(n_estimators),
        'python_version': f"{pyver[0]}.{pyver[1]}.{pyver[2]}.{pyver[3]}",
    }
    mlflow.log_params(mlparams)

    X_train, X_test, y_train, y_test = load_data(fpath)
    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(mod, "saved_models")

    if not hyperopt:
        return mod

    return {"loss": acc, "status": STATUS_OK}
Example #7
0
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = (int(max_depth),
                                             float(max_features),
                                             int(n_estimators))

    X_train, X_test, y_train, y_test = load_data(fpath)

    mod = RandomForestClassifier(max_depth=max_depth,
                                 max_features=max_features,
                                 n_estimators=n_estimators)

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlparams = {
        "max_depth": str(max_depth),
        "max_features": str(max_features),
        "n_estimators": str(n_estimators)
    }
    mlflow.log_params(mlparams)

    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(mod, "saved_models")

    if (not hyperopt):
        return mod

    return {'loss': acc, 'status': STATUS_OK}
Example #8
0
print('Copying data to GPU done in {:.2f} seconds'.format(time() - t0))

# ### Learning
#
# Random forest classifiers are quick to train, quite robust to
# hyperparameter values, and often work relatively well.

print()
print('Learning begins')
t0 = time()

n_estimators = 100
max_depth = 16
clf_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
print(clf_rf)
clf_rf.fit(cu_X_train, cu_y_train)

print('Learning done in {:.2f} seconds'.format(time() - t0))

# ### Inference
#
# We will use GPU-based inference to predict the classes for the test
# data.

print()
print('Inference begins')
t0 = time()

pred_rf = clf_rf.predict(X_test, predict_model='GPU')
pred_rf = [chr(x) for x in pred_rf + ord('A')]
pred_rf = np.array(pred_rf)
def run_random_forest(scaled_df):
    raw_train_arr = []
    raw_test_arr = []
    # Over five trials
    for i in range(5):

        # Split data into train and test
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000)

        # Run GridSearch
        search_results = GridSearch_random_forest(X_train, y_train)

        results = search_results
        # Get optimal clfs using gridsearch results
        opt_acc_inf = results.sort_values(by='mean_accuracy',
                                          ascending=False).iloc[0]
        opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0]
        opt_auc_inf = results.sort_values(by='mean_auc',
                                          ascending=False).iloc[0]

        # Init optimal clfs
        opt_acc_clf = RandomForestClassifier(
            n_estimators=opt_acc_inf.n_estimators,
            max_features=opt_acc_inf.max_features)

        opt_f1_clf = RandomForestClassifier(
            n_estimators=opt_f1_inf.n_estimators,
            max_features=opt_f1_inf.max_features)

        opt_auc_clf = RandomForestClassifier(
            n_estimators=opt_auc_inf.n_estimators,
            max_features=opt_auc_inf.max_features)

        # Encode as float32 for cuML
        X_train_np = X_train.to_numpy().astype('float32')
        y_train_np = y_train.to_numpy().astype('float32')

        X_test_np = X_test.to_numpy().astype('float32')
        y_test_np = y_test.to_numpy().astype('float32')

        # Fit clfs
        opt_acc_clf.fit(X_train_np, y_train_np)
        opt_f1_clf.fit(X_train_np, y_train_np)
        opt_auc_clf.fit(X_train_np, y_train_np)

        # Get train and test metrics
        train_score_acc = opt_acc_clf.score(X_train_np, y_train_np)
        train_score_f1 = f1_score(y_train_np, opt_f1_clf.predict(X_train_np))
        train_score_auc = roc_auc_score(y_train_np,
                                        opt_auc_clf.predict(X_train_np))

        test_score_acc = opt_acc_clf.score(X_test_np, y_test_np)
        test_score_f1 = f1_score(y_test_np, opt_f1_clf.predict(X_test_np))
        test_score_auc = roc_auc_score(y_test_np,
                                       opt_auc_clf.predict(X_test_np))

        raw_train_arr.append(
            [train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])

    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)

    raw_train_df = pd.DataFrame(data=raw_train_arr,
                                columns=['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data=raw_test_arr,
                               columns=['accuracy', 'f1', 'auc'])

    return raw_train_df, raw_test_df
class CUMLTrainable(tune.Trainable):
    def _setup(self, config):
        # [X_train, X_test, y_train, y_test] = get_pinned_object(data_id)
        self._gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES",
                                      0)  #ray.get_gpu_ids()[0]
        print("Starting new trainable on {}.".format(self._gpu_id))
        # self._wait_for_gpus()

        with FileLock(os.path.expanduser("~/.tune.gpulock")):
            X_cudf_train = cudf.DataFrame.from_pandas(X_train)
            self.train_mat = X_cudf_train.as_gpu_matrix(order="F")
            del X_cudf_train
            self.X_cudf_test = cudf.DataFrame.from_pandas(X_test)
            self.y_cudf_train = cudf.Series(y_train.values)
            self.y_test = y_test
            config = {k: int(v) for k, v in config.items()}
            self.cuml_model = GPURandomForestClassifier(**config)

    def _train(self):
        self.cuml_model.fit(self.train_mat, self.y_cudf_train)
        fil_preds_orig = self.cuml_model.predict(self.X_cudf_test)
        accuracy = accuracy_score(self.y_test, fil_preds_orig)
        return {"mean_accuracy": accuracy}

    def _stop(self):
        import time
        import GPUtil
        gpu_object = GPUtil.getGPUs()[self._gpu_id]
        print("Deleting the model. Mem: {:0.3f}".format(gpu_object.memoryUsed))
        del self.cuml_model
        print("Deleting the test set. Mem: {:0.3f}".format(
            gpu_object.memoryUsed))
        del self.X_cudf_test
        print("Deleting the test labels. Mem: {:0.3f}".format(
            gpu_object.memoryUsed))
        del self.y_test
        print("Deleting the training labels. Mem: {:0.3f}".format(
            gpu_object.memoryUsed))
        del self.y_cudf_train
        print("Deleting the training matrix. Mem: {:0.3f}".format(
            gpu_object.memoryUsed))
        del self.train_mat


#         self._wait_for_gpus(retry=1)

    def _wait_for_gpus(self, retry=10):
        import GPUtil
        import time
        gpu_object = GPUtil.getGPUs()[self._gpu_id]
        for i in range(int(retry)):
            if gpu_object.memoryUsed > 0.1:
                print("Waiting for GPU memory to free. Mem: {:0.3f}".format(
                    gpu_object.memoryUsed))
                time.sleep(5)
        time.sleep(5)

    def reset_config(self, config):
        del self.cuml_model
        config = {k: int(v) for k, v in config.items()}
        self.cuml_model = GPURandomForestClassifier(**config)
        return True
Example #11
0
import pickle
from datasets import prepare_dataset

from cuml.ensemble import RandomForestClassifier as GPURandomForestClassifier

data = prepare_dataset("/data", "airline", None)
X_train, X_test, y_train, y_test = data.X_train, data.X_test, data.y_train, data.y_test

y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

QUARTER = len(X_train) // 2
X_train = X_train[QUARTER:]
y_train = y_train[QUARTER:]

X_cudf_train = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
train_mat = X_cudf_train.as_gpu_matrix(order="F")
del X_cudf_train

y_cudf_train = cudf.Series(y_train.values)

cuml_model = GPURandomForestClassifier(n_estimators=467,
                                       max_depth=19,
                                       max_features=1.0)

cuml_model.fit(train_mat, y_cudf_train)

fil_preds_orig = cuml_model.predict(X_cudf_test)
fil_acc_orig = accuracy_score(y_test, fil_preds_orig)