def train(fpath, max_depth, max_features, n_estimators): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param max_depth: RF max_depth parameter :param max_features: RF max_features parameter :param n_estimators: RF n_estimators parameter :return: trained model """ X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlparams = { "max_depth": str(max_depth), "max_features": str(max_features), "n_estimators": str(n_estimators), } mlflow.log_params(mlparams) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") return mod
def GridSearch_random_forest(X_train, y_train): # Encode as float32 X_train = X_train.to_numpy().astype('float32') y_train = y_train.to_numpy().astype('float32') # Init Kfolds folds = KFold(n_splits=5) # Init hyperparam vals n_estimators_lst = [128, 256, 512, 1024] max_features_lst = ['sqrt', 'log2'] fin_arr = [] # Run GridSearch for all hyperparam combos for n_estimators in n_estimators_lst: for max_features in max_features_lst: # Init clf clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features) predicted_y = [] true_y = [] # Run CV and calc metrics for train, holdout in folds.split(X_train): clf.fit(X_train[train], y_train[train]) predicted_y.append(clf.predict(X_train[holdout])) true_y.append(y_train[holdout]) predicted_y = np.concatenate(predicted_y) true_y = np.concatenate(true_y) accuracy_train = accuracy_score(true_y, predicted_y) f1_train = f1_score(true_y, predicted_y) roc_auc_train = roc_auc_score(true_y, predicted_y) fin_arr.append([ n_estimators, max_features, accuracy_train, f1_train, roc_auc_train ]) # Create final dataframe from GridSearch results fin_arr = np.array(fin_arr).reshape( (len(n_estimators_lst) * len(max_features_lst)), 5) columns = [ 'n_estimators', 'max_features', 'mean_accuracy', 'mean_f1', 'mean_auc' ] results = pd.DataFrame(data=fin_arr, columns=columns) results.n_estimators = results.n_estimators.astype(int) return results
def fast_rf_classifier( X, y, *, num_classes=2, split_algo=1, split_criterion=0, min_rows_per_node=2, min_impurity_decrease=0.0, bootstrap_features=False, rows_sample=1.0, max_leaves=-1, n_estimators=100, max_depth=16, max_features='auto', bootstrap=True, n_bins=8, n_cols=None, dtype=None, accuracy_metric=None, quantile_per_tree=False, n_streams=8, random_state: int = 1, n_jobs: Optional[int] = None, framework: Literal['auto', 'cuml', 'sklearn'] = 'auto', **kwargs, ): kw = dict(locals()) kwargs = kw.pop('kwargs') X = kw.pop('X') y = kw.pop('y') kw.update(kwargs) framework = kw.pop('framework') ### import is_cuml = False if framework == 'sklearn': RFC = RandomForestClassifier else: try: from cuml.ensemble import RandomForestClassifier as RFC is_cuml = True except ImportError as e: RFC = RandomForestClassifier ### fine-tune keywords if is_cuml: kw['output_type'] = 'numpy' kw['seed'] = kw.pop('random_state') else: kw = dict() ### training tree = RFC() for k, v in tree.__dict__.items(): print(k, v) exit() tree.fit(X, y) return tree
def train_and_eval(X_param, y_param, max_depth=16, n_estimators=100): X_train, X_valid, y_train, y_valid = train_test_split(X_param, y_param, random_state=77) classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_valid) score = accuracy_score(y_valid, y_pred) return score
def fit(X, y): global clf clf = RandomForestClassifier(split_criterion=params.criterion, split_algo=params.split_algorithm, n_estimators=params.num_trees, max_depth=params.max_depth, max_features=params.max_features, min_samples_split=params.min_samples_split, max_leaves=params.max_leaf_nodes, min_impurity_decrease=params.min_impurity_decrease, bootstrap=params.bootstrap) return clf.fit(X, y)
def _train(params, fpath, hyperopt=False): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param hyperopt: Use hyperopt for hyperparameter search during training. :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) """ max_depth, max_features, n_estimators = params max_depth, max_features, n_estimators = (int(max_depth), float(max_features), int(n_estimators)) # Log all of our training parameters for this run. pyver = sys.version_info mlparams = { 'cudf_version': str(cudf.__version__), 'cuml_version': str(cuml.__version__), 'max_depth': str(max_depth), 'max_features': str(max_features), 'n_estimators': str(n_estimators), 'python_version': f"{pyver[0]}.{pyver[1]}.{pyver[2]}.{pyver[3]}", } mlflow.log_params(mlparams) X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") if not hyperopt: return mod return {"loss": acc, "status": STATUS_OK}
def _train(params, fpath, hyperopt=False): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param hyperopt: Use hyperopt for hyperparameter search during training. :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) """ max_depth, max_features, n_estimators = params max_depth, max_features, n_estimators = (int(max_depth), float(max_features), int(n_estimators)) X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlparams = { "max_depth": str(max_depth), "max_features": str(max_features), "n_estimators": str(n_estimators) } mlflow.log_params(mlparams) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") if (not hyperopt): return mod return {'loss': acc, 'status': STATUS_OK}
print('Copying data to GPU done in {:.2f} seconds'.format(time() - t0)) # ### Learning # # Random forest classifiers are quick to train, quite robust to # hyperparameter values, and often work relatively well. print() print('Learning begins') t0 = time() n_estimators = 100 max_depth = 16 clf_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth) print(clf_rf) clf_rf.fit(cu_X_train, cu_y_train) print('Learning done in {:.2f} seconds'.format(time() - t0)) # ### Inference # # We will use GPU-based inference to predict the classes for the test # data. print() print('Inference begins') t0 = time() pred_rf = clf_rf.predict(X_test, predict_model='GPU') pred_rf = [chr(x) for x in pred_rf + ord('A')] pred_rf = np.array(pred_rf)
def run_random_forest(scaled_df): raw_train_arr = [] raw_test_arr = [] # Over five trials for i in range(5): # Split data into train and test X_train, X_test, y_train, y_test = train_test_split( scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000) # Run GridSearch search_results = GridSearch_random_forest(X_train, y_train) results = search_results # Get optimal clfs using gridsearch results opt_acc_inf = results.sort_values(by='mean_accuracy', ascending=False).iloc[0] opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0] opt_auc_inf = results.sort_values(by='mean_auc', ascending=False).iloc[0] # Init optimal clfs opt_acc_clf = RandomForestClassifier( n_estimators=opt_acc_inf.n_estimators, max_features=opt_acc_inf.max_features) opt_f1_clf = RandomForestClassifier( n_estimators=opt_f1_inf.n_estimators, max_features=opt_f1_inf.max_features) opt_auc_clf = RandomForestClassifier( n_estimators=opt_auc_inf.n_estimators, max_features=opt_auc_inf.max_features) # Encode as float32 for cuML X_train_np = X_train.to_numpy().astype('float32') y_train_np = y_train.to_numpy().astype('float32') X_test_np = X_test.to_numpy().astype('float32') y_test_np = y_test.to_numpy().astype('float32') # Fit clfs opt_acc_clf.fit(X_train_np, y_train_np) opt_f1_clf.fit(X_train_np, y_train_np) opt_auc_clf.fit(X_train_np, y_train_np) # Get train and test metrics train_score_acc = opt_acc_clf.score(X_train_np, y_train_np) train_score_f1 = f1_score(y_train_np, opt_f1_clf.predict(X_train_np)) train_score_auc = roc_auc_score(y_train_np, opt_auc_clf.predict(X_train_np)) test_score_acc = opt_acc_clf.score(X_test_np, y_test_np) test_score_f1 = f1_score(y_test_np, opt_f1_clf.predict(X_test_np)) test_score_auc = roc_auc_score(y_test_np, opt_auc_clf.predict(X_test_np)) raw_train_arr.append( [train_score_acc, train_score_f1, train_score_auc]) raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc]) raw_train_arr = np.array(raw_train_arr).reshape(5, 3) raw_test_arr = np.array(raw_test_arr).reshape(5, 3) raw_train_df = pd.DataFrame(data=raw_train_arr, columns=['accuracy', 'f1', 'auc']) raw_test_df = pd.DataFrame(data=raw_test_arr, columns=['accuracy', 'f1', 'auc']) return raw_train_df, raw_test_df
class CUMLTrainable(tune.Trainable): def _setup(self, config): # [X_train, X_test, y_train, y_test] = get_pinned_object(data_id) self._gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES", 0) #ray.get_gpu_ids()[0] print("Starting new trainable on {}.".format(self._gpu_id)) # self._wait_for_gpus() with FileLock(os.path.expanduser("~/.tune.gpulock")): X_cudf_train = cudf.DataFrame.from_pandas(X_train) self.train_mat = X_cudf_train.as_gpu_matrix(order="F") del X_cudf_train self.X_cudf_test = cudf.DataFrame.from_pandas(X_test) self.y_cudf_train = cudf.Series(y_train.values) self.y_test = y_test config = {k: int(v) for k, v in config.items()} self.cuml_model = GPURandomForestClassifier(**config) def _train(self): self.cuml_model.fit(self.train_mat, self.y_cudf_train) fil_preds_orig = self.cuml_model.predict(self.X_cudf_test) accuracy = accuracy_score(self.y_test, fil_preds_orig) return {"mean_accuracy": accuracy} def _stop(self): import time import GPUtil gpu_object = GPUtil.getGPUs()[self._gpu_id] print("Deleting the model. Mem: {:0.3f}".format(gpu_object.memoryUsed)) del self.cuml_model print("Deleting the test set. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.X_cudf_test print("Deleting the test labels. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.y_test print("Deleting the training labels. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.y_cudf_train print("Deleting the training matrix. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.train_mat # self._wait_for_gpus(retry=1) def _wait_for_gpus(self, retry=10): import GPUtil import time gpu_object = GPUtil.getGPUs()[self._gpu_id] for i in range(int(retry)): if gpu_object.memoryUsed > 0.1: print("Waiting for GPU memory to free. Mem: {:0.3f}".format( gpu_object.memoryUsed)) time.sleep(5) time.sleep(5) def reset_config(self, config): del self.cuml_model config = {k: int(v) for k, v in config.items()} self.cuml_model = GPURandomForestClassifier(**config) return True
import pickle from datasets import prepare_dataset from cuml.ensemble import RandomForestClassifier as GPURandomForestClassifier data = prepare_dataset("/data", "airline", None) X_train, X_test, y_train, y_test = data.X_train, data.X_test, data.y_train, data.y_test y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) QUARTER = len(X_train) // 2 X_train = X_train[QUARTER:] y_train = y_train[QUARTER:] X_cudf_train = cudf.DataFrame.from_pandas(X_train) X_cudf_test = cudf.DataFrame.from_pandas(X_test) train_mat = X_cudf_train.as_gpu_matrix(order="F") del X_cudf_train y_cudf_train = cudf.Series(y_train.values) cuml_model = GPURandomForestClassifier(n_estimators=467, max_depth=19, max_features=1.0) cuml_model.fit(train_mat, y_cudf_train) fil_preds_orig = cuml_model.predict(X_cudf_test) fil_acc_orig = accuracy_score(y_test, fil_preds_orig)