def fit_forest(X_train, y_train, n_estimators=10, dirichlet=0.5, step=1.0): clf_kwargs = { "n_estimators": n_estimators, "min_samples_split": 2, "random_state": random_state, "n_jobs": 1, "step": step, "dirichlet": dirichlet, } clf = ForestClassifier(**clf_kwargs) clf.fit(X_train, y_train) return clf
def set_classifier(clf_name, fit_seed, n_jobs=-1): classifier_setting = { "RandomForestClassifier": RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=fit_seed), "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=fit_seed), "XGBClassifier": xgb.XGBClassifier( use_label_encoder=False, n_jobs=n_jobs, tree_method="hist", random_state=fit_seed, ), "LGBMClassifier": lgb.LGBMClassifier(n_jobs=n_jobs, random_state=fit_seed), "CatBoostClassifier": CatBoostClassifier( thread_count=n_jobs, random_state=fit_seed, logging_level="Silent", allow_writing_files=False, ), "WildWood": ForestClassifier(n_estimators=10, n_jobs=n_jobs, random_state=fit_seed), } return classifier_setting[clf_name]
def fit( self, params, X_train, y_train, Xy_val, sample_weight, n_estimators=None, seed=None, ): if seed is not None: params.update({"random_state": seed}) if n_estimators is not None: params.update({"n_estimators": n_estimators}) clf = ForestClassifier(**params, n_jobs=-1) clf.fit( X_train, y_train, sample_weight=sample_weight, categorical_features=self.categorical_features, ) return clf, None
def fit_forest( X_train, y_train, aggregation=True, n_estimators=10, dirichlet=0.5, step=1.0, min_samples_split=2, n_jobs=1, ): clf_kwargs = { "n_estimators": n_estimators, "aggregation": aggregation, "min_samples_split": min_samples_split, "random_state": random_state, "n_jobs": n_jobs, "step": step, "dirichlet": dirichlet, "max_features": 2, } clf = ForestClassifier(**clf_kwargs) clf.fit(X_train, y_train) return clf
def launch_run(*, run_config, experiment_id): """ Parameters ---------- run_config : dict The configuration of the run experiment_id : str Id of the experiment that groups runs in mlflow Returns ------- output : dict Metrics computed during this run """ wildwood_kwargs = { key.replace("wildwood_", ""): val for key, val in run_config.items() if key.startswith("wildwood") } dataset_name = run_config["dataset"] dataset_random_state = run_config["dataset_random_state"] loader = loader_from_name[dataset_name] # Just get the task from the dataset dataset = loader() learning_task = dataset.task # But we use the raw data in wildwood X, y = loader(raw=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=dataset_random_state, shuffle=True, stratify=y) label_encoder = LabelEncoder() y_train = label_encoder.fit_transform(y_train) y_test = label_encoder.transform(y_test) kwargs_one_tree = wildwood_kwargs.copy() kwargs_one_tree["n_estimators"] = 1 # Fit a single tree on the full dataset to force pre-compilation (doing so on a # subset often fails). # TODO: debug such cases clf = ForestClassifier(**kwargs_one_tree) clf.fit(X_train, y_train) # Instantiate again just to be sure clf = ForestClassifier(**wildwood_kwargs) with mlflow.start_run(experiment_id=experiment_id): # Fit and timing tic = time() # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name)) # TODO: include computations with an without categorical features ? clf.fit(X_train, y_train) toc = time() fit_time = toc - tic logging.info(f"Fitted for experiment {filename} in {fit_time}s") # Predict and timing tic = time() y_scores_train = clf.predict_proba(X_train) toc = time() predict_train_time = toc - tic tic = time() y_scores_test = clf.predict_proba(X_test) toc = time() predict_test_time = toc - tic # col_predict_time.append(predict_time) logging.info( f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s" ) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) metrics = compute_metrics( learning_task=learning_task, y_train=y_train, y_test=y_test, y_scores_train=y_scores_train, y_scores_test=y_scores_test, y_pred_train=y_pred_train, y_pred_test=y_pred_test, ) mlflow_metrics = dict( **metrics, fit_time=fit_time, predict_train_time=predict_train_time, predict_test_time=predict_test_time, ) mlflow_params = dict( **wildwood_kwargs, dataset=dataset_name, dataset_random_state=dataset_random_state, ) mlflow.log_params(mlflow_params) mlflow.log_metrics(mlflow_metrics)
# "max_bins": 8308, "n_jobs": -1, "dirichlet": 1e-8, "step": 1.0, "aggregation": False, "verbose": True } # classifiers = [ # ("tree", DecisionTreeClassifier), # ("sk_tree", SkDecisionTreeClassifier) # ] classifiers = [ # ("forest", ForestBinaryClassifier(n_estimators=1, **clf_kwargs)), ("forest", ForestClassifier(**clf_kwargs)), # ("sk_forest", RandomForestClassifier(**clf_kwargs)) # ("tree", DecisionTreeClassifier(**clf_kwargs)), # ("sk_tree", SkDecisionTreeClassifier(**clf_kwargs)), ] n_classifiers = len(classifiers) n_datasets = len(datasets) h = 0.2 i = 1 # iterate over datasets # for ds_cnt, ds in enumerate(datasets): # # preprocess datasets, split into training and test part # ds_name, (X, y) = ds
# X_train = rcv1_train.data # y_train = rcv1_train.target # X_test = rcv1_test.data # y_test = rcv1_test.target clf_kwargs = { "n_estimators": 5, "min_samples_split": 2, "random_state": random_state, "n_jobs": -1, "dirichlet": 1e-5, "step": 2.0, "aggregation": True } clf = ForestClassifier(**clf_kwargs) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) tic = time() clf.fit(X_train, y_train) toc = time() print("time to fit: ", toc - tic) tic = time() y_scores = clf.predict_proba(X_test) toc = time() print("time to predict_proba: ", toc - tic)
noise=0.2, random_state=data_random_state) clf_kwargs = { "n_estimators": 1, "max_features": 2, "min_samples_split": 2, "random_state": random_state, "n_jobs": -1, "dirichlet": 1e-8, "step": 1.0, "aggregation": False, "verbose": True, } clf = ForestClassifier(**clf_kwargs) h = 0.2 i = 1 h = 0.2 fig = plt.figure(figsize=(4, 2)) i = 1 # iterate over datasets # preprocess datasets, split into training and test part X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
data_random_state = 42 dataset = load_bank() dataset.one_hot_encode = False dataset.standardize = False X_train, X_test, y_train, y_test = dataset.extract( random_state=data_random_state) n_estimators = 100 clf = ForestClassifier( n_estimators=n_estimators, random_state=42, aggregation=False, max_features=None, categorical_features=dataset.categorical_features_, n_jobs=1, class_weight="balanced", criterion="entropy", ) clf.fit(X_train, y_train) y_scores_train = clf.predict_proba(X_train) y_scores_test = clf.predict_proba(X_test) avg_prec_train = average_precision_score(y_train, y_scores_train[:, 1]) avg_prec_test = average_precision_score(y_test, y_scores_test[:, 1]) print("Categorical") print("AP(train):", avg_prec_train, "AP(test):", avg_prec_test) clf = ForestClassifier( n_estimators=n_estimators, random_state=42,
random_state = 42 classifiers = [ lambda n: ( "RFW", RandomForestClassifier( n_estimators=n, n_jobs=-1, random_state=random_state, ), ), lambda n: ( "WildWood", ForestClassifier( n_estimators=n, multiclass="ovr", n_jobs=-1, random_state=random_state, ), ), lambda n: ( "ET", ExtraTreesClassifier( n_estimators=n, n_jobs=-1, random_state=random_state, ), ), ] logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s",