def fit_forest(X_train, y_train, n_estimators=10, dirichlet=0.5, step=1.0):
    clf_kwargs = {
        "n_estimators": n_estimators,
        "min_samples_split": 2,
        "random_state": random_state,
        "n_jobs": 1,
        "step": step,
        "dirichlet": dirichlet,
    }

    clf = ForestClassifier(**clf_kwargs)
    clf.fit(X_train, y_train)
    return clf
Exemple #2
0
 def fit(
     self,
     params,
     X_train,
     y_train,
     Xy_val,
     sample_weight,
     n_estimators=None,
     seed=None,
 ):
     if seed is not None:
         params.update({"random_state": seed})
     if n_estimators is not None:
         params.update({"n_estimators": n_estimators})
     clf = ForestClassifier(**params, n_jobs=-1)
     clf.fit(
         X_train,
         y_train,
         sample_weight=sample_weight,
         categorical_features=self.categorical_features,
     )
     return clf, None
def fit_forest(
    X_train,
    y_train,
    aggregation=True,
    n_estimators=10,
    dirichlet=0.5,
    step=1.0,
    min_samples_split=2,
    n_jobs=1,
):

    clf_kwargs = {
        "n_estimators": n_estimators,
        "aggregation": aggregation,
        "min_samples_split": min_samples_split,
        "random_state": random_state,
        "n_jobs": n_jobs,
        "step": step,
        "dirichlet": dirichlet,
        "max_features": 2,
    }
    clf = ForestClassifier(**clf_kwargs)
    clf.fit(X_train, y_train)
    return clf
Exemple #4
0
def launch_run(*, run_config, experiment_id):
    """

    Parameters
    ----------
    run_config : dict
        The configuration of the run

    experiment_id : str
        Id of the experiment that groups runs in mlflow

    Returns
    -------
    output : dict
        Metrics computed during this run
    """

    wildwood_kwargs = {
        key.replace("wildwood_", ""): val
        for key, val in run_config.items() if key.startswith("wildwood")
    }

    dataset_name = run_config["dataset"]
    dataset_random_state = run_config["dataset_random_state"]
    loader = loader_from_name[dataset_name]

    # Just get the task from the dataset
    dataset = loader()
    learning_task = dataset.task

    # But we use the raw data in wildwood
    X, y = loader(raw=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=dataset_random_state, shuffle=True, stratify=y)

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    kwargs_one_tree = wildwood_kwargs.copy()
    kwargs_one_tree["n_estimators"] = 1

    # Fit a single tree on the full dataset to force pre-compilation (doing so on a
    # subset often fails).
    # TODO: debug such cases
    clf = ForestClassifier(**kwargs_one_tree)
    clf.fit(X_train, y_train)

    # Instantiate again just to be sure
    clf = ForestClassifier(**wildwood_kwargs)

    with mlflow.start_run(experiment_id=experiment_id):
        # Fit and timing
        tic = time()
        # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name))
        # TODO: include computations with an without categorical features ?
        clf.fit(X_train, y_train)

        toc = time()
        fit_time = toc - tic
        logging.info(f"Fitted for experiment {filename} in {fit_time}s")

        # Predict and timing
        tic = time()
        y_scores_train = clf.predict_proba(X_train)
        toc = time()
        predict_train_time = toc - tic

        tic = time()
        y_scores_test = clf.predict_proba(X_test)
        toc = time()
        predict_test_time = toc - tic

        # col_predict_time.append(predict_time)
        logging.info(
            f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s"
        )

        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        metrics = compute_metrics(
            learning_task=learning_task,
            y_train=y_train,
            y_test=y_test,
            y_scores_train=y_scores_train,
            y_scores_test=y_scores_test,
            y_pred_train=y_pred_train,
            y_pred_test=y_pred_test,
        )

        mlflow_metrics = dict(
            **metrics,
            fit_time=fit_time,
            predict_train_time=predict_train_time,
            predict_test_time=predict_test_time,
        )

        mlflow_params = dict(
            **wildwood_kwargs,
            dataset=dataset_name,
            dataset_random_state=dataset_random_state,
        )

        mlflow.log_params(mlflow_params)
        mlflow.log_metrics(mlflow_metrics)
Exemple #5
0
    "random_state": random_state,
    "n_jobs": -1,
    "dirichlet": 1e-5,
    "step": 2.0,
    "aggregation": True
}

clf = ForestClassifier(**clf_kwargs)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

tic = time()
clf.fit(X_train, y_train)
toc = time()
print("time to fit: ", toc - tic)

tic = time()
y_scores = clf.predict_proba(X_test)
toc = time()
print("time to predict_proba: ", toc - tic)

tic = time()
y_pred = clf.predict(X_test)
toc = time()
print("time to predict: ", toc - tic)

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)