Ejemplo n.º 1
0
def launch_run(*, run_config, experiment_id):
    """

    Parameters
    ----------
    run_config : dict
        The configuration of the run

    experiment_id : str
        Id of the experiment that groups runs in mlflow

    Returns
    -------
    output : dict
        Metrics computed during this run
    """

    wildwood_kwargs = {
        key.replace("wildwood_", ""): val
        for key, val in run_config.items() if key.startswith("wildwood")
    }

    dataset_name = run_config["dataset"]
    dataset_random_state = run_config["dataset_random_state"]
    loader = loader_from_name[dataset_name]

    # Just get the task from the dataset
    dataset = loader()
    learning_task = dataset.task

    # But we use the raw data in wildwood
    X, y = loader(raw=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=dataset_random_state, shuffle=True, stratify=y)

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    kwargs_one_tree = wildwood_kwargs.copy()
    kwargs_one_tree["n_estimators"] = 1

    # Fit a single tree on the full dataset to force pre-compilation (doing so on a
    # subset often fails).
    # TODO: debug such cases
    clf = ForestClassifier(**kwargs_one_tree)
    clf.fit(X_train, y_train)

    # Instantiate again just to be sure
    clf = ForestClassifier(**wildwood_kwargs)

    with mlflow.start_run(experiment_id=experiment_id):
        # Fit and timing
        tic = time()
        # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name))
        # TODO: include computations with an without categorical features ?
        clf.fit(X_train, y_train)

        toc = time()
        fit_time = toc - tic
        logging.info(f"Fitted for experiment {filename} in {fit_time}s")

        # Predict and timing
        tic = time()
        y_scores_train = clf.predict_proba(X_train)
        toc = time()
        predict_train_time = toc - tic

        tic = time()
        y_scores_test = clf.predict_proba(X_test)
        toc = time()
        predict_test_time = toc - tic

        # col_predict_time.append(predict_time)
        logging.info(
            f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s"
        )

        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        metrics = compute_metrics(
            learning_task=learning_task,
            y_train=y_train,
            y_test=y_test,
            y_scores_train=y_scores_train,
            y_scores_test=y_scores_test,
            y_pred_train=y_pred_train,
            y_pred_test=y_pred_test,
        )

        mlflow_metrics = dict(
            **metrics,
            fit_time=fit_time,
            predict_train_time=predict_train_time,
            predict_test_time=predict_test_time,
        )

        mlflow_params = dict(
            **wildwood_kwargs,
            dataset=dataset_name,
            dataset_random_state=dataset_random_state,
        )

        mlflow.log_params(mlflow_params)
        mlflow.log_metrics(mlflow_metrics)
Ejemplo n.º 2
0
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

# iterate over classifiers
ax = plt.subplot(1, 2, 2)
clf.fit(X_train, y_train)

# clf.apply(X_train)
# logging.info("%s had %d nodes" % (name, clf.tree_.node_count))
truc = np.empty((xx.ravel().shape[0], 2))
truc[:, 0] = xx.ravel()
truc[:, 1] = yy.ravel()

Z = clf.predict_proba(truc)[:, 1]
# Z = clf.predict_proba_trees(truc)[0][:, 1]

# score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

plt.tight_layout()

# print("time: ", toc - tic)
Ejemplo n.º 3
0
    "aggregation": True
}

clf = ForestClassifier(**clf_kwargs)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

tic = time()
clf.fit(X_train, y_train)
toc = time()
print("time to fit: ", toc - tic)

tic = time()
y_scores = clf.predict_proba(X_test)
toc = time()
print("time to predict_proba: ", toc - tic)

tic = time()
y_pred = clf.predict(X_test)
toc = time()
print("time to predict: ", toc - tic)

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

print(cm)
print(acc)
Ejemplo n.º 4
0
    random_state=data_random_state)

n_estimators = 100

clf = ForestClassifier(
    n_estimators=n_estimators,
    random_state=42,
    aggregation=False,
    max_features=None,
    categorical_features=dataset.categorical_features_,
    n_jobs=1,
    class_weight="balanced",
    criterion="entropy",
)
clf.fit(X_train, y_train)
y_scores_train = clf.predict_proba(X_train)
y_scores_test = clf.predict_proba(X_test)
avg_prec_train = average_precision_score(y_train, y_scores_train[:, 1])
avg_prec_test = average_precision_score(y_test, y_scores_test[:, 1])
print("Categorical")
print("AP(train):", avg_prec_train, "AP(test):", avg_prec_test)

clf = ForestClassifier(
    n_estimators=n_estimators,
    random_state=42,
    aggregation=False,
    max_features=None,
    # categorical_features=dataset.categorical_features_,
    criterion="entropy",
    n_jobs=1,
    class_weight="balanced",