コード例 #1
0
def launch_run(*, run_config, experiment_id):
    """

    Parameters
    ----------
    run_config : dict
        The configuration of the run

    experiment_id : str
        Id of the experiment that groups runs in mlflow

    Returns
    -------
    output : dict
        Metrics computed during this run
    """

    wildwood_kwargs = {
        key.replace("wildwood_", ""): val
        for key, val in run_config.items() if key.startswith("wildwood")
    }

    dataset_name = run_config["dataset"]
    dataset_random_state = run_config["dataset_random_state"]
    loader = loader_from_name[dataset_name]

    # Just get the task from the dataset
    dataset = loader()
    learning_task = dataset.task

    # But we use the raw data in wildwood
    X, y = loader(raw=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=dataset_random_state, shuffle=True, stratify=y)

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    kwargs_one_tree = wildwood_kwargs.copy()
    kwargs_one_tree["n_estimators"] = 1

    # Fit a single tree on the full dataset to force pre-compilation (doing so on a
    # subset often fails).
    # TODO: debug such cases
    clf = ForestClassifier(**kwargs_one_tree)
    clf.fit(X_train, y_train)

    # Instantiate again just to be sure
    clf = ForestClassifier(**wildwood_kwargs)

    with mlflow.start_run(experiment_id=experiment_id):
        # Fit and timing
        tic = time()
        # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name))
        # TODO: include computations with an without categorical features ?
        clf.fit(X_train, y_train)

        toc = time()
        fit_time = toc - tic
        logging.info(f"Fitted for experiment {filename} in {fit_time}s")

        # Predict and timing
        tic = time()
        y_scores_train = clf.predict_proba(X_train)
        toc = time()
        predict_train_time = toc - tic

        tic = time()
        y_scores_test = clf.predict_proba(X_test)
        toc = time()
        predict_test_time = toc - tic

        # col_predict_time.append(predict_time)
        logging.info(
            f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s"
        )

        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        metrics = compute_metrics(
            learning_task=learning_task,
            y_train=y_train,
            y_test=y_test,
            y_scores_train=y_scores_train,
            y_scores_test=y_scores_test,
            y_pred_train=y_pred_train,
            y_pred_test=y_pred_test,
        )

        mlflow_metrics = dict(
            **metrics,
            fit_time=fit_time,
            predict_train_time=predict_train_time,
            predict_test_time=predict_test_time,
        )

        mlflow_params = dict(
            **wildwood_kwargs,
            dataset=dataset_name,
            dataset_random_state=dataset_random_state,
        )

        mlflow.log_params(mlflow_params)
        mlflow.log_metrics(mlflow_metrics)
コード例 #2
0
ファイル: check_iris.py プロジェクト: yiyang-yu/wildwood
    "aggregation": True
}

clf = ForestClassifier(**clf_kwargs)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

tic = time()
clf.fit(X_train, y_train)
toc = time()
print("time to fit: ", toc - tic)

tic = time()
y_scores = clf.predict_proba(X_test)
toc = time()
print("time to predict_proba: ", toc - tic)

tic = time()
y_pred = clf.predict(X_test)
toc = time()
print("time to predict: ", toc - tic)

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

print(cm)
print(acc)