예제 #1
0
def random_search_and_save(fit_dictionary: typing.Dict[str, typing.Any],
                           backend: Backend, num_models: int) -> None:
    """
    A function to generate randomly fitted pipelines.
    It inefficiently pass the data in the fit dictionary, as there is no datamanager yet.

    It uses the backend to save the models and predictions for the ensemble selection
    """

    # Ensemble selection will evaluate performance on the OOF predictions. Store the OOF
    # Ground truth
    datamanager = backend.load_datamanager()
    X_train, y_train = datamanager.train_tensors
    X_test, y_test = (None, None)
    if datamanager.test_tensors is not None:
        X_test, y_test = datamanager.test_tensors
    targets = np.take(y_train, fit_dictionary['val_indices'], axis=0)
    backend.save_targets_ensemble(targets)

    for idx in range(num_models):
        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary['dataset_properties'])

        # Sample a random configuration
        pipeline_cs = pipeline.get_hyperparameter_search_space()
        config = pipeline_cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        # Fit the sample configuration
        pipeline.fit(fit_dictionary)

        # Predict using the fit model
        ensemble_predictions = pipeline.predict(
            np.take(X_train, fit_dictionary['val_indices'], axis=0))
        test_predictions = pipeline.predict(X_test)

        backend.save_numrun_to_dir(
            seed=fit_dictionary['seed'],
            idx=idx,
            budget=fit_dictionary['epochs'],
            model=pipeline,
            cv_model=None,
            ensemble_predictions=ensemble_predictions,
            valid_predictions=None,
            test_predictions=test_predictions,
        )

        score = accuracy_score(y_test, np.argmax(test_predictions, axis=1))
        print(f"Fitted a pipeline {idx} with score = {score}")

    return
    def test_pipeline_predict(self, fit_dictionary_tabular):
        """This test makes sure that the pipeline is able to predict
        given a random configuration"""
        X = fit_dictionary_tabular['X_train'].copy()
        pipeline = TabularClassificationPipeline(
            dataset_properties=fit_dictionary_tabular['dataset_properties'])

        cs = pipeline.get_hyperparameter_search_space()
        config = cs.sample_configuration()
        pipeline.set_hyperparameters(config)

        with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
             as patch_train:
            patch_train.return_value = 1, {}
            pipeline.fit(fit_dictionary_tabular)

        # we expect the output to have the same batch size as the test input,
        # and number of outputs per batch sample equal to the number of outputs
        expected_output_shape = (
            X.shape[0],
            fit_dictionary_tabular["dataset_properties"]["output_shape"])

        prediction = pipeline.predict(X)
        assert isinstance(prediction, np.ndarray)
        assert prediction.shape == expected_output_shape