Example #1
0
def test_failure_trial_to_json():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name='mse')

    with trial:
        trial_split = given_failed_trial_split(trial)

    trial_json = trial.to_json()

    assert trial_json['status'] == TRIAL_STATUS.FAILED.value
    assert trial_json['error'] == str(trial_split.error)
    assert trial_json['error_traceback'] == EXPECTED_ERROR_TRACEBACK
    assert trial_json['main_metric_name'] == trial.main_metric_name
    assert then_failed_validation_split_json_is_valid(
        trial_json['validation_splits'][0], trial_split=trial_split)

    start_time = datetime.datetime.strptime(trial_json['start_time'],
                                            TRIAL_DATETIME_STR_FORMAT)
    end_time = datetime.datetime.strptime(
        trial_json['end_time'],
        TRIAL_DATETIME_STR_FORMAT) + datetime.timedelta(hours=1)

    assert start_time < end_time
Example #2
0
def _test_trial_scores(
        expected_output_mult, pipeline,
        hyperparams_optimizer: BaseHyperparameterSelectionStrategy,
        tmpdir: str):
    hp_repository: InMemoryHyperparamsRepository = InMemoryHyperparamsRepository(
        cache_folder=str(tmpdir))
    n_epochs = 1
    n_trials = 20
    auto_ml: AutoML = AutoML(
        pipeline=pipeline,
        hyperparams_optimizer=hyperparams_optimizer,
        validation_splitter=ValidationSplitter(0.5),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=n_trials,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository)

    # When
    data_inputs = np.array([0, 0])
    expected_outputs = expected_output_mult * np.ones_like(data_inputs)
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    trials: Trials = hp_repository.load_all_trials(status=TRIAL_STATUS.SUCCESS)
    validation_scores = [t.get_validation_score() for t in trials]
    return validation_scores
Example #3
0
 def setup(self):
     self.hp = HyperparameterSamples({'a': 2})
     self.repo = InMemoryHyperparamsRepository()
     self.trial = Trial(trial_number=0,
                        save_trial_function=self.repo.save_trial,
                        hyperparams=self.hp,
                        main_metric_name=MAIN_METRIC_NAME)
Example #4
0
def test_automl_should_shallow_copy_data_before_each_epoch():
    # see issue #332 https://github.com/Neuraxio/Neuraxle/issues/332
    data_inputs = np.random.randint(0, 100, (100, 3))
    expected_outputs = np.random.randint(0, 3, 100)

    from sklearn.preprocessing import StandardScaler
    p = Pipeline([
        SKLearnWrapper(StandardScaler()),
        SKLearnWrapper(LinearSVC(),
                       HyperparameterSpace({'C': RandInt(0, 10000)})),
    ])

    auto_ml = AutoML(p,
                     validation_splitter=ValidationSplitter(0.20),
                     refit_trial=True,
                     n_trials=10,
                     epochs=10,
                     cache_folder_when_no_handle='cache',
                     scoring_callback=ScoringCallback(
                         mean_squared_error, higher_score_is_better=False),
                     callbacks=[
                         MetricCallback('mse',
                                        metric_function=mean_squared_error,
                                        higher_score_is_better=False)
                     ],
                     hyperparams_repository=InMemoryHyperparamsRepository(
                         cache_folder='cache'),
                     continue_loop_on_error=False)

    random_search = auto_ml.fit(data_inputs, expected_outputs)

    best_model = random_search.get_best_model()

    assert isinstance(best_model, Pipeline)
Example #5
0
def test_success_trial_to_json():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name='mse')

    with trial:
        given_success_trial_validation_split(trial)

    trial_json = trial.to_json()

    assert trial_json['status'] == TRIAL_STATUS.SUCCESS.value
    assert trial_json['error'] is None
    assert trial_json['error_traceback'] is None
    assert trial_json['main_metric_name'] == trial.main_metric_name
    assert then_success_trial_split_json_is_valid(
        trial_json['validation_splits'][0])

    start_time = datetime.datetime.strptime(trial_json['start_time'],
                                            TRIAL_DATETIME_STR_FORMAT)
    end_time = datetime.datetime.strptime(
        trial_json['end_time'],
        TRIAL_DATETIME_STR_FORMAT) + datetime.timedelta(hours=1)

    assert start_time < end_time
Example #6
0
def test_automl_early_stopping_callback(tmpdir):
    # TODO: fix this unit test
    # Given
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    n_epochs = 60
    auto_ml = AutoML(
        pipeline=Pipeline([
            FitTransformCallbackStep().set_name('callback'),
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository)

    # When
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 2
    auto_ml = auto_ml.fit(data_inputs=data_inputs,
                          expected_outputs=expected_outputs)

    # Then
    p = auto_ml.get_best_model()
Example #7
0
def test_trial_split_is_new_best_score_should_return_true_with_a_new_best_score_after_multiple_scores(
):
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name=MAIN_METRIC_NAME)

    with trial.new_validation_split(Identity()) as trial_split:
        trial_split.add_metric_results_train(name=MAIN_METRIC_NAME,
                                             score=0.5,
                                             higher_score_is_better=False)
        trial_split.add_metric_results_validation(name=MAIN_METRIC_NAME,
                                                  score=0.5,
                                                  higher_score_is_better=False)

        trial_split.add_metric_results_train(name=MAIN_METRIC_NAME,
                                             score=0.7,
                                             higher_score_is_better=False)
        trial_split.add_metric_results_validation(name=MAIN_METRIC_NAME,
                                                  score=0.7,
                                                  higher_score_is_better=False)

        trial_split.add_metric_results_train(name=MAIN_METRIC_NAME,
                                             score=0.4,
                                             higher_score_is_better=False)
        trial_split.add_metric_results_validation(name=MAIN_METRIC_NAME,
                                                  score=0.4,
                                                  higher_score_is_better=False)

    assert trial_split.is_new_best_score()
Example #8
0
def test_failure_trial_split_to_json():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name='mse')
    with trial:
        trial_split = given_failed_trial_split(trial)

    trial_json = trial_split.to_json()

    then_failed_validation_split_json_is_valid(trial_json, trial_split)
Example #9
0
def test_success_trial_split_to_json():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name=MAIN_METRIC_NAME)

    with trial:
        trial_split = given_success_trial_validation_split(trial)
        trial_json = trial_split.to_json()

    then_success_trial_split_json_is_valid(trial_json)
Example #10
0
def test_success_trial_get_validation_score():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name='mse')

    with trial:
        given_success_trial_validation_split(trial, best_score=0.3)

    validation_score = trial.get_validation_score()

    assert validation_score == 0.3
Example #11
0
def test_trial_should_create_new_split():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name=MAIN_METRIC_NAME)

    with trial.new_validation_split(Identity()) as trial_split:
        trial_split.set_success()

    assert isinstance(trial_split.start_time, datetime.datetime)
    assert isinstance(trial_split.end_time, datetime.datetime)
    assert trial_split.start_time < trial_split.end_time
    assert trial.validation_splits[0] == trial_split
Example #12
0
def test_trial_with_failed_split_should_only_average_successful_splits():
    hp = HyperparameterSamples({'a': 2})
    repo = InMemoryHyperparamsRepository()
    trial = Trial(save_trial_function=repo.save_trial,
                  hyperparams=hp,
                  main_metric_name='mse')

    with trial:
        given_success_trial_validation_split(trial, best_score=0.3)
        given_success_trial_validation_split(trial, best_score=0.1)
        given_failed_trial_split(trial)

    validation_score = trial.get_validation_score()

    assert validation_score == 0.2
Example #13
0
def _make_autoML_loop(tmpdir, p: Pipeline):
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir) + "_hp")
    n_epochs = 1
    return AutoML(
        pipeline=p,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository,
        cache_folder_when_no_handle=str(tmpdir),
        continue_loop_on_error=False
    )
    def test_logger_automl_multiprocessing(self, tmpdir):
        # Given
        context = ExecutionContext()
        self.tmpdir = str(tmpdir)
        hp_repository = InMemoryHyperparamsRepository(cache_folder=self.tmpdir)
        n_epochs = 2
        n_trials = 4
        auto_ml = AutoML(
            pipeline=Pipeline([
                MultiplyByN(2).set_hyperparams_space(
                    HyperparameterSpace(
                        {'multiply_by': FixedHyperparameter(2)})),
                NumpyReshape(new_shape=(-1, 1)),
                LoggingStep()
            ]),
            hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(
            ),
            validation_splitter=ValidationSplitter(0.20),
            scoring_callback=ScoringCallback(mean_squared_error,
                                             higher_score_is_better=False),
            n_trials=n_trials,
            refit_trial=True,
            epochs=n_epochs,
            hyperparams_repository=hp_repository,
            continue_loop_on_error=False)

        # When
        data_container = DataContainer(
            data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
            expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]))
        auto_ml.handle_fit(data_container, context)

        # Then
        file_paths = [
            os.path.join(hp_repository.cache_folder, f"trial_{i}.log")
            for i in range(n_trials)
        ]

        for f in file_paths:
            assert os.path.exists(f)

        # That not a great way of testing... but at least it raises a flag when something changes in the logging process
        with open(file_paths[0], 'r') as f:
            l = f.readlines()
            assert len(l) == 36
Example #15
0
def test_automl_early_stopping_callback(tmpdir):
    # Given
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    n_epochs = 10
    max_epochs_without_improvement = 3
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
        ]),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
            EarlyStoppingCallback(max_epochs_without_improvement)
        ],
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False)

    # When
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 2
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    trial = hp_repository.trials[0]
    assert len(trial.validation_splits) == 1
    validation_scores = trial.validation_splits[0].get_validation_scores()
    nepochs_executed = len(validation_scores)
    assert nepochs_executed == max_epochs_without_improvement + 1
Example #16
0
def test_trials_get_best_hyperparams_should_return_hyperparams_of_best_trial():
    # Given
    repo = InMemoryHyperparamsRepository()
    hp_trial_1 = HyperparameterSamples({'a': 2})
    trial_1 = Trial(save_trial_function=repo.save_trial,
                    hyperparams=hp_trial_1,
                    main_metric_name=MAIN_METRIC_NAME)
    with trial_1:
        given_success_trial_validation_split(trial_1, best_score=0.2)

    hp_trial_2 = HyperparameterSamples({'b': 3})
    trial_2 = Trial(save_trial_function=repo.save_trial,
                    hyperparams=hp_trial_2,
                    main_metric_name=MAIN_METRIC_NAME)
    with trial_2:
        given_success_trial_validation_split(trial_2, best_score=0.1)

    trials = Trials(trials=[trial_1, trial_2])

    # When
    best_hyperparams = trials.get_best_hyperparams()

    # Then
    assert best_hyperparams == hp_trial_2
Example #17
0
def test_in_memory_hyperparams_repository_should_be_observable():
    repo: InMemoryHyperparamsRepository = InMemoryHyperparamsRepository()
    # todo: make a tests that asserts that an observer can receive updates from the InMemoryHyperparamsRepository
    # todo: given trial, a repo, and an observer
    pass
Example #18
0
def main(tmpdir):
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
    # within the classes ar their definition if using custom classes, or also it could be defined after declaring the
    # pipeline using a flat dict or a nested dict.

    p = Pipeline([
        AddFeatures([
            SKLearnWrapper(
                PCA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})
            ),
            SKLearnWrapper(
                FastICA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})
            ),
        ]),
        ModelStacking([
            SKLearnWrapper(
                GradientBoostingRegressor(),
                HyperparameterSpace({
                    "n_estimators": RandInt(50, 300), "max_depth": RandInt(1, 4),
                    "learning_rate": LogUniform(0.07, 0.7)
                })
            ),
            SKLearnWrapper(
                KMeans(),
                HyperparameterSpace({"n_clusters": RandInt(5, 10)})
            ),
        ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({"alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean()})
            ),
        )
    ])

    print("Meta-fitting on train:")
    auto_ml = AutoML(
        p,
        validation_splitter=ValidationSplitter(0.20),
        refit_trial=True,
        n_trials=10,
        epochs=1,  # 1 epoc here due to using sklearn models that just fit once.
        cache_folder_when_no_handle=str(tmpdir),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)],
        hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    )

    random_search = auto_ml.fit(X_train, y_train)
    p = random_search.get_best_model()
    print("")

    print("Transforming train and test:")
    y_train_predicted = p.predict(X_train)
    y_test_predicted = p.predict(X_test)

    print("")

    print("Evaluating transformed train:")
    score_transform = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score_transform)

    print("")

    print("Evaluating transformed test:")
    score_test = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score_test)
Example #19
0
def main():
    def accuracy(data_inputs, expected_outputs):
        return np.mean(
            np.argmax(np.array(data_inputs), axis=1) == np.argmax(
                np.array(expected_outputs), axis=1))

    # load the dataset
    df = read_csv('data/winequality-white.csv', sep=';')
    data_inputs = df.values
    data_inputs[:, -1] = data_inputs[:, -1] - 1
    n_features = data_inputs.shape[1] - 1
    n_classes = 10

    p = Pipeline([
        TrainOnlyWrapper(DataShuffler()),
        ColumnTransformerInputOutput(
            input_columns=[(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32)
            )],
            output_columns=[(11, Identity())]
        ),
        OutputTransformerWrapper(PlotDistribution(column=-1)),
        MiniBatchSequentialPipeline([
            Tensorflow2ModelStep(
                create_model=create_model,
                create_loss=create_loss,
                create_optimizer=create_optimizer
            ) \
                .set_hyperparams(HyperparameterSamples({
                'n_dense_layers': 2,
                'input_dim': n_features,
                'optimizer': 'adam',
                'activation': 'relu',
                'kernel_initializer': 'he_uniform',
                'learning_rate': 0.01,
                'hidden_dim': 20,
                'n_classes': 3
            })).set_hyperparams_space(HyperparameterSpace({
                'n_dense_layers': RandInt(2, 4),
                'hidden_dim_layer_multiplier': Uniform(0.30, 1),
                'input_dim': FixedHyperparameter(n_features),
                'optimizer': Choice([
                    OPTIMIZERS.ADAM.value,
                    OPTIMIZERS.SGD.value,
                    OPTIMIZERS.ADAGRAD.value
                ]),
                'activation': Choice([
                    ACTIVATIONS.RELU.value,
                    ACTIVATIONS.TANH.value,
                    ACTIVATIONS.SIGMOID.value,
                    ACTIVATIONS.ELU.value,
                ]),
                'kernel_initializer': Choice([
                    KERNEL_INITIALIZERS.GLOROT_NORMAL.value,
                    KERNEL_INITIALIZERS.GLOROT_UNIFORM.value,
                    KERNEL_INITIALIZERS.HE_UNIFORM.value
                ]),
                'learning_rate': LogUniform(0.005, 0.01),
                'hidden_dim': RandInt(3, 80),
                'n_classes': FixedHyperparameter(n_classes)
            }))
        ], batch_size=33),
        OutputTransformerWrapper(Pipeline([
            ExpandDim(),
            OneHotEncoder(nb_columns=n_classes, name='classes')
        ]))
    ])

    auto_ml = AutoML(
        pipeline=p,
        hyperparams_repository=InMemoryHyperparamsRepository(
            cache_folder='trials'),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(test_size=0.30),
        scoring_callback=ScoringCallback(accuracy,
                                         higher_score_is_better=True),
        callbacks=[
            MetricCallback(
                name='classification_report_imbalanced_metric',
                metric_function=classificaiton_report_imbalanced_metric,
                higher_score_is_better=True),
            MetricCallback(name='f1',
                           metric_function=f1_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='recall',
                           metric_function=recall_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='precision',
                           metric_function=precision_score_weighted,
                           higher_score_is_better=True),
            EarlyStoppingCallback(max_epochs_without_improvement=3)
        ],
        n_trials=200,
        refit_trial=True,
        epochs=75)

    auto_ml = auto_ml.fit(data_inputs=data_inputs)