Ejemplo n.º 1
0
def testRead(ensemble_backend):

    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
    )

    success = ensbuilder.score_ensemble_preds()
    assert success, str(ensbuilder.read_preds)
    assert len(ensbuilder.read_preds) == 3, ensbuilder.read_preds.keys()
    assert len(ensbuilder.read_scores) == 3, ensbuilder.read_scores.keys()

    filename = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy")
    np.testing.assert_almost_equal(
        ensbuilder.read_scores[filename]["ens_score"], np.array(0.8))

    filename = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy")
    np.testing.assert_almost_equal(
        ensbuilder.read_scores[filename]["ens_score"], np.array(1.0))
Ejemplo n.º 2
0
def testPerformanceRangeThresholdMaxBest(ensemble_backend,
                                         performance_range_threshold,
                                         ensemble_nbest, exp):
    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=ensemble_nbest,
        performance_range_threshold=performance_range_threshold,
        max_models_on_disc=None,
    )
    ensbuilder.read_scores = {
        'A': {
            'ens_score': 1,
            'num_run': 1,
            'loaded': -1,
            "seed": 1
        },
        'B': {
            'ens_score': 2,
            'num_run': 2,
            'loaded': -1,
            "seed": 1
        },
        'C': {
            'ens_score': 3,
            'num_run': 3,
            'loaded': -1,
            "seed": 1
        },
        'D': {
            'ens_score': 4,
            'num_run': 4,
            'loaded': -1,
            "seed": 1
        },
        'E': {
            'ens_score': 5,
            'num_run': 5,
            'loaded': -1,
            "seed": 1
        },
    }
    ensbuilder.read_preds = {
        key: {key_2: True
              for key_2 in (Y_ENSEMBLE, Y_TEST)}
        for key in ensbuilder.read_scores
    }
    sel_keys = ensbuilder.get_n_best_preds()

    assert len(sel_keys) == exp
Ejemplo n.º 3
0
def test_main(ensemble_backend):

    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=MULTICLASS,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=2,
        max_models_on_disc=None,
    )
    ensbuilder.SAVE2DISC = False

    run_history, ensemble_nbest, _, _ = ensbuilder.main(
        time_left=np.inf,
        iteration=1,
        return_predictions=False,
    )

    assert len(ensbuilder.read_preds) == 3
    assert ensbuilder.last_hash is not None
    assert ensbuilder.y_true_ensemble is not None

    # Make sure the run history is ok

    # We expect at least 1 element to be in the ensemble
    assert len(run_history) > 0

    # As the data loader loads the same val/train/test
    # we expect 1.0 as score and all keys available
    expected_performance = {
        'train_accuracy': 1.0,
        'test_accuracy': 1.0,
    }

    # Make sure that expected performance is a subset of the run history
    assert all(item in run_history[0].items()
               for item in expected_performance.items())
    assert 'Timestamp' in run_history[0]
    assert isinstance(run_history[0]['Timestamp'], pd.Timestamp)

    assert os.path.exists(
        os.path.join(ensemble_backend.internals_directory,
                     'ensemble_read_preds.pkl')), os.listdir(
                         ensemble_backend.internals_directory)
    assert os.path.exists(
        os.path.join(ensemble_backend.internals_directory,
                     'ensemble_read_scores.pkl')), os.listdir(
                         ensemble_backend.internals_directory)
Ejemplo n.º 4
0
def testMaxModelsOnDisc2(ensemble_backend):
    # Test for Extreme scenarios
    # Make sure that the best predictions are kept
    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=50,
        max_models_on_disc=10000.0,
    )
    ensbuilder.read_preds = {}
    for i in range(50):
        ensbuilder.read_scores['pred' + str(i)] = {
            'ens_score': i * 10,
            'num_run': i,
            'loaded': 1,
            "seed": 1,
            "disc_space_cost_mb": 50 * i,
        }
        ensbuilder.read_preds['pred' + str(i)] = {Y_ENSEMBLE: True}
    sel_keys = ensbuilder.get_n_best_preds()
    assert ['pred49', 'pred48', 'pred47'] == sel_keys

    # Make sure at least one model is kept alive
    ensbuilder.max_models_on_disc = 0.0
    sel_keys = ensbuilder.get_n_best_preds()
    assert ['pred49'] == sel_keys
Ejemplo n.º 5
0
def testMaxModelsOnDisc(ensemble_backend, test_case, exp):
    ensemble_nbest = 4
    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=ensemble_nbest,
        max_models_on_disc=test_case,
    )

    with unittest.mock.patch('os.path.getsize') as mock:
        mock.return_value = 100 * 1024 * 1024
        ensbuilder.score_ensemble_preds()
        sel_keys = ensbuilder.get_n_best_preds()
        assert len(sel_keys) == exp, test_case
Ejemplo n.º 6
0
def test_run_end_at(ensemble_backend):
    with unittest.mock.patch('pynisher.enforce_limits') as pynisher_mock:
        ensbuilder = EnsembleBuilder(
            backend=ensemble_backend,
            dataset_name="TEST",
            output_type=MULTICLASS,  # Multilabel Classification
            task_type=TABULAR_CLASSIFICATION,
            metrics=[accuracy],
            opt_metric='accuracy',
            seed=0,  # important to find the test files
            ensemble_nbest=2,
            max_models_on_disc=None,
        )
        ensbuilder.SAVE2DISC = False

        current_time = time.time()

        ensbuilder.run(end_at=current_time + 10, iteration=1)
        # 4 seconds left because: 10 seconds - 5 seconds overhead - very little overhead,
        # but then rounded to an integer
        assert pynisher_mock.call_args_list[0][1]["wall_time_in_s"], 4
Ejemplo n.º 7
0
def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp):
    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=ensemble_nbest,
        max_models_on_disc=max_models_on_disc,
    )

    ensbuilder.score_ensemble_preds()
    sel_keys = ensbuilder.get_n_best_preds()

    assert len(sel_keys) == exp

    fixture = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy")
    assert sel_keys[0] == fixture
Ejemplo n.º 8
0
def test_read_pickle_read_preds(ensemble_backend):
    """
    This procedure test that we save the read predictions before
    destroying the ensemble builder and that we are able to read
    them safely after
    """
    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=MULTICLASS,  # Multilabel Classification
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=2,
        max_models_on_disc=None,
    )
    ensbuilder.SAVE2DISC = False

    ensbuilder.main(time_left=np.inf, iteration=1, return_predictions=False)

    # Check that the memory was created
    ensemble_memory_file = os.path.join(ensemble_backend.internals_directory,
                                        'ensemble_read_preds.pkl')
    assert os.path.exists(ensemble_memory_file)

    # Make sure we pickle the correct read preads and hash
    with (open(ensemble_memory_file, "rb")) as memory:
        read_preds, last_hash = pickle.load(memory)

    compare_read_preds(read_preds, ensbuilder.read_preds)
    assert last_hash == ensbuilder.last_hash

    ensemble_memory_file = os.path.join(ensemble_backend.internals_directory,
                                        'ensemble_read_scores.pkl')
    assert os.path.exists(ensemble_memory_file)

    # Make sure we pickle the correct read scores
    with (open(ensemble_memory_file, "rb")) as memory:
        read_scores = pickle.load(memory)

    compare_read_preds(read_scores, ensbuilder.read_scores)

    # Then create a new instance, which should automatically read this file
    ensbuilder2 = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=MULTICLASS,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=2,
        max_models_on_disc=None,
    )
    compare_read_preds(ensbuilder2.read_preds, ensbuilder.read_preds)
    compare_read_preds(ensbuilder2.read_scores, ensbuilder.read_scores)
    assert ensbuilder2.last_hash == ensbuilder.last_hash
Ejemplo n.º 9
0
def testEntireEnsembleBuilder(ensemble_backend):

    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=2,
    )
    ensbuilder.SAVE2DISC = False

    ensbuilder.score_ensemble_preds()

    d2 = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy")

    sel_keys = ensbuilder.get_n_best_preds()
    assert len(sel_keys) > 0

    ensemble = ensbuilder.fit_ensemble(selected_keys=sel_keys)

    n_sel_test = ensbuilder.get_test_preds(selected_keys=sel_keys)

    # both valid and test prediction files are available
    assert len(n_sel_test) > 0

    y_test = ensbuilder.predict(
        set_="test",
        ensemble=ensemble,
        selected_keys=n_sel_test,
        n_preds=len(sel_keys),
        index_run=1,
    )

    # since d2 provides perfect predictions
    # it should get a higher weight
    # so that y_valid should be exactly y_valid_d2
    y_test_d2 = ensbuilder.read_preds[d2][Y_TEST][:, 1]
    np.testing.assert_array_almost_equal(y_test, y_test_d2)
Ejemplo n.º 10
0
def testGetTestPreds(ensemble_backend):

    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=1)

    ensbuilder.score_ensemble_preds()

    d1 = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy")
    d2 = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy")
    d3 = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy")

    sel_keys = ensbuilder.get_n_best_preds()
    assert len(sel_keys) == 1
    ensbuilder.get_test_preds(selected_keys=sel_keys)

    # Number of read files should be three and
    # predictions_ensemble_0_4_0.0.npy must not be in there
    assert len(ensbuilder.read_preds) == 3
    assert os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_4_0.0/predictions_ensemble_0_4_0.0.npy"
    ) not in ensbuilder.read_preds

    # not selected --> should still be None
    assert ensbuilder.read_preds[d1][Y_TEST] is None
    assert ensbuilder.read_preds[d3][Y_TEST] is None

    # selected --> read valid and test predictions
    assert ensbuilder.read_preds[d2][Y_TEST] is not None
Ejemplo n.º 11
0
def testFallBackNBest(ensemble_backend):

    ensbuilder = EnsembleBuilder(
        backend=ensemble_backend,
        dataset_name="TEST",
        output_type=BINARY,
        task_type=TABULAR_CLASSIFICATION,
        metrics=[accuracy],
        opt_metric='accuracy',
        seed=0,  # important to find the test files
        ensemble_nbest=1)

    ensbuilder.score_ensemble_preds()

    filename = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy")
    ensbuilder.read_scores[filename]["ens_score"] = -1

    filename = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy")
    ensbuilder.read_scores[filename]["ens_score"] = -1

    filename = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy")
    ensbuilder.read_scores[filename]["ens_score"] = -1

    sel_keys = ensbuilder.get_n_best_preds()

    fixture = os.path.join(
        ensemble_backend.temporary_directory,
        ".autoPyTorch/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy")
    assert len(sel_keys) == 1
    assert sel_keys[0] == fixture