コード例 #1
0
ファイル: test_tabular.py プロジェクト: taesup-aws/autogluon
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                      'name': 'AdultIncomeBinaryClassification',
                      'problem_type': BINARY}
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = TabularPredictor(label=label, path=savedir).fit(train_data)
    leaderboard = predictor.leaderboard(data=test_data)
    extra_metrics = ['accuracy', 'roc_auc', 'log_loss']
    leaderboard_extra = predictor.leaderboard(data=test_data, extra_info=True, extra_metrics=extra_metrics)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    assert set(leaderboard_extra.columns).issuperset(set(extra_metrics))  # Assert that extra_metrics are present in output
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(data=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert set(feature_importances.index) == original_features
    assert set(feature_importances.columns) == {'importance', 'stddev', 'p_value', 'n', 'p99_high', 'p99_low'}
    predictor.transform_features()
    predictor.transform_features(data=test_data)
    predictor.info()

    assert predictor.get_model_names_persisted() == []  # Assert that no models were persisted during training
    assert predictor.unpersist_models() == []  # Assert that no models were unpersisted

    persisted_models = predictor.persist_models(models='all', max_memory=None)
    assert set(predictor.get_model_names_persisted()) == set(persisted_models)  # Ensure all models are persisted
    assert predictor.persist_models(models='all', max_memory=None) == []  # Ensure that no additional models are persisted on repeated calls
    unpersised_models = predictor.unpersist_models()
    assert set(unpersised_models) == set(persisted_models)
    assert predictor.get_model_names_persisted() == []  # Assert that all models were unpersisted

    # Raise exception
    with pytest.raises(NetworkXError):
        predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2'])

    assert predictor.get_model_names_persisted() == []

    assert predictor.unpersist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == []

    predictor.persist_models(models='all', max_memory=None)
    predictor.save()  # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded.
    predictor_loaded = TabularPredictor.load(predictor.path)  # Assert that predictor loading works
    leaderboard_loaded = predictor_loaded.leaderboard(data=test_data)
    assert len(leaderboard) == len(leaderboard_loaded)
    assert predictor_loaded.get_model_names_persisted() == []  # Assert that models were not still persisted after loading predictor

    assert(predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(data=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(models_to_keep=[])  # Test that dry-run doesn't delete models
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(data=test_data)
    predictor.delete_models(models_to_keep=[], dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(data=test_data)
    except:
        pass
    else:
        raise AssertionError('predictor.predict should raise exception after all models are deleted')
    print('Tabular Advanced Functionality Test Succeeded.')
##################################
# Fitting with the new Predictor #
##################################

predictor2 = TabularPredictor(label, eval_metric=eval_metric).fit(train_data, hyperparameters=hyperparameters, num_bag_folds=2)
predictor2.leaderboard(test_data)

####################################
# Advanced fit_extra functionality #
####################################

# Fit extra models at level 0, with 30 second time limit
hyperparameters_extra1 = {'GBM': {}, 'NN': {}}
predictor2.fit_extra(hyperparameters_extra1, time_limit=30)

# Fit new level 1 stacker models that use the level 0 models from the original fit and the previous fit_extra call as base models
hyperparameters_extra2 = {'CAT': {}, 'NN': {}}
base_model_names = predictor2.get_model_names(stack_name='core', level=0)
predictor2.fit_extra(hyperparameters_extra2, base_model_names=base_model_names)

# Fit a new 3-layer stack ensemble on top of level 1 stacker models
hyperparameters_extra3 = {
    0: {'XT': {}},
    1: {'NN': {}, 'RF': {}},
    2: {'XGB': {}, 'custom': ['GBM']}
}
base_model_names = predictor2.get_model_names(stack_name='core', level=1)
predictor2.fit_extra(hyperparameters_extra3, base_model_names=base_model_names)

predictor2.leaderboard(test_data)