Ejemplo n.º 1
0
def get_xgbooster(X, Y, name, params, calculate):
    if calculate:
        if len(Y.unique()) == 2:
            xgb = xgboost.XGBClassifier(learning_rate=0.02,
                                        param_max_depth=5,
                                        n_estimators=400,
                                        scoring='roc_auc',
                                        objective='binary:logistic',
                                        silent=True,
                                        nthread=1)
            t0 = time.time()
            sh = GridSuccessiveHalving(xgb,
                                       params,
                                       cv=5,
                                       ratio=2,
                                       force_exhaust_budget=True,
                                       scoring='roc_auc',
                                       verbose=3,
                                       n_jobs=6).fit(X, Y)
            t1 = time.time()
        else:
            xgb = xgboost.XGBRegressor(learning_rate=0.02,
                                       param_max_depth=5,
                                       n_estimators=400,
                                       objective='reg:linear',
                                       silent=True,
                                       nthread=1)
            t0 = time.time()
            sh = GridSuccessiveHalving(xgb,
                                       params,
                                       cv=5,
                                       ratio=2,
                                       force_exhaust_budget=True,
                                       scoring='neg_mean_squared_error',
                                       verbose=3,
                                       n_jobs=6).fit(X, Y)
            t1 = time.time()

        # Halving

        print(t1 - t0)

        results = pd.DataFrame.from_dict(sh.cv_results_)
        results.groupby('iter').r_i.unique()
        results[['r_i'] + ['param_' + k for k in params.keys()] +
                ['mean_test_score']].to_csv(name + '.csv')

        with open(name + '.pickle', 'wb') as input:
            pickle.dump(sh, input, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(name + '.pickle', 'rb') as output:
            sh = pickle.load(output)
    return sh
Ejemplo n.º 2
0
def test_n_iterations(max_budget, n_iterations, n_possible_iterations):
    # test the number of actual iterations that were run depending on
    # max_budget

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=1)
    parameters = {'a': [1, 2], 'b': list(range(10))}
    base_estimator = FastClassifier()
    ratio = 2

    sh = GridSuccessiveHalving(base_estimator, parameters, cv=2, ratio=ratio,
                               max_budget=max_budget, r_min=4)
    sh.fit(X, y)
    assert sh.n_required_iterations_ == 5
    assert sh.n_iterations_ == n_iterations
    assert sh.n_possible_iterations_ == n_possible_iterations
Ejemplo n.º 3
0
def test_force_exhaust_budget_true(max_budget, r_i_list):
    # Test the force_exhaust_budget parameter when it's true
    # in this case we need to change r_min so that the last iteration uses as
    # much budget as possible

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    parameters = {'a': [1, 2], 'b': [1, 2, 3]}
    base_estimator = FastClassifier()
    ratio = 3
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               force_exhaust_budget=True, ratio=ratio,
                               max_budget=max_budget)
    sh.fit(X, y)

    assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list)
    assert sh._r_i_list == r_i_list

    # Test same for randomized search
    sh = RandomSuccessiveHalving(base_estimator, parameters, n_candidates=6,
                                 cv=5, force_exhaust_budget=True,
                                 ratio=ratio, max_budget=max_budget)
    sh.fit(X, y)

    assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh._r_i_list)
    assert sh._r_i_list == r_i_list
Ejemplo n.º 4
0
        }

params = {
        'max_depth': [5, 6],
        'n_estimators': range(400, 700, 50)
        }


xgb = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

# Halving

t0 = time.time()
sh = GridSuccessiveHalving(xgb, params, cv=5,
                           ratio=2, force_exhaust_budget=True, scoring='roc_auc', verbose=3, n_jobs=6
                           ).fit(data_X, data_Y)
t1 = time.time()
print(t1 - t0)
results = pd.DataFrame.from_dict(sh.cv_results_)
results.groupby('iter').r_i.unique()
results[['r_i', 'param_max_depth', 'param_n_estimators', 'mean_test_score']]

# random_search = GridSearchCV(xgb, param_grid=params,  scoring='roc_auc', n_jobs=6, cv=5, verbose=3)
# random_search.fit(data_X, data_Y)

# # Correls
# data_f = data_2.select_dtypes(include=['float64'])
#
# plt.figure(figsize=(10, 10))
# plt.matshow(data_f.corr(), fignum=1)
Ejemplo n.º 5
0
def test_aggressive_elimination():
    # Test the aggressive_elimination parameter.

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    parameters = {'a': ('l1', 'l2'), 'b': list(range(30))}
    base_estimator = FastClassifier()
    ratio = 3

    # aggressive_elimination is only really relevant when there is not enough
    # budget.
    max_budget = 180

    # aggressive_elimination=True
    # In this case, the first iterations only use r_min_ resources
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               aggressive_elimination=True,
                               max_budget=max_budget, ratio=ratio)
    sh.fit(X, y)
    assert sh.n_iterations_ == 4
    assert sh.n_required_iterations_ == 4
    assert sh.n_possible_iterations_ == 3
    assert sh._r_i_list == [20, 20, 60, 180]  # see how it loops at the start
    assert sh.n_remaining_candidates_ == 1

    # Make sure we get the same results with randomized search
    sh = RandomSuccessiveHalving(base_estimator, parameters,
                                 n_candidates=60, cv=5,
                                 aggressive_elimination=True,
                                 max_budget=max_budget, ratio=ratio)
    sh.fit(X, y)
    assert sh.n_iterations_ == 4
    assert sh.n_required_iterations_ == 4
    assert sh.n_possible_iterations_ == 3
    assert sh._r_i_list == [20, 20, 60, 180]  # see how it loops at the start
    assert sh.n_remaining_candidates_ == 1

    # aggressive_elimination=False
    # In this case we don't loop at the start, and might end up with a lot of
    # candidates at the last iteration
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               aggressive_elimination=False,
                               max_budget=max_budget, ratio=ratio)
    sh.fit(X, y)

    assert sh.n_iterations_ == 3
    assert sh.n_required_iterations_ == 4
    assert sh.n_possible_iterations_ == 3
    assert sh._r_i_list == [20, 60, 180]
    assert sh.n_remaining_candidates_ == 3

    max_budget = n_samples
    # with enough budget, aggressive_elimination has no effect since it is not
    # needed

    # aggressive_elimination=True
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               aggressive_elimination=True,
                               max_budget=max_budget, ratio=ratio)
    sh.fit(X, y)

    assert sh.n_iterations_ == 4
    assert sh.n_required_iterations_ == 4
    assert sh.n_possible_iterations_ == 4
    assert sh._r_i_list == [20, 60, 180, 540]
    assert sh.n_remaining_candidates_ == 1

    # aggressive_elimination=False
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               aggressive_elimination=False,
                               max_budget=max_budget, ratio=ratio)
    sh.fit(X, y)

    assert sh.n_iterations_ == 4
    assert sh.n_required_iterations_ == 4
    assert sh.n_possible_iterations_ == 4
    assert sh._r_i_list == [20, 60, 180, 540]
    assert sh.n_remaining_candidates_ == 1
Ejemplo n.º 6
0
def test_budget_on():
    # Test the budget_on parameter

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    parameters = {'a': [1, 2], 'b': list(range(10))}
    base_estimator = FastClassifier()
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=2,
                               budget_on='c', max_budget=10, ratio=3)
    sh.fit(X, y)
    assert set(sh._r_i_list) == set([1, 3, 9])
    for r_i, params, param_c in zip(sh.cv_results_['r_i'],
                                    sh.cv_results_['params'],
                                    sh.cv_results_['param_c']):
        assert r_i == params['c'] == param_c

    with pytest.raises(
            ValueError,
            match='Cannot budget on parameter 1234 which is not supported '):
        sh = GridSuccessiveHalving(base_estimator, parameters, cv=2,
                                   budget_on='1234', max_budget=10)
        sh.fit(X, y)

    with pytest.raises(
            ValueError,
            match='Cannot budget on parameter c since it is part of the '
                  'searched parameters.'):
        parameters = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]}
        sh = GridSuccessiveHalving(base_estimator, parameters, cv=2,
                                   budget_on='c', max_budget=10)
        sh.fit(X, y)
Ejemplo n.º 7
0
def test_force_exhaust_budget_false():
    # Test the force_exhaust_budget parameter when it's false or ignored.
    # This is the default case: we start at the beginning no matter what since
    # we do not overwrite r_min_

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    parameters = {'a': [1, 2], 'b': [1, 2, 3]}
    base_estimator = FastClassifier()
    ratio = 3

    # with enough budget
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               force_exhaust_budget=False, ratio=ratio)
    sh.fit(X, y)
    assert sh.n_iterations_ == 2
    assert sh.n_required_iterations_ == 2
    assert sh.n_possible_iterations_ == 4
    assert sh._r_i_list == [20, 60]

    # with enough budget but r_min!='auto': ignored
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               force_exhaust_budget=False, ratio=ratio,
                               r_min=50)
    sh.fit(X, y)
    assert sh.n_iterations_ == 2
    assert sh.n_required_iterations_ == 2
    assert sh.n_possible_iterations_ == 3
    assert sh._r_i_list == [50, 150]

    # without enough budget (budget is exhausted anyway)
    sh = GridSuccessiveHalving(base_estimator, parameters, cv=5,
                               force_exhaust_budget=False, ratio=ratio,
                               max_budget=30)
    sh.fit(X, y)
    assert sh.n_iterations_ == 1
    assert sh.n_required_iterations_ == 2
    assert sh.n_possible_iterations_ == 1
    assert sh._r_i_list == [20]
Ejemplo n.º 8
0
else:
    digits = load_digits()
    scale = 16

X_train, X_test, y_train, y_test = train_test_split(digits.data / scale,
                                                    digits.target,
                                                    stratify=digits.target,
                                                    random_state=42)

param_grid = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-3, 2, 6)}
gs = GridSearchCV(SVC(), param_grid, cv=5)

print("Parameter grid:")
print(param_grid)

sh = GridSuccessiveHalving(SVC(), param_grid, cv=5)
print("Start successive halving")
tick = time()
sh.fit(X_train, y_train)
print("Training Time Successive Halving", time() - tick)
print("Test Score Successive Halving: ", sh.score(X_test, y_test))
print("Parameters Successive Halving: ", sh.best_params_)

gs = GridSearchCV(SVC(), param_grid, cv=5)
print("Start Grid Search")
tick = time()
gs.fit(X_train, y_train)
print("Training Time Grid Search: ", time() - tick)
print("Test Score Grid Search: ", gs.score(X_test, y_test))
print("Parameters Grid Search: ", gs.best_params_)
Ejemplo n.º 9
0
from sklearn.model_selection import GridSearchCV

from dabl.search import GridSuccessiveHalving

data_train = fetch_20newsgroups(subset="train")
data_test = fetch_20newsgroups(subset="test")

pipe = Pipeline([('vect', CountVectorizer()), ('clf', LogisticRegression())])
param_grid = {
    'vect': [TfidfVectorizer(), CountVectorizer()],
    'clf__C': np.logspace(-3, 3, 7),
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]}
print("Parameter grid:")
print(param_grid)

sh = GridSuccessiveHalving(pipe, param_grid, cv=5)
print("Start successive halving")
tick = time()
sh.fit(data_train.data, data_train.target)
print("Training Time Successive Halving", time() - tick)
print("Test Score Successive Halving: ",
      sh.score(data_test.data, data_test.target))
print("Parameters Successive Halving: ", sh.best_params_)

gs = GridSearchCV(pipe, param_grid, cv=5)
print("Start Grid Search")
tick = time()
gs.fit(data_train.data, data_train.target)
print("Training Time Grid Search: ", time() - tick)
print("Test Score Grid Search: ", gs.score(data_test.data, data_test.target))
print("Parameters Grid Search: ", gs.best_params_)