Exemple #1
0
def blending(data, folds):
    names = [
        "LinearSVM", "LogisticRegression", "RandomForest", "XGBoost",
        "CatBoost", "LGMBoost"
    ]
    classifiers = [
        svm.SVC(probability=True),
        linear_model.LogisticRegression(max_iter=10000),
        ensemble.RandomForestClassifier(),
        XGBClassifier(),
        CatBoostClassifier(verbose=0),
        LGBMClassifier(n_estimators=400, silent=True)
    ]
    parameters = [
        # LinearSVM
        {
            'C': loguniform(1e0, 1e3),
            'gamma': loguniform(1e-4, 1e-3),
            'kernel': ['rbf']
        },
        # LogisticRegression
        {},
        # RandomForest
        {
            'bootstrap': [True, False],
            'max_depth': [int(x) for x in range(10, 50)],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [int(x) for x in range(1, 5)],
            'min_samples_split': [int(x) for x in range(2, 10)],
            'n_estimators': [int(x) for x in range(100, 500, 50)]
        },
        # XGBoost
        {
            'min_child_weight': [1, 5, 10],
            'gamma': loguniform(1e-4, 1e-3),
            'subsample': list(np.linspace(0.5, 1, 100)),
            'colsample_bytree': list(np.linspace(0.6, 1, 10)),
            'max_depth': [int(x) for x in range(3, 11)],
            'n_estimators': [int(x) for x in range(100, 500, 50)]
        },
        # CatBoost
        {
            'max_depth': [int(x) for x in range(4, 11)],
            'iterations': [int(x) for x in range(10, 100)]
        },
        # LGMBoost
        {}
    ]
    for name, classifier, param_dist in zip(names, classifiers, parameters):
        print(name)
        print("-" * 50)
        train_data = data.copy()
        n_iter_search = 30
        rs = RandomizedSearchCV(classifier,
                                param_distributions=param_dist,
                                n_iter=n_iter_search,
                                n_jobs=-1)
        best_clf = run_training(train_data, rs, name, folds)
        print("\n")
        joblib.dump(best_clf, f"./model/{name}.bin", compress=5)
Exemple #2
0
def forest():
    vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)),
                             ('clf', RandomForestClassifier(random_state=0))])

    param_dist = {
        'clf__n_estimators':
        np.array(np.power(10, np.arange(1, 3, step=0.1)), dtype=int),
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth':
        np.arange(2, 50, dtype=int).tolist() + [None],
        'clf__min_samples_split':
        loguniform(1e-4, 1e-1),
        'clf__min_samples_leaf':
        stats.randint(1, 200),
        'clf__max_features':
        np.linspace(0.01, 1, num=10, dtype=float).tolist() +
        ['auto', 'sqrt', 'log2', None],
        'clf__max_leaf_nodes': [None] +
        np.array(np.power(10, np.arange(1, 4, step=0.5)), dtype=int).tolist(),
        'clf__min_impurity_decrease': [0.0] + np.array(
            np.power(10, np.arange(-10, -4, step=0.5)), dtype=float).tolist(),
        'clf__bootstrap': [True, False],
        'clf__oob_score': [True, False],
        'clf__warm_start': [True, False],
        'clf__max_samples':
        stats.uniform(0, 1),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample'],
        'clf__verbose': [True, False],
        'clf__min_weight_fraction_leaf':
        loguniform(1e-4, 1e-1)
    }

    return vect_and_clf, param_dist
Exemple #3
0
def LR(dataset):

    vect_and_clf = Pipeline([('vect', TfidfVectorizer()),
                             ('clf', LogisticRegression(random_state=0))])
    param_dist = {
        'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'clf__dual': [True, False],
        'clf__C': loguniform(1e-3, 1e3),
        'clf__tol': loguniform(1e-11, 1e-4),
        'clf__fit_intercept': [True, False],
        'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'clf__max_iter': stats.randint(50, 200),
        'clf__warm_start': [True, False],
        'clf__multi_class': ['auto', 'ovr', 'multinomial'],
        'clf__l1_ratio': stats.uniform(0, 1),
        'vect__min_df': loguniform(1e-4, 1e-2),
        'vect__max_df': np.linspace(0.5, 0.9, num=10, dtype=float),
        'vect__stop_words': [None, 'english'],
        'vect__token_pattern': ['\w{2,}', '\w{1,}'],
        'vect__ngram_range': [(1, 2), (1, 1)]
    }

    n_iter_search = 200
    random_search = RandomizedSearchCV(vect_and_clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=5,
                                       n_jobs=-1)

    train_data, test_data, train_label, test_label = get_dataset()
    start = time()
    random_search.fit(train_data, train_label)
    print("RandomizedSearchCV took %.2f seconds" % ((time() - start)))
    results = random_search.cv_results_
    candidates = np.flatnonzero(results['rank_test_score'] == 1)
    with open('/Users/tianchima/Desktop/Trial1/LR.txt', 'w') as f:
        for candidate in candidates:
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            f.write('Mean validation score: ' +
                    str(results['mean_test_score'][candidate]) + '\n')
            f.write('std: ' + str(results['std_test_score'][candidate]) + '\n')
            f.write('Parameters: ' + str(results['params'][candidate]) + '\n')
        test_label_predict = random_search.best_estimator_.predict(test_data)
        accruracy = accuracy_score(test_label, test_label_predict)
        print(results)
        print('accruracy = ', accruracy)

        f.write('accruracy = ' + str(accruracy) + '\n' + '\n')
        f.write(str(random_search.best_params_) + '\n' + '\n')
        f.write(str(results))

    return random_search.best_estimator_
Exemple #4
0
def RandomF(dataset):
    vect_and_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', RandomForestClassifier(random_state=0))])
    param_dist = {'clf__n_estimators': np.array(np.power(10, np.arange(1, 3, step=0.1)), dtype=int),
                  'clf__criterion': ['gini', 'entropy'],
                  'clf__max_depth': np.arange(2, 50, dtype=int).tolist() + [None],
                  'clf__min_samples_split': loguniform(1e-4, 1e-1),
                  'clf__min_samples_leaf': stats.randint(1, 200),
                  'clf__max_features': np.linspace(0.01, 1, num=10, dtype=float).tolist() + ['auto', 'sqrt', 'log2',
                                                                                             None],
                  'clf__max_leaf_nodes': [None] + (np.power(10, np.arange(1, 4, step=0.5)).astype(np.int)).tolist(),
                  'clf__min_impurity_decrease': [0.0] + np.array(np.power(10, np.arange(-10, -4, step=0.5)),
                                                                 dtype=float).tolist(),
                  'clf__bootstrap': [True, False],
                  'clf__oob_score': [True, False],
                  'clf__warm_start': [True, False],
                  'clf__max_samples': stats.uniform(0, 1),
                  'clf__class_weight': [None, 'balanced', 'balanced_subsample'],
                  'clf__verbose': [True, False],
                  'clf__min_weight_fraction_leaf': loguniform(1e-4, 1e-1),
                  'vect__min_df': loguniform(1e-4, 1e-2),
                  'vect__max_df': np.linspace(0.5, 0.9, num=10, dtype=float),
                  'vect__stop_words': [None, 'english'],
                  'vect__token_pattern': ['\w{2,}', '\w{1,}'],
                  'vect__ngram_range': [(1, 2), (1, 1)]}

    n_iter_search = 200
    random_search = RandomizedSearchCV(vect_and_clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5,
                                       n_jobs=-1)

    train_data, test_data, train_label, test_label = get_dataset()
    start = time()
    random_search.fit(train_data, train_label)
    print("RandomizedSearchCV took %.2f seconds" % ((time() - start)))
    results = random_search.cv_results_
    candidates = np.flatnonzero(results['rank_test_score'] == 1)
    with open('/Users/tianchima/Desktop/Trial2/RandomF.txt', 'w') as f:
        for candidate in candidates:
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(results['mean_test_score'][candidate],
                                                                         results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            f.write('Mean validation score: ' + str(results['mean_test_score'][candidate]) + '\n')
            f.write('std: ' + str(results['std_test_score'][candidate]) + '\n')
            f.write('Parameters: ' + str(results['params'][candidate]) + '\n')
        test_label_predict = random_search.best_estimator_.predict(test_data)
        accruracy = accuracy_score(test_label, test_label_predict)
        print(results)
        print('accruracy = ', accruracy)

        f.write('accruracy = ' + str(accruracy) + '\n' + '\n')
        f.write(str(random_search.best_params_) + '\n' + '\n')
        f.write(str(results))

    return random_search.best_estimator_
Exemple #5
0
def svm():
    vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)),
                             ('clf', LinearSVC(random_state=0, verbose=10))])

    param_dist = {
        'clf__dual': [True, False],
        'clf__loss': ['hinge', 'squared_hinge'],
        'clf__C': loguniform(1e-3, 1e3),
        'clf__tol': loguniform(1e-11, 1e-4),
        'clf__fit_intercept': [True, False]
    }

    return vect_and_clf, param_dist
Exemple #6
0
def est_ET():
    hp = [{
        'n_estimators': (
            1,
            100,
        ),
        'min_weight_fraction_leaf': (
            0.0,
            0.25,
            0.5,
        ),
        'max_features': (
            'sqrt',
            'log2',
            'auto',
            None,
        ),
        'max_samples': loguniform(1, 1000),
        'bootstrap': (
            True,
            False,
        ),
        'oob_score': (
            True,
            False,
        ),
        'warm_start': (
            True,
            False,
        ),
        'criterion': (
            'mse',
            'mae',
        ),
        'max_depth': (
            1,
            10,
            100,
            None,
        ),
        'max_leaf_nodes': (
            2,
            100,
        ),
        'min_samples_split': (10, ),
        'min_samples_leaf': loguniform(1, 100),
    }]
    est = ensemble.ExtraTreesRegressor()
    #regr = MultiOutputRegressor(estimator=est)
    return est, hp
def SVM(x_train, y_train, x_test, y_test):
    params = {
        'C': loguniform(1e0, 1e3),
        'gamma': loguniform(1e-4, 1e-3),
        'kernel': ['rbf', 'linear'],
        'class_weight': ['balanced', None]
    }
    svm = SVC()
    clf = GridSearchCV(svm, params)
    clf.fit(x_train, y_train)
    svm_predictions = clf.predict(x_test)
    accuracy = accuracy_score(y_test, svm_predictions)
    print("Accuracy: ", accuracy)
    print(clf.get_params)
    return accuracy
Exemple #8
0
def Bag(dataset, estimator1, estimator2, estimator3, estimator4, estimator5):
    vect_and_clf = Pipeline([('vect', TfidfVectorizer()),
                             ('clf', BaggingClassifier(random_state=0))])
    param_dist = {
        'clf__base_estimator': [
            None, estimator1, estimator2, estimator3, estimator4, estimator5,
            MultinomialNB()
        ],
        'clf__n_estimators':
        stats.randint(10, 400),
        'clf__max_features':
        stats.uniform(0, 1),
        'clf__max_samples':
        stats.uniform(0, 1),
        'clf__bootstrap': [True, False],
        'clf__bootstrap_features': [True, False],
        'clf__oob_score': [True, False],
        'clf__warm_start': [True, False],
        'vect__min_df':
        loguniform(1e-4, 1e-2),
        'vect__max_df':
        np.linspace(0.5, 0.9, num=10, dtype=float),
        'vect__stop_words': [None, 'english'],
        'vect__token_pattern': ['\w{2,}', '\w{1,}'],
        'vect__ngram_range': [(1, 2), (1, 1)]
    }

    n_iter_search = 200
    random_search = RandomizedSearchCV(vect_and_clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=5,
                                       n_jobs=-1)

    train_data, test_data, train_label, test_label = get_dataset()
    start = time()
    random_search.fit(train_data, train_label)
    print("RandomizedSearchCV took %.2f seconds" % ((time() - start)))
    results = random_search.cv_results_
    candidates = np.flatnonzero(results['rank_test_score'] == 1)
    with open('/Users/tianchima/Desktop/Trial2/Bag.txt', 'w') as f:
        for candidate in candidates:
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            f.write('Mean validation score: ' +
                    str(results['mean_test_score'][candidate]) + '\n')
            f.write('std: ' + str(results['std_test_score'][candidate]) + '\n')
            f.write('Parameters: ' + str(results['params'][candidate]) + '\n')
        test_label_predict = random_search.best_estimator_.predict(test_data)
        accruracy = accuracy_score(test_label, test_label_predict)
        print(results)
        print('accruracy = ', accruracy)

        f.write('accruracy = ' + str(accruracy) + '\n' + '\n')
        f.write(str(random_search.best_params_) + '\n' + '\n')
        f.write(str(results))

    return
Exemple #9
0
def get_ensemble_model(params):
    """output a nonlinear XGBoost regressor with randomised parameter search over nested 5-fold CV
    params: dict, containing details on PCA if required

    returns:
    model: sklearn estimator
    """
    ss = StandardScaler()
    xgb_reg = xgb.XGBRegressor(objective="reg:squarederror",n_jobs=1, base_score=12, learning_rate=0.05, random_state=42)

    if params['pca']:
        pca = PCA(n_components=params['pca_comps'], whiten=True)
        xgb_model = Pipeline(steps=(['scale', ss], ['pca', pca], ['model', xgb_reg])) # pipeline
    else:
        xgb_model = Pipeline(steps=(['scale', ss], ['model', xgb_reg]))

    xgb_model_params = {
        "model__n_estimators": [100,250,500],
        "model__colsample_bytree": uniform(0.5, 0.5), # default 1
        "model__min_child_weight": randint(1,6),        #deafult 1
        "model__max_depth": randint(2, 5),            # default 3, 3-10 -
        "model__subsample": uniform(0.5, 0.5),        # default 1
        "model__reg_lambda": loguniform(1e1,1e2)       # l2 reg, default 1
    }

    # model: classifier with randomised parameter search over nested 3-fold CV (more iters to account for large space)
    ensemble_model = RandomizedSearchCV(xgb_model, xgb_model_params, n_iter=500, cv=5, verbose=1, n_jobs=5)

    return clone(ensemble_model)
Exemple #10
0
def get_linear_model(params):
    """output a sparse linear regressor with randomised parameter search over nested 5-fold CV
    params: dict, containing details on PCA if required
    returns:
    model: sklearn estimator
    """

    ss = StandardScaler()
    lr = ElasticNet(selection='random', random_state=42)  # EN

    if params['pca']:
        pca = PCA(n_components=params['pca_comps'], whiten=True)
        lr_model = Pipeline(steps=(['scale', ss], ['pca', pca], ['model', lr])) # pipeline
    else:
        lr_model = Pipeline(steps=(['scale', ss], ['model', lr]))     # pipeline

    lr_model_params = {
            'model__alpha': loguniform(1e-1, 1e3),
            'model__l1_ratio': uniform(0.1, .9)
    }

    # model: classifier with randomised parameter search over nested 3-fold CV
    linear_model = RandomizedSearchCV(lr_model, lr_model_params, n_iter=500, cv=5)

    return clone(linear_model)
Exemple #11
0
def get_ensemble_model():
    """output a nonlinear XGBoost classifier with randomised parameter search over nested 3-fold CV

    returns:
    model: sklearn estimator
    """
    ss = StandardScaler()
    xgb_clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    xgb_model = Pipeline(steps=(['scale', ss], ['clf', xgb_clf]))

    xgb_model_params = {
        "clf__colsample_bytree": uniform(0.5, 0.5),  # default 1
        "clf__gamma": loguniform(1e-1, 1e3),  # default 0
        "clf__learning_rate": uniform(0.03, 0.57),  # default 0.3
        "clf__max_depth": randint(2, 5),  # default 3
        "clf__n_estimators": randint(10, 50),  # default 100
        "clf__subsample": uniform(0.5, 0.25),  # default 1
        "clf__min_child_weight": randint(1, 8)  # default 1
    }

    # model: classifier with randomised parameter search over nested 3-fold CV (more iters to account for large space)
    ensemble_model = RandomizedSearchCV(xgb_model,
                                        xgb_model_params,
                                        n_iter=250,
                                        cv=3)

    return clone(ensemble_model)
Exemple #12
0
def lr():
    vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)),
                             ('clf', LogisticRegression(random_state=0))])

    param_dist = {
        'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'clf__dual': [True, False],
        'clf__C': loguniform(1e-3, 1e3),
        'clf__tol': loguniform(1e-11, 1e-4),
        'clf__fit_intercept': [True, False],
        'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'clf__max_iter': stats.randint(50, 200),
        'clf__warm_start': [True, False],
        'clf__multi_class': ['auto', 'ovr', 'multinomial'],
        'clf__l1_ratio': stats.uniform(0, 1)
    }

    return vect_and_clf, param_dist
Exemple #13
0
def test_loguniform(low, high, base):
    rv = loguniform(base**low, base**high)
    assert isinstance(rv, scipy.stats._distn_infrastructure.rv_frozen)
    rvs = rv.rvs(size=2000, random_state=0)

    # Test the basics; right bounds, right size
    assert (base**low <= rvs).all() and (rvs <= base**high).all()
    assert len(rvs) == 2000

    # Test that it's actually (fairly) uniform
    log_rvs = np.array([math.log(x, base) for x in rvs])
    counts, _ = np.histogram(log_rvs)
    assert counts.mean() == 200
    assert np.abs(counts - counts.mean()).max() <= 40

    # Test that random_state works
    assert loguniform(base**low, base**high).rvs(random_state=0) == loguniform(
        base**low, base**high).rvs(random_state=0)
Exemple #14
0
def test_sparse_methods():
    c1 = [0.1, 0.2]
    c2 = [0.1, 0.2]
    param_grid = {"c": [c1, c2]}
    pmd_cv = GridSearchCV(PMD(random_state=rng),
                          param_grid=param_grid).fit([X, Y])
    cv_plot(pmd_cv.cv_results_)
    c1 = [5e-1]
    c2 = [1e-1]
    param_grid = {"c": [c1, c2]}
    scca_cv = GridSearchCV(SCCA(random_state=rng),
                           param_grid=param_grid).fit([X, Y])
    c1 = [1e-1]
    c2 = [1e-1]
    param_grid = {"c": [c1, c2]}
    parkhomenko_cv = GridSearchCV(ParkhomenkoCCA(random_state=rng),
                                  param_grid=param_grid).fit([X, Y])
    c1 = [2e-2]
    c2 = [1e-2]
    param_grid = {"c": [c1, c2]}
    admm_cv = GridSearchCV(SCCA_ADMM(random_state=rng),
                           param_grid=param_grid).fit([X, Y])
    c1 = loguniform(1e-1, 2e-1)
    c2 = loguniform(1e-1, 2e-1)
    param_grid = {"c": [c1, c2], "l1_ratio": [[0.9], [0.9]]}
    elastic_cv = RandomizedSearchCV(ElasticCCA(random_state=rng),
                                    param_distributions=param_grid,
                                    n_iter=4).fit([X, Y])
    assert (pmd_cv.best_estimator_.weights[0] == 0).sum() > 0
    assert (pmd_cv.best_estimator_.weights[1] == 0).sum() > 0
    assert (scca_cv.best_estimator_.weights[0] == 0).sum() > 0
    assert (scca_cv.best_estimator_.weights[1] == 0).sum() > 0
    assert (admm_cv.best_estimator_.weights[0] == 0).sum() > 0
    assert (admm_cv.best_estimator_.weights[1] == 0).sum() > 0
    assert (parkhomenko_cv.best_estimator_.weights[0] == 0).sum() > 0
    assert (parkhomenko_cv.best_estimator_.weights[1] == 0).sum() > 0
    assert (elastic_cv.best_estimator_.weights[0] == 0).sum() > 0
    assert (elastic_cv.best_estimator_.weights[1] == 0).sum() > 0
Exemple #15
0
def get_nonlinear_model():
    """output a nonlinear SVM classifier with randomised parameter search over nested 3-fold CV

    returns:
    model: sklearn estimator
    """
    ss = StandardScaler()
    svm = SVC(kernel='rbf', probability=True, random_state=42)  # kernel SVM

    svm_model = Pipeline(steps=(['scale', ss], ['clf', svm]))

    svm_model_params = {
        'clf__C': loguniform(1e-3, 1e3),
        'clf__gamma': loguniform(1e-4, 1e1)
    }

    # model: classifier with randomised parameter search over nested 3-fold CV
    nonlinear_model = RandomizedSearchCV(svm_model,
                                         svm_model_params,
                                         n_iter=100,
                                         cv=3)

    return clone(nonlinear_model)
Exemple #16
0
def get_linear_model():
    """output a linear classifier with randomised parameter search over nested 3-fold CV

    returns:
    model: sklearn estimator
    """

    ss = StandardScaler()
    lr = LogisticRegression(penalty='l2', max_iter=1000,
                            class_weight=None)  # ridge

    lr_model = Pipeline(steps=(['scale', ss], ['clf', lr]))  # pipeline

    lr_model_params = {'clf__C': loguniform(1e-3, 1e3)}

    # model: classifier with randomised parameter search over nested 3-fold CV
    linear_model = RandomizedSearchCV(lr_model,
                                      lr_model_params,
                                      n_iter=100,
                                      cv=3)

    return clone(linear_model)
Exemple #17
0
    def perform_hyperparameter_tuning(self,
                                      X,
                                      y,
                                      model_name='ridge',
                                      n_values=100):
        if model_name == 'ridge':
            #             model = Ridge()
            #             reg_pipeline = Pipeline([('scaler', MinMaxScaler()),
            #                             ('Ridge', Ridge())])

            #             param_grid = [{'alpha': np.logspace(-5,5,100)}]
            param_dist = {'alpha': loguniform(1e-5, 1e0)}
            clf = RandomizedSearchCV(estimator=Ridge(normalize=True),
                                     param_distributions=param_dist,
                                     n_iter=50,
                                     n_jobs=10,
                                     random_state=self.random_state)
            clf.fit(X, y)
            return clf.best_params_

        else:
            print("Only supporting Ridge for now")
Exemple #18
0
def dt():
    vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)),
                             ('clf', DecisionTreeClassifier(random_state=0))])

    param_dist = {
        'clf__criterion': ['gini', 'entropy'],
        'clf__splitter': ['best', 'random'],
        'clf__max_depth':
        np.arange(2, 50, dtype=int).tolist() + [None],
        'clf__min_samples_split':
        loguniform(1e-4, 1e-1),
        'clf__min_samples_leaf':
        stats.randint(1, 200),
        'clf__max_features':
        np.linspace(0.01, 1, num=10, dtype=float).tolist() +
        ['auto', 'sqrt', 'log2', None],
        'clf__max_leaf_nodes': [None] +
        np.array(np.power(10, np.arange(1, 4, step=0.5)), dtype=int).tolist(),
        'clf__min_impurity_decrease': [0.0] + np.array(
            np.power(10, np.arange(-10, -4, step=0.5)), dtype=float).tolist()
    }

    return vect_and_clf, param_dist
# fuzzy options to test:
fuzzy_options = ["normal", "fuzzy_dist", "fuzzy_err"]

# Features:
features = ["N2-N4", "N3-N4", "N2-N3", "Y-N4", "Z-N4", "G-I", "G-R", "I-N4"]
fuzzy_dist_column = ["fuzzy_dist"]
fuzzy_err_column = ["fuzzy_err"]
output_path = "./results"


#------------------------------------ TRAINING: --------------------------------------

# scale features of the data:
train_X, general_X = training_utils.scale_X_of_the_data(training_data[features], general_data[features])

params = {'C': loguniform(1e0, 1e3),
          'gamma': loguniform(1e-4, 1e-2)}

for fuzzy_option in fuzzy_options:
    
    print(fuzzy_option)
    
    clf = svm.SVC(gamma='scale',
                  kernel='rbf',
                  probability=True,
                  class_weight='balanced',
                  cache_size=5000,
                  random_state=476)
    
    clf_for_eval = svm.SVC(gamma='scale',
                  kernel='rbf',
# Our kernel has two parameters: the length-scale and the periodicity. For our
# dataset, we use `sin` as the generative process, implying a
# :math:`2 \pi`-periodicity for the signal. The default value of the parameter
# being :math:`1`, it explains the high frequency observed in the predictions of
# our model.
# Similar conclusions could be drawn with the length-scale parameter. Thus, it
# tell us that the kernel parameters need to be tuned. We will use a randomized
# search to tune the different parameters the kernel ridge model: the `alpha`
# parameter and the kernel parameters.

# %%
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

param_distributions = {
    "alpha": loguniform(1e0, 1e3),
    "kernel__length_scale": loguniform(1e-2, 1e2),
    "kernel__periodicity": loguniform(1e0, 1e1),
}
kernel_ridge_tuned = RandomizedSearchCV(
    kernel_ridge,
    param_distributions=param_distributions,
    n_iter=500,
    random_state=0,
)
start_time = time.time()
kernel_ridge_tuned.fit(training_data, training_noisy_target)
print(f"Time for KernelRidge fitting: {time.time() - start_time:.3f} seconds")

# %%
# Fitting the model is now more computationally expensive since we have to try
    'input_scaling': 0.4,
    'bias_scaling': 0.0,
    'spectral_radius': 0.0,
    'reservoir_activation': 'tanh',
    'leakage': 1.0,
    'bidirectional': False,
    'k_rec': 10,
    'alpha': 1e-3,
    'random_state': 42
}

step1_esn_params = {
    'input_scaling': uniform(loc=1e-2, scale=1),
    'spectral_radius': uniform(loc=0, scale=2)
}
step2_esn_params = {'leakage': loguniform(1e-5, 1e0)}
step3_esn_params = {'bias_scaling': np.linspace(0.0, 1.0, 11)}
step4_esn_params = {'alpha': loguniform(1e-5, 1e1)}

kwargs_step1 = {
    'n_iter': 200, 'random_state': 42, 'verbose': 1, 'n_jobs': -1,
    'scoring': make_scorer(mean_squared_error, greater_is_better=False,
                           needs_proba=True)
}
kwargs_step2 = {
    'n_iter': 50, 'random_state': 42, 'verbose': 1, 'n_jobs': -1,
    'scoring': make_scorer(mean_squared_error, greater_is_better=False,
                           needs_proba=True)
}
kwargs_step3 = {
    'verbose': 1, 'n_jobs': -1,
            gs.best_estimator_, "%s_models/%s_%s_regressor_best_estimator.pk" %
            (method, method, data_type))

    return (gs)


# +
if classification_task:
    model = svm.SVC(max_iter=10000)
else:
    model = svm.SVR(max_iter=10000)

# Grid parameters
param_svm = [
    {
        'C': loguniform(1e-1, 1e4),
        'kernel': ['poly', 'rbf'],
        'gamma': loguniform(1e-4, 1e1)
    },
]

n_iter = 200
scaler = preprocessing.MinMaxScaler()
X_train_copy = scaler.fit_transform(X_train)

if classification_task:
    svm_gs = supervised_learning_steps("svm", "roc_auc", data_type,
                                       classification_task, model, param_svm,
                                       X_train_copy, y_train, n_iter)
else:
    svm_gs = supervised_learning_steps("svm", "r2", data_type,
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {
    'average': [True, False],
    'l1_ratio': stats.uniform(0, 1),
    'alpha': loguniform(1e-4, 1e0)
}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
Exemple #24
0
    'leakage': 1.0,
    'k_rec': 10,
    'reservoir_activation': 'tanh',
    'bidirectional': False,
    'alpha': 1e-3,
    'random_state': 42
}

step1_esn_params = {
    'input_scaling': uniform(loc=1e-2, scale=1),
    'spectral_radius': uniform(loc=0, scale=2)
}

step2_esn_params = {'leakage': uniform(1e-5, 1e0)}
step3_esn_params = {'bias_scaling': uniform(loc=0, scale=3)}
step4_esn_params = {'alpha': loguniform(1e-5, 1e1)}

kwargs_step1 = {
    'n_iter': 200,
    'random_state': 42,
    'verbose': 1,
    'n_jobs': -1,
    'scoring': gpe_scorer
}
kwargs_step2 = {
    'n_iter': 50,
    'random_state': 42,
    'verbose': 1,
    'n_jobs': -1,
    'scoring': gpe_scorer
}
Exemple #25
0

X=crime_filtered.loc[:, 'population':'PolicBudgPerPop']
imputer = KNNImputer(n_neighbors=10,weights='distance')
X = imputer.fit_transform(X)

Y = crime_filtered.loc[:, 'larcPerPop']

train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=.3,random_state=42)



bds = [{'name': 'alpha', 'type': 'continuous', 'domain': (1e-10, 10)},
        {'name': 'l1_ratio', 'type': 'continuous', 'domain': (0, 1)}]

param_dist = {"alpha": loguniform(1e-10, 10e0),
              "l1_ratio": uniform(0, 1)}


enet = ElasticNet(max_iter=10000,normalize=True)



baseline = cross_val_score(enet, train_x, train_y, scoring='r2', cv=10).mean()


rs = RandomizedSearchCV(enet,
                        param_distributions=param_dist,
                        scoring='r2',
                        n_jobs=-1,
                        verbose=2,
Exemple #26
0
def getHyperParamBTC(typeofrun=1):
    if typeofrun == 1:
        # BitcoinTransformer
        param_dist = {
            'classify__max_depth': [80, 90, 100, 110],
            'classify__max_features': [2, 3],
            'classify__min_samples_leaf': [3, 4, 5],
            'classify__min_samples_split': [8, 10, 12],
            'classify__n_estimators': [100, 200, 300, 1000],
            'BitcoinTransformer__adosc_fastperiod': loguniform(2, 100),
            'BitcoinTransformer__adosc_slowperiod': loguniform(2, 100),
            'BitcoinTransformer__adx_period': loguniform(2, 100),
            'BitcoinTransformer__adxr_period': loguniform(2, 100),
            'BitcoinTransformer__apo_fastperiod': loguniform(2, 100),
            'BitcoinTransformer__apo_slowperiod': loguniform(2, 100),
            'BitcoinTransformer__aroon_period': loguniform(2, 100),
            'BitcoinTransformer__aroonosc_period': loguniform(2, 100),
            'BitcoinTransformer__bb_periods': loguniform(2, 100),
            'BitcoinTransformer__cci_periods': loguniform(2, 100),
            'BitcoinTransformer__cmo_period': loguniform(2, 100),
            'BitcoinTransformer__dema_period': loguniform(2, 100),
            'BitcoinTransformer__dx_period': loguniform(2, 100),
            'BitcoinTransformer__ema_period': loguniform(2, 100),
            'BitcoinTransformer__kama_period': loguniform(2, 100),
            'BitcoinTransformer__ma_period': loguniform(2, 100),
            'BitcoinTransformer__macd_period_longterm': loguniform(2, 100),
            'BitcoinTransformer__macd_period_shortterm': loguniform(2, 100),
            'BitcoinTransformer__macd_period_to_signal': loguniform(2, 100),
            'BitcoinTransformer__mean_o_c_period': loguniform(2, 100),
            'BitcoinTransformer__mfi_period': loguniform(2, 100),
            'BitcoinTransformer__midpoint_period': loguniform(2, 100),
            'BitcoinTransformer__midprice_period': loguniform(2, 100),
            'BitcoinTransformer__minus_di_period': loguniform(2, 100),
            'BitcoinTransformer__minus_dm_period': loguniform(2, 100),
            'BitcoinTransformer__momentum_period': loguniform(2, 100),
            'BitcoinTransformer__plus_di_period': loguniform(2, 100),
            'BitcoinTransformer__plus_dm_period': loguniform(2, 100),
            'BitcoinTransformer__ppo_fastperiod': loguniform(2, 100),
            'BitcoinTransformer__ppo_slowperiod': loguniform(2, 100),
            'BitcoinTransformer__roc_period': loguniform(2, 100),
            'BitcoinTransformer__rocp_period': loguniform(2, 100),
            'BitcoinTransformer__rocr100_period': loguniform(2, 100),
            'BitcoinTransformer__rocr_period': loguniform(2, 100),
            'BitcoinTransformer__rsi_period': loguniform(2, 100),
            'BitcoinTransformer__sar_acceleration': loguniform(2, 100),
            'BitcoinTransformer__sar_maximum': loguniform(2, 100),
            'BitcoinTransformer__sarext_accelerationinitlong':
            loguniform(2, 100),
            'BitcoinTransformer__sarext_accelerationinitshort':
            loguniform(2, 100),
            'BitcoinTransformer__sarext_accelerationlong': loguniform(2, 100),
            'BitcoinTransformer__sarext_accelerationmaxlong':
            loguniform(2, 100),
            'BitcoinTransformer__sarext_accelerationmaxshort':
            loguniform(2, 100),
            'BitcoinTransformer__sarext_accelerationshort': loguniform(2, 100),
            'BitcoinTransformer__sarext_offsetonreverse': loguniform(2, 100),
            'BitcoinTransformer__sarext_startvalue': loguniform(2, 100),
            'BitcoinTransformer__sma_close_timeperiod': loguniform(2, 100),
            'BitcoinTransformer__sma_h_l_c_o_period': loguniform(2, 100),
            'BitcoinTransformer__sma_handl_period': loguniform(2, 100),
            'BitcoinTransformer__sma_high_period': loguniform(2, 100),
            'BitcoinTransformer__sma_low_period': loguniform(2, 100),
            'BitcoinTransformer__so_d_n': loguniform(2, 100),
            'BitcoinTransformer__so_n': loguniform(2, 100),
            'BitcoinTransformer__t3_period': loguniform(2, 100),
            'BitcoinTransformer__tema_period': loguniform(2, 100),
            'BitcoinTransformer__trima_period': loguniform(2, 100),
            'BitcoinTransformer__trix_period': loguniform(2, 100),
            'BitcoinTransformer__ultosc_period1': loguniform(2, 100),
            'BitcoinTransformer__ultosc_period2': loguniform(2, 100),
            'BitcoinTransformer__ultosc_period3': loguniform(2, 100),
            'BitcoinTransformer__var_close_period': loguniform(2, 100),
            'BitcoinTransformer__var_open_period': loguniform(2, 100),
            'BitcoinTransformer__wma_period': loguniform(2, 100),
            'BitcoinTransformer__wr_lookback_period': loguniform(2, 100)
        }
    elif typeofrun == 2:
        # ParameterRelationsBTCTrans
        param_dist = {
            'classify__max_depth': [80, 90, 100, 110],
            'classify__max_features': [2, 3],
            'classify__min_samples_leaf': [3, 4, 5],
            'classify__min_samples_split': [8, 10, 12],
            'classify__n_estimators': [100, 200, 300, 1000],
            'BitcoinTransformer__fastperiod': loguniform(2, 100),
            'BitcoinTransformer__longterm': loguniform(2, 100),
            'BitcoinTransformer__midterm': loguniform(2, 100),
            'BitcoinTransformer__shortterm': loguniform(2, 100),
            'BitcoinTransformer__bb_cci': loguniform(2, 100),
            'BitcoinTransformer__var_t3': loguniform(2, 100),
            'BitcoinTransformer__dema_trema': loguniform(2, 100),
            'BitcoinTransformer__zero': loguniform(2, 100),
            'BitcoinTransformer__rocperiod': loguniform(2, 100)
        }
    elif typeofrun == 3:
        # Only Random Forest hyperparameters
        param_dist = {
            'classify__max_depth': [80, 90, 100, 110],
            'classify__max_features': [2, 3],
            'classify__min_samples_leaf': [3, 4, 5],
            'classify__min_samples_split': [8, 10, 12],
            'classify__n_estimators': [100, 200, 300, 1000],
        }
    return param_dist
Exemple #27
0
# Hyperparameter ranges / distributions that should be considered during the random search
PARAM_SEARCH = {
    "kernel": ["RFB", "Matern12", "Matern32", "Matern52", "RQ"],
    "n_neighbors": np.arange(5, 50, 5),
    "n_inducing_points": np.arange(10, 100, 10),
    "coeff": np.linspace(0.5, 4, 10),
    "n_components": range(2, 20),
    "hidden_sizes": [
        [hidden_size] * num_layers
        for hidden_size in [25, 30, 50, 75, 100]
        for num_layers in range(1, 4)
    ],
    "latent_dim": [5, 10, 15, 20],
    "batch_size": [64, 128, 256],
    "lr": loguniform(1e-4, 0.1),
    # Intervals become [loc, loc + scale] for uniform
    "C": [10 ** i for i in range(0, 5)],
    #  Regularization for logistic regression baseline
    "n_mix_components": range(1, 11),
    # Intervals become [loc, loc + scale] for uniform
    "dropout_rate": uniform(loc=0, scale=0.5),  # [0, 0.5]
    "posterior_rho_init": uniform(loc=-8, scale=6),  # [-8, -2]
    "posterior_mu_init": uniform(loc=-0.6, scale=1.2),  # [-0.6, 0.6]
    "prior_pi": uniform(loc=0.1, scale=0.8),  # [0.1, 0.9]
    "prior_sigma_1": [np.exp(d) for d in np.arange(-0.8, 0, 0.1)],
    "prior_sigma_2": [np.exp(d) for d in np.arange(-0.8, 0, 0.1)],
    "reconstr_error_weight": loguniform(0.01, 0.9),
    "anneal": [True, False],
    "beta": uniform(loc=0.1, scale=2.4),  # [0.1, 2.5]
}
Exemple #28
0
import numpy as np
from vectorizer import load_vectors
from scipy.stats import uniform
from sklearn.utils.fixes import loguniform
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

X_test_vect, X_train_vect, test, train = load_vectors()

lr_model = LogisticRegression(n_jobs=-1, solver='sag', C=1.55, max_iter=500)
lr_params = dict(tol=loguniform(1e-7, 5e-4))
lr_grid = RandomizedSearchCV(lr_model,
                             lr_params,
                             verbose=5,
                             n_iter=12,
                             n_jobs=-1)
lr_search = lr_grid.fit(X_train_vect, train.label)
print(lr_search.best_params_)
print(lr_search.best_score_)
Exemple #29
0
        #     TransformedTargetRegressor(regressor=SVR(), transformer=StandardScaler()),
        # ),
        # (
        #     "kernel_ridge",
        #     TransformedTargetRegressor(
        #         regressor=KernelRidge(), transformer=StandardScaler()
        #     ),
        # ),
        # ("knn", KNeighborsRegressor()),
        # ("xgb", XGBRegressor(objective="reg:squarederror")),
    ],
    memory="cache",
)

param_distributions = {
    "svr__C": loguniform(50, 200),
    "svr__epsilon": loguniform(1e-4, 1),
    # "knn__n_neighbors": stats.randint(low=2, high=50),
    # "xgb__n_estimators": stats.randint(low=50, high=300),
    # "xgb__max_depth": stats.randint(low=2, high=10),
    # "target_svr__regressor__C": stats.expon(scale=100),
    # "target_svr__regressor__epsilon": stats.expon(),
    # "kernel_ridge__regressor__alpha": loguniform(1, 1e4),
    # "kernel_ridge__regressor__gamma": [0.1],
}
search = RandomizedSearchCV(
    fitting,
    param_distributions,
    n_iter=50,
    n_jobs=2,
).fit(X, y)
Exemple #30
0
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results["mean_test_score"][candidate],
                results["std_test_score"][candidate],
            ))
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {
    "average": [True, False],
    "l1_ratio": stats.uniform(0, 1),
    "alpha": loguniform(1e-2, 1e0),
}

# run randomized search
n_iter_search = 15
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_iter_search))
report(random_search.cv_results_)