Ejemplo n.º 1
0
def train_MLP(min_size=5, max_size=100):
    model = MLPClassifier(max_iter=100)
    param_grid = {
        'alpha': [1],
        'max_iter': [1000],
        'solver': ['adam'],
        'activation': ['relu']
    }
    param_grid = {
        'hidden_layer_sizes':
        [(sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size),
          sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size),
          sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size, max_size), )],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam', 'lbfgs'],
        'alpha':
        sp_expon(scale=.01),
        'learning_rate': ['constant', 'adaptive'],
        'learning_rate_init':
        sp_expon(scale=.001),
    }
    return model, param_grid
def get_random_grid_CV_params():
    """Define the Random Grid Search parameters for each model."""
    logit_params = {"C": sp_expon(loc=0.001, scale=1),
                    "fit_intercept": [True, False],
                    "intercept_scaling": sp_randint(1, 5),
                    "warm_start": [False, True]
                    }
    rf_params = {"min_samples_split": sp_randint(1, 50),
                 "min_samples_leaf": sp_randint(1, 50),
                 "criterion": ["gini", "entropy"],
                 "class_weight": ['balanced', 'balanced_subsample']
                 }
    ada_dt_params = {"learning_rate": sp_expon(loc=0.001, scale=1.5),
                     "algorithm": ['SAMME.R', 'SAMME']
                     }
    gbc_params = {"learning_rate": sp_expon(loc=0.001, scale=0.5),
                  "subsample": sp_uniform(loc=0.2, scale=0.8),
                  "max_features": [None, 'auto'],
                  "max_depth": sp_randint(2, 6),
                  }
    svc_params = {"C": sp_expon(loc=0.001, scale=2),
                  "kernel": ['rbf', 'poly'],
                  "degree": sp_randint(2, 10),
                  "coef0": [0, 1, 2],
                  "shrinking": [True, False]
                  }
    rnd_CV_param_distributions = {'Logistic': logit_params,
                                  'RandomForest': rf_params,
                                  'AdaBoost_DT': ada_dt_params,
                                  'GBC': gbc_params,
                                  'SVC': svc_params
                                  }
    return rnd_CV_param_distributions
Ejemplo n.º 3
0
def generate_random_numbers_tuple():
    while True:
        result = []
        size = sp_randint(1, 10).rvs()
        for i in range(size):
            result.append(sp_expon(scale=50).rvs())
        yield tuple(result)
Ejemplo n.º 4
0
        self.low = low
        self.high = high
        self.shape = sp_randint(1, 3).rvs()

    def rvs(self, random_state=None):
        return sp_randint(self.low, self.high).rvs(size=self.shape)

# generate_random_numbers_tuple()

PARAM_DISTS = {
    type(DecisionTreeClassifier()): {
        'criterion': ['gini', 'entropy'],
        'max_depth': sp_randint(3, 8)
    },
    type(SVC()): {
        'C': sp_expon(scale=100),
        'gamma': sp_expon(scale=.1),
        'max_iter': [300],
        'kernel': ['rbf', 'linear', 'sigmoid'],
    },
    type(MLPClassifier()): {
        'hidden_layer_sizes': RandIntMatrix(12, 128),
        'max_iter': [500],
        'activation': ['relu', 'tanh', 'logistic']
    },
    type(RandomForestClassifier()): {
        'n_estimators': sp_randint(10, 25),
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 5, 7, None]
    }
}
Ejemplo n.º 5
0
    # if we don't want to balance
    #X_train_b, y_train_b = X_train, y_train

    print('Training set size AFTER sampling: ', len(X_train_b))
    if presentation:
        print("--------------------------------------------------")

    # PARAMETER ESTIMATION
    # --------------------
    if verbose: print('parameter estimation')

    # train and test the svm classifier to get a mean accuracy score
    svm_clf = svm.LinearSVC(dual=False, penalty='l2', class_weight='balanced')

    # specify possible hyperparameters
    param_dist = {"C": sp_expon(scale=1)}

    scoring = {
        'acc': 'accuracy',
        'prec': 'precision',
        'rec': 'recall',
        'f1': 'f1'
    }
    # run randomized search
    svm_random_search = RandomizedSearchCV(svm_clf,
                                           param_distributions=param_dist,
                                           n_iter=n_iter_search,
                                           cv=nb_folds,
                                           scoring=scoring,
                                           refit='acc')
    # scoring=['accuracy', 'precision', 'recall'],
def get_parameter_distribution(model_name, random_state=None, n_iter=10):
    """
    returns a dictionary containing key=classifier_name:value=parameter_distribution_dictionary
    """
    from scipy.stats import randint as sp_randint
    from sklearn.gaussian_process.kernels import RBF
    from scipy.stats import expon as sp_expon

    np.random.seed(random_state)

    parameter_distributions = {
        'SGD Classifier (log)': {
            'alpha': sp_expon(scale=0.1),
        },
        'KNN Classifier': {
            'n_neighbors': [3, 4, 5]
        },
        'SGD Classifier (linear SVM)': {
            'base_estimator__alpha': sp_expon(scale=3e-5),
        },
        'Polynomial SVM Classifier': {
            'poly_svc__C': [1, 3, 5, 7, 9],
        },
        'RBF SVM Classifier': {
            'rbf_svc__C': [1, 10, 100, 1000],
            'rbf_svc__gamma': sp_expon(scale=0.01)
        },
        'Decision Tree Classifier': {
            "max_depth": sp_randint(3, 15),
            "max_features": sp_randint(3, 15),
            "min_samples_split": sp_randint(2, 10),
            "min_samples_leaf": sp_randint(1, 10),
        },
        'Random Forest Classifier': {
            "n_estimators": sp_randint(2, 150),
            "max_depth": sp_randint(3, 15),
            "max_features": sp_randint(3, 15),
            "min_samples_split": sp_randint(2, 10),
            "min_samples_leaf": sp_randint(1, 10),
        },
        'Extra Trees Classifier': {
            "n_estimators": sp_randint(2, 150),
            "max_depth": sp_randint(3, 15),
            "max_features": sp_randint(3, 15),
            "min_samples_split": sp_randint(2, 10),
            "min_samples_leaf": sp_randint(1, 10),
        },
        'AdaBoosting Trees Classifier': {
            "n_estimators": sp_randint(2, 150),
            "base_estimator__max_depth": sp_randint(3, 15),
            "base_estimator__max_features": sp_randint(3, 15),
            "base_estimator__min_samples_split": sp_randint(2, 10),
            "base_estimator__min_samples_leaf": sp_randint(1, 10),
        },
        'MLP Classifier': {
            'hidden_layer_sizes':
            get_dist_mlp_layers(int(n_iter // 3),
                                min_num_layers=4,
                                max_num_layers=8,
                                min_neurons_per_layer=50,
                                max_neurons_per_layer=200)
        },
    }

    return parameter_distributions[model_name]
Ejemplo n.º 7
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from scipy.stats import uniform as sp_uniform
from scipy.stats import expon as sp_expon

# For NB: Accuracy score on train: 0.7435417446649195
# Accuracy score on test: 0.5305676855895196

n_iter = 10

dict_clf = {
    'BernouilliNB': {
        'clf': (naive_bayes.BernoulliNB(alpha=0.5), ),
        'clf__alpha':
        sp_expon(0, 5),  # [0.0001 * (i + 1) for i in range(100)] #random
        'clf__fit_prior': (True, False)
    },
    'MultinomialNB': {
        'clf': (naive_bayes.MultinomialNB(alpha=0.2), ),
        'clf__alpha':
        sp_expon(0, 5),  # [0.0001 * (i + 1) for i in range(100)] #random
        'clf__fit_prior': (True, False)
    }
}

dict_vect = {
    'union__text_transform__vect': (TfidfVectorizer(ngram_range=(1, 4),
                                                    strip_accents='unicode',
                                                    analyzer='word'), ),
    'union__text_transform__vect__ngram_range':
Ejemplo n.º 8
0
     'predictor': make_sklearn_pipeline(KNeighborsClassifier()),
     'parameters': {
         'clf__n_neighbors': sp_randint(2, 20),
         'clf__weights': ['uniform', 'distance'],
     },
     'n_iter': 1000,
     'fit_params': None,
 },
 {
     'name':
     "Linear SVM",
     'predictor':
     make_sklearn_pipeline(
         SVC(kernel="linear", class_weight='balanced', random_state=1)),
     'parameters': {
         'clf__C': sp_expon(),
     },
     'n_iter':
     1000,
     'fit_params':
     None,
 },
 {
     'name':
     "RBF SVM",
     'predictor':
     make_sklearn_pipeline(SVC(class_weight='balanced', random_state=1)),
     'parameters': {
         'clf__C': sp_expon(),
     },
     'n_iter':