def train_MLP(min_size=5, max_size=100): model = MLPClassifier(max_iter=100) param_grid = { 'alpha': [1], 'max_iter': [1000], 'solver': ['adam'], 'activation': ['relu'] } param_grid = { 'hidden_layer_sizes': [(sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size), )], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam', 'lbfgs'], 'alpha': sp_expon(scale=.01), 'learning_rate': ['constant', 'adaptive'], 'learning_rate_init': sp_expon(scale=.001), } return model, param_grid
def get_random_grid_CV_params(): """Define the Random Grid Search parameters for each model.""" logit_params = {"C": sp_expon(loc=0.001, scale=1), "fit_intercept": [True, False], "intercept_scaling": sp_randint(1, 5), "warm_start": [False, True] } rf_params = {"min_samples_split": sp_randint(1, 50), "min_samples_leaf": sp_randint(1, 50), "criterion": ["gini", "entropy"], "class_weight": ['balanced', 'balanced_subsample'] } ada_dt_params = {"learning_rate": sp_expon(loc=0.001, scale=1.5), "algorithm": ['SAMME.R', 'SAMME'] } gbc_params = {"learning_rate": sp_expon(loc=0.001, scale=0.5), "subsample": sp_uniform(loc=0.2, scale=0.8), "max_features": [None, 'auto'], "max_depth": sp_randint(2, 6), } svc_params = {"C": sp_expon(loc=0.001, scale=2), "kernel": ['rbf', 'poly'], "degree": sp_randint(2, 10), "coef0": [0, 1, 2], "shrinking": [True, False] } rnd_CV_param_distributions = {'Logistic': logit_params, 'RandomForest': rf_params, 'AdaBoost_DT': ada_dt_params, 'GBC': gbc_params, 'SVC': svc_params } return rnd_CV_param_distributions
def generate_random_numbers_tuple(): while True: result = [] size = sp_randint(1, 10).rvs() for i in range(size): result.append(sp_expon(scale=50).rvs()) yield tuple(result)
self.low = low self.high = high self.shape = sp_randint(1, 3).rvs() def rvs(self, random_state=None): return sp_randint(self.low, self.high).rvs(size=self.shape) # generate_random_numbers_tuple() PARAM_DISTS = { type(DecisionTreeClassifier()): { 'criterion': ['gini', 'entropy'], 'max_depth': sp_randint(3, 8) }, type(SVC()): { 'C': sp_expon(scale=100), 'gamma': sp_expon(scale=.1), 'max_iter': [300], 'kernel': ['rbf', 'linear', 'sigmoid'], }, type(MLPClassifier()): { 'hidden_layer_sizes': RandIntMatrix(12, 128), 'max_iter': [500], 'activation': ['relu', 'tanh', 'logistic'] }, type(RandomForestClassifier()): { 'n_estimators': sp_randint(10, 25), 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, None] } }
# if we don't want to balance #X_train_b, y_train_b = X_train, y_train print('Training set size AFTER sampling: ', len(X_train_b)) if presentation: print("--------------------------------------------------") # PARAMETER ESTIMATION # -------------------- if verbose: print('parameter estimation') # train and test the svm classifier to get a mean accuracy score svm_clf = svm.LinearSVC(dual=False, penalty='l2', class_weight='balanced') # specify possible hyperparameters param_dist = {"C": sp_expon(scale=1)} scoring = { 'acc': 'accuracy', 'prec': 'precision', 'rec': 'recall', 'f1': 'f1' } # run randomized search svm_random_search = RandomizedSearchCV(svm_clf, param_distributions=param_dist, n_iter=n_iter_search, cv=nb_folds, scoring=scoring, refit='acc') # scoring=['accuracy', 'precision', 'recall'],
def get_parameter_distribution(model_name, random_state=None, n_iter=10): """ returns a dictionary containing key=classifier_name:value=parameter_distribution_dictionary """ from scipy.stats import randint as sp_randint from sklearn.gaussian_process.kernels import RBF from scipy.stats import expon as sp_expon np.random.seed(random_state) parameter_distributions = { 'SGD Classifier (log)': { 'alpha': sp_expon(scale=0.1), }, 'KNN Classifier': { 'n_neighbors': [3, 4, 5] }, 'SGD Classifier (linear SVM)': { 'base_estimator__alpha': sp_expon(scale=3e-5), }, 'Polynomial SVM Classifier': { 'poly_svc__C': [1, 3, 5, 7, 9], }, 'RBF SVM Classifier': { 'rbf_svc__C': [1, 10, 100, 1000], 'rbf_svc__gamma': sp_expon(scale=0.01) }, 'Decision Tree Classifier': { "max_depth": sp_randint(3, 15), "max_features": sp_randint(3, 15), "min_samples_split": sp_randint(2, 10), "min_samples_leaf": sp_randint(1, 10), }, 'Random Forest Classifier': { "n_estimators": sp_randint(2, 150), "max_depth": sp_randint(3, 15), "max_features": sp_randint(3, 15), "min_samples_split": sp_randint(2, 10), "min_samples_leaf": sp_randint(1, 10), }, 'Extra Trees Classifier': { "n_estimators": sp_randint(2, 150), "max_depth": sp_randint(3, 15), "max_features": sp_randint(3, 15), "min_samples_split": sp_randint(2, 10), "min_samples_leaf": sp_randint(1, 10), }, 'AdaBoosting Trees Classifier': { "n_estimators": sp_randint(2, 150), "base_estimator__max_depth": sp_randint(3, 15), "base_estimator__max_features": sp_randint(3, 15), "base_estimator__min_samples_split": sp_randint(2, 10), "base_estimator__min_samples_leaf": sp_randint(1, 10), }, 'MLP Classifier': { 'hidden_layer_sizes': get_dist_mlp_layers(int(n_iter // 3), min_num_layers=4, max_num_layers=8, min_neurons_per_layer=50, max_neurons_per_layer=200) }, } return parameter_distributions[model_name]
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest, chi2 from scipy.stats import uniform as sp_uniform from scipy.stats import expon as sp_expon # For NB: Accuracy score on train: 0.7435417446649195 # Accuracy score on test: 0.5305676855895196 n_iter = 10 dict_clf = { 'BernouilliNB': { 'clf': (naive_bayes.BernoulliNB(alpha=0.5), ), 'clf__alpha': sp_expon(0, 5), # [0.0001 * (i + 1) for i in range(100)] #random 'clf__fit_prior': (True, False) }, 'MultinomialNB': { 'clf': (naive_bayes.MultinomialNB(alpha=0.2), ), 'clf__alpha': sp_expon(0, 5), # [0.0001 * (i + 1) for i in range(100)] #random 'clf__fit_prior': (True, False) } } dict_vect = { 'union__text_transform__vect': (TfidfVectorizer(ngram_range=(1, 4), strip_accents='unicode', analyzer='word'), ), 'union__text_transform__vect__ngram_range':
'predictor': make_sklearn_pipeline(KNeighborsClassifier()), 'parameters': { 'clf__n_neighbors': sp_randint(2, 20), 'clf__weights': ['uniform', 'distance'], }, 'n_iter': 1000, 'fit_params': None, }, { 'name': "Linear SVM", 'predictor': make_sklearn_pipeline( SVC(kernel="linear", class_weight='balanced', random_state=1)), 'parameters': { 'clf__C': sp_expon(), }, 'n_iter': 1000, 'fit_params': None, }, { 'name': "RBF SVM", 'predictor': make_sklearn_pipeline(SVC(class_weight='balanced', random_state=1)), 'parameters': { 'clf__C': sp_expon(), }, 'n_iter':