def search_grid(self, X, y, param_grid, verbose): if '__algorithm' in param_grid.keys(): algorithm = param_grid['__algorithm'] else: algorithm = self.best_algorithm if '__best_parameter' in param_grid.keys( ) and param_grid['__best_parameter']: self.param_base = self.best_param.copy() param_grid = ParameterGrid({ p[0]: p[1] for p in param_grid.items() if not p[0].startswith('__') }) for param in param_grid: trainer = crf.Trainer(verbose=verbose) param_train = self.param_base.copy() param_train.update(param) trainer.select(algorithm, self.graphical_model) trainer.set_params(param_train) if isinstance(self.cv, int): cv = KFold(n=len(X), n_folds=self.cv, shuffle=True, random_state=None) print('Parameter: (%s) %s' % (algorithm, param_train)) cv_score = [] for j, indices in enumerate(cv): X_train, y_train = X[indices[0]], y[indices[0]] X_test, y_test = X[indices[1]], y[indices[1]] for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) start = time.time() trainer.train('model') fit_elapsed_in_sec = time.time() - start trainer.clear() tagger = crf.Tagger() tagger.open('model') start = time.time() y_pred = [tagger.tag(xseq) for xseq in X_test] predict_elapsed_in_sec = time.time() - start tagger.close() score = self.scorer(y_pred, y_test) print( ' cv(%i): score %.4f, train size %i, test size %i, train elapsed %.4f sec, test elapsed %.4f sec' % (j, score, X_train.shape[0], X_test.shape[0], fit_elapsed_in_sec, predict_elapsed_in_sec)) cv_score.append(score) score = np.mean(cv_score) if self.best_score < score: self.best_score = score self.best_param = param_train self.best_algorithm = algorithm del cv_score[:]
def test_parameter_grid(): # Test basic properties of ParameterGrid. params1 = {"foo": [1, 2, 3]} grid1 = ParameterGrid(params1) assert_true(isinstance(grid1, Iterable)) assert_true(isinstance(grid1, Sized)) assert_equal(len(grid1), 3) assert_grid_iter_equals_getitem(grid1) params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]} grid2 = ParameterGrid(params2) assert_equal(len(grid2), 6) # loop to assert we can iterate over the grid multiple times for i in xrange(2): # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) assert_equal( points, set(("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"]))) assert_grid_iter_equals_getitem(grid2) # Special case: empty grid (useful to get default estimator settings) empty = ParameterGrid({}) assert_equal(len(empty), 1) assert_equal(list(empty), [{}]) assert_grid_iter_equals_getitem(empty) assert_raises(IndexError, lambda: empty[1]) has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}]) assert_equal(len(has_empty), 4) assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}]) assert_grid_iter_equals_getitem(has_empty)
def fit(self, X, y): self.folds = _generate_fold_indices(y, self.test_size, self.seed, self.n_folds) assert len(self.folds) == self.n_folds # adaptive if self.best_params is not None and self.adaptive: param_grid = {} for key, best_param in self.best_params.iteritems(): i = self.param_grid[key].index(best_param) if i != 0 and i != len(self.param_grid[key]) - 1: param_grid[key] = [ self.param_grid[key][j] for j in [i - 1, i, i + 1] ] elif i == 0: param_grid[key] = [ self.param_grid[key][j] for j in [i, i + 1, i + 2] ] elif i == len(self.param_grid[key]) - 1: param_grid[key] = [ self.param_grid[key][j] for j in [i - 2, i - 1, i] ] self.param_list = list(ParameterGrid(param_grid)) else: self.param_list = list(ParameterGrid(self.param_grid)) self.results = [0 for _ in xrange(len(self.param_list))] for i, params in enumerate(self.param_list): scores = [] for train_id, test_id in self.folds: model = self.base_model_cls(**params) model.fit(X[train_id], y[train_id]) pred = model.predict(X[test_id]) scores.append(self.score(y[test_id], pred)) self.results[i] = np.mean(scores) self.best_params = self.param_list[np.argmax(self.results)] if self.refit: self.best_model = self.base_model_cls(**self.best_params) self.best_model.fit(X, y) assert self.refit return self.best_model
def create_model_instances(config): print('Creating model instances...') models = [] model_classess = { 'ridge': RidgeClassifier, 'perceptron': Perceptron, 'passive_aggressive': PassiveAggressiveClassifier, 'sgd': SGDClassifier, 'nearest_centroid': NearestCentroid, 'multinomial_nb': MultinomialNB, 'linear_svc': LinearSVC, 'svc': SVC, 'dtree': DecisionTreeClassifier, 'forest': RandomForestClassifier, 'gbc': GradientBoostingClassifier, 'extra': ExtraTreesClassifier } for model_name, param_grid in config.items(): model_class = model_classess[model_name] print(' Building %s models' % str(model_class).split('.')[-1][:-2]) models.extend([model_class(**p) for p in ParameterGrid(param_grid)]) print('Created ' + str(len(models)) + ' model instances.') return models
def trainIngredient(model, grid, train, cv, refit=True, n_jobs=5): from joblib import Parallel, delayed from sklearn.grid_search import ParameterGrid from numpy import zeros from sklearn.metrics import accuracy_score pred = zeros((train.shape[0], train.cuisine.unique().shape[0])) best_score = 0 for g in ParameterGrid(grid): model.set_params(**g) results = Parallel(n_jobs=n_jobs)( delayed(fitIngredients)(train, list(cv), i, model) for i in range(cv.n_folds)) for i in results: pred[i['index'], :] = i['pred'] score = accuracy_score(train.cuisine, pred.argmax(1)) if score > best_score: best_score = score best_pred = pred.copy() best_grid = g print("Best Score: %0.5f" % best_score) print("Best Grid", best_grid) if refit: X2 = splitIngredients(train) model.set_params(**best_grid) model.fit(X2.ingredient, X2.cuisine) return best_pred, IngredientModel(model)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({ "max_samples": [0.5, 1.0], "bootstrap": [True, False] }) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest(n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest(n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) assert_array_equal(sparse_results, dense_results)
def test_parameters_sampler_replacement(): # raise error if n_iter too large params = {'first': [0, 1], 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params, n_iter=7) assert_raises(ValueError, list, sampler) # degenerates to GridSearchCV if n_iter the same as grid_size sampler = ParameterSampler(params, n_iter=6) samples = list(sampler) assert_equal(len(samples), 6) for values in ParameterGrid(params): assert_true(values in samples) # test sampling without replacement in a large grid params = {'a': range(10), 'b': range(10), 'c': range(10)} sampler = ParameterSampler(params, n_iter=99, random_state=42) samples = list(sampler) assert_equal(len(samples), 99) hashable_samples = [ "a%db%dc%d" % (p['a'], p['b'], p['c']) for p in samples ] assert_equal(len(set(hashable_samples)), 99) # doesn't go into infinite loops params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params_distribution, n_iter=7) samples = list(sampler) assert_equal(len(samples), 7)
def test_spectral_biclustering(): """Test Kluger methods on a checkerboard dataset.""" param_grid = {'method': ['scale', 'bistochastic', 'log'], 'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [3], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=random_state) for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralBiclustering(n_clusters=3, random_state=random_state, **kwargs) if issparse(mat) and kwargs['method'] == 'log': # cannot take log of sparse matrix assert_raises(ValueError, model.fit, mat) continue else: model.fit(mat) assert_equal(model.rows_.shape, (9, 30)) assert_equal(model.columns_.shape, (9, 30)) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def tune_parameters(estimator, name, param_grid, X, y, cv): logging.info('Tuning parameters for %s model' % name) grid_iterable = ParameterGrid(param_grid) logging.info('Fitting {0} folds for each of {1} candidates, totalling ' '{2} fits'.format(len(cv), len(grid_iterable), len(cv) * len(grid_iterable))) best_score, best_params = None, None for grid in grid_iterable: estimator.set_params(**grid) logging.info('Params: %s' % grid) mean_score, opt_n_estimators = cross_validation(estimator, X, y, cv, use_watch_list=True) if isinstance(estimator, xgb.XGBRegressor): grid['n_estimators'] = opt_n_estimators if (best_score is None) or (best_score > mean_score): best_score, best_params = mean_score, grid logging.info('Best parameters: %s, best score: %.5f' % (best_params, best_score)) logging.info('Parameters are tuned for %s model' % name) return best_params, best_score
def clf_loop(models_to_run, clfs, grid, X, y, y_lab): results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'weighted_roc_score')) for n in range(1, 2): # create training and valdation sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) for index, clf in enumerate([clfs[x] for x in models_to_run]): print(models_to_run[index]) parameter_values = grid[models_to_run[index]] for p in ParameterGrid(parameter_values): try: clf.set_params(**p) model = clf.fit(X_train, y_train) #predict_proba(X_test)[:,1] y_pred_probs = model.predict_proba(X_test) y_pred_probs = pd.DataFrame( [row[:, 1] for row in y_pred_probs]).transpose() #y_pred_probs = y_pred_probs.set_index(testing.vessel_number) y_pred_probs.columns = y_lab # you can also store the model, feature importances, and prediction scores # we're only storing the metrics for now y_pred_probs_sorted, y_test_sorted = zip( *sorted(zip(y_pred_probs, y_test), reverse=True)) results_df.loc[len(results_df)] = [ models_to_run[index], clf, p, roc_auc_scorer(y_test, y_pred_probs) ] except IndexError: print('IndexError:') continue return results_df
def initialize_regressors(sgd_grid): l = SGDRegressor() regressions_to_fit = [] for params in ParameterGrid(sgd_grid): l.set_params(**params) regressions_to_fit.append(clone(l)) return regressions_to_fit
def develop(): param_grid = [ { "n_estimators" : [200], "criterion" : ["gini"], "max_features" : ["auto", "sqrt", "log2"], "max_depth" : [None], "min_samples_split" : [5, 10, 15, 20], #[25, 50], "min_samples_leaf" : [5, 10, 15, 20], #[25, 30, 35, 40, 45, 50, 55], "min_weight_fraction_leaf" : [0.0], "max_leaf_nodes" : [None], "bootstrap" : [True], "oob_score" : [False], "n_jobs" : [-1], "random_state" : [None], "verbose" : [0], "warm_start" : [False], "class_weight" : [None], } ] param_list = list(ParameterGrid(param_grid)) process_list = [] for param_dict in param_list: process = Process(target=train_validate_test, args=(param_dict, )) process_list.append(process) for process in process_list: process.start() for process in process_list: process.join()
def requires(self): # get models to use models_to_run = [x.strip() for x in self.models_used.split('-')[1:]] # Construct union of all codes for computing it's compliment for # 'other' in RunML clfs, grid = run_reg_models.define_clfs_params() final_weight = float(self.final_weight) overunder = [float(i) for i in self.overunder.split()] station_names = [] for item in self.stations.split(): station_names.append('STA' + "%02d" % int(item)) # Wrapper loop for running all models runs = [] counter = 0 for index, clf in enumerate([clfs[x] for x in models_to_run]): parameter_values = grid[models_to_run[index]] for p in ParameterGrid(parameter_values): for station in station_names: runs.append( RunMLReg(p, clf, features=self.features, station=station, final_weight=final_weight, schedule=self.schedule, overunder=overunder, model_counter=counter, label=self.label)) counter += 1 return runs
def grid_search_cv(datafile, model, grid_params, chunk_size): print("Starting batch with chunk size of %d lines" % chunk_size) kfold_offsets = load_or_compute_kfold(datafile, CV_N_FOLDS, chunk_size, KFOLD_OFFSETS_CACHE) best_mcc = None best_params = None for param in list(ParameterGrid(grid_params)): model_obj = model() model_obj.set_params(**param) if best_params is None: best_params = model_obj.get_params() print("Evaluating model %s" % model_obj.get_params()) scores = cross_validate(datafile, model_obj, kfold_offsets, chunk_size) avg_score = reduce(lambda x, y: x + y, scores) / len(scores) print("Average MCC %0.6f" % avg_score) if best_mcc is None: best_mcc = avg_score else: if avg_score > best_mcc: best_mcc = avg_score best_params = model_obj.get_params() return best_mcc, best_params
def __init__(self, conf, process_num, stage): self.conf = conf self.input_layer_dimension = 1024 self.label_names = conf['label_names'] self.EF_ratio_list = conf['enrichment_factor']['ratio_list'] self.process_num = process_num self.stage = stage if self.stage == 0: self.label_names = [self.label_names[0]] cnt = 0 for param in ParameterGrid(conf['params']): if cnt != self.process_num: cnt += 1 continue self.param = param self.n_estimators = param['n_estimators'] self.max_features = param['max_features'] self.min_samples_leaf = param['min_samples_leaf'] self.class_weight = param['class_weight'] print('Testing set:', param) break if self.max_features == "None": self.max_features = None if self.class_weight == "None": self.class_weight = None self.model_dict = {} return
def generateParams(): params = { 'max_features': 'sqrt', 'n_estimators': 1000, 'learning_rate': 0.01 } #params = {'kernel' : 'linear' } # Set the parameters by cross-validation paramaters_grid = { 'max_depth': [3, 4, 5, 6, 7, 8], 'min_samples_split': [2, 3, 4, 5, 6, 7], 'min_samples_leaf': [3, 2, 4, 5, 6, 7], 'n_estimators': [50, 75, 100, 150, 200, 300, 250], 'learning_rate': [0.005, 0.01, 0.02, 0.03, 0.04, 0.05] } # Set the parameters by cross-validation #paramaters_grid = {'C': [0.0000001, 0.001, 0.005, 0.008, 0.01, 0.02, 0.05, 0.07, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 10, 100, 0.004]}; paramaters_search = list(ParameterGrid(paramaters_grid)) parameters_to_try = [] for ps in paramaters_search: params = {'max_features': 'sqrt'} for param in ps.keys(): params[str(param)] = ps[param] parameters_to_try.append(copy.copy(params)) return parameters_to_try
def trainFeatureModel(train, target, model, grid, cv, n_jobs=-1): from sklearn.grid_search import ParameterGrid from sklearn.metrics import accuracy_score from joblib import Parallel, delayed from numpy import zeros pred = zeros((train.shape[0], target.unique().shape[0])) best_score = 0 best_grid = {} for g in ParameterGrid(grid): model.set_params(**g) if len([ True for x in list(g.keys()) if x.find('nthread') != -1 or x.find('n_jobs') != -1 ]) > 0: results = [ fitSklearn(train, target, list(cv), i, model, True) for i in range(cv.n_folds) ] else: results = Parallel(n_jobs=n_jobs)( delayed(fitSklearn)(train, target, list(cv), i, model, True) for i in range(cv.n_folds)) for i in results: pred[i['index'], :] = i['pred'] score = accuracy_score(target, pred.argmax(1)) if score > best_score: best_score = score best_pred = pred.copy() best_grid = g print("Best Score: %0.5f" % best_score) print("Best Grid:", best_grid) model.set_params(**best_grid) model.fit(train, target) return best_pred, model
def clf_loop(models_to_run, clfs, grid, X, y): results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'p_at_5', 'p_at_10', 'p_at_20')) for n in range(1, 2): # create training and valdation sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) for index, clf in enumerate([clfs[x] for x in models_to_run]): print models_to_run[index] parameter_values = grid[models_to_run[index]] for p in ParameterGrid(parameter_values): try: start_time = time.time() clf.set_params(**p) y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1] # you can also store the model, feature importances, and prediction scores # we're only storing the metrics for now y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True)) run_time = time.time() - start_time results_df.loc[len(results_df)] = [models_to_run[index], clf, run_time, roc_auc_score(y_test, y_pred_probs), precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0), precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0), precision_at_k(y_test_sorted, y_pred_probs_sorted, 20.0)] if NOTEBOOK == 1: plot_precision_recall_n(y_test, y_pred_probs, clf) except (IndexError): print('Error:') continue return results_df
def param_search(estimator, param_dict, n_iter=None, seed=None): """ Generator for cloned copies of `estimator` set with parameters as specified by `param_dict`. `param_dict` can contain either lists of parameter values (grid search) or a scipy distribution function to be sampled from. If distributions, you must specify `n_iter`. Parameters: ___________ estimator: sklearn-like estimator param_dict: dict of parameter name: values, where values can be an iterable or a distribution function n_iter: number of draws to take from parameter distributions """ if n_iter is None: param_iter = ParameterGrid(param_dict) else: param_iter = ParameterSampler(param_dict, n_iter, random_state=seed) estimators = [] for params in param_iter: new_estimator = sklearn.clone(estimator) new_estimator.set_params(**params) estimators.append(new_estimator) return estimators
def fit(self, X, y): assert (self.X_tune is not None) if self.verbose: print('grid searching...') #ret = list((self._evaluateParameters(X, y, params) for params in ParameterGrid(self.searchParameters))) ret = self.parallel( joblib.delayed(_evaluateParameters) (X, y, self.X_tune, self.y_tune, self.classifierType, self.parameters, params, self.scoring) for params in ParameterGrid(self.searchParameters)) if len(ret): bestClassifier, bestScore = max(ret, key=lambda x: x[1]) if self.verbose: for c, s in ret: print( { i: c.get_params()[i] for i in set(c.get_params()) & set(self.searchParameters) }, s) print("Best: {} Score: {}".format( { i: bestClassifier.get_params()[i] for i in set(bestClassifier.get_params()) & set(self.searchParameters) }, bestScore)) self.classifier = bestClassifier return bestClassifier return None
def test_spectral_coclustering(): """Test Dhillon's Spectral CoClustering on a simple problem.""" param_grid = {'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def test_classification(): """Check classification for various parameter settings.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) grid = ParameterGrid({ "max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False] }) for base_estimator in [ None, DummyClassifier(), Perceptron(), DecisionTreeClassifier(), KNeighborsClassifier(), SVC() ]: for params in grid: BaggingClassifier(base_estimator=base_estimator, random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def grid_search(lb_view, model, cv_split_filenames, param_grid): """ Parameters: lb_view = A load-balanced IPython.parallel client. model = tuple containing in the [0] index the alias name for the classifier and in the [1] index the instantiation of the classifier itself. cv_split_filenames = list of cross-val dataset filenames. param_grid = dictionary of all the hyper-parameters for the pipeline objects to be trained. Output: List of parameters and list of asynchronous client tasks handles """ all_tasks = [] all_parameters = list(ParameterGrid(param_grid)) for i, params in enumerate(all_parameters): task_for_params = [] for j, cv_split_filename in enumerate(cv_split_filenames): t = lb_view.apply(compute_evaluation, cv_split_filename, model, params) task_for_params.append(t) all_tasks.append(task_for_params) return all_parameters, all_tasks
def get_parameters(self): ''' Returns a list of all possible combinations of parameters. :return: list with a dictionary of parameter_name: value ''' specific_params = list(ParameterGrid(self.parameters)) return specific_params
def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test): columns = ['classifier', 'parameters', 'pat5'] temp_results_df = pd.DataFrame(data=np.zeros((0, len(columns))), columns=columns) for index, clf in enumerate([clfs[x] for x in models_to_run]): parameter_values = grid[models_to_run[index]] for p in ParameterGrid(parameter_values): try: clf.set_params(**p) y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1] # need to change this to store model and also get feature importances # threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))] # print threshold # print precision_at_k(y_test,y_pred_probs,.05) # plot_precision_recall_n(y_test,y_pred_probs,clf) temp_results_df.loc[len(temp_results_df)] = [ x, clf, precision_at_k(y_test, y_pred_probs, .05) ] except IndexError, e: print 'Error:', e continue
def tune_parameters(estimator, name, param_grid, X, y, cv): info('Tuning parameters for %s model' % name) grid_iterable = ParameterGrid(param_grid) info('Fitting {0} folds for each of {1} candidates, totalling {2}' 'fits'.format(len(cv), len(grid_iterable), len(cv) * len(grid_iterable))) best_score, best_params = None, None for i, grid in enumerate(grid_iterable): estimator.set_params(**grid) info('Params: %s' % grid) mean_score, opt_n_estimators = cross_validation( estimator, X, y, cv, True) if isinstance(estimator, xgb.XGBClassifier): grid['n_estimators'] = opt_n_estimators if (best_score is None) or (best_score > mean_score): best_score, best_params = mean_score, grid info('Best parameters: %s, best score: %.5f' % (best_params, best_score)) info('Parameters are tuned for %s model' % name) return best_params, best_score
def magic_loop(models_to_run, clfs, grid, X, y): ''' Takes a list of models to use, two dictionaries of classifiers and parameters, and array of X Set to find ten models with best precision at 5 percent recall ''' table = {} top = [] for i in range(10): top.append((0, " ")) heapq.heapify(top) k = 0.05 for n in range(1, 2): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) for index, clf in enumerate([clfs[x] for x in models_to_run]): for p in ParameterGrid(grid[models_to_run[index]]): try: clf.set_params(**p) print (clf) y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1] plot_precision_recall_n(y_test, y_pred_probs, clf) l = scoring(k, y_test, y_pred_probs) m, s = top[0] p = l['precision'] if p > m: heapq.heapreplace(top, (p, clf)) table[str(clf)] = l except: print ('Error:') continue print (top) return top, table
def __init__(self, estimator, param_grid, scoring=None, cv=4, refit=True, verbose=False, population_size=50, mutation_prob=0.10, tournament_size=3, generations_number=10, n_jobs=1, iid=True, pre_dispatch='2*n_jobs', error_score='raise', fit_params=None): super(EvolutionaryAlgorithmSearchCV, self).__init__(estimator, scoring, fit_params, n_jobs, iid, refit, cv, pre_dispatch, error_score) _check_param_grid(param_grid) self.param_grid = param_grid self.possible_params = list(ParameterGrid(self.param_grid)) self.individual_size = int(ceil(log(len(self.possible_params), 2))) self.population_size = population_size self.generations_number = generations_number self.best_estimator_ = None self.best_score_ = None self.best_params_ = None self._individual_evals = {} self.mutation_prob = mutation_prob self.tournament_size = tournament_size
def __init__(self, params, log_files_path, output_dim=1, output_activation=None, batch_size=16, epoch_count=1, score_func='r2_score', patience=150, class_mode='categorical', dropout=0.5, loss='categorical_crossentropy', default_activation='relu', default_optimizer=Adadelta()): self.epoch_count = epoch_count self.batch_size = batch_size self.log_files_path = log_files_path self.createDirectoryIfNotExist(self.log_files_path) self.param_grid = ParameterGrid(params) self.patience = patience self.dropout = dropout self.score_func = score_func self.output_dim = output_dim self.output_activation = output_activation self.default_activation = default_activation self.default_optimizer = default_optimizer self.loss = loss self.class_mode = class_mode
def grid_search_param_model(model_name, grid_search_param_dict, X, y, data_process_param_dict): param_grid = list(ParameterGrid(grid_search_param_dict)) if model_name == "tree": for param in param_grid: tree_clf = DecisionTreeClassifier( max_depth=param.get("max_depth", None)) param.update({'model': model_name}) print param param.update(data_process_param_dict) cross_validate_model_param_v2(tree_clf, param, dataset=X, label=y) if model_name == "adaboosting": for param in param_grid: tree_clf = DecisionTreeClassifier( max_depth=param.get("max_depth", None)) ada_boost_clf = AdaBoostClassifier( base_estimator=tree_clf, n_estimators=param.get("n_estimators", None), learning_rate=param.get("learning_rate", None)) param.update({'model': model_name}) print param param.update(data_process_param_dict) cross_validate_model_param_v2(ada_boost_clf, param, dataset=X, label=y) elif model_name == 'linearsvc': for param in param_grid: print "%s" % param param.update({'model': model_name}) linsvc_clf = LinearSVC(C=param.get("C", None)) if "class_weight" in param.keys(): class_weight = param['class_weight'] class_weight_key = class_weight.keys() class_weight_key = [unicode(key) for key in class_weight_key] class_weight_value = class_weight.values() class_weight = dict(zip(class_weight_key, class_weight_value)) param.update({'class_weight': class_weight}) param.update(data_process_param_dict) cross_validate_model_param_v2(linsvc_clf, param, dataset=X, label=y) elif model_name == "random_forest": for param in param_grid: print "%s" % param param.update({'model': model_name}) rf_clf = RandomForestClassifier(n_estimators=param.get( "n_estimators", None), max_depth=param.get("max_depth")) param.update(data_process_param_dict) cross_validate_model_param_v2(rf_clf, param, dataset=X, label=y) elif model_name == "svm": for param in param_grid: print "%s" % param param.update({'model': model_name}) svm_clf = SVC(C=param.get("C"), gamma=param.get("gamma")) param.update(data_process_param_dict) cross_validate_model_param_v2(svm_clf, param, dataset=X, label=y)