def create_project(params_e_pre, params_e_post, params_g_pre, params_g_post, baseline_period_start_date, baseline_period_end_date, reporting_period_start_date, reporting_period_end_date, has_electricity, has_gas, weather_source, zipcode): model_e = AverageDailyTemperatureSensitivityModel(heating=True, cooling=True) model_g = AverageDailyTemperatureSensitivityModel(heating=True, cooling=False) # generate consumption baseline_period = Period(baseline_period_start_date, reporting_period_start_date) datetimes_pre = generate_monthly_billing_datetimes(baseline_period, dist=randint(29,31)) reporting_period = Period(datetimes_pre[-1], reporting_period_end_date) datetimes_post = generate_monthly_billing_datetimes(reporting_period, dist=randint(29,31)) location = Location(zipcode=zipcode) baseline_period = Period(baseline_period_start_date, baseline_period_end_date) reporting_period = Period(reporting_period_start_date, reporting_period_end_date) cds = [] if has_electricity: cd_e = generate_consumption_records(model_e, params_e_pre, params_e_post, datetimes_pre, datetimes_post, "electricity", "kWh", weather_source) cds.append(cd_e) if has_gas: cd_g = generate_consumption_records(model_g, params_g_pre, params_g_post, datetimes_pre, datetimes_post, "natural_gas", "therm", weather_source) cds.append(cd_g) return Project(location, cds, baseline_period, reporting_period)
def Gradient(self): X_train, y_train =self.X_train,self.y_train parameters_boost={'max_depth':randint(3,self.max_depth_max+1), 'n_estimators':randint(80,100+self.n_estimators_max)} boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,, n_iter=self.n_iter,n_jobs=-1),y_train) self.boost_reg=boost_reg.best_estimator_
def get_param_grid(cur_model, points, rand): print('\nRetrieving parameter grid...') try: c_range = 10.0 ** np.arange(-2, 3) # print 'Getting Parameter grid...' # out_txt.write('Getting Parameter grid...') gamma_range = [0, .01, .1, .3] # neighbor_range = np.arange(2, points, step=5) # leaf_range = np.arange(10, points, step=5) neighbor_range = np.arange(2, 17, step=5) leaf_range = np.arange(10, 60, step=5) if not rand: grid_params = {'SVC()': [{'C': c_range, 'kernel': ['poly'], 'degree': [3, 5, 8], 'gamma': gamma_range, 'probability': [True], 'class_weight': ['auto', None]}, {'C': c_range, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma_range, 'probability': [True], 'class_weight': ['auto', None]}, {'C': c_range, 'kernel': ['linear'], 'random_state': [10], 'probability': [True], 'class_weight': ['auto', None]}], 'KNeighborsClassifier()': [{'n_neighbors': neighbor_range, 'weights': ['uniform'], 'algorithm': ['brute'], 'metric': ['euclidean', 'manhattan']}, {'n_neighbors': neighbor_range, 'weights': ['uniform'], 'algorithm': ['ball_tree', 'kd_tree'], 'metric': ['euclidean', 'manhattan'], 'leaf_size': leaf_range}], 'LogisticRegression()': [{'penalty': ['l1', 'l2'], 'C': c_range, 'class_weight': [None, 'auto']}]} return grid_params[cur_model] else: rand_params = {'SVC()': {'C': stats.expon(scale=300), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [3, 4, 5, 6, 7, 8], 'gamma': stats.expon(scale=1/3), 'random_state': [10], 'probability': [True], 'class_weight': ['auto', None]}, 'KNeighborsClassifier()': {'n_neighbors': stats.randint(low=2, high=20), 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan'], 'leaf_size': stats.randint(low=10, high=60)}, 'LogisticRegression()': {'penalty': ['l1', 'l2'], 'C': stats.expon(scale=300), 'class_weight': [None, 'auto']}} return rand_params[cur_model] except: print('could not get parameter grid')
def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) params = {'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0]))} bestModels = [] for i in range(len(yTrain[0])): gbrt = GradientBoostingRegressor() clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=20, scoring='mean_squared_error', cv=kfold, n_jobs=-1), yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(1, 12): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, i) # 模型的预测天数递增 XPredict = pp.makeXPredict(array, embedDim, interval, i) # 待预测的输入递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack((array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 yPredict = array[0, -65:-5] # 一共可以预测66天,取其中对应的数据 return yPredict
def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) params = {"n_estimators": randint(5, 100), "max_depth": [1, 2, 3, 5, 8, 10, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 3), "min_samples_leaf": randint(1, 3)} bestModels = [] for i in range(len(yTrain[0])): erf = ExtraTreesRegressor() clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=10, scoring='mean_squared_error', cv=kfold, n_jobs=-1), yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(60): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) # 模型的嵌入维度递增 XPredict = pp.makeXPredict(array, embedDim, interval, 1) # 待预测的嵌入维度递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack((array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 embedDim += 1 yPredict = array[0, -60:] # 一共可以预测60天,取其中对应的数据 return yPredict
def decisiontree_param(self, method='grid'): parameters = { # 'selector__extraTC__n_estimators': [10], # 'selector__extraTC__n_estimators': [10, 15], # # 'selector__extraTC__criterion': ['entropy'], # 'selector__extraTC__criterion': ['gini','entropy'], # 'selector__extraTC__n_jobs': [-1], # 'selector__pca__svd_solver': ['randomized'], 'selector__pca__svd_solver': ['full', 'arpack', 'randomized'], # 'selector__pca__whiten': [True], 'selector__pca__whiten': [True,False], 'DecisionTreeClassifier__criterion': ['gini','entropy'], 'DecisionTreeClassifier__splitter': ['best','random'], 'DecisionTreeClassifier__max_features': ['sqrt','log2', None] # 'DecisionTreeClassifier__max_leaf_nodes': [2,3, None], # 'DecisionTreeClassifier__max_depth': [2,3, None], # 'DecisionTreeClassifier__min_samples_leaf': [1,3,5, None] } if method == 'random': parameters['DecisionTreeClassifier__min_samples_leaf'] = randint(1,20) parameters['DecisionTreeClassifier__max_leaf_nodes'] = randint(2,20) parameters['DecisionTreeClassifier__max_depth'] = randint(1,20) return parameters
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def get_random_forest(): classifier = sklearn.ensemble.RandomForestClassifier(max_features=None,oob_score=False,n_jobs=1) pipeline = sklearn.pipeline.Pipeline([('RF',classifier)]) meta_dict={'RF__n_estimators':stats.randint(5,100),'RF__max_features':['sqrt','log2','auto',None], 'RF__max_depth':stats.randint(2,10)} return pipeline,meta_dict
def Extra(self): parameters_extra={'max_depth':randint(1,self.max_depth_max+1), "bootstrap": [True, False], 'min_samples_split':randint(1,self.min_samples_split_max+1), "min_samples_leaf": randint(1, self.min_samples_leaf_max+1), 'n_estimators':randint(20,20+self.n_estimators_max) } X_train, y_train =self.X_train,self.y_train extra_reg=RandomizedSearchCV(ExtraTreesRegressor(),param_distributions=parameters_extra,, n_iter=self.n_iter,n_jobs=-1),y_train) self.extra_reg=extra_reg.best_estimator_
def do_train_rand(X, y, params=None, n_iter=50): if not params: params = {'n_estimators': stats.randint(40, 90), 'max_depth': stats.randint(20, 40), 'min_samples_leaf': stats.randint(80, 110), 'max_features': ['auto', 'sqrt']} clf = RandomizedSearchCV(GradientBoostingClassifier(), params, n_iter=n_iter, scoring=do_test, n_jobs=15, verbose=1, cv=4), y) return clf
def RandomFo(self): parameters_forest={'max_depth':randint(1,self.max_depth_max+1), "bootstrap": [True, False], 'min_samples_split':randint(1,self.min_samples_split_max+1), "min_samples_leaf": randint(1,self.min_samples_leaf_max+1), "max_features": randint(1, self.max_features_max), 'n_estimators':randint(15,self.n_estimators_max), } ### Gridsearch X_train, y_train =self.X_train,self.y_train forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,, n_iter=self.n_iter,n_jobs=-1),y_train) self.forest_reg=forest_reg.best_estimator_
def simple_tree(self): #### methode d'apprentissage avec arbre simple ### cv: nb d'etapes ds la cross valid ### n_iter est le nb d'iteration pour la random cross valid parameters_tree={'max_depth': randint(1,self.max_depth_max+1), 'min_samples_split':randint(1,self.min_samples_split_max+1), 'min_samples_leaf':randint(1,self.min_samples_leaf_max+1), 'max_leaf_nodes':randint(2,self.max_leaf_nodes_max), "max_features": randint(1, self.max_features_max)} tree_reg = RandomizedSearchCV(DecisionTreeRegressor(), param_distributions=parameters_tree,, n_iter=self.n_iter,n_jobs=-1) X_train, y_train =self.X_train,self.y_train, y_train) self.tree_reg=tree_reg.best_estimator_
def train(XTrain, yTrain, XPredict): params = {"n_estimators": randint(5, 100), "max_depth": [1, 2, 3, 5, 10, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 3), "min_samples_leaf": randint(1, 3)} rf = RandomForestRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=3, shuffle=False) clf = grid_search.RandomizedSearchCV(rf, param_distributions=params, n_iter=30, scoring='mean_squared_error', cv=kfold, n_jobs=-1), yTrain) # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict, clf.best_params_
def _get_random_params(model_name): """Return some random model parameters to search over. Args: ---- model_name: str Return: ------ param_dct: dct """ if model_name == 'logit': param_dct = {'penalty': ['l1', 'l2'], 'C': scs.uniform(0.00001, 0.0099)} elif model_name == 'random_forest': param_dct = {'n_estimators': scs.randint(400, 1200), 'max_depth': scs.randint(2, 32)} elif model_name == 'extra_trees': param_dct = {'n_estimators': scs.randint(400, 1200), 'max_depth': scs.randint(2, 32)} elif model_name == 'gboosting': param_dct = {'n_estimators': scs.randint(400, 1200), 'learning_rate': scs.uniform(0.001, 0.099), 'max_depth': scs.randint(1, 8), 'max_features': scs.uniform(0.5, 0.5), 'subsample': scs.uniform(0.5, 0.5)} elif model_name == 'xgboost': param_dct = {'learning_rate': scs.uniform(0.001, 0.099), 'n_estimators': scs.randint(400, 1200), 'max_depth': scs.randint(1, 8), 'subsample': scs.uniform(0.5, 0.5), 'colsample_bytree': scs.uniform(0.5, 0.5)} return param_dct
def test_generate_monthly_billing_datetimes(): period = Period(datetime(2012,1,1),datetime(2013,1,1)) datetimes_30d = generate_monthly_billing_datetimes(period, randint(30,31)) assert datetimes_30d[0] == datetime(2012,1,1) assert datetimes_30d[1] == datetime(2012,1,31) assert datetimes_30d[11] == datetime(2012,11,26) assert datetimes_30d[12] == datetime(2012,12,26) datetimes_1d = generate_monthly_billing_datetimes(period, randint(1,2)) assert datetimes_1d[0] == datetime(2012,1,1) assert datetimes_1d[1] == datetime(2012,1,2) assert datetimes_1d[330] == datetime(2012,11,26) assert datetimes_1d[331] == datetime(2012,11,27)
def test_random_grid(): # get our train/test X_train, X_test, y_train, y_test = train_test_split(X,, train_size=0.75, random_state=42) # default CV does not shuffle, so we define our own custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42) # build a pipeline pipe = Pipeline([ ('retainer' , FeatureRetainer()), # will retain all ('dropper' , FeatureDropper()), # won't drop any ('mapper' , FunctionMapper()), # pass through ('encoder' , OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer' , SelectiveImputer()), # pass through ('scaler' , SelectiveScaler()), ('boxcox' , BoxCoxTransformer()), ('nzv' , NearZeroVarianceFilterer(threshold=1e-4)), ('pca' , SelectivePCA(n_components=0.9)), ('model' , RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold' : uniform(loc=.8, scale=.15), 'collinearity__method' : ['pearson','kendall','spearman'], 'scaler__scaler' : [StandardScaler(), RobustScaler()], 'pca__n_components' : uniform(loc=.75, scale=.2), 'pca__whiten' : [True, False], 'model__n_estimators' : randint(5,100), 'model__max_depth' : randint(2,25), 'model__min_samples_leaf' : randint(1,15), 'model__max_features' : uniform(loc=.5, scale=.5), 'model__max_leaf_nodes' : randint(10,75) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=custom_cv, random_state=42) # fit the search, y_train) # test the report the_report = report_grid_score_detail(search, charts=False)
def RandomFo(self): parameters_forest={'n_estimators':randint(10,self.n_estimators_max), "bootstrap": [True, False]} X_train, y_train =self.X_train,self.y_train forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,, n_iter=self.n_iter,n_jobs=-1),y_train) self.forest_reg=forest_reg.best_estimator_
def do_train_rand(train, valid, params=None, max_models=32): """Do randomized hyper-parameter search Args: train (SFrame): training set valid (SFrame): validataion set params (dict): parameters for random search max_models (int): maximum number of models to run Returns: res (SFrame): table of choices of parameters sorted by valid RMSE """ if not params: params = {'user_id': ['username'], 'item_id': ['course_id'], 'target': ['label'], 'binary_target': [True], 'num_factors': stats.randint(4, 128), 'regularization': stats.expon(scale=1e-4), 'linear_regularization': stats.expon(scale=1e-7)} try: job = gl.toolkits.model_parameter_search \ .random_search.create((train, valid), gl.recommender. factorization_recommender.create, params, max_models=max_models) res = job.get_results() res = res.sort('validation_rmse') print 'Best params for random search are: {}'.format(res[0])'rand_search.csv', format='csv') except: print job.get_metrics() res = None return res
def get_available_resource(self, random = False): """Gets the index number of an available resource. *Arguments* ``random`` (Boolean) If set to True, randomly chooses the index of an available resource. Otherwise, returns the index of the available resource with the lowest index value. *Returns:* A positive integer representing the index number of the resource. ``None`` if all resources are busy. """ empty_resources = [] for index in range(self.num_resources): if self[index].get_available_station() is not None: empty_resources.append(index) if len(empty_resources) == 0: return None elif not random: return empty_resources[0] else: return empty_resources[randint(0, len(empty_resources) - 1)]
def make_data(self): """ Create a generator for building the data. """ # Build a logging format suitable for sorting log_dir = self.param_distributions['log_dir'] log_fmt = '{{0:0{0}d}}-{{1:0{1}d}}'.format(len(str(self.niter)), len(str(self.nsplits))) # Create the parameters for each instance for i in xrange(1, self.niter + 1): # Make the params ncolumns = self.param_distributions['ncolumns'].rvs() param = {'ncolumns':ncolumns} #### # Ensure that parameters make sense #### # Compute nactive as a function of ncolumns # Ensure that nactive is bounded by (0, ncolumns) nactive = int(self.param_distributions['nactive'].rvs() * ncolumns) while ((nactive == 0) or (nactive == ncolumns)): nactive = int(self.param_distributions['nactive'].rvs() * ncolumns) param['nactive'] = nactive # Ensure that each input is seen at least once nsynapses = self.param_distributions['nsynapses'].rvs() p = prob.p_c(ncolumns, nsynapses, self.ninputs) e = int(p * self.ninputs) while e > 0: nsynapses = self.param_distributions['nsynapses'].rvs() p = prob.p_c(ncolumns, nsynapses, self.ninputs) e = int(p * self.ninputs) param['nsynapses'] = nsynapses # Compute seg_th as a function of nsynapses seg_th = int(self.param_distributions['seg_th'].rvs() * nsynapses) param['seg_th'] = seg_th # Make a useful log directory param['log_dir'] = os.path.join(log_dir, log_fmt.format(i, 1)) #### # Add all other parameters #### added = set(param.keys()) missing = [key for key in self.keys if key not in added] for key in missing: if hasattr(self.param_distributions[key], 'rvs'): param[key] = self.param_distributions[key].rvs() else: param[key] = value[randint(0, len(v)).rvs()] # Yield each item for key in self.keys: yield param[key]
def generate_altitudes_for_traj(trajectories, distr_file = None, distr_type = "flat", min_FL = 240., max_FL = 350., save_file = None, starting_date = [2010, 6, 5, 10, 0, 0]): """ @trajectories: a list of tuple (lat, lon, alt, time). TODO: do a distribution for entry and for exit? """ print "Generating altitudes from distribution..." trajectories = [[list(p) for p in traj] for traj in trajectories] if distr_file!=None: print "Getting distribution of altitudes from file", distr_file distr_type = "data" data = [] with open(distr_type, 'r') as f: for columns in (raw.strip().split() for raw in f): data.append(columns[0]) min_FL, max_FL = min(data), max(data) distr = getDistribution(data) else: if distr_type == 'flat': distr = stats.randint(low = min_FL, high = max_FL).rvs else: print "You asked for a distribution of type", distr_type raise Exception("This type of distribution is not implemented.") for traj in trajectories: alt = distr() for p in traj: #same altitude for the whole trajectory p[2] = 10*int(alt/10.) # To have trajectories separated by 10 FL. if save_file!=None: write_trajectories_for_tact(trajectories, fil = save_file, starting_date = starting_date) return trajectories
def extratrees_param(self, method='grid'): parameters = { # 'selector__extraTC__n_estimators': [10], # 'selector__extraTC__n_estimators': [10, 15], # 'selector__extraTC__criterion': ['gini', 'entropy'], # # 'selector__extraTC__criterion': ['entropy'], # 'selector__extraTC__n_jobs': [-1], # 'selector__pca__svd_solver': ['randomized'], 'selector__pca__svd_solver': ['full', 'arpack', 'randomized'], # 'selector__pca__whiten': [True], 'selector__pca__whiten': [True,False], 'ExtraTreesClassifier__n_estimators': [10, 15, 20], 'ExtraTreesClassifier__criterion': ['gini', 'entropy'] # 'ExtraTreesClassifier__min_samples_leaf': [1,2,3,4,5], # 'ExtraTreesClassifier__min_samples_leaf': range(200,1001,200), # 'ExtraTreesClassifier__max_leaf_nodes': [2,3,4,5], # 'ExtraTreesClassifier__max_depth': [2,3,4,5] } if method == 'random': parameters['ExtraTreesClassifier__min_samples_leaf'] = randint(200,1001) # parameters['ExtraTreesClassifier__max_leaf_nodes'] = randint(2,20) # parameters['ExtraTreesClassifier__max_depth'] = randint(1,20) pass return parameters
def test_sklearn_cv(): model = LightFM(loss='warp', random_state=42) # Set distributions for hyperparameters randint = stats.randint(low=1, high=65) randint.random_state = 42 gamma = stats.gamma(a=1.2, loc=0, scale=0.13) gamma.random_state = 42 distr = {'no_components': randint, 'learning_rate': gamma} # Custom score function def scorer(est, x, y=None): return precision_at_k(est, x).mean() # Custom CV which sets train_index = test_index class CV(KFold): def __iter__(self): ind = np.arange(self.n) for test_index in self._iter_test_masks(): train_index = np.logical_not(test_index) train_index = ind[train_index] yield train_index, train_index cv = CV(n=train.shape[0], random_state=42) search = RandomizedSearchCV(estimator=model, param_distributions=distr, n_iter=10, scoring=scorer, random_state=42, cv=cv) assert search.best_params_['no_components'] == 52
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1): # hyperparameter optimization param_dist = {"n_iter": randint(5, 100), "power_t": uniform(0.1), "alpha": uniform(1e-08, 1e-03), "eta0": uniform(1e-03, 1), "penalty": ["l1", "l2", "elasticnet"], "learning_rate": ["invscaling", "constant", "optimal"]} scoring = 'roc_auc' n_iter_search = n_iter_search random_search = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=n_iter_search, cv=cv, scoring=scoring, n_jobs=n_jobs, random_state=random_state, refit=True) X, y = make_data_matrix(positive_data_matrix=positive_data_matrix, negative_data_matrix=negative_data_matrix, target=target), y) logger.debug('\nClassifier:') logger.debug('%s' % random_search.best_estimator_) logger.debug('\nPredictive performance:') # assess the generalization capacity of the model via a 10-fold cross validation for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']: scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs) logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores))) return random_search.best_estimator_
def test_sklearn_(): ''' Test whether the booster indeed gets updated :return: ''' Xtrain = np.random.randn(100,10) ytrain = np.random.randint(0,2,100) Xval = np.random.randn(20, 10) yval = np.random.randint(0, 2, 20) classifier = SHSklearnEstimator(model=RandomForestClassifier(n_estimators=4),\ ressource_name='n_estimators') param_grid = {'max_depth': randint(1,10), 'min_impurity_decrease':lognorm(0.1) } scoring = make_scorer(accuracy_score) successiveHalving = SuccessiveHalving( estimator=classifier, n = 10, r = 100, param_grid=param_grid, ressource_name='n_estimators', scoring=scoring, n_jobs=1, cv=None, seed=0 ) T = successiveHalving.apply(Xtrain,ytrain,Xval,yval) print(T) assert(True)
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = {"n_estimators": randint(50, 150), "max_depth": [1, 3, 5, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 4), "min_samples_leaf": randint(1, 4)} erf = ExtraTreesRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=5, scoring='mean_squared_error', cv=kfold, n_jobs=-1) yPredict = [] for i in range(yTrain.shape[1]):, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def train(XTrain, yTrain, XPredict): params = {'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0]))} gbrt = GradientBoostingRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=50, scoring='mean_absolute_error', cv=kfold, n_jobs=-1), yTrain) # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict
def Extra(self): parameters_extra={"bootstrap": [True, False], 'n_estimators':randint(20,self.n_estimators_max) } X_train, y_train =self.X_train,self.y_train extra_reg=RandomizedSearchCV(ExtraTreesRegressor(),param_distributions=parameters_extra,, n_iter=self.n_iter,n_jobs=-1),y_train) self.extra_reg=extra_reg.best_estimator_
def time_span_1(request, consumption_generator_1, gsod_722880_2012_2014_weather_source): period, n_days = request.param generator,_ = consumption_generator_1 datetimes = generate_monthly_billing_datetimes(period, dist=randint(30,31)) consumption_data = generator.generate( gsod_722880_2012_2014_weather_source, datetimes) return consumption_data, n_days
def __init__(self, extent:tuple, layer_range:tuple, fault_range:tuple, verbose:bool=False): """A Noddy history random generator. Args: extent: (x,X,y,Y,z,Z) layer_range: (low, high) fault_range: (low, high) verbose: True / False """ self.extent = extent self.x = abs(extent[1] - extent[0]) self.y = abs(extent[3] - extent[2]) self.z = abs(extent[5] - extent[4]) self.layer_low, self.layer_high = layer_range self.faults_low, self.faults_high = fault_range self.verbose = verbose # defaults self.n_layers = self.layer_high self.n_faults = self.faults_low self.dist_faults = { "pos": self._random_pos(), "dip_dir": np.random.choice([stats.uniform(60, 120), stats.uniform(240, 300)]), "dip": stats.norm(45, 5), "slip": stats.uniform(0, self.z / 4) # np.random.uniform(0, self.z / 4) } self.dist_strat = { "layer_thickness": [stats.randint(self.z / self.n_layers, self.z / self.n_layers + self.z / 8 * self.n_faults) for l in range(self.n_layers)] } self.dist_tilt = { "pos": self._random_pos(), "rotation": stats.norm(0, 10), "plunge_direction": stats.uniform(0, 360), "plunge": stats.norm(0, 10) } self.dist_fold = { "pos": self._random_pos(), "wavelength": stats.uniform(self.x * 0.1, self.x * 2), "amplitude": stats.uniform(self.z * 0.05, self.z * 0.15) } self.dist_unconf = { "pos": self._random_pos(z_offset=self.z / 2), "dip_direction": stats.uniform(0, 360), "dip": stats.norm(0, 5), }
@author: T510 """ from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestClassifier from sklearn import linear_model from sklearn.svm import SVC from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint #-------------- finding best parameters and initialising instance models with good parameters # random forest param_distribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=8), 'max_depth': randint(low=1, high=20), 'min_samples_leaf': randint(low=1, high=4), } my_model = RandomForestClassifier() rnd_search = RandomizedSearchCV(my_model, param_distributions=param_distribs, n_iter=5, cv=3, scoring='balanced_accuracy', random_state=42), y_train) print(rnd_search.best_params_) rnd_forest = RandomForestClassifier(max_depth=11,
def rand_search(self): '''running a randomized search to find the parameter combination for a random forest which gives the best accuracy score''' print('*' * 80) print( '* Running RandomizedSearch for best parameter combination for ExtremeRandomForest' ) print('*' * 80) #create the decision forest extra_clf_rand = ExtraTreesClassifier(random_state=100, max_depth=1, n_jobs=-1) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Created extreme random forest: extra_clf_rand \n') #set up randomized search param_rand = { "criterion": ["gini", "entropy"], #metric to judge reduction of impurity 'class_weight': ['balanced', None], 'n_estimators': randint(100, 10000), #number of trees in forest 'max_features': randint(2, 31), #max number of features when splitting "min_samples_split": randint(2, 20), #min samples per node to induce split #"max_depth": randint(1, 10),#max number of splits to do "min_samples_leaf": randint(1, 20), #min number of samples in a leaf "max_leaf_nodes": randint(10, 20) } #max number of leaves with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write( 'Running randomized search for the following parameters: %s \n' % param_rand) text_file.write('use cv=3, scoring=accuracy \n') #building and running the randomized search rand_search = RandomizedSearchCV(extra_clf_rand, param_rand, random_state=5, cv=3, n_iter=500, scoring='accuracy', n_jobs=-1) rand_search_fitted =, self.y_train) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Best parameters: ' + str(rand_search_fitted.best_params_) + '\n') text_file.write('Best score: ' + str(rand_search_fitted.best_score_) + '\n') feature_importances_fitted = rand_search_fitted.best_estimator_.feature_importances_ feature_importances_fitted_ls = sorted(zip(feature_importances_fitted, self.X_metrix_train), reverse=True) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Feature importances: %s \n' % feature_importances_fitted_ls) self.best_params = rand_search_fitted.best_params_ self.feature_importances_fitted_ls = feature_importances_fitted_ls def feature_importances_best_estimator(feature_list, directory): datestring = datetime.strftime(, '%Y%m%d_%H%M') feature_list.sort(key=lambda x: x[1], reverse=True) feature = list(zip(*feature_list))[1] score = list(zip(*feature_list))[0] x_pos = np.arange(len(feature)), score, align='center') #plt.figure(figsize=(20,10)) plt.xticks(x_pos, feature, rotation=90) plt.title( 'Histogram of Feature Importances for best RandomForest using features ' ) plt.xlabel('Features') plt.tight_layout() plt.savefig( os.path.join( directory, 'feature_importances_best_bar_plot_rand_bag_' + datestring + '.png')) plt.close() feature_importances_best_estimator(self.feature_importances_fitted_ls, self.output_dir)
x_train, y_train, x_test = feature_engineering_titanic.read_titanic() x_train = x_train.as_matrix() y_train = y_train.as_matrix() x_test = x_test.as_matrix() # split train validate # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=0) # get best model one_to_left = st.beta(10, 1) from_zero_positive = st.expon(0, 50) params = { "n_estimators": st.randint(3, 40), "max_depth": st.randint(3, 10), "learning_rate": st.uniform(0.05, 0.4), "colsample_bytree": one_to_left, "subsample": one_to_left, "gamma": st.uniform(0, 10), 'reg_alpha': from_zero_positive, "min_child_weight": from_zero_positive, } xgb_clf = XGBClassifier(nthreads=-1) best_xgb_model = best_model.get_best_model(x_train, y_train, model=xgb_clf, params=params, n_iter=500,
"loss": ["hinge", "squared_hinge"], "tol": [0.00001, 0.0001, 0.001] # should probably include this in grid search, as # dual=False is preferred when samples>features. However: # Unsupported set of arguments: The combination of # penalty='l2' and loss='hinge' are not supported when # dual=False # "dual": [True, False] }, 'svm': { "C": [1, 0.5, 0.1, 0.9, 0.8], "tol": [0.00001, 0.0001, 0.001, 0.01], "shrinking": [True, False] }, 'kneighbors': { "n_neighbors": randint(2, 15), "weights": ['uniform', 'distance'], "leaf_size": randint(15, 100) }, 'linear': { "alpha": [0.0001, 0.01, 1.0, 10.0, 1000.0], "tol": [0.00001, 0.0001, 0.001, 0.01] } } TEMPLATES = pkg_resources.resource_filename('q2_sample_classifier', 'assets') def _load_data(feature_data, targets_metadata): '''Load data and generate training and test sets.
'c__colsample_bylevel': [1, 0.8, 0.6], # Subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. 'c__colsample_bynode': [1, 0.8, 0.6],# Subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. 'c__num_parallel_tree': [1], # Number of parallel trees constructed during each iteration. This option is used to support boosted random forest. 'c__max_depth': [2, 3, 6], # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. # 'c__reg_alpha': [0], # L1 regularization term on weights. Increasing this value will make model more conservative. # 'c__reg_lambda': [1], # L2 regularization term on weights. Increasing this value will make model more conservative. 'c__learning_rate': [0.3], # 0.3 Step size shrinkage used in update to prevents overfitting. Shrinks the feature weights to make the boosting process more conservative. # 'c__scale_pos_weight': [1] # should be negative_samples_count / positive_samples_count #'c__objective': ['multi:softmax'], #XGBoost will adjust this between binary:logistic and multi:softmax based on # of classes 'c__eval_metric': ['mlogloss'], # logloss heavily penalizes false-positives (better precision) 'c__tree_method': ['hist'] } PARAMETER_DISTRIBUTION = { 'i__strategy': ['mean'], # 'median', 'most_frequent', 'constant' 'c__n_estimators': randint(100, 1000), 'c__subsample': [1], # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. 'c__colsample_bytree': uniform(0.4, 0.6), # Subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed. 'c__colsample_bylevel': uniform(0.4, 0.6), # Subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. 'c__colsample_bynode': uniform(0.4, 0.6),# Subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. 'c__num_parallel_tree': [1], # Number of parallel trees constructed during each iteration. This option is used to support boosted random forest. 'c__max_depth': randint(2, 12), # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. # 'c__reg_alpha': [0], # L1 regularization term on weights. Increasing this value will make model more conservative. # 'c__reg_lambda': [1], # L2 regularization term on weights. Increasing this value will make model more conservative. 'c__learning_rate': [0.3], # 0.3 Step size shrinkage used in update to prevents overfitting. Shrinks the feature weights to make the boosting process more conservative. # 'c__scale_pos_weight': [1] # should be negative_samples_count / positive_samples_count # 'c__objective': ['multi:softmax'], #XGBoost will adjust this between binary:logistic and multi:softmax based on # of classes 'c__eval_metric': ['mlogloss'], # logloss heavily penalizes false-positives (better precision) 'c__tree_method': ['hist'] }
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = DecisionTreeClassifier(random_state=20, class_weight='balanced', max_features=self.numf) ada = AdaBoostClassifier(base_estimator=clf1, algorithm="SAMME.R", random_state=55)'Initialised classifier using balanced class weights \n') #set up randomized search param_dict = { 'base_estimator__criterion': ['gini', 'entropy'], 'n_estimators': randint(100, 10000), #number of base estimators to use 'learning_rate': uniform(0.0001, 1.0), 'base_estimator__min_samples_split': randint(2, 20), 'base_estimator__max_depth': randint(1, 10), 'base_estimator__min_samples_leaf': randint(1, 20), 'base_estimator__max_leaf_nodes': randint(10, 20) } f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(ada, param_dict, random_state=5,, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted =, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(, '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(, 'best_predictor_' + datestring + '.pkl'))'Writing best classifier to disk in {} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model,, self.bootiter, 'uncalibrated')'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train_scaled.columns), reverse=True) f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) all_clf_feat_import_mean = np.mean( [tree.feature_importances_ for tree in self.model.estimators_], axis=0) all_clf_feat_import_mean_sorted = sorted(zip( all_clf_feat_import_mean, self.X_train_scaled.columns), reverse=True) print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(best_clf_feat_import_sorted, f'Plotting feature importances for best classifier in decreasing order \n' ) feature_importances_error_bars(self.model, self.X_train_scaled.columns, f'Plotting feature importances for best classifier with errorbars \n' )
#!/usr/bin/env python # -*- coding: utf-8 -*- from sklearn.datasets import fetch_california_housing from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from scipy.stats import randint X, y = fetch_california_housing(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # define the parameter space that will be searched over param_distributions = { 'n_estimators': randint(1, 5), 'max_depth': randint(5, 10) } # now create a searchCV object and fit it to the data search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5, param_distributions=param_distributions, random_state=0), y_train) print(search.best_params_) # the search object now acts like a normal random forest estimator # with max_depth=9 and n_estimators=4 print(search.score(X_test, y_test))
r2 = gm_cv.score(X_test, y_test) # Accuracy test (R^2) mse = mean_squared_error(y_test, y_pred) # Accuracy test print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_)) print("Tuned ElasticNet R squared: {}".format(r2)) print("Tuned ElasticNet MSE: {}".format(mse)) # RandomizedSearchCV (w/ DecisionTreeClassifier) # problem: large hyperparam spaces, many hyperparams -> GridSearchCV comp. exp. # solution: fixed number of hyperparam values is sampled from scipy.stats import randint from sklearn.model_selection import RandomizedSearchCV from sklearn.tree import DecisionTreeClassifier # Setup the grid and hyperparam spaces to sample from: param_dist param_dist = { "max_depth": [3, None], "max_features": randint(1, 9), "min_samples_leaf": randint(1, 9), "criterion": ["gini", "entropy"] } tree = DecisionTreeClassifier() # Model tree_cv = RandomizedSearchCV( tree, param_dist, cv=5, n_iter=10 ) # Inst. RandomizedSearchCV. n_iter: the number of parameter settings tried, y_train) # Fitting on training data print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) print("Best score is {}".format(tree_cv.best_score_)) #endregion (HYPERPARAMETER TUNING) #region PREPROCESSING
# featureGain = {} # featureScore = {} # lastScore = 0 # count = 0 xx_score = [] cv_pred = [] splitsNum = 5 randomSeed = 42 numBoostRound = 200 # 50 earlyStoppingRounds = 60 #===================================================================================== tunedParameters = { "learning_rate": expon(scale=0.2),#np.exp(range(-3,0)), #"lambda_l2": 0.25, "max_depth": randint(low=5,high=15), "num_leaves": [64,128,256,512,1024], "bagging_fraction":uniform(), "feature_fraction":uniform(), #"min_data_in_leaf": } model = lgb.LGBMClassifier(objective="multiclass",num_class= 11,seed=42) searchResult = RandomizedSearchCV(model,tunedParameters,scoring ="f1_macro",n_iter=10,n_jobs=1,cv=3),trainLabel) print(rscv.cv_results_) print(rscv.best_params_) exit() #===================================================================================== skf = StratifiedKFold(n_splits=splitsNum, random_state=randomSeed, shuffle=True) for index, (train_index, test_index) in enumerate(skf.split(trainData, trainLabel)): print("[+] "+str(index)+" iteration")
from sklearn.linear_model import LogisticRegression from scipy.stats import randint, uniform seed = 0 model = LogisticRegression() param_dist = { # "penalty": ['l1', 'l2'], "penalty": ['l2'], # "C": [0.1, 0.5, 1.0, 2, 10], "C": uniform(0.001, 0.01), "random_state": [seed], "max_iter": randint(500, 1000), }
def msm_distance_measure_getter(X): """ generate the msm distance measure :param X: dataset to derive parameter ranges from :return: distance measure and parameter range dictionary """ n_dimensions = 1 # todo use other dimensions return { "distance_measure": [cython_wrapper(msm_distance)], "dim_to_use": stats.randint(low=0, high=n_dimensions), "c": [ 0.01, 0.01375, 0.0175, 0.02125, 0.025, 0.02875, 0.0325, 0.03625, 0.04, 0.04375, 0.0475, 0.05125, 0.055, 0.05875, 0.0625, 0.06625, 0.07, 0.07375, 0.0775, 0.08125, 0.085, 0.08875, 0.0925, 0.09625, 0.1, 0.136, 0.172, 0.208, 0.244, 0.28, 0.316, 0.352, 0.388, 0.424, 0.46, 0.496, 0.532, 0.568, 0.604, 0.64, 0.676, 0.712, 0.748, 0.784, 0.82, 0.856, 0.892, 0.928, 0.964, 1, 1.36, 1.72, 2.08, 2.44, 2.8, 3.16, 3.52, 3.88, 4.24, 4.6, 4.96, 5.32, 5.68, 6.04, 6.4, 6.76, 7.12, 7.48, 7.84, 8.2, 8.56, 8.92, 9.28, 9.64, 10, 13.6, 17.2, 20.8, 24.4, 28, 31.6, 35.2, 38.8, 42.4, 46, 49.6, 53.2, 56.8, 60.4, 64, 67.6, 71.2, 74.8, 78.4, 82, 85.6, 89.2, 92.8, 96.4, 100, ], }
scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(X_train, y_train) # In[97]: gridSearchAda.best_params_, gridSearchAda.best_score_ # #### GradientBoosting # In[98]: gbHyperParams = { 'loss': ['deviance', 'exponential'], 'n_estimators': randint(10, 500), 'max_depth': randint(1, 10) } # In[99]: gridSearchGB = RandomizedSearchCV(estimator=gbMod, param_distributions=gbHyperParams, n_iter=10, scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(X_train, y_train) # In[100]:
ests = { 'case-1': [ #('lin', LinearRegression()), ('bay', BayesianRidge(tol=1e-5)), #('hub', HuberRegressor(max_iter=5000, tol=1e-5)), ('ard', ARDRegression(tol=1e-5)), ('par', PassiveAggressiveRegressor(max_iter=5000, tol=1e-5)), ('rdg', Ridge(max_iter=5000, random_state=seed)), ('las', Lasso(max_iter=5000, random_state=seed)), # ('eln', ElasticNet(max_iter=5000, tol=1e-5, random_state=seed)), ('svr', SVR(kernel='linear')), ('mlp', MLPRegressor()) ] } r = uniform(0, 30) d = randint(2, 10) f = randint(1, 100) e = uniform(0, 3) ee = uniform(0, 1) pars_1 = { 'case-1.mlp': { 'alpha': ee, 'beta_1': e, 'beta_2': e, 'epsilon': ee }, 'case-1.eln': { 'alpha': e, 'l1_ratio': e },
class FixedLengthTupleDistribution: """ Tuples where each element stems from a specified distribution Note: this is not a normalized distribution """ def __init__(self, distributions): self.distributions = distributions def rvs(self, random_state=None): return tuple([ dist.rvs(random_state=random_state) for dist in self.distributions ]) param_grid = { 'mlp__hidden_layer_sizes': randint(2, 150), 'mlp__activation': ['logistic', 'tanh', 'relu'], 'mlp__solver': ['lbfgs'], 'mlp__alpha': ExpDistribution(uniform(-6, 5)), 'mlp__learning_rate': ['constant'], 'mlp__max_iter': [200], 'mlp__learning_rate_init': ExpDistribution(uniform(-6, 5)), # no2 results analysis shows good range between 0.1 and 0.001 # 'filter__alpha': ExpDistribution(uniform(-3, 2)), 'mlp__early_stopping': [True] } # CO_best = {'mlp__activation': 'relu', 'mlp__alpha': 1.899210794532138e-06, # 'mlp__hidden_layer_sizes': 112, # 'mlp__learning_rate_init': 0.07908998568339845, # 'mlp__solver': 'lbfgs', 'mlp__learning_rate': 'constant',
from scipy.stats import randint

params = {
    'min_impurity_decrease': uniform(0.0001, 0.001),
    'max_depth': randint(20, 50),
def test_smoke_hyperband(min_iter): seed = 10 n_splits = 4 eta = 3 max_iter = 27 params = dict(a=randint(low=1, high=100)) cmax_param = {'b': max_iter} if min_iter is not None: cmin_param = {'b': min_iter} else: cmin_param = None hyperband_search = HyperbandSearchCV( DummyCVedEstimator(), cost_parameter_max=cmax_param, cost_parameter_min=cmin_param, cv=n_splits, iid=False, return_train_score=False, eta=eta, param_distributions=params, random_state=seed), 3)), np.random.choice(2, size=100, replace=True)) rng = np.random.RandomState(seed=seed) ri = randint(low=1, high=100) # b_vals is the geometric sequence of values of b hyperband will test if min_iter is None or min_iter == 1: b_vals = np.array( [1] * 27 + [3] * 9 + [9] * 3 + [27] + [3] * 12 + [9] * 4 + [27] + [9] * 6 + [27] * 2 + [27] * 4) a_itrs = [[27, 9, 3, 1], [12, 4, 1], [6, 2], [4]] else: b_vals = np.array( [3] * 9 + [9] * 3 + [27] + [9] * 5 + [27] + [27] * 3) a_itrs = [[9, 3, 1], [5, 1], [3]] # now draw the a_vals in the proper order a_vals = [] for bstart, nks in enumerate(a_itrs): a_vals_orig = ri.rvs(random_state=rng, size=nks[0]).tolist() for i, nk in enumerate(nks): scores = np.array( [np.random.RandomState(seed=s).uniform() for s in a_vals_orig]) + np.power(3, i + bstart) sinds = np.argsort(scores)[::-1] # bigger is better msk = np.zeros_like(a_vals_orig) msk[sinds[0:nk]] = 1 msk = msk.astype(bool) a_vals_orig = [a for i, a in enumerate(a_vals_orig) if msk[i]] a_vals += a_vals_orig a_vals = np.array(a_vals) mn_scores = np.array( [np.random.RandomState(seed=a).uniform() for a in a_vals]) + b_vals best_index = np.argmax(mn_scores) # now make sure it got the right values assert hyperband_search.best_index_ == best_index, "Best index is wrong!" assert hyperband_search.best_score_ == mn_scores[best_index], ( "Best score is wrong!") assert (hyperband_search.best_params_ == {'a': a_vals[best_index], 'b': b_vals[best_index]}), ( "Best parameters are wrong!")
param_grid=params_grid, cv=cv, scoring='neg_mean_squared_error'), y) bst_grid.grid_scores_ print("Best accuracy obtained: {0}".format(bst_grid.best_score_)) print("Parameters:") for key, value in bst_grid.best_params_.items(): print("\t{}: {}".format(key, value)) params_dist_grid = { 'max_depth': [1, 2, 3, 4], 'gamma': [0, 0.5, 1], 'n_estimators': randint(1, 1001), # uniform discrete random distribution 'learning_rate': uniform(), # gaussian distribution 'subsample': uniform(), # gaussian distribution 'colsample_bytree': uniform() # gaussian distribution } rs_grid = RandomizedSearchCV(estimator=XGBRegressor(**params_fixed, seed=seed), param_distributions=params_dist_grid, n_iter=10, cv=cv, scoring='neg_mean_squared_error', random_state=seed), y) rs_grid.grid_scores_ rs_grid.best_estimator_
('rfr', RandomForestClassifier(n_jobs=1)), ('mlp', MLPClassifier(tol=1e-4)), ('svc', SVC(tol=1e-4, degree=9)), ('rdc', RidgeClassifierCV()), ('gbc', GradientBoostingClassifier()), ('ada', AdaBoostClassifier()), ('svc', SVC(tol=1e-4, degree=7, kernel='linear')), ('bag', BaggingClassifier(n_jobs=1))] ests_1 = { 'case-1': est_l1, # 'case-2': est_l1, # 'case-3': est_l1, # 'case-4': est_l1 } r = uniform(0, 30) d = randint(2, 10) f = randint(100, 200) e = uniform(0, 3) ee = uniform(0, 1) pars_1 = {} sc = StandardScaler() pca = PCA() fa = FactorAnalysis() nmf = NMF() pre_cases = { 'case-1': [sc], # 'case-2': [sc], # 'case-3': [pca],
device=('cuda' if USE_CUDA else 'cpu'), max_epochs=5, lr=0.01, optimizer=torch.optim.RMSprop, ) pipe = Pipeline(steps + [('net', net)]) #, y) pipe.set_params(net__verbose=0, net__train_split=None) params = { 'to_idx__stop_words': ['english', None], 'to_idx__lowercase': [False, True], 'to_idx__ngram_range': [(1, 1), (2, 2)], 'net__module__embedding_dim': stats.randint(32, 256 + 1), 'net__module__rec_layer_type': ['gru', 'lstm'], 'net__module__num_units': stats.randint(32, 256 + 1), 'net__module__num_layers': [1, 2, 3], 'net__module__dropout': stats.uniform(0, 0.9), 'net__module__bidirectional': [True, False], 'net__lr': [10**(-stats.uniform(1, 5).rvs()) for _ in range(NUM_CV_STEPS)], 'net__max_epochs': [5, 10], } search = RandomizedSearchCV(pipe, params, n_iter=NUM_CV_STEPS, verbose=2, refit=False, scoring='accuracy',
""" An example training a RandomForestClassifier, performing randomized search using TuneSearchCV. """ from tune_sklearn import TuneSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn import datasets from sklearn.model_selection import train_test_split from scipy.stats import randint import numpy as np digits = datasets.load_digits() x = y = x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2) clf = RandomForestClassifier() param_distributions = { "n_estimators": randint(20, 80), "max_depth": randint(2, 10) } tune_search = TuneSearchCV(clf, param_distributions, n_iter=3), y_train) pred = tune_search.predict(x_test) accuracy = np.count_nonzero(np.array(pred) == np.array(y_test)) / len(pred) print(accuracy)
def sample_preprocess(): preprocess_distr = { "filter_type": ["none", "ma_smoothing"], "win_len": randint(2, 20) } return sample_scikit(preprocess_distr)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_disribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_disribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error',
                                random_state=42), housing_labels)

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
class Training: def __init__(self, n_calls): self.n_calls = n_calls def afl_loss(y_true, y_pred): return -np.sum(1 + np.log2(y_true * y_pred + (1 - y_true) * (1 - y_pred))) scorer = make_scorer(afl_loss, greater_is_better=False, needs_proba=True) spaceR = { 'n_estimators': randint(low=350, high=700), 'max_depth': randint(low=3, high=20), 'min_samples_split': uniform(0.01, 0.99), 'min_samples_leaf': randint(low=1, high=10), 'min_weight_fraction_leaf': uniform(0, 0.5), 'max_features': randint(low=1, high=18), 'max_leaf_nodes': randint(low=2, high=1000), 'min_impurity_decrease': uniform(0, 2) } spaceB = { 'n_estimators': Integer(200, 1000), 'max_depth': Integer(3, 20), 'min_samples_split': Real(0.01, .99, "uniform"), 'min_samples_leaf': Integer(1, 10), 'min_weight_fraction_leaf': Real(0, 0.5, "uniform"), 'max_features': Integer(1, 17), 'max_leaf_nodes': Integer(2, 1000), 'min_impurity_decrease': Real(0, 2) } def trainR(self, X_list, y_list, space=spaceR, cv=5): """ RandomSearchCV method :param X_list: List of training sets :param y_list: List of targets :param space: parameter space :return: models an metrics """ n_calls = self.n_calls scores = [] val_scores = [] best_models = [] for j in range(len(X_list)): classifier = RandomForestClassifier(n_jobs=-1) y = y_list.copy() X = X_list.copy() y_test = y.pop(j) X_test = X.pop(j) y_train = np.concatenate(y, axis=0) X_train = np.concatenate(X, axis=0) X_train = Features().div_cols(X_train).values X_test = Features().div_cols(X_test).values start = time() opt = RandomizedSearchCV(classifier, param_distributions=space, n_iter=n_calls, scoring=self.scorer, cv=cv, n_jobs=-1, iid=False), y_train) model = opt.best_estimator_ print('Season', 2019 - j) print("Random CV search took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_calls)) print("val. score:", opt.best_score_) print("test score:", opt.score(X_test, y_test)) # print(model) print("") best_models.append(model) val_scores.append(opt.best_score_) scores.append(opt.score(X_test, y_test)) return scores, val_scores, best_models def trainB(self, X_list, y_list, n_points=1, space=spaceB, cv=5): """ BayesianSearchCV method :param X_list: List of training sets :param y_list: List of targets :param space: parameter space :return: models an metrics """ n_calls = self.n_calls scores = [] val_scores = [] best_models = [] for j in range(len(X_list)): classifier = RandomForestClassifier(n_jobs=-1) y = y_list.copy() X = X_list.copy() y_test = y.pop(j) X_test = X.pop(j) y_train = np.concatenate(y, axis=0) X_train = np.concatenate(X, axis=0) X_train = Features().div_cols(X_train).values X_test = Features().div_cols(X_test).values start = time() opt = BayesSearchCV(classifier, search_spaces=space, scoring=self.scorer, cv=cv, n_points=n_points, n_iter=n_calls, n_jobs=-1), y_train) model = opt.best_estimator_ print('Season', 2019 - j) print("Bayes CV search took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_calls)) print("val. score:", opt.best_score_) print("test score:", opt.score(X_test, y_test)) # print(model) print("") best_models.append(model) val_scores.append(opt.best_score_) scores.append(opt.score(X_test, y_test)) return scores, val_scores, best_models
from sklearn.preprocessing import StandardScaler from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import SGDClassifier if __name__ == '__main__': x, y, num, cat = data_prep() classifiers = [ ('KNN', KNeighborsClassifier(), { 'KNN__algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'KNN__n_neighbors': stats.randint(4, 40), 'KNN__p': stats.randint(1, 5), 'KNN__weights': ('uniform', 'distance') }), ( 'ETclf', ExtraTreesClassifier(), { 'ETclf__n_estimators': stats.randint(10, 500), 'ETclf__criterion': ('gini', 'entropy'), 'ETclf__max_depth': stats.randint(10, 50), 'ETclf__max_features': ('sqrt', 'log2', 'auto'), # 'ETclf__max_leaf_nodes' :(None , 2 , 4 , 5 , 6 , 7 , 9 , 10) , # 'ETclf__min_samples_leaf':[2, 4 , 6 , 8 , 10], # 'ETclf__min_samples_split':[2, 4 , 6 , 8 , 10] }),, y) assert sh.n_candidates_[0] == expected_n_candidates if n_candidates == 'exhaust': # Make sure 'exhaust' makes the last iteration use as much resources as # we can assert sh.n_resources_[-1] == max_resources @pytest.mark.parametrize( 'param_distributions, expected_n_candidates', [ ({ 'a': [1, 2] }, 2), # all lists, sample less than n_candidates ({ 'a': randint(1, 3) }, 10), # not all list, respect n_candidates ]) def test_random_search_discrete_distributions(param_distributions, expected_n_candidates): # Make sure random search samples the appropriate number of candidates when # we ask for more than what's possible. How many parameters are sampled # depends whether the distributions are 'all lists' or not (see # ParameterSampler for details). This is somewhat redundant with the checks # in ParameterSampler but interaction bugs were discovered during # developement of SH n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=0) base_estimator = FastClassifier() sh = HalvingRandomSearchCV(base_estimator,
#scaling and splitting data #X_data = preprocessing.scale(X_data) #Y_data = preprocessing.scale(Y_data) print("done") X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data) #Exhaustive grid search parameters param_grid = {'C': [1,10, 100, 1000], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'degree': [0, 1, 2, 3, 4, 5], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], 'coef0': [0,1,2,3,4,5]} #param_grid = {'C': [1,10,100,1000], 'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], }, {'C': [1,10,100,1000], 'kernel': ['poly'], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], 'degree': [1,2,3,4,5,6,7]}, {'C': [1,10,100,1000], 'kernel': ['linear']} #Randomized grid search parameters param_dist = {'C': expon(scale = 100), 'gamma': expon(scale = .1), 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'degree': randint(1,100), 'coef0': randint(0,100)} n_iter_search = 20 #Creating and training model #Regular model #model = SVC(C= 10, gamma= .0001, kernel= 'rbf', verbose = True) SVC = SVC(verbose = False) #Regular grid search #model = GridSearchCV(SVC, param_grid=param_grid) #searchtype = GridSearch #Randomized Grid search model = RandomizedSearchCV(SVC, param_distributions=param_dist, n_iter=n_iter_search) searchtype = "Randomized, n_iter_search = {}".format(n_iter_search) #Training
from scipy.stats import uniform, randint

rgen = randint(low=0, high=9)

np.unique(rgen.rvs(1000), return_counts=True)

ugen = uniform(0, 1)
ugen.rvs(10)

params = {
    'min_impurity_decrease': uniform(0.0001, 0.001),
    'max_depth': randint(20, 50),
    'min_samples_split': randint(2, 25),
    'min_samples_leaf': randint(1, 25),
}

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, 
                       n_iter=100, n_jobs=-1, random_state=42), train_target)

dt = gs.best_estimator_

print(gs.best_params_)
print(gs.cv_results_['mean_test_score'])

scores = cross_validate(dt, train_input, train_target)
print(np.mean(scores['test_score']))

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42), train_target)

print(rf.feature_importances_)

rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42), y_train)

est = RandomForestClassifier(n_jobs=-1)
rf_p_dist={'max_depth':[3,5,10,None],
          'n_estimators':[10,100,200,300,400,500],
          'max_features':randint(1,3),
          'criterion':['gini','entropy'],
          'bootstrap':[True,False],
          'min_samples_leaf':randint(1,4),
          }

def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch=RandomizedSearchCV(est, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=9)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

rf_parameters, rf_ht_score = hypertuning_rscv(est, rf_p_dist, 40, X, y)
# 'model__C': stats.loguniform(1, 2), # 'gamma': 'auto', # 'model__degree': stats.randint(1, 3), "preprocessor__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()] } # params_lr = {"random_grid_search": grid_lr, "model": LinearRegression()} ###################################################### ###################################################### # RandomForestRegressor model ###################################################### grid_rfr = { 'model__n_estimators': stats.randint(1, 300), 'model__max_depth': stats.randint(1, 300), 'model__max_samples': stats.randint(1, 300), "preprocessor__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()] } # params_rfr = {"random_grid_search": grid_rfr, "model": RandomForestRegressor()} ###################################################### ###################################################### # GradientBoostingRegressor model ###################################################### grid_gbr = { #'model__loss': ["ls", "lad", "huber", "quantile"], 'model__learning_rate': stats.loguniform(0.001, 10),