def create_project(params_e_pre, params_e_post, params_g_pre, params_g_post, baseline_period_start_date, baseline_period_end_date, reporting_period_start_date, reporting_period_end_date, has_electricity, has_gas, weather_source, zipcode): model_e = AverageDailyTemperatureSensitivityModel(heating=True, cooling=True) model_g = AverageDailyTemperatureSensitivityModel(heating=True, cooling=False) # generate consumption baseline_period = Period(baseline_period_start_date, reporting_period_start_date) datetimes_pre = generate_monthly_billing_datetimes(baseline_period, dist=randint(29,31)) reporting_period = Period(datetimes_pre[-1], reporting_period_end_date) datetimes_post = generate_monthly_billing_datetimes(reporting_period, dist=randint(29,31)) location = Location(zipcode=zipcode) baseline_period = Period(baseline_period_start_date, baseline_period_end_date) reporting_period = Period(reporting_period_start_date, reporting_period_end_date) cds = [] if has_electricity: cd_e = generate_consumption_records(model_e, params_e_pre, params_e_post, datetimes_pre, datetimes_post, "electricity", "kWh", weather_source) cds.append(cd_e) if has_gas: cd_g = generate_consumption_records(model_g, params_g_pre, params_g_post, datetimes_pre, datetimes_post, "natural_gas", "therm", weather_source) cds.append(cd_g) return Project(location, cds, baseline_period, reporting_period)
def Gradient(self): X_train, y_train =self.X_train,self.y_train parameters_boost={'max_depth':randint(3,self.max_depth_max+1), 'n_estimators':randint(80,100+self.n_estimators_max)} boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) boost_reg.fit(X_train,y_train) self.boost_reg=boost_reg.best_estimator_
def get_param_grid(cur_model, points, rand): print('\nRetrieving parameter grid...') try: c_range = 10.0 ** np.arange(-2, 3) # print 'Getting Parameter grid...' # out_txt.write('Getting Parameter grid...') gamma_range = [0, .01, .1, .3] # neighbor_range = np.arange(2, points, step=5) # leaf_range = np.arange(10, points, step=5) neighbor_range = np.arange(2, 17, step=5) leaf_range = np.arange(10, 60, step=5) if not rand: grid_params = {'SVC()': [{'C': c_range, 'kernel': ['poly'], 'degree': [3, 5, 8], 'gamma': gamma_range, 'probability': [True], 'class_weight': ['auto', None]}, {'C': c_range, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma_range, 'probability': [True], 'class_weight': ['auto', None]}, {'C': c_range, 'kernel': ['linear'], 'random_state': [10], 'probability': [True], 'class_weight': ['auto', None]}], 'KNeighborsClassifier()': [{'n_neighbors': neighbor_range, 'weights': ['uniform'], 'algorithm': ['brute'], 'metric': ['euclidean', 'manhattan']}, {'n_neighbors': neighbor_range, 'weights': ['uniform'], 'algorithm': ['ball_tree', 'kd_tree'], 'metric': ['euclidean', 'manhattan'], 'leaf_size': leaf_range}], 'LogisticRegression()': [{'penalty': ['l1', 'l2'], 'C': c_range, 'class_weight': [None, 'auto']}]} return grid_params[cur_model] else: rand_params = {'SVC()': {'C': stats.expon(scale=300), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [3, 4, 5, 6, 7, 8], 'gamma': stats.expon(scale=1/3), 'random_state': [10], 'probability': [True], 'class_weight': ['auto', None]}, 'KNeighborsClassifier()': {'n_neighbors': stats.randint(low=2, high=20), 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan'], 'leaf_size': stats.randint(low=10, high=60)}, 'LogisticRegression()': {'penalty': ['l1', 'l2'], 'C': stats.expon(scale=300), 'class_weight': [None, 'auto']}} return rand_params[cur_model] except: print('could not get parameter grid')
def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) params = {'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0]))} bestModels = [] for i in range(len(yTrain[0])): gbrt = GradientBoostingRegressor() clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=20, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(1, 12): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, i) # 模型的预测天数递增 XPredict = pp.makeXPredict(array, embedDim, interval, i) # 待预测的输入递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack((array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 yPredict = array[0, -65:-5] # 一共可以预测66天,取其中对应的数据 return yPredict
def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) params = {"n_estimators": randint(5, 100), "max_depth": [1, 2, 3, 5, 8, 10, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 3), "min_samples_leaf": randint(1, 3)} bestModels = [] for i in range(len(yTrain[0])): erf = ExtraTreesRegressor() clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=10, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(60): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) # 模型的嵌入维度递增 XPredict = pp.makeXPredict(array, embedDim, interval, 1) # 待预测的嵌入维度递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack((array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 embedDim += 1 yPredict = array[0, -60:] # 一共可以预测60天,取其中对应的数据 return yPredict
def decisiontree_param(self, method='grid'): parameters = { # 'selector__extraTC__n_estimators': [10], # 'selector__extraTC__n_estimators': [10, 15], # # 'selector__extraTC__criterion': ['entropy'], # 'selector__extraTC__criterion': ['gini','entropy'], # 'selector__extraTC__n_jobs': [-1], # 'selector__pca__svd_solver': ['randomized'], 'selector__pca__svd_solver': ['full', 'arpack', 'randomized'], # 'selector__pca__whiten': [True], 'selector__pca__whiten': [True,False], 'DecisionTreeClassifier__criterion': ['gini','entropy'], 'DecisionTreeClassifier__splitter': ['best','random'], 'DecisionTreeClassifier__max_features': ['sqrt','log2', None] # 'DecisionTreeClassifier__max_leaf_nodes': [2,3, None], # 'DecisionTreeClassifier__max_depth': [2,3, None], # 'DecisionTreeClassifier__min_samples_leaf': [1,3,5, None] } if method == 'random': parameters['DecisionTreeClassifier__min_samples_leaf'] = randint(1,20) parameters['DecisionTreeClassifier__max_leaf_nodes'] = randint(2,20) parameters['DecisionTreeClassifier__max_depth'] = randint(1,20) return parameters
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def get_random_forest(): classifier = sklearn.ensemble.RandomForestClassifier(max_features=None,oob_score=False,n_jobs=1) pipeline = sklearn.pipeline.Pipeline([('RF',classifier)]) meta_dict={'RF__n_estimators':stats.randint(5,100),'RF__max_features':['sqrt','log2','auto',None], 'RF__max_depth':stats.randint(2,10)} return pipeline,meta_dict
def Extra(self): parameters_extra={'max_depth':randint(1,self.max_depth_max+1), "bootstrap": [True, False], 'min_samples_split':randint(1,self.min_samples_split_max+1), "min_samples_leaf": randint(1, self.min_samples_leaf_max+1), 'n_estimators':randint(20,20+self.n_estimators_max) } X_train, y_train =self.X_train,self.y_train extra_reg=RandomizedSearchCV(ExtraTreesRegressor(),param_distributions=parameters_extra,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) extra_reg.fit(X_train,y_train) self.extra_reg=extra_reg.best_estimator_
def do_train_rand(X, y, params=None, n_iter=50): if not params: params = {'n_estimators': stats.randint(40, 90), 'max_depth': stats.randint(20, 40), 'min_samples_leaf': stats.randint(80, 110), 'max_features': ['auto', 'sqrt']} clf = RandomizedSearchCV(GradientBoostingClassifier(), params, n_iter=n_iter, scoring=do_test, n_jobs=15, verbose=1, cv=4) clf.fit(X, y) return clf
def RandomFo(self): parameters_forest={'max_depth':randint(1,self.max_depth_max+1), "bootstrap": [True, False], 'min_samples_split':randint(1,self.min_samples_split_max+1), "min_samples_leaf": randint(1,self.min_samples_leaf_max+1), "max_features": randint(1, self.max_features_max), 'n_estimators':randint(15,self.n_estimators_max), } ### Gridsearch X_train, y_train =self.X_train,self.y_train forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) forest_reg.fit(X_train,y_train) self.forest_reg=forest_reg.best_estimator_
def simple_tree(self): #### methode d'apprentissage avec arbre simple ### cv: nb d'etapes ds la cross valid ### n_iter est le nb d'iteration pour la random cross valid parameters_tree={'max_depth': randint(1,self.max_depth_max+1), 'min_samples_split':randint(1,self.min_samples_split_max+1), 'min_samples_leaf':randint(1,self.min_samples_leaf_max+1), 'max_leaf_nodes':randint(2,self.max_leaf_nodes_max), "max_features": randint(1, self.max_features_max)} tree_reg = RandomizedSearchCV(DecisionTreeRegressor(), param_distributions=parameters_tree, cv=self.cv, n_iter=self.n_iter,n_jobs=-1) X_train, y_train =self.X_train,self.y_train tree_reg.fit(X_train, y_train) self.tree_reg=tree_reg.best_estimator_
def train(XTrain, yTrain, XPredict): params = {"n_estimators": randint(5, 100), "max_depth": [1, 2, 3, 5, 10, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 3), "min_samples_leaf": randint(1, 3)} rf = RandomForestRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=3, shuffle=False) clf = grid_search.RandomizedSearchCV(rf, param_distributions=params, n_iter=30, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain) # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict, clf.best_params_
def _get_random_params(model_name): """Return some random model parameters to search over. Args: ---- model_name: str Return: ------ param_dct: dct """ if model_name == 'logit': param_dct = {'penalty': ['l1', 'l2'], 'C': scs.uniform(0.00001, 0.0099)} elif model_name == 'random_forest': param_dct = {'n_estimators': scs.randint(400, 1200), 'max_depth': scs.randint(2, 32)} elif model_name == 'extra_trees': param_dct = {'n_estimators': scs.randint(400, 1200), 'max_depth': scs.randint(2, 32)} elif model_name == 'gboosting': param_dct = {'n_estimators': scs.randint(400, 1200), 'learning_rate': scs.uniform(0.001, 0.099), 'max_depth': scs.randint(1, 8), 'max_features': scs.uniform(0.5, 0.5), 'subsample': scs.uniform(0.5, 0.5)} elif model_name == 'xgboost': param_dct = {'learning_rate': scs.uniform(0.001, 0.099), 'n_estimators': scs.randint(400, 1200), 'max_depth': scs.randint(1, 8), 'subsample': scs.uniform(0.5, 0.5), 'colsample_bytree': scs.uniform(0.5, 0.5)} return param_dct
def test_generate_monthly_billing_datetimes(): period = Period(datetime(2012,1,1),datetime(2013,1,1)) datetimes_30d = generate_monthly_billing_datetimes(period, randint(30,31)) assert datetimes_30d[0] == datetime(2012,1,1) assert datetimes_30d[1] == datetime(2012,1,31) assert datetimes_30d[11] == datetime(2012,11,26) assert datetimes_30d[12] == datetime(2012,12,26) datetimes_1d = generate_monthly_billing_datetimes(period, randint(1,2)) assert datetimes_1d[0] == datetime(2012,1,1) assert datetimes_1d[1] == datetime(2012,1,2) assert datetimes_1d[330] == datetime(2012,11,26) assert datetimes_1d[331] == datetime(2012,11,27)
def test_random_grid(): # get our train/test X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.75, random_state=42) # default CV does not shuffle, so we define our own custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42) # build a pipeline pipe = Pipeline([ ('retainer' , FeatureRetainer()), # will retain all ('dropper' , FeatureDropper()), # won't drop any ('mapper' , FunctionMapper()), # pass through ('encoder' , OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer' , SelectiveImputer()), # pass through ('scaler' , SelectiveScaler()), ('boxcox' , BoxCoxTransformer()), ('nzv' , NearZeroVarianceFilterer(threshold=1e-4)), ('pca' , SelectivePCA(n_components=0.9)), ('model' , RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold' : uniform(loc=.8, scale=.15), 'collinearity__method' : ['pearson','kendall','spearman'], 'scaler__scaler' : [StandardScaler(), RobustScaler()], 'pca__n_components' : uniform(loc=.75, scale=.2), 'pca__whiten' : [True, False], 'model__n_estimators' : randint(5,100), 'model__max_depth' : randint(2,25), 'model__min_samples_leaf' : randint(1,15), 'model__max_features' : uniform(loc=.5, scale=.5), 'model__max_leaf_nodes' : randint(10,75) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=custom_cv, random_state=42) # fit the search search.fit(X_train, y_train) # test the report the_report = report_grid_score_detail(search, charts=False)
def RandomFo(self): parameters_forest={'n_estimators':randint(10,self.n_estimators_max), "bootstrap": [True, False]} X_train, y_train =self.X_train,self.y_train forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) forest_reg.fit(X_train,y_train) self.forest_reg=forest_reg.best_estimator_
def do_train_rand(train, valid, params=None, max_models=32): """Do randomized hyper-parameter search Args: train (SFrame): training set valid (SFrame): validataion set params (dict): parameters for random search max_models (int): maximum number of models to run Returns: res (SFrame): table of choices of parameters sorted by valid RMSE """ if not params: params = {'user_id': ['username'], 'item_id': ['course_id'], 'target': ['label'], 'binary_target': [True], 'num_factors': stats.randint(4, 128), 'regularization': stats.expon(scale=1e-4), 'linear_regularization': stats.expon(scale=1e-7)} try: job = gl.toolkits.model_parameter_search \ .random_search.create((train, valid), gl.recommender. factorization_recommender.create, params, max_models=max_models) res = job.get_results() res = res.sort('validation_rmse') print 'Best params for random search are: {}'.format(res[0]) res.save('rand_search.csv', format='csv') except: print job.get_metrics() res = None return res
def get_available_resource(self, random = False): """Gets the index number of an available resource. *Arguments* ``random`` (Boolean) If set to True, randomly chooses the index of an available resource. Otherwise, returns the index of the available resource with the lowest index value. *Returns:* A positive integer representing the index number of the resource. ``None`` if all resources are busy. """ empty_resources = [] for index in range(self.num_resources): if self[index].get_available_station() is not None: empty_resources.append(index) if len(empty_resources) == 0: return None elif not random: return empty_resources[0] else: return empty_resources[randint(0, len(empty_resources) - 1)]
def make_data(self): """ Create a generator for building the data. """ # Build a logging format suitable for sorting log_dir = self.param_distributions['log_dir'] log_fmt = '{{0:0{0}d}}-{{1:0{1}d}}'.format(len(str(self.niter)), len(str(self.nsplits))) # Create the parameters for each instance for i in xrange(1, self.niter + 1): # Make the params ncolumns = self.param_distributions['ncolumns'].rvs() param = {'ncolumns':ncolumns} #### # Ensure that parameters make sense #### # Compute nactive as a function of ncolumns # Ensure that nactive is bounded by (0, ncolumns) nactive = int(self.param_distributions['nactive'].rvs() * ncolumns) while ((nactive == 0) or (nactive == ncolumns)): nactive = int(self.param_distributions['nactive'].rvs() * ncolumns) param['nactive'] = nactive # Ensure that each input is seen at least once nsynapses = self.param_distributions['nsynapses'].rvs() p = prob.p_c(ncolumns, nsynapses, self.ninputs) e = int(p * self.ninputs) while e > 0: nsynapses = self.param_distributions['nsynapses'].rvs() p = prob.p_c(ncolumns, nsynapses, self.ninputs) e = int(p * self.ninputs) param['nsynapses'] = nsynapses # Compute seg_th as a function of nsynapses seg_th = int(self.param_distributions['seg_th'].rvs() * nsynapses) param['seg_th'] = seg_th # Make a useful log directory param['log_dir'] = os.path.join(log_dir, log_fmt.format(i, 1)) #### # Add all other parameters #### added = set(param.keys()) missing = [key for key in self.keys if key not in added] for key in missing: if hasattr(self.param_distributions[key], 'rvs'): param[key] = self.param_distributions[key].rvs() else: param[key] = value[randint(0, len(v)).rvs()] # Yield each item for key in self.keys: yield param[key]
def generate_altitudes_for_traj(trajectories, distr_file = None, distr_type = "flat", min_FL = 240., max_FL = 350., save_file = None, starting_date = [2010, 6, 5, 10, 0, 0]): """ @trajectories: a list of tuple (lat, lon, alt, time). TODO: do a distribution for entry and for exit? """ print "Generating altitudes from distribution..." trajectories = [[list(p) for p in traj] for traj in trajectories] if distr_file!=None: print "Getting distribution of altitudes from file", distr_file distr_type = "data" data = [] with open(distr_type, 'r') as f: for columns in (raw.strip().split() for raw in f): data.append(columns[0]) min_FL, max_FL = min(data), max(data) distr = getDistribution(data) else: if distr_type == 'flat': distr = stats.randint(low = min_FL, high = max_FL).rvs else: print "You asked for a distribution of type", distr_type raise Exception("This type of distribution is not implemented.") for traj in trajectories: alt = distr() for p in traj: #same altitude for the whole trajectory p[2] = 10*int(alt/10.) # To have trajectories separated by 10 FL. if save_file!=None: write_trajectories_for_tact(trajectories, fil = save_file, starting_date = starting_date) return trajectories
def extratrees_param(self, method='grid'): parameters = { # 'selector__extraTC__n_estimators': [10], # 'selector__extraTC__n_estimators': [10, 15], # 'selector__extraTC__criterion': ['gini', 'entropy'], # # 'selector__extraTC__criterion': ['entropy'], # 'selector__extraTC__n_jobs': [-1], # 'selector__pca__svd_solver': ['randomized'], 'selector__pca__svd_solver': ['full', 'arpack', 'randomized'], # 'selector__pca__whiten': [True], 'selector__pca__whiten': [True,False], 'ExtraTreesClassifier__n_estimators': [10, 15, 20], 'ExtraTreesClassifier__criterion': ['gini', 'entropy'] # 'ExtraTreesClassifier__min_samples_leaf': [1,2,3,4,5], # 'ExtraTreesClassifier__min_samples_leaf': range(200,1001,200), # 'ExtraTreesClassifier__max_leaf_nodes': [2,3,4,5], # 'ExtraTreesClassifier__max_depth': [2,3,4,5] } if method == 'random': parameters['ExtraTreesClassifier__min_samples_leaf'] = randint(200,1001) # parameters['ExtraTreesClassifier__max_leaf_nodes'] = randint(2,20) # parameters['ExtraTreesClassifier__max_depth'] = randint(1,20) pass return parameters
def test_sklearn_cv(): model = LightFM(loss='warp', random_state=42) # Set distributions for hyperparameters randint = stats.randint(low=1, high=65) randint.random_state = 42 gamma = stats.gamma(a=1.2, loc=0, scale=0.13) gamma.random_state = 42 distr = {'no_components': randint, 'learning_rate': gamma} # Custom score function def scorer(est, x, y=None): return precision_at_k(est, x).mean() # Custom CV which sets train_index = test_index class CV(KFold): def __iter__(self): ind = np.arange(self.n) for test_index in self._iter_test_masks(): train_index = np.logical_not(test_index) train_index = ind[train_index] yield train_index, train_index cv = CV(n=train.shape[0], random_state=42) search = RandomizedSearchCV(estimator=model, param_distributions=distr, n_iter=10, scoring=scorer, random_state=42, cv=cv) search.fit(train) assert search.best_params_['no_components'] == 52
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1): # hyperparameter optimization param_dist = {"n_iter": randint(5, 100), "power_t": uniform(0.1), "alpha": uniform(1e-08, 1e-03), "eta0": uniform(1e-03, 1), "penalty": ["l1", "l2", "elasticnet"], "learning_rate": ["invscaling", "constant", "optimal"]} scoring = 'roc_auc' n_iter_search = n_iter_search random_search = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=n_iter_search, cv=cv, scoring=scoring, n_jobs=n_jobs, random_state=random_state, refit=True) X, y = make_data_matrix(positive_data_matrix=positive_data_matrix, negative_data_matrix=negative_data_matrix, target=target) random_search.fit(X, y) logger.debug('\nClassifier:') logger.debug('%s' % random_search.best_estimator_) logger.debug('\nPredictive performance:') # assess the generalization capacity of the model via a 10-fold cross validation for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']: scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs) logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores))) return random_search.best_estimator_
def test_sklearn_(): ''' Test whether the booster indeed gets updated :return: ''' Xtrain = np.random.randn(100,10) ytrain = np.random.randint(0,2,100) Xval = np.random.randn(20, 10) yval = np.random.randint(0, 2, 20) classifier = SHSklearnEstimator(model=RandomForestClassifier(n_estimators=4),\ ressource_name='n_estimators') param_grid = {'max_depth': randint(1,10), 'min_impurity_decrease':lognorm(0.1) } scoring = make_scorer(accuracy_score) successiveHalving = SuccessiveHalving( estimator=classifier, n = 10, r = 100, param_grid=param_grid, ressource_name='n_estimators', scoring=scoring, n_jobs=1, cv=None, seed=0 ) T = successiveHalving.apply(Xtrain,ytrain,Xval,yval) print(T) assert(True)
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = {"n_estimators": randint(50, 150), "max_depth": [1, 3, 5, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 4), "min_samples_leaf": randint(1, 4)} erf = ExtraTreesRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=5, scoring='mean_squared_error', cv=kfold, n_jobs=-1) yPredict = [] for i in range(yTrain.shape[1]): clf.fit(XTrain, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def train(XTrain, yTrain, XPredict): params = {'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0]))} gbrt = GradientBoostingRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=50, scoring='mean_absolute_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain) # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict
def Extra(self): parameters_extra={"bootstrap": [True, False], 'n_estimators':randint(20,self.n_estimators_max) } X_train, y_train =self.X_train,self.y_train extra_reg=RandomizedSearchCV(ExtraTreesRegressor(),param_distributions=parameters_extra,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) extra_reg.fit(X_train,y_train) self.extra_reg=extra_reg.best_estimator_
def time_span_1(request, consumption_generator_1, gsod_722880_2012_2014_weather_source): period, n_days = request.param generator,_ = consumption_generator_1 datetimes = generate_monthly_billing_datetimes(period, dist=randint(30,31)) consumption_data = generator.generate( gsod_722880_2012_2014_weather_source, datetimes) return consumption_data, n_days
def __init__(self, extent:tuple, layer_range:tuple, fault_range:tuple, verbose:bool=False): """A Noddy history random generator. Args: extent: (x,X,y,Y,z,Z) layer_range: (low, high) fault_range: (low, high) verbose: True / False """ self.extent = extent self.x = abs(extent[1] - extent[0]) self.y = abs(extent[3] - extent[2]) self.z = abs(extent[5] - extent[4]) self.layer_low, self.layer_high = layer_range self.faults_low, self.faults_high = fault_range self.verbose = verbose # defaults self.n_layers = self.layer_high self.n_faults = self.faults_low self.dist_faults = { "pos": self._random_pos(), "dip_dir": np.random.choice([stats.uniform(60, 120), stats.uniform(240, 300)]), "dip": stats.norm(45, 5), "slip": stats.uniform(0, self.z / 4) # np.random.uniform(0, self.z / 4) } self.dist_strat = { "layer_thickness": [stats.randint(self.z / self.n_layers, self.z / self.n_layers + self.z / 8 * self.n_faults) for l in range(self.n_layers)] } self.dist_tilt = { "pos": self._random_pos(), "rotation": stats.norm(0, 10), "plunge_direction": stats.uniform(0, 360), "plunge": stats.norm(0, 10) } self.dist_fold = { "pos": self._random_pos(), "wavelength": stats.uniform(self.x * 0.1, self.x * 2), "amplitude": stats.uniform(self.z * 0.05, self.z * 0.15) } self.dist_unconf = { "pos": self._random_pos(z_offset=self.z / 2), "dip_direction": stats.uniform(0, 360), "dip": stats.norm(0, 5), }
@author: T510 """ from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestClassifier from sklearn import linear_model from sklearn.svm import SVC from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint #-------------- finding best parameters and initialising instance models with good parameters # random forest param_distribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=8), 'max_depth': randint(low=1, high=20), 'min_samples_leaf': randint(low=1, high=4), } my_model = RandomForestClassifier() rnd_search = RandomizedSearchCV(my_model, param_distributions=param_distribs, n_iter=5, cv=3, scoring='balanced_accuracy', random_state=42) rnd_search.fit(X_train, y_train) print(rnd_search.best_params_) rnd_forest = RandomForestClassifier(max_depth=11,
def rand_search(self): '''running a randomized search to find the parameter combination for a random forest which gives the best accuracy score''' print('*' * 80) print( '* Running RandomizedSearch for best parameter combination for ExtremeRandomForest' ) print('*' * 80) #create the decision forest extra_clf_rand = ExtraTreesClassifier(random_state=100, max_depth=1, n_jobs=-1) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Created extreme random forest: extra_clf_rand \n') #set up randomized search param_rand = { "criterion": ["gini", "entropy"], #metric to judge reduction of impurity 'class_weight': ['balanced', None], 'n_estimators': randint(100, 10000), #number of trees in forest 'max_features': randint(2, 31), #max number of features when splitting "min_samples_split": randint(2, 20), #min samples per node to induce split #"max_depth": randint(1, 10),#max number of splits to do "min_samples_leaf": randint(1, 20), #min number of samples in a leaf "max_leaf_nodes": randint(10, 20) } #max number of leaves with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write( 'Running randomized search for the following parameters: %s \n' % param_rand) text_file.write('use cv=3, scoring=accuracy \n') #building and running the randomized search rand_search = RandomizedSearchCV(extra_clf_rand, param_rand, random_state=5, cv=3, n_iter=500, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_metrix_train, self.y_train) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Best parameters: ' + str(rand_search_fitted.best_params_) + '\n') text_file.write('Best score: ' + str(rand_search_fitted.best_score_) + '\n') feature_importances_fitted = rand_search_fitted.best_estimator_.feature_importances_ feature_importances_fitted_ls = sorted(zip(feature_importances_fitted, self.X_metrix_train), reverse=True) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Feature importances: %s \n' % feature_importances_fitted_ls) self.best_params = rand_search_fitted.best_params_ self.feature_importances_fitted_ls = feature_importances_fitted_ls def feature_importances_best_estimator(feature_list, directory): datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') feature_list.sort(key=lambda x: x[1], reverse=True) feature = list(zip(*feature_list))[1] score = list(zip(*feature_list))[0] x_pos = np.arange(len(feature)) plt.bar(x_pos, score, align='center') #plt.figure(figsize=(20,10)) plt.xticks(x_pos, feature, rotation=90) plt.title( 'Histogram of Feature Importances for best RandomForest using features ' ) plt.xlabel('Features') plt.tight_layout() plt.savefig( os.path.join( directory, 'feature_importances_best_bar_plot_rand_bag_' + datestring + '.png')) plt.close() feature_importances_best_estimator(self.feature_importances_fitted_ls, self.output_dir)
x_train, y_train, x_test = feature_engineering_titanic.read_titanic() x_train = x_train.as_matrix() y_train = y_train.as_matrix() x_test = x_test.as_matrix() # split train validate # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=0) # get best model one_to_left = st.beta(10, 1) from_zero_positive = st.expon(0, 50) params = { "n_estimators": st.randint(3, 40), "max_depth": st.randint(3, 10), "learning_rate": st.uniform(0.05, 0.4), "colsample_bytree": one_to_left, "subsample": one_to_left, "gamma": st.uniform(0, 10), 'reg_alpha': from_zero_positive, "min_child_weight": from_zero_positive, } xgb_clf = XGBClassifier(nthreads=-1) best_xgb_model = best_model.get_best_model(x_train, y_train, model=xgb_clf, params=params, n_iter=500,
"loss": ["hinge", "squared_hinge"], "tol": [0.00001, 0.0001, 0.001] # should probably include this in grid search, as # dual=False is preferred when samples>features. However: # Unsupported set of arguments: The combination of # penalty='l2' and loss='hinge' are not supported when # dual=False # "dual": [True, False] }, 'svm': { "C": [1, 0.5, 0.1, 0.9, 0.8], "tol": [0.00001, 0.0001, 0.001, 0.01], "shrinking": [True, False] }, 'kneighbors': { "n_neighbors": randint(2, 15), "weights": ['uniform', 'distance'], "leaf_size": randint(15, 100) }, 'linear': { "alpha": [0.0001, 0.01, 1.0, 10.0, 1000.0], "tol": [0.00001, 0.0001, 0.001, 0.01] } } TEMPLATES = pkg_resources.resource_filename('q2_sample_classifier', 'assets') def _load_data(feature_data, targets_metadata): '''Load data and generate training and test sets.
'c__colsample_bylevel': [1, 0.8, 0.6], # Subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. 'c__colsample_bynode': [1, 0.8, 0.6],# Subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. 'c__num_parallel_tree': [1], # Number of parallel trees constructed during each iteration. This option is used to support boosted random forest. 'c__max_depth': [2, 3, 6], # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. # 'c__reg_alpha': [0], # L1 regularization term on weights. Increasing this value will make model more conservative. # 'c__reg_lambda': [1], # L2 regularization term on weights. Increasing this value will make model more conservative. 'c__learning_rate': [0.3], # 0.3 Step size shrinkage used in update to prevents overfitting. Shrinks the feature weights to make the boosting process more conservative. # 'c__scale_pos_weight': [1] # should be negative_samples_count / positive_samples_count #'c__objective': ['multi:softmax'], #XGBoost will adjust this between binary:logistic and multi:softmax based on # of classes 'c__eval_metric': ['mlogloss'], # logloss heavily penalizes false-positives (better precision) 'c__tree_method': ['hist'] } PARAMETER_DISTRIBUTION = { 'i__strategy': ['mean'], # 'median', 'most_frequent', 'constant' 'c__n_estimators': randint(100, 1000), 'c__subsample': [1], # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. 'c__colsample_bytree': uniform(0.4, 0.6), # Subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed. 'c__colsample_bylevel': uniform(0.4, 0.6), # Subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. 'c__colsample_bynode': uniform(0.4, 0.6),# Subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. 'c__num_parallel_tree': [1], # Number of parallel trees constructed during each iteration. This option is used to support boosted random forest. 'c__max_depth': randint(2, 12), # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. # 'c__reg_alpha': [0], # L1 regularization term on weights. Increasing this value will make model more conservative. # 'c__reg_lambda': [1], # L2 regularization term on weights. Increasing this value will make model more conservative. 'c__learning_rate': [0.3], # 0.3 Step size shrinkage used in update to prevents overfitting. Shrinks the feature weights to make the boosting process more conservative. # 'c__scale_pos_weight': [1] # should be negative_samples_count / positive_samples_count # 'c__objective': ['multi:softmax'], #XGBoost will adjust this between binary:logistic and multi:softmax based on # of classes 'c__eval_metric': ['mlogloss'], # logloss heavily penalizes false-positives (better precision) 'c__tree_method': ['hist'] }
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = DecisionTreeClassifier(random_state=20, class_weight='balanced', max_features=self.numf) ada = AdaBoostClassifier(base_estimator=clf1, algorithm="SAMME.R", random_state=55) logging.info(f'Initialised classifier using balanced class weights \n') #set up randomized search param_dict = { 'base_estimator__criterion': ['gini', 'entropy'], 'n_estimators': randint(100, 10000), #number of base estimators to use 'learning_rate': uniform(0.0001, 1.0), 'base_estimator__min_samples_split': randint(2, 20), 'base_estimator__max_depth': randint(1, 10), 'base_estimator__min_samples_leaf': randint(1, 20), 'base_estimator__max_leaf_nodes': randint(10, 20) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(ada, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train_scaled.columns), reverse=True) logging.info( f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) all_clf_feat_import_mean = np.mean( [tree.feature_importances_ for tree in self.model.estimators_], axis=0) all_clf_feat_import_mean_sorted = sorted(zip( all_clf_feat_import_mean, self.X_train_scaled.columns), reverse=True) print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(best_clf_feat_import_sorted, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' ) feature_importances_error_bars(self.model, self.X_train_scaled.columns, self.directory) logging.info( f'Plotting feature importances for best classifier with errorbars \n' )
#!/usr/bin/env python # -*- coding: utf-8 -*- from sklearn.datasets import fetch_california_housing from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from scipy.stats import randint X, y = fetch_california_housing(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # define the parameter space that will be searched over param_distributions = { 'n_estimators': randint(1, 5), 'max_depth': randint(5, 10) } # now create a searchCV object and fit it to the data search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5, param_distributions=param_distributions, random_state=0) search.fit(X_train, y_train) print(search.best_params_) # the search object now acts like a normal random forest estimator # with max_depth=9 and n_estimators=4 print(search.score(X_test, y_test))
r2 = gm_cv.score(X_test, y_test) # Accuracy test (R^2) mse = mean_squared_error(y_test, y_pred) # Accuracy test print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_)) print("Tuned ElasticNet R squared: {}".format(r2)) print("Tuned ElasticNet MSE: {}".format(mse)) # RandomizedSearchCV (w/ DecisionTreeClassifier) # problem: large hyperparam spaces, many hyperparams -> GridSearchCV comp. exp. # solution: fixed number of hyperparam values is sampled from scipy.stats import randint from sklearn.model_selection import RandomizedSearchCV from sklearn.tree import DecisionTreeClassifier # Setup the grid and hyperparam spaces to sample from: param_dist param_dist = { "max_depth": [3, None], "max_features": randint(1, 9), "min_samples_leaf": randint(1, 9), "criterion": ["gini", "entropy"] } tree = DecisionTreeClassifier() # Model tree_cv = RandomizedSearchCV( tree, param_dist, cv=5, n_iter=10 ) # Inst. RandomizedSearchCV. n_iter: the number of parameter settings tried tree_cv.fit(X_train, y_train) # Fitting on training data print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) print("Best score is {}".format(tree_cv.best_score_)) #endregion (HYPERPARAMETER TUNING) #region PREPROCESSING
# featureGain = {} # featureScore = {} # lastScore = 0 # count = 0 xx_score = [] cv_pred = [] splitsNum = 5 randomSeed = 42 numBoostRound = 200 # 50 earlyStoppingRounds = 60 #===================================================================================== tunedParameters = { "learning_rate": expon(scale=0.2),#np.exp(range(-3,0)), #"lambda_l2": 0.25, "max_depth": randint(low=5,high=15), "num_leaves": [64,128,256,512,1024], "bagging_fraction":uniform(), "feature_fraction":uniform(), #"min_data_in_leaf": } model = lgb.LGBMClassifier(objective="multiclass",num_class= 11,seed=42) searchResult = RandomizedSearchCV(model,tunedParameters,scoring ="f1_macro",n_iter=10,n_jobs=1,cv=3) searchResult.fit(trainData,trainLabel) print(rscv.cv_results_) print(rscv.best_params_) exit() #===================================================================================== skf = StratifiedKFold(n_splits=splitsNum, random_state=randomSeed, shuffle=True) for index, (train_index, test_index) in enumerate(skf.split(trainData, trainLabel)): print("[+] "+str(index)+" iteration")
from sklearn.linear_model import LogisticRegression from scipy.stats import randint, uniform seed = 0 model = LogisticRegression() param_dist = { # "penalty": ['l1', 'l2'], "penalty": ['l2'], # "C": [0.1, 0.5, 1.0, 2, 10], "C": uniform(0.001, 0.01), "random_state": [seed], "max_iter": randint(500, 1000), }
def msm_distance_measure_getter(X): """ generate the msm distance measure :param X: dataset to derive parameter ranges from :return: distance measure and parameter range dictionary """ n_dimensions = 1 # todo use other dimensions return { "distance_measure": [cython_wrapper(msm_distance)], "dim_to_use": stats.randint(low=0, high=n_dimensions), "c": [ 0.01, 0.01375, 0.0175, 0.02125, 0.025, 0.02875, 0.0325, 0.03625, 0.04, 0.04375, 0.0475, 0.05125, 0.055, 0.05875, 0.0625, 0.06625, 0.07, 0.07375, 0.0775, 0.08125, 0.085, 0.08875, 0.0925, 0.09625, 0.1, 0.136, 0.172, 0.208, 0.244, 0.28, 0.316, 0.352, 0.388, 0.424, 0.46, 0.496, 0.532, 0.568, 0.604, 0.64, 0.676, 0.712, 0.748, 0.784, 0.82, 0.856, 0.892, 0.928, 0.964, 1, 1.36, 1.72, 2.08, 2.44, 2.8, 3.16, 3.52, 3.88, 4.24, 4.6, 4.96, 5.32, 5.68, 6.04, 6.4, 6.76, 7.12, 7.48, 7.84, 8.2, 8.56, 8.92, 9.28, 9.64, 10, 13.6, 17.2, 20.8, 24.4, 28, 31.6, 35.2, 38.8, 42.4, 46, 49.6, 53.2, 56.8, 60.4, 64, 67.6, 71.2, 74.8, 78.4, 82, 85.6, 89.2, 92.8, 96.4, 100, ], }
scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(X_train, y_train) # In[97]: gridSearchAda.best_params_, gridSearchAda.best_score_ # #### GradientBoosting # In[98]: gbHyperParams = { 'loss': ['deviance', 'exponential'], 'n_estimators': randint(10, 500), 'max_depth': randint(1, 10) } # In[99]: gridSearchGB = RandomizedSearchCV(estimator=gbMod, param_distributions=gbHyperParams, n_iter=10, scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(X_train, y_train) # In[100]:
ests = { 'case-1': [ #('lin', LinearRegression()), ('bay', BayesianRidge(tol=1e-5)), #('hub', HuberRegressor(max_iter=5000, tol=1e-5)), ('ard', ARDRegression(tol=1e-5)), ('par', PassiveAggressiveRegressor(max_iter=5000, tol=1e-5)), ('rdg', Ridge(max_iter=5000, random_state=seed)), ('las', Lasso(max_iter=5000, random_state=seed)), # ('eln', ElasticNet(max_iter=5000, tol=1e-5, random_state=seed)), ('svr', SVR(kernel='linear')), ('mlp', MLPRegressor()) ] } r = uniform(0, 30) d = randint(2, 10) f = randint(1, 100) e = uniform(0, 3) ee = uniform(0, 1) pars_1 = { 'case-1.mlp': { 'alpha': ee, 'beta_1': e, 'beta_2': e, 'epsilon': ee }, 'case-1.eln': { 'alpha': e, 'l1_ratio': e },
class FixedLengthTupleDistribution: """ Tuples where each element stems from a specified distribution Note: this is not a normalized distribution """ def __init__(self, distributions): self.distributions = distributions def rvs(self, random_state=None): return tuple([ dist.rvs(random_state=random_state) for dist in self.distributions ]) param_grid = { 'mlp__hidden_layer_sizes': randint(2, 150), 'mlp__activation': ['logistic', 'tanh', 'relu'], 'mlp__solver': ['lbfgs'], 'mlp__alpha': ExpDistribution(uniform(-6, 5)), 'mlp__learning_rate': ['constant'], 'mlp__max_iter': [200], 'mlp__learning_rate_init': ExpDistribution(uniform(-6, 5)), # no2 results analysis shows good range between 0.1 and 0.001 # 'filter__alpha': ExpDistribution(uniform(-3, 2)), 'mlp__early_stopping': [True] } # CO_best = {'mlp__activation': 'relu', 'mlp__alpha': 1.899210794532138e-06, # 'mlp__hidden_layer_sizes': 112, # 'mlp__learning_rate_init': 0.07908998568339845, # 'mlp__solver': 'lbfgs', 'mlp__learning_rate': 'constant',
랜덤 서치에는 매개변수 값이 목록을 전달하는 것이 아니라 매개변수를 샘플링 할 수 있는 확률 분포객체를 전달합니다. 확률 분포라 하니 조금 어렵게 들릴 수 있지만 간단하고 쉽습니다. 먼저 싸이파이에서 2개의 확률 분포 클래스를 임포트 해보죠. 싸이파이란................ 싸이파이는 파이썬의 핵심과학 라이브러리 중 하나입니다. 적분, 보간, 선형 대수, 확률 등을 포함한 수치 계산 전용 라이브러리 입니다. 사이킷런은 넘파이와 싸이파이 기능을 많이 사용합니다. 코랩에는 이미 설치 되어 있다. ''' ''' 싸이파이의 stats 서브 패키지에 있는 uniform과 randint클래스는 모두 주어진 범위에서 고르게 값을 뽑습니다. 이를 균등분포에서 샘플링 한다라고 말합니다. randint는 정숫값을 뽑고, uniform은 실숫값을 뽑습니다. 사용하는 방법은 같습니다. 0에서 10사이의 범위를 갖는 randint객체를 만들고 10개의 숫자를 샘플링해보겠습니다. ''' rgen = randint(0, 10) nansu = rgen.rvs(10) print(nansu) ''' 좋습니다. 난수 발생기랑 유사하게 생각하면 됩니다. 랜덤 서치에 randint과 uniform 클래스 객체를 넘겨주고, 총 몇 번을 샘플링해서 최적의 매개변수를 찾으라고 명령할 수 있습니다. 샘플링 횟수는 시스템 자원이 허락하는 범위 내에서 최대한 크게 하는 것이 좋겠죠 그럼 탐색할 매개변수의 딕셔너리를 만들어 보겠습니다. 여기에서는 min_samples_leaf 매개변수를 탐색 대상에 추가하겠습니다. 이 매개변수는 리프 노드가 되기 위한 최소 샘플의 개수입니다. 어떤 노드가 분할하여 만들어질 자식 노드의 샘플 수가 이 값보다 작을 경우 분할하지 않습니다. ''' params = { 'min_impurity_decrease': uniform(0.0001, 0.001), 'max_depth': randint(20, 50),
def test_smoke_hyperband(min_iter): seed = 10 n_splits = 4 eta = 3 max_iter = 27 params = dict(a=randint(low=1, high=100)) cmax_param = {'b': max_iter} if min_iter is not None: cmin_param = {'b': min_iter} else: cmin_param = None hyperband_search = HyperbandSearchCV( DummyCVedEstimator(), cost_parameter_max=cmax_param, cost_parameter_min=cmin_param, cv=n_splits, iid=False, return_train_score=False, eta=eta, param_distributions=params, random_state=seed) hyperband_search.fit(np.random.normal(size=(100, 3)), np.random.choice(2, size=100, replace=True)) rng = np.random.RandomState(seed=seed) ri = randint(low=1, high=100) # b_vals is the geometric sequence of values of b hyperband will test if min_iter is None or min_iter == 1: b_vals = np.array( [1] * 27 + [3] * 9 + [9] * 3 + [27] + [3] * 12 + [9] * 4 + [27] + [9] * 6 + [27] * 2 + [27] * 4) a_itrs = [[27, 9, 3, 1], [12, 4, 1], [6, 2], [4]] else: b_vals = np.array( [3] * 9 + [9] * 3 + [27] + [9] * 5 + [27] + [27] * 3) a_itrs = [[9, 3, 1], [5, 1], [3]] # now draw the a_vals in the proper order a_vals = [] for bstart, nks in enumerate(a_itrs): a_vals_orig = ri.rvs(random_state=rng, size=nks[0]).tolist() for i, nk in enumerate(nks): scores = np.array( [np.random.RandomState(seed=s).uniform() for s in a_vals_orig]) + np.power(3, i + bstart) sinds = np.argsort(scores)[::-1] # bigger is better msk = np.zeros_like(a_vals_orig) msk[sinds[0:nk]] = 1 msk = msk.astype(bool) a_vals_orig = [a for i, a in enumerate(a_vals_orig) if msk[i]] a_vals += a_vals_orig a_vals = np.array(a_vals) mn_scores = np.array( [np.random.RandomState(seed=a).uniform() for a in a_vals]) + b_vals best_index = np.argmax(mn_scores) # now make sure it got the right values assert hyperband_search.best_index_ == best_index, "Best index is wrong!" assert hyperband_search.best_score_ == mn_scores[best_index], ( "Best score is wrong!") assert (hyperband_search.best_params_ == {'a': a_vals[best_index], 'b': b_vals[best_index]}), ( "Best parameters are wrong!")
param_grid=params_grid, cv=cv, scoring='neg_mean_squared_error') bst_grid.fit(X, y) bst_grid.grid_scores_ print("Best accuracy obtained: {0}".format(bst_grid.best_score_)) print("Parameters:") for key, value in bst_grid.best_params_.items(): print("\t{}: {}".format(key, value)) params_dist_grid = { 'max_depth': [1, 2, 3, 4], 'gamma': [0, 0.5, 1], 'n_estimators': randint(1, 1001), # uniform discrete random distribution 'learning_rate': uniform(), # gaussian distribution 'subsample': uniform(), # gaussian distribution 'colsample_bytree': uniform() # gaussian distribution } rs_grid = RandomizedSearchCV(estimator=XGBRegressor(**params_fixed, seed=seed), param_distributions=params_dist_grid, n_iter=10, cv=cv, scoring='neg_mean_squared_error', random_state=seed) rs_grid.fit(X, y) rs_grid.grid_scores_ rs_grid.best_estimator_
('rfr', RandomForestClassifier(n_jobs=1)), ('mlp', MLPClassifier(tol=1e-4)), ('svc', SVC(tol=1e-4, degree=9)), ('rdc', RidgeClassifierCV()), ('gbc', GradientBoostingClassifier()), ('ada', AdaBoostClassifier()), ('svc', SVC(tol=1e-4, degree=7, kernel='linear')), ('bag', BaggingClassifier(n_jobs=1))] ests_1 = { 'case-1': est_l1, # 'case-2': est_l1, # 'case-3': est_l1, # 'case-4': est_l1 } r = uniform(0, 30) d = randint(2, 10) f = randint(100, 200) e = uniform(0, 3) ee = uniform(0, 1) pars_1 = {} sc = StandardScaler() pca = PCA() fa = FactorAnalysis() nmf = NMF() pre_cases = { 'case-1': [sc], # 'case-2': [sc], # 'case-3': [pca],
device=('cuda' if USE_CUDA else 'cpu'), max_epochs=5, lr=0.01, optimizer=torch.optim.RMSprop, ) pipe = Pipeline(steps + [('net', net)]) # pipe.fit(X, y) pipe.set_params(net__verbose=0, net__train_split=None) params = { 'to_idx__stop_words': ['english', None], 'to_idx__lowercase': [False, True], 'to_idx__ngram_range': [(1, 1), (2, 2)], 'net__module__embedding_dim': stats.randint(32, 256 + 1), 'net__module__rec_layer_type': ['gru', 'lstm'], 'net__module__num_units': stats.randint(32, 256 + 1), 'net__module__num_layers': [1, 2, 3], 'net__module__dropout': stats.uniform(0, 0.9), 'net__module__bidirectional': [True, False], 'net__lr': [10**(-stats.uniform(1, 5).rvs()) for _ in range(NUM_CV_STEPS)], 'net__max_epochs': [5, 10], } search = RandomizedSearchCV(pipe, params, n_iter=NUM_CV_STEPS, verbose=2, refit=False, scoring='accuracy',
""" An example training a RandomForestClassifier, performing randomized search using TuneSearchCV. """ from tune_sklearn import TuneSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn import datasets from sklearn.model_selection import train_test_split from scipy.stats import randint import numpy as np digits = datasets.load_digits() x = digits.data y = digits.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2) clf = RandomForestClassifier() param_distributions = { "n_estimators": randint(20, 80), "max_depth": randint(2, 10) } tune_search = TuneSearchCV(clf, param_distributions, n_iter=3) tune_search.fit(x_train, y_train) pred = tune_search.predict(x_test) accuracy = np.count_nonzero(np.array(pred) == np.array(y_test)) / len(pred) print(accuracy)
def sample_preprocess(): preprocess_distr = { "filter_type": ["none", "ma_smoothing"], "win_len": randint(2, 20) } return sample_scikit(preprocess_distr)
pd.DataFrame(grid_search.cv_results_) # Random 탐색 # n_iter는 몇 번 정도 오류가나면 stop 기준 # random state= 42 이유에는 스토리텔링이 있음...책에 있을걸? # RandomForest가 개별 특성을 잘 뽑아내주는 (실제 가장 중요한 역할을 하는 키를 뽑아내기 쉬움) # In[64]: from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint param_disribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=8), } forest_reg = RandomForestRegressor(random_state=42) rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_disribs, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42) rnd_search.fit(housing_prepared, housing_labels) # In[65]: cvres = rnd_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params)
Use a scipy statistical distribution to answer the questions below: from scipy import stats from scipy.stats import norm, binom die_distribution = stats.randint(1, 7) # What is the probability of rolling a 1? die_distribution.pmf(1) # There's a 1 in 2 chance that I'll roll higher than what number? die_distribution.isf(.5) # What is the probability of rolling less than or equal to 2? die_distribution.cdf(2) # There's a 5 in 6 chance that my roll will be less than or equal to what number? die_distribution.ppf(5/6) # There's a 1 in 2 chance that my roll will be less than or equal to what number? die_distribution.ppf(.5) # What is the probability of rolling less than or equal to 6? die_distribution.cdf(6) # There's a 1 in 3 chance that I'll roll higher than what number? die_distribution.isf(1/3) # What is the probability of rolling higher than a 1? die_distribution.sf(1)
class Training: def __init__(self, n_calls): self.n_calls = n_calls def afl_loss(y_true, y_pred): return -np.sum(1 + np.log2(y_true * y_pred + (1 - y_true) * (1 - y_pred))) scorer = make_scorer(afl_loss, greater_is_better=False, needs_proba=True) spaceR = { 'n_estimators': randint(low=350, high=700), 'max_depth': randint(low=3, high=20), 'min_samples_split': uniform(0.01, 0.99), 'min_samples_leaf': randint(low=1, high=10), 'min_weight_fraction_leaf': uniform(0, 0.5), 'max_features': randint(low=1, high=18), 'max_leaf_nodes': randint(low=2, high=1000), 'min_impurity_decrease': uniform(0, 2) } spaceB = { 'n_estimators': Integer(200, 1000), 'max_depth': Integer(3, 20), 'min_samples_split': Real(0.01, .99, "uniform"), 'min_samples_leaf': Integer(1, 10), 'min_weight_fraction_leaf': Real(0, 0.5, "uniform"), 'max_features': Integer(1, 17), 'max_leaf_nodes': Integer(2, 1000), 'min_impurity_decrease': Real(0, 2) } def trainR(self, X_list, y_list, space=spaceR, cv=5): """ RandomSearchCV method :param X_list: List of training sets :param y_list: List of targets :param space: parameter space :return: models an metrics """ n_calls = self.n_calls scores = [] val_scores = [] best_models = [] for j in range(len(X_list)): classifier = RandomForestClassifier(n_jobs=-1) y = y_list.copy() X = X_list.copy() y_test = y.pop(j) X_test = X.pop(j) y_train = np.concatenate(y, axis=0) X_train = np.concatenate(X, axis=0) X_train = Features().div_cols(X_train).values X_test = Features().div_cols(X_test).values start = time() opt = RandomizedSearchCV(classifier, param_distributions=space, n_iter=n_calls, scoring=self.scorer, cv=cv, n_jobs=-1, iid=False) opt.fit(X_train, y_train) model = opt.best_estimator_ print('Season', 2019 - j) print("Random CV search took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_calls)) print("val. score:", opt.best_score_) print("test score:", opt.score(X_test, y_test)) # print(model) print("") best_models.append(model) val_scores.append(opt.best_score_) scores.append(opt.score(X_test, y_test)) return scores, val_scores, best_models def trainB(self, X_list, y_list, n_points=1, space=spaceB, cv=5): """ BayesianSearchCV method :param X_list: List of training sets :param y_list: List of targets :param space: parameter space :return: models an metrics """ n_calls = self.n_calls scores = [] val_scores = [] best_models = [] for j in range(len(X_list)): classifier = RandomForestClassifier(n_jobs=-1) y = y_list.copy() X = X_list.copy() y_test = y.pop(j) X_test = X.pop(j) y_train = np.concatenate(y, axis=0) X_train = np.concatenate(X, axis=0) X_train = Features().div_cols(X_train).values X_test = Features().div_cols(X_test).values start = time() opt = BayesSearchCV(classifier, search_spaces=space, scoring=self.scorer, cv=cv, n_points=n_points, n_iter=n_calls, n_jobs=-1) opt.fit(X_train, y_train) model = opt.best_estimator_ print('Season', 2019 - j) print("Bayes CV search took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_calls)) print("val. score:", opt.best_score_) print("test score:", opt.score(X_test, y_test)) # print(model) print("") best_models.append(model) val_scores.append(opt.best_score_) scores.append(opt.score(X_test, y_test)) return scores, val_scores, best_models
from sklearn.preprocessing import StandardScaler from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import SGDClassifier if __name__ == '__main__': x, y, num, cat = data_prep() classifiers = [ ('KNN', KNeighborsClassifier(), { 'KNN__algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'KNN__n_neighbors': stats.randint(4, 40), 'KNN__p': stats.randint(1, 5), 'KNN__weights': ('uniform', 'distance') }), ( 'ETclf', ExtraTreesClassifier(), { 'ETclf__n_estimators': stats.randint(10, 500), 'ETclf__criterion': ('gini', 'entropy'), 'ETclf__max_depth': stats.randint(10, 50), 'ETclf__max_features': ('sqrt', 'log2', 'auto'), # 'ETclf__max_leaf_nodes' :(None , 2 , 4 , 5 , 6 , 7 , 9 , 10) , # 'ETclf__min_samples_leaf':[2, 4 , 6 , 8 , 10], # 'ETclf__min_samples_split':[2, 4 , 6 , 8 , 10] }),
sh.fit(X, y) assert sh.n_candidates_[0] == expected_n_candidates if n_candidates == 'exhaust': # Make sure 'exhaust' makes the last iteration use as much resources as # we can assert sh.n_resources_[-1] == max_resources @pytest.mark.parametrize( 'param_distributions, expected_n_candidates', [ ({ 'a': [1, 2] }, 2), # all lists, sample less than n_candidates ({ 'a': randint(1, 3) }, 10), # not all list, respect n_candidates ]) def test_random_search_discrete_distributions(param_distributions, expected_n_candidates): # Make sure random search samples the appropriate number of candidates when # we ask for more than what's possible. How many parameters are sampled # depends whether the distributions are 'all lists' or not (see # ParameterSampler for details). This is somewhat redundant with the checks # in ParameterSampler but interaction bugs were discovered during # developement of SH n_samples = 1024 X, y = make_classification(n_samples=n_samples, random_state=0) base_estimator = FastClassifier() sh = HalvingRandomSearchCV(base_estimator,
#scaling and splitting data #X_data = preprocessing.scale(X_data) #Y_data = preprocessing.scale(Y_data) print("done") X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data) #Exhaustive grid search parameters param_grid = {'C': [1,10, 100, 1000], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'degree': [0, 1, 2, 3, 4, 5], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], 'coef0': [0,1,2,3,4,5]} #param_grid = {'C': [1,10,100,1000], 'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], }, {'C': [1,10,100,1000], 'kernel': ['poly'], 'gamma': [1e-3, 1e-4, 1e-5, 1e-6], 'degree': [1,2,3,4,5,6,7]}, {'C': [1,10,100,1000], 'kernel': ['linear']} #Randomized grid search parameters param_dist = {'C': expon(scale = 100), 'gamma': expon(scale = .1), 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'degree': randint(1,100), 'coef0': randint(0,100)} n_iter_search = 20 #Creating and training model #Regular model #model = SVC(C= 10, gamma= .0001, kernel= 'rbf', verbose = True) SVC = SVC(verbose = False) #Regular grid search #model = GridSearchCV(SVC, param_grid=param_grid) #searchtype = GridSearch #Randomized Grid search model = RandomizedSearchCV(SVC, param_distributions=param_dist, n_iter=n_iter_search) searchtype = "Randomized, n_iter_search = {}".format(n_iter_search) #Training
# + [markdown] id="d0k9DQTNlaD6" # ### 랜덤 서치 # #### 매개변수로서 확률분포객체를 전달 # + # 양이 너무 많을 경우(시간이 오래걸릴 경우) random하게 뽑아서 grid search -> 대충 그 근방에서 정해진다 함 # cost가 너무 크면 전체적(hybrid하게?)으로 random하게 하고 적당히 min이 잡히면 거기에 맞춰서 grid를 깔고 반복하는 방식 # + id="_T9KTEk1GBcY" from scipy.stats import uniform, randint # + colab={"base_uri": "https://localhost:8080/", "height": 34} executionInfo={"elapsed": 33380, "status": "ok", "timestamp": 1591523215530, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="fd0UJpCGGDhz" outputId="a2f0b471-aa50-45f3-d070-76eba4aef24f" # A uniform discrete random variable # randint, A uniform discrete random variable. rgen = randint(low=0, high=9) # rgen.rvs(size = 2) # + colab={"base_uri": "https://localhost:8080/", "height": 50} executionInfo={"elapsed": 33373, "status": "ok", "timestamp": 1591523215530, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="ch3zTUohIJR6" outputId="c09ba926-927d-4ae1-9e35-433c11ad5014" # Find the unique elements of an array. np.unique(rgen.rvs(1000), return_counts=True) # np.unique(rgen.rvs(1000)) # + colab={"base_uri": "https://localhost:8080/", "height": 50} executionInfo={"elapsed": 33366, "status": "ok", "timestamp": 1591523215531, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="bGhshTn0IjkI" outputId="36e8962b-f517-4521-d272-d829ced45377" # A uniform continuous random variable. ugen = uniform(0, 1) ugen.rvs(10) # + id="irDX9e6WYTIH" params = { 'min_impurity_decrease': uniform(0.0001, 0.001),
classifier.fit(X_train, y_train) """ Random forest is a classifier for various decision trees, n_estimator specifies how many decision trees we are using. criterion = 'entropy, we can use gini also. We have selected these parameters randomly, how can we get to know number of n_estimator and criterion, for this we use RandomizedSearchCV""" from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint est = RandomForestClassifier(n_jobs=-1) rf_p_dist = { 'max_depth': [3, 5, 10, None], 'n_estimators': [10, 100, 200, 300, 400, 500], 'max_features': randint(1, 3), 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False], 'min_samples_leaf': randint(1, 4), } def hypertuning_rscv(est, p_distr, nbr_iter, X, y): rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr, n_jobs=-1, n_iter=nbr_iter, cv=9) #CV = Cross-Validation ( here using Stratified KFold CV) rdmsearch.fit(X, y) ht_params = rdmsearch.best_params_
# 'model__C': stats.loguniform(1, 2), # 'gamma': 'auto', # 'model__degree': stats.randint(1, 3), "preprocessor__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()] } # params_lr = {"random_grid_search": grid_lr, "model": LinearRegression()} ###################################################### ###################################################### # RandomForestRegressor model ###################################################### grid_rfr = { 'model__n_estimators': stats.randint(1, 300), 'model__max_depth': stats.randint(1, 300), 'model__max_samples': stats.randint(1, 300), "preprocessor__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()] } # params_rfr = {"random_grid_search": grid_rfr, "model": RandomForestRegressor()} ###################################################### ###################################################### # GradientBoostingRegressor model ###################################################### grid_gbr = { #'model__loss': ["ls", "lad", "huber", "quantile"], 'model__learning_rate': stats.loguniform(0.001, 10),