def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv # parameter grid for SVR using two parameters, C and gamma param_grid = { 'C': np.logspace(-2, 2, self.gridpoints * 2), 'gamma': np.logspace(-2, 0.5, self.gridpoints) } steps = [ ('scaler', StandardScaler()), # TEST removal of cache_size, tol and max_iter; probably don't need these ('reg', GridSearchCV(SVR(kernel='rbf', cache_size=10000, tol=1e-4, max_iter=5000), param_grid=param_grid)) ] if self.bestT: steps.insert(0, ('xtransform', columnBestTransformer(float_k=len(self.float_idx)))) outerpipe = Pipeline(steps=steps) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def get_pipe( self ): #getting a GBR pipeline; this code follows a template shared by other pipeline classes if self.inner_cv is None: inner_cv = RepeatedKFold( n_splits=10, n_repeats=1, random_state=0) #default inner_cv iterator else: inner_cv = self.inner_cv if self.est_kwargs is None: #non-hard-coded key word arguments for the estimator supplied at the end of the pipeline self.est_kwargs = {'max_depth': [3, 4], 'n_estimators': [64, 128]} hyper_param_dict, gbr_params = self.extractParams( self.est_kwargs ) #pulling out tunable hyperparameters that will be selected #by grid search and static parameters that will be passed directly to the estimator if not 'random_state' in gbr_params: gbr_params['random_state'] = 0 steps = [('reg', GridSearchCV(GradientBoostingRegressor(**gbr_params), param_grid=hyper_param_dict, cv=inner_cv))] #** unpacks a dictionary and provides its elements to the function; if self.bestT: #user asking for best feature transforms using the GUI steps.insert( 0, 'xtransform', columnBestTransformer(float_k=len( self.float_idx))) #GET BACK TO columnBestTransformer #in Doug's experience, columnBestTransformer overbuilds the pipeline and doesn't generalize well outerpipe = Pipeline(steps=steps) if self.do_prep: #wrapping outerpipe inside another outerpipe pipeline steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv steps = [('scaler', StandardScaler()), ('select', shrinkBigKTransformer(max_k=8)), ('reg', FlexibleEstimator(**self.flex_kwargs))] if self.bestT: steps.insert(0, 'xtransform', columnBestTransformer(float_k=len(self.float_idx))) pipe = Pipeline(steps=steps) # selecting features. k_share allows you to choose the fraction of features that are retained using the LARS algorithm param_grid = { 'select__k_share': np.linspace(0.2, 1, self.gridpoints * 2) } if self.functional_form_search: param_grid['reg__form'] = ['powXB', 'expXB'] #,'linear'] outerpipe = GridSearchCV(pipe, param_grid=param_grid) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv # gridpoints=self.gridpoints transformer_list = [ none_T(), log_T(), logp1_T() ] # Using 3 of many options here: none_T,logp1_T(),log_T() steps = [ ('shrink_k1', shrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=32)) ), # retain a subset of the best original variables ('polyfeat', PolynomialFeatures(interaction_only=0, degree=2)), # create interactions among them ('drop_constant', dropConst()), ('shrink_k2', shrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=64)) ), # pick from all of those options ('reg', LinearRegression()) ] if self.bestT: steps.insert(0, ('xtransform', columnBestTransformer(float_k=len(self.float_idx)))) X_T_pipe = Pipeline(steps=steps) #develop a new pipeline that allows transformation of y in addition to X, which other scikit learn transformers don't Y_T_X_T_pipe = Pipeline( steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))]) Y_T__param_grid = { 'ttr__transformer': transformer_list, 'ttr__regressor__polyfeat__degree': [2], #could use other degrees here if desired } outerpipe = GridSearchCV(Y_T_X_T_pipe, param_grid=Y_T__param_grid, cv=inner_cv) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv gridpoints = self.gridpoints param_grid = {'C': np.logspace(-2, 4, gridpoints * 4)} steps = [ #('shrink_k1',shrinkBigKTransformer(selector=LassoLarsCV(cv=inner_cv,max_iter=32))), # retain only a subset of the # original variables for continued use ('polyfeat', PolynomialFeatures(interaction_only=False, degree=2) ), # create every 2nd-order interaction among the features # including squared terms ( 'drop_constant', dropConst() ), #drops features without variance, including created interactions ('shrink_k2', shrinkBigKTransformer( selector=LassoLarsCV(cv=inner_cv, max_iter=64))), ('scaler', StandardScaler()), # TEST removal of cache_size, tol and max_iter; probably don't need these ('reg', GridSearchCV(LinearSVR(random_state=0, tol=1e-4, max_iter=1000), param_grid=param_grid)) ] if self.bestT: steps = [ steps[0], ('xtransform', columnBestTransformer(float_k=len(self.float_idx))), *steps[1:] ] outerpipe = Pipeline(steps=steps) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv steps = [ ('scaler', StandardScaler()), #standardizes the X values ('reg', LassoLarsCV(cv=inner_cv, max_n_alphas=self.max_n_alphas, normalize=False)) ] if self.bestT: steps.insert(0, 'xtransform', columnBestTransformer(float_k=len(self.float_idx))) outerpipe = Pipeline(steps=steps) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def get_pipe(self): try: if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv if self.est_kwargs is None: self.est_kwargs = { 'reg__alpha': np.logspace(-5, 10, self.gridpoints).tolist( ), #investigate the ideal range for alpha 'reg__power': [0, *np.logspace(1, 3, self.gridpoints - 1).tolist() ], #investigate power values between 2 and 3 'select__max_k': [4, 8, 32] } #maybe look to tweak using k_share steps = [('scaler', StandardScaler()), ('select', shrinkBigKTransformer(max_k=8)), ('reg', TweedieRegressor())] if self.bestT: steps.insert( 0, 'xtransform', columnBestTransformer(float_k=len(self.float_idx))) outerpipe = GridSearchCV(Pipeline(steps=steps), param_grid=self.est_kwargs, cv=inner_cv) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe except: self.logger.exception(f'get_pipe error for flexibleGLM')
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0) else: inner_cv = self.inner_cv gridpoints = self.gridpoints #param_grid={'l1_ratio':1-np.logspace(-2,-.03,gridpoints)} #manually creating a list of gridpoints for the l1_ratio; #ENet chooses these with its own internal GridSearchCV l1_ratio = 1 - np.logspace( -2, -.03, gridpoints * 2 ) #regularization hyperparameter for ENet; creating a list of gridpoints; #multiplication by 2 somewhat arbitrary; Sci-kit learn documentation? n_alphas = gridpoints * 5 #another regularization hyperparameter for ENet; multiplication by 5 chosen somewhat arbitrarily steps = [ ( 'scaler', StandardScaler() ), #StandardScaler chosen as perceived "best" option for scaling data (from Doug's experience) #('reg',GridSearchCV(ElasticNetCV(cv=inner_cv,normalize=False,),param_grid=param_grid))]; commented out to instead pass #list of values to l1_ratio instead of GridSearchCV ('reg', ElasticNetCV(cv=inner_cv, normalize=False, l1_ratio=l1_ratio, n_alphas=n_alphas)) ] if self.bestT: steps.insert(0, ('xtransform', columnBestTransformer(float_k=len(self.float_idx)))) outerpipe = Pipeline(steps=steps) if self.do_prep: steps = [('prep', missingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe