Example #1
0
 def get_pipe(
     self
 ):  #getting a GBR pipeline; this code follows a template shared by other pipeline classes
     if self.inner_cv is None:
         inner_cv = RepeatedKFold(
             n_splits=10, n_repeats=1,
             random_state=0)  #default inner_cv iterator
     else:
         inner_cv = self.inner_cv
     if self.est_kwargs is None:  #non-hard-coded key word arguments for the estimator supplied at the end of the pipeline
         self.est_kwargs = {'max_depth': [3, 4], 'n_estimators': [64, 128]}
     hyper_param_dict, gbr_params = self.extractParams(
         self.est_kwargs
     )  #pulling out tunable hyperparameters that will be selected
     #by grid search and static parameters that will be passed directly to the estimator
     if not 'random_state' in gbr_params:
         gbr_params['random_state'] = 0
     steps = [('reg',
               GridSearchCV(GradientBoostingRegressor(**gbr_params),
                            param_grid=hyper_param_dict,
                            cv=inner_cv))]
     #** unpacks a dictionary and provides its elements to the function;
     if self.bestT:  #user asking for best feature transforms using the GUI
         steps.insert(
             0, 'xtransform',
             columnBestTransformer(float_k=len(
                 self.float_idx)))  #GET BACK TO columnBestTransformer
         #in Doug's experience, columnBestTransformer overbuilds the pipeline and doesn't generalize well
     outerpipe = Pipeline(steps=steps)
     if self.do_prep:  #wrapping outerpipe inside another outerpipe pipeline
         steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                  ('post', outerpipe)]
         outerpipe = Pipeline(steps=steps)
     return outerpipe
Example #2
0
    def saveFullFloatXy(
        self
    ):  #this function is exporting the data for initial data visualization, but this stuff won't be used for eventual pipeline training
        mvh = missingValHandler(
            {  #create an object for cleaning the covariate data
                'impute_strategy': 'impute_knn5'  #'pass-through'
            })
        #the next lines do data prep, like imputation and binarizing categorical variables
        mvh = mvh.fit(self.X_df)
        X_float = mvh.transform(self.X_df)
        #create a new dataset, potentially with more columns when categorical variables were expanded
        X_float_df = pd.DataFrame(
            data=X_float,
            columns=mvh.get_feature_names(
                input_features=self.X_df.columns.to_list()))
        X_json_s = X_float_df.to_json()  #_json_s is json-string
        y_json_s = self.y_df.to_json()
        X_nan_bool_s = self.X_df.isnull().to_json(
        )  #matrix locations in X of missing values so we can plot them

        summary_data = {
            'full_float_X': X_json_s,
            'full_y': y_json_s,
            'X_nan_bool': X_nan_bool_s
        }
        self.summary_data = summary_data
        with open('summaryXy.json', 'w') as f:
            json.dump(summary_data, f)  #saving a json of the summary data
        print(f'summary data saved to summaryXy.json')
Example #3
0
 def get_pipe(self, ):
     if self.inner_cv is None:
         inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
     else:
         inner_cv = self.inner_cv
     # parameter grid for SVR using two parameters, C and gamma
     param_grid = {
         'C': np.logspace(-2, 2, self.gridpoints * 2),
         'gamma': np.logspace(-2, 0.5, self.gridpoints)
     }
     steps = [
         ('scaler', StandardScaler()),
         # TEST removal of cache_size, tol and max_iter; probably don't need these
         ('reg',
          GridSearchCV(SVR(kernel='rbf',
                           cache_size=10000,
                           tol=1e-4,
                           max_iter=5000),
                       param_grid=param_grid))
     ]
     if self.bestT:
         steps.insert(0,
                      ('xtransform',
                       columnBestTransformer(float_k=len(self.float_idx))))
     outerpipe = Pipeline(steps=steps)
     if self.do_prep:
         steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                  ('post', outerpipe)]
         outerpipe = Pipeline(steps=steps)
     return outerpipe
Example #4
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
        else:
            inner_cv = self.inner_cv

        steps = [('scaler', StandardScaler()),
                 ('select', shrinkBigKTransformer(max_k=8)),
                 ('reg', FlexibleEstimator(**self.flex_kwargs))]
        if self.bestT:
            steps.insert(0, 'xtransform',
                         columnBestTransformer(float_k=len(self.float_idx)))

        pipe = Pipeline(steps=steps)
        # selecting features. k_share allows you to choose the fraction of features that are retained using the LARS algorithm
        param_grid = {
            'select__k_share': np.linspace(0.2, 1, self.gridpoints * 2)
        }
        if self.functional_form_search:
            param_grid['reg__form'] = ['powXB', 'expXB']  #,'linear']

        outerpipe = GridSearchCV(pipe, param_grid=param_grid)
        if self.do_prep:
            steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)

        return outerpipe
Example #5
0
 def get_pipe(self):
     steps = [('reg', HistGradientBoostingRegressor())]
     outerpipe = Pipeline(steps=steps)
     if self.do_prep:
         steps = [('prep',
                   missingValHandler(
                       prep_dict=dict(impute_strategy='pass-through',
                                      cat_idx=self.prep_dict['cat_idx']))),
                  ('post', outerpipe)]
         outerpipe = Pipeline(steps=steps)
     return outerpipe
Example #6
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
        else:
            inner_cv = self.inner_cv

        # gridpoints=self.gridpoints
        transformer_list = [
            none_T(), log_T(), logp1_T()
        ]  # Using 3 of many options here: none_T,logp1_T(),log_T()
        steps = [
            ('shrink_k1',
             shrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=32))
             ),  # retain a subset of the best original variables
            ('polyfeat',
             PolynomialFeatures(interaction_only=0,
                                degree=2)),  # create interactions among them
            ('drop_constant', dropConst()),
            ('shrink_k2',
             shrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=64))
             ),  # pick from all of those options
            ('reg', LinearRegression())
        ]
        if self.bestT:
            steps.insert(0,
                         ('xtransform',
                          columnBestTransformer(float_k=len(self.float_idx))))

        X_T_pipe = Pipeline(steps=steps)
        #develop a new pipeline that allows transformation of y in addition to X, which other scikit learn transformers don't
        Y_T_X_T_pipe = Pipeline(
            steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))])
        Y_T__param_grid = {
            'ttr__transformer': transformer_list,
            'ttr__regressor__polyfeat__degree':
            [2],  #could use other degrees here if desired
        }
        outerpipe = GridSearchCV(Y_T_X_T_pipe,
                                 param_grid=Y_T__param_grid,
                                 cv=inner_cv)
        if self.do_prep:
            steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)

        return outerpipe
Example #7
0
    def get_pipe(self):

        try:
            # calling the pipelines in self.pipelist with their keyword arguments
            est_pipes = [(p[0], p[1]['pipe'](**p[1]['pipe_kwargs']))
                         for p in self.pipelist]
            final_e = self.stacker_estimator
            steps = [
                ('prep', missingValHandler(prep_dict=self.prep_dict)),
                #passthrough=True would add the original covariates to the final stacked regressor model in addition
                #to the y-hats of the component pipelines
                ('post',
                 make_pipeline(
                     StackingRegressor(est_pipes,
                                       passthrough=False,
                                       final_estimator=final_e,
                                       n_jobs=1)))
            ]
            return Pipeline(steps=steps)
        except:
            self.logger.exception(f'error')
            assert False, 'halt'
Example #8
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
        else:
            inner_cv = self.inner_cv

        gridpoints = self.gridpoints
        param_grid = {'C': np.logspace(-2, 4, gridpoints * 4)}
        steps = [
            #('shrink_k1',shrinkBigKTransformer(selector=LassoLarsCV(cv=inner_cv,max_iter=32))), # retain only a subset of the
            # original variables for continued use
            ('polyfeat', PolynomialFeatures(interaction_only=False, degree=2)
             ),  # create every 2nd-order interaction among the features
            # including squared terms
            (
                'drop_constant', dropConst()
            ),  #drops features without variance, including created interactions
            ('shrink_k2',
             shrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=64))),
            ('scaler', StandardScaler()),
            # TEST removal of cache_size, tol and max_iter; probably don't need these
            ('reg',
             GridSearchCV(LinearSVR(random_state=0, tol=1e-4, max_iter=1000),
                          param_grid=param_grid))
        ]
        if self.bestT:
            steps = [
                steps[0],
                ('xtransform',
                 columnBestTransformer(float_k=len(self.float_idx))),
                *steps[1:]
            ]
        outerpipe = Pipeline(steps=steps)
        if self.do_prep:
            steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)
        return outerpipe
Example #9
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
        else:
            inner_cv = self.inner_cv

        steps = [
            ('scaler', StandardScaler()),  #standardizes the X values
            ('reg',
             LassoLarsCV(cv=inner_cv,
                         max_n_alphas=self.max_n_alphas,
                         normalize=False))
        ]
        if self.bestT:
            steps.insert(0, 'xtransform',
                         columnBestTransformer(float_k=len(self.float_idx)))
        outerpipe = Pipeline(steps=steps)

        if self.do_prep:
            steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)
        return outerpipe
Example #10
0
    def get_pipe(self):
        try:

            if self.inner_cv is None:
                inner_cv = RepeatedKFold(n_splits=10,
                                         n_repeats=1,
                                         random_state=0)
            else:
                inner_cv = self.inner_cv
            if self.est_kwargs is None:
                self.est_kwargs = {
                    'reg__alpha':
                    np.logspace(-5, 10, self.gridpoints).tolist(
                    ),  #investigate the ideal range for alpha
                    'reg__power':
                    [0, *np.logspace(1, 3, self.gridpoints - 1).tolist()
                     ],  #investigate power values between 2 and 3
                    'select__max_k': [4, 8, 32]
                }  #maybe look to tweak using k_share
            steps = [('scaler', StandardScaler()),
                     ('select', shrinkBigKTransformer(max_k=8)),
                     ('reg', TweedieRegressor())]
            if self.bestT:
                steps.insert(
                    0, 'xtransform',
                    columnBestTransformer(float_k=len(self.float_idx)))

            outerpipe = GridSearchCV(Pipeline(steps=steps),
                                     param_grid=self.est_kwargs,
                                     cv=inner_cv)
            if self.do_prep:
                steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                         ('post', outerpipe)]
                outerpipe = Pipeline(steps=steps)
            return outerpipe
        except:
            self.logger.exception(f'get_pipe error for flexibleGLM')
Example #11
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
        else:
            inner_cv = self.inner_cv
        gridpoints = self.gridpoints
        #param_grid={'l1_ratio':1-np.logspace(-2,-.03,gridpoints)} #manually creating a list of gridpoints for the l1_ratio;
        #ENet chooses these with its own internal GridSearchCV
        l1_ratio = 1 - np.logspace(
            -2, -.03, gridpoints * 2
        )  #regularization hyperparameter for ENet; creating a list of gridpoints;
        #multiplication by 2 somewhat arbitrary; Sci-kit learn documentation?
        n_alphas = gridpoints * 5  #another regularization hyperparameter for ENet; multiplication by 5 chosen somewhat arbitrarily
        steps = [
            (
                'scaler', StandardScaler()
            ),  #StandardScaler chosen as perceived "best" option for scaling data (from Doug's experience)
            #('reg',GridSearchCV(ElasticNetCV(cv=inner_cv,normalize=False,),param_grid=param_grid))]; commented out to instead pass
            #list of values to l1_ratio instead of GridSearchCV
            ('reg',
             ElasticNetCV(cv=inner_cv,
                          normalize=False,
                          l1_ratio=l1_ratio,
                          n_alphas=n_alphas))
        ]

        if self.bestT:
            steps.insert(0,
                         ('xtransform',
                          columnBestTransformer(float_k=len(self.float_idx))))
        outerpipe = Pipeline(steps=steps)
        if self.do_prep:
            steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)
        return outerpipe