Example #1
0
    def get_LinearRegressor(degrees: Union[int, None] = None) -> ModelInfo:
        model_wrapper = LinearRegressor()
        description = type(model_wrapper).__name__
        transformations = [
            ImputationTransformer(),
            CenterScaleTransformer(),
            RemoveNZVTransformer(),
            RemoveCorrelationsTransformer()
        ]

        if degrees is not None:
            # assumes center/scaling data should be done before the polynomial transformation
            description = '{0}_{1}_{2}'.format(description, 'polynomial',
                                               str(degrees))
            transformations.append(
                PolynomialFeaturesTransformer(degrees=degrees))

        # PolynomialFeaturesTransformer, if used, must be performed before adding dummy variables
        transformations.append(
            DummyEncodeTransformer(CategoricalEncoding.DUMMY))

        return ModelInfo(description=description,
                         model=model_wrapper,
                         transformations=transformations,
                         hyper_params=None,
                         hyper_params_grid=None)
Example #2
0
    def get_SoftmaxLogisticClassifier(
            degrees: Union[int, None] = None) -> ModelInfo:

        model_wrapper = SoftmaxLogisticClassifier()
        description = type(model_wrapper).__name__
        # TODO: fill out rest of recommended transformations, verify order
        transformations = [
            ImputationTransformer(),
            CenterScaleTransformer(),
            RemoveNZVTransformer(),
            RemoveCorrelationsTransformer()
        ]

        if degrees is not None:
            # assumes center/scaling data should be done before the polynomial transformation
            description = '{0}_{1}_{2}'.format(description, 'polynomial',
                                               str(degrees))
            transformations.append(
                PolynomialFeaturesTransformer(degrees=degrees))

        # PolynomialFeaturesTransformer, if used, must be performed before adding dummy variables
        transformations.append(
            DummyEncodeTransformer(CategoricalEncoding.DUMMY))

        return ModelInfo(description=description,
                         model=model_wrapper,
                         transformations=transformations,
                         hyper_params=SoftmaxLogisticHP(),
                         hyper_params_grid=HyperParamsGrid(params_dict=dict(
                             C=[0.001, 0.01, 0.1, 1, 100, 1000])))
Example #3
0
    def get_GradientBoostingClassifier(number_of_features) -> ModelInfo:
        model_wrapper = GradientBoostingClassifier()
        return ModelInfo(
            description=type(model_wrapper).__name__,
            model=model_wrapper,
            # TODO: fill out rest of recommended transformations, verify order
            transformations=None,
            hyper_params=GradientBoostingClassifierHP(),
            # https://machinelearningmastery.com/configure-gradient-boosting-algorithm/
            # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

            # consider taking the default learning rate of 0.1 and check the optimum number of
            #   trees for that. For this purpose, we can do a grid search and test out values
            #   from 20 to 80 in steps of 10.
            # The order of tuning variables should be decided carefully.
            # You should take the variables with a higher impact on outcome first.
            # For instance, max_depth and min_samples_split have a significant impact and we’re
            #    tuning those first.
            hyper_params_grid=HyperParamsGrid(params_dict=dict(
                n_estimators=[50, 200, 350],
                max_depth=[2, 5, 8],
                min_samples_leaf=[2, 10, 50],
                max_features=[
                    # int(round(number_of_features**(1/2.0)))],
                    # int(round(number_of_features/2)),
                    # number_of_features - 1],
                    int(round(number_of_features / 3 * 2))
                ],  # 2/3
                subsample=[0.3, 0.5, 0.8])))
Example #4
0
    def _ridge_lasso_elastic_helper(model_wrapper, hyper_params, degrees,
                                    params_dict):
        description = type(model_wrapper).__name__
        transformations = [
            ImputationTransformer(),
            CenterScaleTransformer(),
            RemoveCorrelationsTransformer(),
            RemoveNZVTransformer()
        ]

        if degrees is not None:
            # assumes center/scaling data should be done before the polynomial transformation
            description = '{0}_{1}_{2}'.format(description, 'polynomial',
                                               str(degrees))
            transformations.append(
                PolynomialFeaturesTransformer(degrees=degrees))

        # PolynomialFeaturesTransformer, if used, must be performed before adding dummy variables
        transformations.append(
            DummyEncodeTransformer(CategoricalEncoding.DUMMY))

        return ModelInfo(
            description=description,
            model=model_wrapper,
            transformations=transformations,
            hyper_params=hyper_params,
            hyper_params_grid=HyperParamsGrid(params_dict=params_dict))
Example #5
0
 def get_DummyClassifier(strategy: DummyClassifierStrategy) -> ModelInfo:
     model_wrapper = DummyClassifier(strategy=strategy)
     return ModelInfo(description='{0}_{1}'.format(
         type(model_wrapper).__name__, strategy.value),
                      model=model_wrapper,
                      transformations=None,
                      hyper_params=None,
                      hyper_params_grid=None)
Example #6
0
 def get_AdaBoostClassifier() -> ModelInfo:
     model_wrapper = AdaBoostClassifier()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         transformations=None,
         hyper_params=AdaBoostClassifierHP(),
         hyper_params_grid=HyperParamsGrid(
             params_dict=dict(max_depth=[3, 10, 30],
                              n_estimators=[100, 500, 1000],
                              learning_rate=[0.1, 0.5, 1])))
Example #7
0
 def get_SvmLinearRegressor() -> ModelInfo:
     model_wrapper = SvmLinearRegressor()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         transformations=[
             ImputationTransformer(),
             CenterScaleTransformer(),
             DummyEncodeTransformer(CategoricalEncoding.ONE_HOT)
         ],
         hyper_params=SvmLinearRegressorHP(),
         hyper_params_grid=HyperParamsGrid(params_dict=dict(
             epsilon=[0, 0.1, 1, 3], penalty_c=[0.001, 0.01, 0.1, 1000])))
Example #8
0
 def get_XGBoostRegressor_linear() -> ModelInfo:
     model_wrapper = XGBoostRegressor()
     return ModelInfo(
         description=type(model_wrapper).__name__ + '_linear',
         model=model_wrapper,
         transformations=[
             ImputationTransformer(),
             DummyEncodeTransformer(CategoricalEncoding.ONE_HOT)
         ],
         hyper_params=XGBoostLinearHP(objective=XGBObjective.REG_LINEAR),
         # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
         hyper_params_grid=HyperParamsGrid(
             params_dict=dict(n_estimators=[50, 200, 350],
                              reg_alpha=[0, 0.001, 0.01, 0.05],
                              reg_lambda=[0.1, 0.5, 1, 2])))
Example #9
0
 def get_XGBoostClassifier_tree(objective: XGBObjective) -> ModelInfo:
     model_wrapper = XGBoostClassifier()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         transformations=[
             ImputationTransformer(),
             DummyEncodeTransformer(CategoricalEncoding.ONE_HOT)
         ],
         hyper_params=XGBoostTreeHP(objective=objective),
         # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
         # https://machinelearningmastery.com/configure-gradient-boosting-algorithm/
         hyper_params_grid=HyperParamsGrid(
             params_dict=dict(colsample_bytree=[0.4, 0.7, 1.0],
                              subsample=[0.5, 0.75, 1.0],
                              max_depth=[3, 6, 9])))
Example #10
0
 def get_SvmPolynomialClassifier() -> ModelInfo:
     model_wrapper = SvmPolynomialClassifier()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         transformations=[
             ImputationTransformer(),
             CenterScaleTransformer(),
             DummyEncodeTransformer(CategoricalEncoding.ONE_HOT)
         ],
         hyper_params=SvmPolynomialClassifierHP(),
         hyper_params_grid=HyperParamsGrid(params_dict=dict(
             degree=[2, 3],
             coef0=[0, 1, 10],
             # a smaller C value leads to a wider street but more margin
             # violations (HOML pg 148)
             penalty_c=[0.001, 0.1, 100, 1000])))
Example #11
0
 def get_RandomForestRegressor(number_of_features: int) -> ModelInfo:
     model_wrapper = RandomForestRegressor()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         #  https://stackoverflow.com/questions/24715230/can-sklearn-random-forest-directly-handle-categorical-features?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
         transformations=[
             DummyEncodeTransformer(CategoricalEncoding.ONE_HOT)
         ],
         hyper_params=RandomForestHP(criterion='gini'),
         hyper_params_grid=HyperParamsGrid(
             params_dict=dict(max_features=[
                 int(round(number_of_features**(1 / 2.0))),
                 int(round(number_of_features / 2)), number_of_features
             ],
                              n_estimators=[100, 500, 1000],
                              min_samples_leaf=[1, 50, 100])))
Example #12
0
 def get_SvmLinearClassifier() -> ModelInfo:
     model_wrapper = SvmLinearClassifier()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         transformations=[
             ImputationTransformer(),
             CenterScaleTransformer(),
             DummyEncodeTransformer(CategoricalEncoding.ONE_HOT)
         ],
         hyper_params=SvmLinearClassifierHP(),
         hyper_params_grid=HyperParamsGrid(params_dict=dict(
             # The ‘l2’ penalty is the standard used in SVC. The ‘l1’ leads
             # to coef_ vectors that are sparse.
             penalty=['l2'],
             #  a smaller C value leads to a wider street but more margin
             #  violations (HOML pg 148)
             penalty_c=[0.001, 0.01, 0.1, 1, 100, 1000])))
Example #13
0
 def get_CartDecisionTreeClassifier(number_of_features) -> ModelInfo:
     model_wrapper = CartDecisionTreeClassifier()
     return ModelInfo(
         description=type(model_wrapper).__name__,
         model=model_wrapper,
         # TODO: fill out rest of recommended transformations, verify order
         transformations=None,
         hyper_params=CartDecisionTreeHP(criterion='gini'),
         # max_depth: The maximum number of levels in the tree.
         # min_samples_leaf: The minimum number of samples allowed in a leaf.
         # min_samples_split: The minimum number of samples required to split an internal node
         # max_features : The number of features to consider when looking for the best split.
         hyper_params_grid=HyperParamsGrid(
             params_dict=dict(max_depth=[3, 10, 30],
                              min_samples_leaf=[1, 50, 100],
                              max_features=[
                                  int(round(number_of_features**(1 / 2.0))),
                                  int(round(number_of_features /
                                            2)), number_of_features
                              ])))