def get_LinearRegressor(degrees: Union[int, None] = None) -> ModelInfo: model_wrapper = LinearRegressor() description = type(model_wrapper).__name__ transformations = [ ImputationTransformer(), CenterScaleTransformer(), RemoveNZVTransformer(), RemoveCorrelationsTransformer() ] if degrees is not None: # assumes center/scaling data should be done before the polynomial transformation description = '{0}_{1}_{2}'.format(description, 'polynomial', str(degrees)) transformations.append( PolynomialFeaturesTransformer(degrees=degrees)) # PolynomialFeaturesTransformer, if used, must be performed before adding dummy variables transformations.append( DummyEncodeTransformer(CategoricalEncoding.DUMMY)) return ModelInfo(description=description, model=model_wrapper, transformations=transformations, hyper_params=None, hyper_params_grid=None)
def get_SoftmaxLogisticClassifier( degrees: Union[int, None] = None) -> ModelInfo: model_wrapper = SoftmaxLogisticClassifier() description = type(model_wrapper).__name__ # TODO: fill out rest of recommended transformations, verify order transformations = [ ImputationTransformer(), CenterScaleTransformer(), RemoveNZVTransformer(), RemoveCorrelationsTransformer() ] if degrees is not None: # assumes center/scaling data should be done before the polynomial transformation description = '{0}_{1}_{2}'.format(description, 'polynomial', str(degrees)) transformations.append( PolynomialFeaturesTransformer(degrees=degrees)) # PolynomialFeaturesTransformer, if used, must be performed before adding dummy variables transformations.append( DummyEncodeTransformer(CategoricalEncoding.DUMMY)) return ModelInfo(description=description, model=model_wrapper, transformations=transformations, hyper_params=SoftmaxLogisticHP(), hyper_params_grid=HyperParamsGrid(params_dict=dict( C=[0.001, 0.01, 0.1, 1, 100, 1000])))
def get_GradientBoostingClassifier(number_of_features) -> ModelInfo: model_wrapper = GradientBoostingClassifier() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=None, hyper_params=GradientBoostingClassifierHP(), # https://machinelearningmastery.com/configure-gradient-boosting-algorithm/ # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/ # consider taking the default learning rate of 0.1 and check the optimum number of # trees for that. For this purpose, we can do a grid search and test out values # from 20 to 80 in steps of 10. # The order of tuning variables should be decided carefully. # You should take the variables with a higher impact on outcome first. # For instance, max_depth and min_samples_split have a significant impact and we’re # tuning those first. hyper_params_grid=HyperParamsGrid(params_dict=dict( n_estimators=[50, 200, 350], max_depth=[2, 5, 8], min_samples_leaf=[2, 10, 50], max_features=[ # int(round(number_of_features**(1/2.0)))], # int(round(number_of_features/2)), # number_of_features - 1], int(round(number_of_features / 3 * 2)) ], # 2/3 subsample=[0.3, 0.5, 0.8])))
def _ridge_lasso_elastic_helper(model_wrapper, hyper_params, degrees, params_dict): description = type(model_wrapper).__name__ transformations = [ ImputationTransformer(), CenterScaleTransformer(), RemoveCorrelationsTransformer(), RemoveNZVTransformer() ] if degrees is not None: # assumes center/scaling data should be done before the polynomial transformation description = '{0}_{1}_{2}'.format(description, 'polynomial', str(degrees)) transformations.append( PolynomialFeaturesTransformer(degrees=degrees)) # PolynomialFeaturesTransformer, if used, must be performed before adding dummy variables transformations.append( DummyEncodeTransformer(CategoricalEncoding.DUMMY)) return ModelInfo( description=description, model=model_wrapper, transformations=transformations, hyper_params=hyper_params, hyper_params_grid=HyperParamsGrid(params_dict=params_dict))
def get_DummyClassifier(strategy: DummyClassifierStrategy) -> ModelInfo: model_wrapper = DummyClassifier(strategy=strategy) return ModelInfo(description='{0}_{1}'.format( type(model_wrapper).__name__, strategy.value), model=model_wrapper, transformations=None, hyper_params=None, hyper_params_grid=None)
def get_AdaBoostClassifier() -> ModelInfo: model_wrapper = AdaBoostClassifier() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=None, hyper_params=AdaBoostClassifierHP(), hyper_params_grid=HyperParamsGrid( params_dict=dict(max_depth=[3, 10, 30], n_estimators=[100, 500, 1000], learning_rate=[0.1, 0.5, 1])))
def get_SvmLinearRegressor() -> ModelInfo: model_wrapper = SvmLinearRegressor() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=[ ImputationTransformer(), CenterScaleTransformer(), DummyEncodeTransformer(CategoricalEncoding.ONE_HOT) ], hyper_params=SvmLinearRegressorHP(), hyper_params_grid=HyperParamsGrid(params_dict=dict( epsilon=[0, 0.1, 1, 3], penalty_c=[0.001, 0.01, 0.1, 1000])))
def get_XGBoostRegressor_linear() -> ModelInfo: model_wrapper = XGBoostRegressor() return ModelInfo( description=type(model_wrapper).__name__ + '_linear', model=model_wrapper, transformations=[ ImputationTransformer(), DummyEncodeTransformer(CategoricalEncoding.ONE_HOT) ], hyper_params=XGBoostLinearHP(objective=XGBObjective.REG_LINEAR), # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ hyper_params_grid=HyperParamsGrid( params_dict=dict(n_estimators=[50, 200, 350], reg_alpha=[0, 0.001, 0.01, 0.05], reg_lambda=[0.1, 0.5, 1, 2])))
def get_XGBoostClassifier_tree(objective: XGBObjective) -> ModelInfo: model_wrapper = XGBoostClassifier() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=[ ImputationTransformer(), DummyEncodeTransformer(CategoricalEncoding.ONE_HOT) ], hyper_params=XGBoostTreeHP(objective=objective), # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ # https://machinelearningmastery.com/configure-gradient-boosting-algorithm/ hyper_params_grid=HyperParamsGrid( params_dict=dict(colsample_bytree=[0.4, 0.7, 1.0], subsample=[0.5, 0.75, 1.0], max_depth=[3, 6, 9])))
def get_SvmPolynomialClassifier() -> ModelInfo: model_wrapper = SvmPolynomialClassifier() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=[ ImputationTransformer(), CenterScaleTransformer(), DummyEncodeTransformer(CategoricalEncoding.ONE_HOT) ], hyper_params=SvmPolynomialClassifierHP(), hyper_params_grid=HyperParamsGrid(params_dict=dict( degree=[2, 3], coef0=[0, 1, 10], # a smaller C value leads to a wider street but more margin # violations (HOML pg 148) penalty_c=[0.001, 0.1, 100, 1000])))
def get_RandomForestRegressor(number_of_features: int) -> ModelInfo: model_wrapper = RandomForestRegressor() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order # https://stackoverflow.com/questions/24715230/can-sklearn-random-forest-directly-handle-categorical-features?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa transformations=[ DummyEncodeTransformer(CategoricalEncoding.ONE_HOT) ], hyper_params=RandomForestHP(criterion='gini'), hyper_params_grid=HyperParamsGrid( params_dict=dict(max_features=[ int(round(number_of_features**(1 / 2.0))), int(round(number_of_features / 2)), number_of_features ], n_estimators=[100, 500, 1000], min_samples_leaf=[1, 50, 100])))
def get_SvmLinearClassifier() -> ModelInfo: model_wrapper = SvmLinearClassifier() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=[ ImputationTransformer(), CenterScaleTransformer(), DummyEncodeTransformer(CategoricalEncoding.ONE_HOT) ], hyper_params=SvmLinearClassifierHP(), hyper_params_grid=HyperParamsGrid(params_dict=dict( # The ‘l2’ penalty is the standard used in SVC. The ‘l1’ leads # to coef_ vectors that are sparse. penalty=['l2'], # a smaller C value leads to a wider street but more margin # violations (HOML pg 148) penalty_c=[0.001, 0.01, 0.1, 1, 100, 1000])))
def get_CartDecisionTreeClassifier(number_of_features) -> ModelInfo: model_wrapper = CartDecisionTreeClassifier() return ModelInfo( description=type(model_wrapper).__name__, model=model_wrapper, # TODO: fill out rest of recommended transformations, verify order transformations=None, hyper_params=CartDecisionTreeHP(criterion='gini'), # max_depth: The maximum number of levels in the tree. # min_samples_leaf: The minimum number of samples allowed in a leaf. # min_samples_split: The minimum number of samples required to split an internal node # max_features : The number of features to consider when looking for the best split. hyper_params_grid=HyperParamsGrid( params_dict=dict(max_depth=[3, 10, 30], min_samples_leaf=[1, 50, 100], max_features=[ int(round(number_of_features**(1 / 2.0))), int(round(number_of_features / 2)), number_of_features ])))