def numeric_pipeline_complex(impute_strategy=None, seq_no=0): if impute_strategy is None: impute_strategy = Choice( ['mean', 'median', 'constant', 'most_frequent']) elif isinstance(impute_strategy, list): impute_strategy = Choice(impute_strategy) # reduce_skewness_kurtosis = SkewnessKurtosisTransformer(transform_fn=Choice([np.log, np.log10, np.log1p])) # reduce_skewness_kurtosis_optional = Optional(reduce_skewness_kurtosis, keep_link=True, # name=f'numeric_reduce_skewness_kurtosis_optional_{seq_no}') imputer = SimpleImputer(missing_values=np.nan, strategy=impute_strategy, name=f'numeric_imputer_{seq_no}') scaler_options = ModuleChoice([ StandardScaler(name=f'numeric_standard_scaler_{seq_no}'), MinMaxScaler(name=f'numeric_minmax_scaler_{seq_no}'), MaxAbsScaler(name=f'numeric_maxabs_scaler_{seq_no}'), RobustScaler(name=f'numeric_robust_scaler_{seq_no}') ], name=f'numeric_or_scaler_{seq_no}') scaler_optional = Optional(scaler_options, keep_link=True, name=f'numeric_scaler_optional_{seq_no}') pipeline = Pipeline([imputer, scaler_optional], name=f'numeric_pipeline_complex_{seq_no}', columns=column_number_exclude_timedelta) return pipeline
def categorical_pipeline_complex(impute_strategy=None, svd_components=3, seq_no=0): if impute_strategy is None: impute_strategy = Choice(['constant', 'most_frequent']) elif isinstance(impute_strategy, list): impute_strategy = Choice(impute_strategy) if isinstance(svd_components, list): svd_components = Choice(svd_components) def onehot_svd(): onehot = OneHotEncoder(name=f'categorical_onehot_{seq_no}') optional_svd = Optional(TruncatedSVD(n_components=svd_components, name=f'categorical_svd_{seq_no}'), name=f'categorical_optional_svd_{seq_no}', keep_link=True)(onehot) return optional_svd imputer = SimpleImputer(missing_values=np.nan, strategy=impute_strategy, name=f'categorical_imputer_{seq_no}') label_encoder = MultiLabelEncoder( name=f'categorical_label_encoder_{seq_no}') onehot = onehot_svd() le_or_onehot_pca = ModuleChoice( [label_encoder, onehot], name=f'categorical_le_or_onehot_pca_{seq_no}') pipeline = Pipeline([imputer, le_or_onehot_pca], name=f'categorical_pipeline_complex_{seq_no}', columns=column_object_category_bool) return pipeline
def conv_cell(hp_dict, type, cell_no, node_no, left_or_right, inputs, filters, is_reduction=False, data_format=None): assert isinstance(inputs, list) assert all([isinstance(m, ModuleSpace) for m in inputs]) name_prefix = f'{type}_C{cell_no}_N{node_no}_{left_or_right}_' input_choice_key = f'{type[2:]}_N{node_no}_{left_or_right}_input_choice' op_choice_key = f'{type[2:]}_N{node_no}_{left_or_right}_op_choice' hp_choice = hp_dict.get(input_choice_key) if hp_choice is None: hp_choice = MultipleChoice(list(range(len(inputs))), 1, name=input_choice_key) hp_dict[input_choice_key] = hp_choice ic1 = InputChoice(inputs, 1, hp_choice=hp_choice)(inputs) if hp_choice is None: hp_dict[input_choice_key] = ic1.hp_choice # hp_strides = Dynamic(lambda_fn=lambda choice: (2, 2) if is_reduction and choice[0] <= 1 else (1, 1), # choice=ic1.hp_choice) hp_strides = (1, 1) hp_op_choice = hp_dict.get(op_choice_key) module_candidates = [ sepconv5x5(name_prefix, filters, strides=hp_strides, data_format=data_format), sepconv3x3(name_prefix, filters, strides=hp_strides, data_format=data_format), avgpooling3x3(name_prefix, filters, strides=hp_strides, data_format=data_format), maxpooling3x3(name_prefix, filters, strides=hp_strides, data_format=data_format), identity(name_prefix) ] if hp_op_choice is None: hp_op_choice = Choice(list(range(len(module_candidates))), name=op_choice_key) hp_dict[op_choice_key] = hp_op_choice op_choice = ModuleChoice(module_candidates, hp_or=hp_op_choice)(ic1) return op_choice
def get_space_num_cat_pipeline_complex(dataframe_mapper_default=False, lightgbm_fit_kwargs={}, xgb_fit_kwargs={}, catboost_fit_kwargs={}): space = HyperSpace() with space.as_default(): input = HyperInput(name='input1') p1 = numeric_pipeline_complex()(input) p2 = categorical_pipeline_complex()(input) # p2 = categorical_pipeline_simple()(input) p3 = DataFrameMapper(default=dataframe_mapper_default, input_df=True, df_out=True, df_out_dtype_transforms=[(column_object, 'int') ])([p1, p2]) lightgbm_init_kwargs = { 'boosting_type': Choice(['gbdt', 'dart', 'goss']), 'num_leaves': Choice([11, 31, 101, 301, 501]), 'learning_rate': Real(0.001, 0.1, step=0.005), 'n_estimators': 100, 'max_depth': -1, 'tree_learner': 'data' # add for dask # subsample_for_bin = 200000, objective = None, class_weight = None, # min_split_gain = 0., min_child_weight = 1e-3, min_child_samples = 20, } lightgbm_est = LightGBMDaskEstimator(task='binary', fit_kwargs=lightgbm_fit_kwargs, **lightgbm_init_kwargs) xgb_init_kwargs = { 'tree_method': 'approx' # add for dask } xgb_est = XGBoostDaskEstimator(task='binary', fit_kwargs=xgb_fit_kwargs, **xgb_init_kwargs) # catboost_init_kwargs = { # 'silent': True # } # catboost_est = CatBoostEstimator(task='binary', fit_kwargs=catboost_fit_kwargs, **catboost_init_kwargs) # or_est = ModuleChoice([lightgbm_est, xgb_est, catboost_est], name='estimator_options')(p3) or_est = ModuleChoice([lightgbm_est, xgb_est], name='estimator_options')(p3) space.set_inputs(input) return space
def get_space_num_cat_pipeline_multi_complex(dataframe_mapper_default=False, lightgbm_fit_kwargs={}, xgb_fit_kwargs={}): space = HyperSpace() with space.as_default(): input = HyperInput(name='input1') p1 = numeric_pipeline_complex()(input) p2 = categorical_pipeline_complex()(input) p3 = DataFrameMapper(default=dataframe_mapper_default, input_df=True, df_out=True, df_out_dtype_transforms=[(column_object, 'category')])([p1, p2]) p4 = numeric_pipeline_complex(seq_no=1)(p3) p5 = categorical_pipeline_complex(seq_no=1)(p3) p6 = DataFrameMapper(default=dataframe_mapper_default, input_df=True, df_out=True, df_out_dtype_transforms=[(column_object, 'category')])([p4, p5]) lightgbm_init_kwargs = { 'boosting_type': Choice(['gbdt', 'dart', 'goss']), 'num_leaves': Choice([11, 31, 101, 301, 501]), 'learning_rate': Real(0.001, 0.1, step=0.005), 'n_estimators': 100, 'max_depth': -1, # subsample_for_bin = 200000, objective = None, class_weight = None, # min_split_gain = 0., min_child_weight = 1e-3, min_child_samples = 20, } lightgbm_est = LightGBMEstimator(task='binary', fit_kwargs=lightgbm_fit_kwargs, **lightgbm_init_kwargs) xgb_init_kwargs = {} xgb_est = XGBoostEstimator(task='binary', fit_kwargs=xgb_fit_kwargs, **xgb_init_kwargs) or_est = ModuleChoice([lightgbm_est, xgb_est])(p6) space.set_inputs(input) return space
def __call__(self, *args, **kwargs): space = HyperSpace() with space.as_default(): hyper_input = HyperInput(name='input1') estimators = [] if self.enable_dt: estimators.append(self.dt) if self.enable_dtr: estimators.append(self.dtr) if self.enable_lr: estimators.append(self.lr) if self.enable_nn: estimators.append(self.nn) modules = [ModuleSpace(name=f'{e["cls"].__name__}', **e) for e in estimators] outputs = ModuleChoice(modules)(hyper_input) space.set_inputs(hyper_input) return space
def conv_block(block_no, hp_pooling, hp_filters, hp_kernel_size, hp_bn_act, hp_use_bn, hp_activation, strides=(1, 1)): def conv_bn(step): conv = Conv2D(filters=conv_filters, kernel_size=hp_kernel_size, strides=strides, padding='same') act = Activation(activation=hp_activation) optional_bn = Optional(BatchNormalization(), keep_link=True, hp_opt=hp_use_bn) # Use `Permutation` to try different arrangements of act, optional_bn # optional_bn is optional module and will be skipped when hp_use_bn is False perm_act_bn = Permutation([optional_bn, act], hp_seq=hp_bn_act) seq = Sequential([conv, perm_act_bn]) return seq if block_no < 2: repeat_num_choices = [2] multiplier = 1 else: repeat_num_choices = [3, 4, 5] multiplier = 2 ** (block_no - 1) conv_filters = Dynamic(lambda filters: filters * multiplier, filters=hp_filters) conv = Repeat(conv_bn, repeat_times=repeat_num_choices) pooling = ModuleChoice([MaxPooling2D(padding='same'), AveragePooling2D(padding='same')], hp_or=hp_pooling) block = Sequential([conv, pooling]) return block