def __call__(self, config, **kwargs): start_time = time.time() return_dict = dict() if self.name is None: raise ValueError('This evaluator has no name/type!') assert self.name in ['hpo', 'fe'] if self.name == 'hpo': hpo_config = config if config is not None else self.hpo_config fe_config = kwargs.get('ano_config', self.fe_config) else: fe_config = config if config is not None else self.fe_config hpo_config = kwargs.get('ano_config', self.hpo_config) # Prepare configuration. self.seed = 1 downsample_ratio = kwargs.get('resource_ratio', 1.0) if 'holdout' in self.resampling_strategy: try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split(self.data_node.data[0], self.data_node.data[1]): _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index] self.train_node.data = [_X_train, _y_train] self.val_node.data = [_X_val, _y_val] data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _X_train, _y_train = data_node.data _X_val, _y_val = _val_node.data config_dict = hpo_config.get_dictionary().copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) score = validation(clf, self.scorer, _X_train, _y_train, _X_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance(self.scorer, _ThresholdScorer) else None, fit_params=fit_params) if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!') lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback self.logger.info('%s-evaluator: %s' % (self.name, str(e))) score = -np.inf traceback.print_exc(file=sys.stdout) elif 'cv' in self.resampling_strategy: # Prepare data node. try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if 'cv' in self.resampling_strategy: if self.resampling_params is None or 'folds' not in self.resampling_params: folds = 5 else: folds = self.resampling_params['folds'] from sklearn.model_selection import StratifiedKFold skfold = StratifiedKFold(n_splits=folds, random_state=self.seed, shuffle=False) scores = list() for train_index, test_index in skfold.split(self.data_node.data[0], self.data_node.data[1]): _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index] self.train_node.data = [_X_train, _y_train] self.val_node.data = [_X_val, _y_val] data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _X_train, _y_train = data_node.data _X_val, _y_val = _val_node.data config_dict = hpo_config.get_dictionary().copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) _score = validation(clf, self.scorer, _X_train, _y_train, _X_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance(self.scorer, _ThresholdScorer) else None, fit_params=fit_params) scores.append(_score) score = np.mean(scores) # TODO: Don't save models for cv if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!') lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): _ = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id) self.topk_model_saver.save_topk_config() lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf elif 'partial' in self.resampling_strategy: try: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split(self.data_node.data[0], self.data_node.data[1]): _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index] self.train_node.data = [_X_train, _y_train] self.val_node.data = [_X_val, _y_val] data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _X_train, _y_train = data_node.data if downsample_ratio != 1: down_ss = StratifiedShuffleSplit(n_splits=1, test_size=downsample_ratio, random_state=self.seed) for _, _val_index in down_ss.split(_X_train, _y_train): _act_x_train, _act_y_train = _X_train[_val_index], _y_train[_val_index] else: _act_x_train, _act_y_train = _X_train, _y_train _val_index = list(range(len(_X_train))) _X_val, _y_val = _val_node.data config_dict = hpo_config.get_dictionary().copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if 'sample_weight' in fit_params: fit_params['sample_weight'] = fit_params['sample_weight'][_val_index] if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) score = validation(clf, self.scorer, _act_x_train, _act_y_train, _X_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance(self.scorer, _ThresholdScorer) else None, fit_params=fit_params) # TODO: Only save models with maximum resources if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!') lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score) and downsample_ratio == 1: save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf else: raise ValueError('Invalid resampling strategy: %s!' % self.resampling_strategy) try: self.logger.info('Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s' % (classifier_id, self.scorer._sign * score, time.time() - start_time, _X_train.shape)) except: pass # Turn it into a minimization problem. return_dict['objective_value'] = -score return -score
def __call__(self, config, **kwargs): start_time = time.time() return_dict = dict() self.seed = 1 downsample_ratio = kwargs.get('resource_ratio', 1.0) if 'holdout' in self.resampling_strategy: try: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data _x_val, _y_val = _val_node.data config_dict = config.get_dictionary().copy() # regression gadgets regressor_id, clf = get_estimator(config_dict, self.estimator_id) score = validation(clf, self.scorer, _x_train, _y_train, _x_val, _y_val, random_state=self.seed) if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info( 'rw_lock not defined! Possible read-write conflicts may happen!' ) lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add( config, score, regressor_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf elif 'cv' in self.resampling_strategy: # Prepare data node. try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if 'cv' in self.resampling_strategy: if self.resampling_params is None or 'folds' not in self.resampling_params: folds = 5 else: folds = self.resampling_params['folds'] from sklearn.model_selection import KFold kfold = KFold(n_splits=folds, random_state=self.seed, shuffle=False) scores = list() for train_index, test_index in kfold.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data _x_val, _y_val = _val_node.data config_dict = config.get_dictionary().copy() # regressor gadgets regressor_id, clf = get_estimator( config_dict, self.estimator_id) _score = validation(clf, self.scorer, _x_train, _y_train, _x_val, _y_val, random_state=self.seed) scores.append(_score) score = np.mean(scores) # TODO: Don't save models for cv if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info( 'rw_lock not defined! Possible read-write conflicts may happen!' ) lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): _ = self.topk_model_saver.add(config, score, regressor_id) self.topk_model_saver.save_topk_config() lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf elif 'partial' in self.resampling_strategy: try: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data if downsample_ratio != 1: down_ss = ShuffleSplit(n_splits=1, test_size=downsample_ratio, random_state=self.seed) for _, _val_index in down_ss.split(_x_train, _y_train): _act_x_train, _act_y_train = _x_train[ _val_index], _y_train[_val_index] else: _act_x_train, _act_y_train = _x_train, _y_train _val_index = list(range(len(_x_train))) _x_val, _y_val = _val_node.data config_dict = config.get_dictionary().copy() # Regressor gadgets regressor_id, clf = get_estimator(config_dict, self.estimator_id) score = validation(clf, self.scorer, _act_x_train, _act_y_train, _x_val, _y_val, random_state=self.seed) # TODO: Only save models with maximum resources if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info( 'rw_lock not defined! Possible read-write conflicts may happen!' ) lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score) and downsample_ratio == 1: save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add( config, score, regressor_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf else: raise ValueError('Invalid resampling strategy: %s!' % self.resampling_strategy) try: self.logger.info( 'Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s' % (regressor_id, self.scorer._sign * score, time.time() - start_time, _x_train.shape)) except: pass # Turn it into a minimization problem. return_dict['objective_value'] = -score return -score
def __call__(self, config, **kwargs): start_time = time.time() return_dict = dict() self.seed = 1 downsample_ratio = kwargs.get('resource_ratio', 1.0) # Convert Configuration into dictionary if not isinstance(config, dict): config = config.get_dictionary().copy() else: config = config.copy() if self.fixed_config is not None: config.update(self.fixed_config) self.estimator_id = config['algorithm'] if 'holdout' in self.resampling_strategy: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data _x_val, _y_val = _val_node.data config_dict = config.copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params( _y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) score = validation(clf, self.scorer, _x_train, _y_train, _x_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance( self.scorer, _ThresholdScorer) else None, fit_params=fit_params) if np.isfinite(score): model_path = CombinedTopKModelSaver.get_path_by_config( self.output_dir, config, self.timestamp) if not os.path.exists(model_path): with open(model_path, 'wb') as f: pkl.dump([op_list, clf, score], f) else: with open(model_path, 'rb') as f: _, _, perf = pkl.load(f) if score > perf: with open(model_path, 'wb') as f: pkl.dump([op_list, clf, score], f) self.logger.info("Model saved to %s" % model_path) elif 'cv' in self.resampling_strategy: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if 'cv' in self.resampling_strategy: if self.resampling_params is None or 'folds' not in self.resampling_params: folds = 5 else: folds = self.resampling_params['folds'] from sklearn.model_selection import StratifiedKFold skfold = StratifiedKFold(n_splits=folds, random_state=self.seed, shuffle=False) scores = list() for train_index, test_index in skfold.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data _x_val, _y_val = _val_node.data config_dict = config.copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params( _y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) _score = validation( clf, self.scorer, _x_train, _y_train, _x_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance( self.scorer, _ThresholdScorer) else None, fit_params=fit_params) scores.append(_score) score = np.mean(scores) elif 'partial' in self.resampling_strategy: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data if downsample_ratio != 1: down_ss = StratifiedShuffleSplit(n_splits=1, test_size=downsample_ratio, random_state=self.seed) for _, _val_index in down_ss.split(_x_train, _y_train): _act_x_train, _act_y_train = _x_train[ _val_index], _y_train[_val_index] else: _act_x_train, _act_y_train = _x_train, _y_train _val_index = list(range(len(_x_train))) _x_val, _y_val = _val_node.data config_dict = config.copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params( _y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if 'sample_weight' in fit_params: fit_params['sample_weight'] = fit_params['sample_weight'][ _val_index] if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) score = validation(clf, self.scorer, _act_x_train, _act_y_train, _x_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance( self.scorer, _ThresholdScorer) else None, fit_params=fit_params) if np.isfinite(score) and downsample_ratio == 1: model_path = CombinedTopKModelSaver.get_path_by_config( self.output_dir, config, self.timestamp) if not os.path.exists(model_path): with open(model_path, 'wb') as f: pkl.dump([op_list, clf, score], f) else: with open(model_path, 'rb') as f: _, _, perf = pkl.load(f) if score > perf: with open(model_path, 'wb') as f: pkl.dump([op_list, clf, score], f) self.logger.info("Model saved to %s" % model_path) else: raise ValueError('Invalid resampling strategy: %s!' % self.resampling_strategy) try: self.logger.info( 'Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s' % (classifier_id, self.scorer._sign * score, time.time() - start_time, _x_train.shape)) except: pass # Turn it into a minimization problem. return_dict['objective'] = -score return -score