def get_wrapped_sklearn_model(model_cls_str): # model_cls_str は'^Sklearn.+$' にマッチする前提 model_cls_str = re.sub('Sklearn', '', model_cls_str) model_cls_mod_notreg = _valid_not_regressor_models.get(model_cls_str) model_cls_mod_reg = _valid_regressor_models.get(model_cls_str) if model_cls_mod_notreg is None and model_cls_mod_reg is None: raise Exception('{} is invalid.'.format(model_cls_str)) if model_cls_mod_notreg is not None and model_cls_mod_reg is not None: raise Exception('unexpected state.') model_cls_mod = model_cls_mod_notreg or model_cls_mod_reg model_cls = load_object_by_str('{}@{}'.format(model_cls_str, model_cls_mod)) cls_attrs = { 'fit': _fit, 'reset': _generate_reset_func(model_cls), 'dump': dump_sklearn_model, 'load': load_sklearn_model, 'base_init_finished': _base_init_finished if model_cls_mod_notreg else _base_init_finished_reg, } if hasattr(model_cls, 'predict'): cls_attrs['predict'] = _predict if hasattr(model_cls, 'predict_proba'): cls_attrs['predict_proba'] = _predict_proba logger.debug('model_cls_str: {} .. created.'.format(model_cls_str)) return type('WrappedSklearn{}'.format(model_cls_str), (WrappedModel, ), cls_attrs)
def _get_preprocessor_strict(config): name = config.get('name') if name is None: raise Exception('preprocessor_config.name must be set.') cameledname = snake2camel(name) ppcls = load_object_by_str(_preprocessor_name_alias.get(cameledname, name)) return ppcls(**config.get('kwargs', {}))
def test_model_regressor(pd_assert_equal): init_kwargs_settings = [ {}, { 'normalize': True }, ] fit_kwargs_settings = [{}] evaluate_kwargs_settings = [{ 'train_test_split_func': '[email protected]_selection', 'train_test_split_func_kwargs': { 'random_state': 0 }, 'metrics': 'all', }] dataset_config = { 'loader_config': { 'name': '*****@*****.**', 'kwargs': { 'random_state': 0, }, }, 'target_column': 'target', } preprocessor = get_preprocessor({ 'name': 'identify', 'kwargs': {}, }) ffunc_for_predictor = ffunc_for_target = load_object_by_str( '*****@*****.**') ds1 = get_dataset(dataset_config) X1, y1 = ds1.get_predictor_target() for init_kwargs in init_kwargs_settings: for fit_kwargs in fit_kwargs_settings: for evaluate_kwargs in evaluate_kwargs_settings: mconfig = { 'name': 'SklearnLinearRegression', 'init_kwargs': init_kwargs, 'fit_kwargs': fit_kwargs, 'evaluate_kwargs': evaluate_kwargs, 'is_rebuild': False, } m = get_model(mconfig) morigin = LinearRegression(**init_kwargs) m.fit(X1, y1) morigin.fit(X1, y1, **fit_kwargs) # assertion pd_assert_equal( pd.Series(m.predict(X1)).astype('float64'), pd.Series(morigin.predict(X1)).astype('float64')) X_train, X_test, y_train, y_test = skl_train_test_split( X1, y1, **evaluate_kwargs['train_test_split_func_kwargs']) rev1 = m.evaluate(X1, y1, preprocessor, ffunc_for_predictor, ffunc_for_target) if not rev1['cv']: met = rev1['metrics'][0] mean_absolute_error = [ o['value'] for o in met if o['name'] == 'mean_absolute_error' ][0] mean_squared_error = [ o['value'] for o in met if o['name'] == 'mean_squared_error' ][0] median_absolute_error = [ o['value'] for o in met if o['name'] == 'median_absolute_error' ][0] r2_score = [ o['value'] for o in met if o['name'] == 'r2_score' ][0] explained_variance = [ o['value'] for o in met if o['name'] == 'explained_variance' ][0] morigin.fit(ffunc_for_predictor(X_train), ffunc_for_target(y_train)) assert math.fabs( mean_absolute_error - skl_metrics.mean_absolute_error( y_test, morigin.predict(X_test))) < 0.00001 assert math.fabs( mean_squared_error - skl_metrics.mean_squared_error( y_test, morigin.predict(X_test))) < 0.00001 assert math.fabs( median_absolute_error - skl_metrics.median_absolute_error( y_test, morigin.predict(X_test))) < 0.00001 assert math.fabs(r2_score - skl_metrics.r2_score( y_test, morigin.predict(X_test))) < 0.00001 assert math.fabs( explained_variance - skl_metrics.explained_variance_score( y_test, morigin.predict(X_test))) < 0.00001
def train(train_id, scenario_tag, dataset_config=None, model_config=None, preprocessor_config=None, formatter_config_for_predictor=None, formatter_config_for_target=None, evaluate_enabled=False, fit_model_enabled=False, dump_result_enabled=False): """ モデルの訓練を実行する関数 :param train_id: シナリオ中における訓練実行の識別子 :type train_id: str :param scenario_tag: シナリオに付与されるタグ :type scenario_tag: str :param dataset_config: Datasetの設定。:class:`akebono.dataset.get_dataset` の引数。 :type dataset_config: dict :param model_config: Modelの設定。:class:`akebono.model.get_model` の引数。 :type model_config: dict :param preprocessor_config: Preprocessorの設定。:class:`akebono.preprocessor.get_preprocessor` の引数。 :type preprocessor_config: dict :param formatter_config_for_predictor: 特徴用Formatterの設定。 :type formatter_config_for_predictor: dict :param formatter_config_for_target: 目標用Formatterの設定。 :type formatter_config_for_target: dict :param evaluate_enabled: モデルの評価を実行するかのフラグ :type evaluate_enabled: bool :param fit_model_enabled: モデルの訓練を実行するかのフラグ :type fit_model_enabled: bool :param dump_result_enabled: モデル、評価結果の永続化を実行するかのフラグ :type dump_result_enabled: bool """ if model_config is None: raise ValueError('model_config must be set.') if dataset_config is None: raise ValueError('dataset_config must be set.') if preprocessor_config is None: preprocessor_config = { 'name': 'identify', 'kwargs': {}, } if formatter_config_for_predictor is None: formatter_config_for_predictor = { 'name': '*****@*****.**', } if formatter_config_for_target is None: formatter_config_for_target = { 'name': '*****@*****.**', } ret = { 'type': 'train', 'id': train_id, 'dataset_config': dataset_config, 'model_config': model_config, 'preprocessor_config': preprocessor_config, 'formatter_config_for_predictor': formatter_config_for_predictor, 'formatter_config_for_target': formatter_config_for_target, 'evaluate_enabled': evaluate_enabled, 'fit_model_enabled': fit_model_enabled, 'dump_result_enabled': dump_result_enabled } dataset = get_dataset(dataset_config) preprocessor = get_preprocessor(preprocessor_config) preprocessor.set_operation_mode('train') logger.debug('load dataset start.') X, y = dataset.get_predictor_target() if X.index.size == 0: raise EmptyDatasetError('empty record') logger.debug('load dataset done.') model_config['is_rebuild'] = False model = get_model(model_config) format_func_for_predictor = load_object_by_str( formatter_config_for_predictor['name']) format_func_for_target = load_object_by_str( formatter_config_for_target['name']) if evaluate_enabled: logger.debug('evaluate start.') rep = model.evaluate(X, y, preprocessor, format_func_for_predictor, format_func_for_target) gc.collect() logger.debug('evaluate done.') ret['evaluate'] = rep if fit_model_enabled: logger.debug('fit start.') fX_p, _ = preprocessor.process(X, None) fX = format_func_for_predictor(fX_p) model.fit(fX, format_func_for_target(y)) gc.collect() logger.debug('fit done.') ret['model'] = model if dump_result_enabled: logger.debug('dump_train_result start.') ret['preprocessor'] = preprocessor dump_train_result(train_id, scenario_tag, ret) logger.debug('dump_train_result done.') return ret
def evaluate(model, X, y, preprocessor, format_func_for_predictor, format_func_for_target, train_test_split_func='[email protected]_selection', train_test_split_func_kwargs={}, cross_val_iterator=None, cross_val_iterator_kwargs={}, metrics='all'): if model.model_type is None: model.set_model_type(y=y) model_type = model.model_type # cross_val_iteratorはNone or iterable if cross_val_iterator is not None: if isinstance(cross_val_iterator, str): cross_val_iterator = load_object_by_str(cross_val_iterator) else: # train_test_split_func はNone or string or function if isinstance(train_test_split_func, str): train_test_split_func = load_object_by_str(train_test_split_func) result = {'metrics': []} train_test_iterator = None # not CV mode if cross_val_iterator is None: train_test_iterator = [ train_test_split_func(X, y, **train_test_split_func_kwargs) ] result['cv'] = False # CV mode else: train_test_iterator = _sklearn_cross_val_iter2train_test_iter( X, y, cross_val_iterator, cross_val_iterator_kwargs) result['cv'] = True for X_train_raw, X_test_raw, y_train_raw, y_test_raw in train_test_iterator: y_train, y_test = format_func_for_target( y_train_raw), format_func_for_target(y_test_raw) preprocessor.reset() X_train_p, X_test_p = preprocessor.process(X_train_raw, X_test_raw) X_train, X_test = format_func_for_predictor( X_train_p), format_func_for_predictor(X_test_p) one_result = [] model, y_pred, y_pred_proba = _fit_and_predict(X_train, X_test, y_train, model) if metrics == 'all': _preload_metrics = None if model_type == 'binary_classifier': _preload_metrics = _binary_classifier_metrics elif model_type == 'regressor': _preload_metrics = _regressor_metrics elif model_type == 'multiple_classifier': _preload_metrics = _multiple_classifier_metrics else: raise Exception('not supported.') for m in _preload_metrics: one_result.append( _get_evaluated_result(y_pred, y_pred_proba, y_test, m)) else: raise Exception('not supported.') result['metrics'].append(one_result) return result
def get_dataset(dataset_config): """ Datasetを生成するための関数 :param dataset_config: Datasetについての設定 :type dataset_config: dict :return: :class:`Dataset` object Usage: >>> from akebono.dataset import get_dataset >>> dataset_config = { 'loader_config': { 'name': '*****@*****.**', 'kwargs': { 'n_features': 1, 'noise': 30.0, 'random_state': 0, }, }, 'target_column': 'target', 'cache_enabled': False, } >>> ds = get_dataset(dataset_config) >>> ds <akebono.dataset.model.Dataset object at 0x11291acc0> """ dataset_name = dataset_config.get('name') target_column = dataset_config.get('target_column', 'target') cache_enabled = dataset_config.get('cache_enabled', False) evacuated_columns = dataset_config.get('evacuated_columns', []) if not isinstance(evacuated_columns, list): raise TypeError('evacuated_columns must be list.') loader_config = dataset_config.get('loader_config') if not isinstance(loader_config, dict): raise Exception( 'loader_config must be specified and this type is dict.') load_func = loader_config.get('name') load_func = _loader_name_alias.get(load_func, load_func) # エイリアスがあったらそれを使う if load_func is None: raise Exception('loader_config.name must be specified.') load_func = load_object_by_str(load_func) load_func_kwargs = Param(loader_config.get('kwargs', {})) loader_param = loader_config.get('param', {}) _reserved_params = ( 'dataset_name', 'target_column', ) for rp in _reserved_params: if rp in loader_param: raise KeyError('{} is reserved param.'.format(rp)) loader_param['dataset_name'] = dataset_name loader_param['target_column'] = target_column preprocess_func_str = dataset_config.get( 'preprocess_func', '*****@*****.**') preprocess_func_hash = get_hash(preprocess_func_str) preprocess_func = load_object_by_str(preprocess_func_str) preprocess_func_kwargs = Param( dataset_config.get('preprocess_func_kwargs', {})) def _core_func(): return preprocess_func( load_func(copy.deepcopy(load_func_kwargs.value), loader_param), **copy.copy(preprocess_func_kwargs.value)) fname = '{}_{}_{}_{}'.format( dataset_name, load_func_kwargs.get_hashed_id(length=24), preprocess_func_hash[:24], preprocess_func_kwargs.get_hashed_id(length=24)) dataset_loading_cache_enabled = dataset_config.get( 'dataset_loading_cache_enabled', True) if dataset_loading_cache_enabled: ds = datasetholder.get(fname) if ds is not None: logger.debug( 'dataset_loading_cache enabled .. {} get done.'.format( ds.name)) return ds pkl_fname = fname + '.pkl' if cache_enabled: if dataset_name is not None: logger.info('dataset cache enabled') _core_func = cache_located_at( pathjoin(settings.cache_dir, pkl_fname))(_core_func) else: raise Exception( 'dataset_config.cache_enabled is True, but dataset_config.name is None' ) ds = Dataset(fname, _core_func(), target_column, evacuated_columns) if dataset_loading_cache_enabled: datasetholder.set(ds) logger.debug('dataset_loading_cache enabled .. {} set done.'.format( ds.name)) return ds