예제 #1
0
def get_wrapped_sklearn_model(model_cls_str):
    # model_cls_str は'^Sklearn.+$' にマッチする前提
    model_cls_str = re.sub('Sklearn', '', model_cls_str)
    model_cls_mod_notreg = _valid_not_regressor_models.get(model_cls_str)
    model_cls_mod_reg = _valid_regressor_models.get(model_cls_str)
    if model_cls_mod_notreg is None and model_cls_mod_reg is None:
        raise Exception('{} is invalid.'.format(model_cls_str))
    if model_cls_mod_notreg is not None and model_cls_mod_reg is not None:
        raise Exception('unexpected state.')
    model_cls_mod = model_cls_mod_notreg or model_cls_mod_reg
    model_cls = load_object_by_str('{}@{}'.format(model_cls_str,
                                                  model_cls_mod))

    cls_attrs = {
        'fit':
        _fit,
        'reset':
        _generate_reset_func(model_cls),
        'dump':
        dump_sklearn_model,
        'load':
        load_sklearn_model,
        'base_init_finished':
        _base_init_finished
        if model_cls_mod_notreg else _base_init_finished_reg,
    }

    if hasattr(model_cls, 'predict'):
        cls_attrs['predict'] = _predict
    if hasattr(model_cls, 'predict_proba'):
        cls_attrs['predict_proba'] = _predict_proba
    logger.debug('model_cls_str: {} .. created.'.format(model_cls_str))

    return type('WrappedSklearn{}'.format(model_cls_str), (WrappedModel, ),
                cls_attrs)
예제 #2
0
def _get_preprocessor_strict(config):
    name = config.get('name')
    if name is None:
        raise Exception('preprocessor_config.name must be set.')
    cameledname = snake2camel(name)
    ppcls = load_object_by_str(_preprocessor_name_alias.get(cameledname, name))
    return ppcls(**config.get('kwargs', {}))
예제 #3
0
def test_model_regressor(pd_assert_equal):
    init_kwargs_settings = [
        {},
        {
            'normalize': True
        },
    ]
    fit_kwargs_settings = [{}]
    evaluate_kwargs_settings = [{
        'train_test_split_func': '[email protected]_selection',
        'train_test_split_func_kwargs': {
            'random_state': 0
        },
        'metrics': 'all',
    }]
    dataset_config = {
        'loader_config': {
            'name': '*****@*****.**',
            'kwargs': {
                'random_state': 0,
            },
        },
        'target_column': 'target',
    }
    preprocessor = get_preprocessor({
        'name': 'identify',
        'kwargs': {},
    })
    ffunc_for_predictor = ffunc_for_target = load_object_by_str(
        '*****@*****.**')
    ds1 = get_dataset(dataset_config)
    X1, y1 = ds1.get_predictor_target()

    for init_kwargs in init_kwargs_settings:
        for fit_kwargs in fit_kwargs_settings:
            for evaluate_kwargs in evaluate_kwargs_settings:
                mconfig = {
                    'name': 'SklearnLinearRegression',
                    'init_kwargs': init_kwargs,
                    'fit_kwargs': fit_kwargs,
                    'evaluate_kwargs': evaluate_kwargs,
                    'is_rebuild': False,
                }
                m = get_model(mconfig)
                morigin = LinearRegression(**init_kwargs)
                m.fit(X1, y1)
                morigin.fit(X1, y1, **fit_kwargs)

                # assertion
                pd_assert_equal(
                    pd.Series(m.predict(X1)).astype('float64'),
                    pd.Series(morigin.predict(X1)).astype('float64'))
                X_train, X_test, y_train, y_test = skl_train_test_split(
                    X1, y1, **evaluate_kwargs['train_test_split_func_kwargs'])
                rev1 = m.evaluate(X1, y1, preprocessor, ffunc_for_predictor,
                                  ffunc_for_target)
                if not rev1['cv']:
                    met = rev1['metrics'][0]
                    mean_absolute_error = [
                        o['value'] for o in met
                        if o['name'] == 'mean_absolute_error'
                    ][0]
                    mean_squared_error = [
                        o['value'] for o in met
                        if o['name'] == 'mean_squared_error'
                    ][0]
                    median_absolute_error = [
                        o['value'] for o in met
                        if o['name'] == 'median_absolute_error'
                    ][0]
                    r2_score = [
                        o['value'] for o in met if o['name'] == 'r2_score'
                    ][0]
                    explained_variance = [
                        o['value'] for o in met
                        if o['name'] == 'explained_variance'
                    ][0]
                    morigin.fit(ffunc_for_predictor(X_train),
                                ffunc_for_target(y_train))
                    assert math.fabs(
                        mean_absolute_error - skl_metrics.mean_absolute_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        mean_squared_error - skl_metrics.mean_squared_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        median_absolute_error -
                        skl_metrics.median_absolute_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(r2_score - skl_metrics.r2_score(
                        y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        explained_variance -
                        skl_metrics.explained_variance_score(
                            y_test, morigin.predict(X_test))) < 0.00001
예제 #4
0
def train(train_id,
          scenario_tag,
          dataset_config=None,
          model_config=None,
          preprocessor_config=None,
          formatter_config_for_predictor=None,
          formatter_config_for_target=None,
          evaluate_enabled=False,
          fit_model_enabled=False,
          dump_result_enabled=False):
    """
        モデルの訓練を実行する関数

        :param train_id: シナリオ中における訓練実行の識別子
        :type train_id: str
        :param scenario_tag: シナリオに付与されるタグ
        :type scenario_tag: str
        :param dataset_config: Datasetの設定。:class:`akebono.dataset.get_dataset` の引数。
        :type dataset_config: dict
        :param model_config: Modelの設定。:class:`akebono.model.get_model` の引数。
        :type model_config: dict
        :param preprocessor_config: Preprocessorの設定。:class:`akebono.preprocessor.get_preprocessor` の引数。
        :type preprocessor_config: dict
        :param formatter_config_for_predictor: 特徴用Formatterの設定。
        :type formatter_config_for_predictor: dict
        :param formatter_config_for_target: 目標用Formatterの設定。
        :type formatter_config_for_target: dict
        :param evaluate_enabled: モデルの評価を実行するかのフラグ
        :type evaluate_enabled: bool
        :param fit_model_enabled: モデルの訓練を実行するかのフラグ
        :type fit_model_enabled: bool
        :param dump_result_enabled: モデル、評価結果の永続化を実行するかのフラグ
        :type dump_result_enabled: bool
        """
    if model_config is None:
        raise ValueError('model_config must be set.')
    if dataset_config is None:
        raise ValueError('dataset_config must be set.')
    if preprocessor_config is None:
        preprocessor_config = {
            'name': 'identify',
            'kwargs': {},
        }
    if formatter_config_for_predictor is None:
        formatter_config_for_predictor = {
            'name': '*****@*****.**',
        }
    if formatter_config_for_target is None:
        formatter_config_for_target = {
            'name': '*****@*****.**',
        }

    ret = {
        'type': 'train',
        'id': train_id,
        'dataset_config': dataset_config,
        'model_config': model_config,
        'preprocessor_config': preprocessor_config,
        'formatter_config_for_predictor': formatter_config_for_predictor,
        'formatter_config_for_target': formatter_config_for_target,
        'evaluate_enabled': evaluate_enabled,
        'fit_model_enabled': fit_model_enabled,
        'dump_result_enabled': dump_result_enabled
    }

    dataset = get_dataset(dataset_config)

    preprocessor = get_preprocessor(preprocessor_config)
    preprocessor.set_operation_mode('train')
    logger.debug('load dataset start.')
    X, y = dataset.get_predictor_target()

    if X.index.size == 0:
        raise EmptyDatasetError('empty record')

    logger.debug('load dataset done.')

    model_config['is_rebuild'] = False
    model = get_model(model_config)

    format_func_for_predictor = load_object_by_str(
        formatter_config_for_predictor['name'])
    format_func_for_target = load_object_by_str(
        formatter_config_for_target['name'])

    if evaluate_enabled:
        logger.debug('evaluate start.')
        rep = model.evaluate(X, y, preprocessor, format_func_for_predictor,
                             format_func_for_target)
        gc.collect()
        logger.debug('evaluate done.')
        ret['evaluate'] = rep
    if fit_model_enabled:
        logger.debug('fit start.')
        fX_p, _ = preprocessor.process(X, None)
        fX = format_func_for_predictor(fX_p)
        model.fit(fX, format_func_for_target(y))
        gc.collect()
        logger.debug('fit done.')
        ret['model'] = model
    if dump_result_enabled:
        logger.debug('dump_train_result start.')
        ret['preprocessor'] = preprocessor
        dump_train_result(train_id, scenario_tag, ret)
        logger.debug('dump_train_result done.')

    return ret
예제 #5
0
def evaluate(model,
             X,
             y,
             preprocessor,
             format_func_for_predictor,
             format_func_for_target,
             train_test_split_func='[email protected]_selection',
             train_test_split_func_kwargs={},
             cross_val_iterator=None,
             cross_val_iterator_kwargs={},
             metrics='all'):
    if model.model_type is None:
        model.set_model_type(y=y)
    model_type = model.model_type

    # cross_val_iteratorはNone or iterable
    if cross_val_iterator is not None:
        if isinstance(cross_val_iterator, str):
            cross_val_iterator = load_object_by_str(cross_val_iterator)
    else:
        # train_test_split_func はNone or string or function
        if isinstance(train_test_split_func, str):
            train_test_split_func = load_object_by_str(train_test_split_func)

    result = {'metrics': []}
    train_test_iterator = None

    # not CV mode
    if cross_val_iterator is None:
        train_test_iterator = [
            train_test_split_func(X, y, **train_test_split_func_kwargs)
        ]
        result['cv'] = False
    # CV mode
    else:
        train_test_iterator = _sklearn_cross_val_iter2train_test_iter(
            X, y, cross_val_iterator, cross_val_iterator_kwargs)
        result['cv'] = True

    for X_train_raw, X_test_raw, y_train_raw, y_test_raw in train_test_iterator:
        y_train, y_test = format_func_for_target(
            y_train_raw), format_func_for_target(y_test_raw)

        preprocessor.reset()
        X_train_p, X_test_p = preprocessor.process(X_train_raw, X_test_raw)
        X_train, X_test = format_func_for_predictor(
            X_train_p), format_func_for_predictor(X_test_p)
        one_result = []
        model, y_pred, y_pred_proba = _fit_and_predict(X_train, X_test,
                                                       y_train, model)
        if metrics == 'all':
            _preload_metrics = None
            if model_type == 'binary_classifier':
                _preload_metrics = _binary_classifier_metrics
            elif model_type == 'regressor':
                _preload_metrics = _regressor_metrics
            elif model_type == 'multiple_classifier':
                _preload_metrics = _multiple_classifier_metrics
            else:
                raise Exception('not supported.')
            for m in _preload_metrics:
                one_result.append(
                    _get_evaluated_result(y_pred, y_pred_proba, y_test, m))
        else:
            raise Exception('not supported.')
        result['metrics'].append(one_result)

    return result
예제 #6
0
def get_dataset(dataset_config):
    """
    Datasetを生成するための関数

    :param dataset_config: Datasetについての設定
    :type dataset_config: dict
    :return: :class:`Dataset` object

    Usage:
        >>> from akebono.dataset import get_dataset
        >>> dataset_config = {
                'loader_config': {
                    'name': '*****@*****.**',
                    'kwargs': {
                        'n_features': 1,
                        'noise': 30.0,
                        'random_state': 0,
                    },
                },
                'target_column': 'target',
                'cache_enabled': False,
            }
        >>> ds = get_dataset(dataset_config)
        >>> ds
        <akebono.dataset.model.Dataset object at 0x11291acc0>
    """

    dataset_name = dataset_config.get('name')
    target_column = dataset_config.get('target_column', 'target')
    cache_enabled = dataset_config.get('cache_enabled', False)
    evacuated_columns = dataset_config.get('evacuated_columns', [])
    if not isinstance(evacuated_columns, list):
        raise TypeError('evacuated_columns must be list.')
    loader_config = dataset_config.get('loader_config')
    if not isinstance(loader_config, dict):
        raise Exception(
            'loader_config must be specified and this type is dict.')
    load_func = loader_config.get('name')
    load_func = _loader_name_alias.get(load_func, load_func)  # エイリアスがあったらそれを使う
    if load_func is None:
        raise Exception('loader_config.name must be specified.')
    load_func = load_object_by_str(load_func)
    load_func_kwargs = Param(loader_config.get('kwargs', {}))
    loader_param = loader_config.get('param', {})
    _reserved_params = (
        'dataset_name',
        'target_column',
    )
    for rp in _reserved_params:
        if rp in loader_param:
            raise KeyError('{} is reserved param.'.format(rp))
    loader_param['dataset_name'] = dataset_name
    loader_param['target_column'] = target_column

    preprocess_func_str = dataset_config.get(
        'preprocess_func', '*****@*****.**')
    preprocess_func_hash = get_hash(preprocess_func_str)
    preprocess_func = load_object_by_str(preprocess_func_str)
    preprocess_func_kwargs = Param(
        dataset_config.get('preprocess_func_kwargs', {}))

    def _core_func():
        return preprocess_func(
            load_func(copy.deepcopy(load_func_kwargs.value), loader_param),
            **copy.copy(preprocess_func_kwargs.value))

    fname = '{}_{}_{}_{}'.format(
        dataset_name, load_func_kwargs.get_hashed_id(length=24),
        preprocess_func_hash[:24],
        preprocess_func_kwargs.get_hashed_id(length=24))
    dataset_loading_cache_enabled = dataset_config.get(
        'dataset_loading_cache_enabled', True)
    if dataset_loading_cache_enabled:
        ds = datasetholder.get(fname)
        if ds is not None:
            logger.debug(
                'dataset_loading_cache enabled .. {} get done.'.format(
                    ds.name))
            return ds

    pkl_fname = fname + '.pkl'
    if cache_enabled:
        if dataset_name is not None:
            logger.info('dataset cache enabled')
            _core_func = cache_located_at(
                pathjoin(settings.cache_dir, pkl_fname))(_core_func)
        else:
            raise Exception(
                'dataset_config.cache_enabled is True, but dataset_config.name is None'
            )

    ds = Dataset(fname, _core_func(), target_column, evacuated_columns)

    if dataset_loading_cache_enabled:
        datasetholder.set(ds)
        logger.debug('dataset_loading_cache enabled .. {} set done.'.format(
            ds.name))
    return ds