def dump_train_result(train_id, scenario_tag, result): model = result.pop('model', None) preprocessor = result.pop('preprocessor', None) model_name = 'train_result_model_{}'.format(train_id) result_name = 'train_result_meta_{}'.format(train_id) dirpath = pathjoin(settings.operation_results_dir, scenario_tag) to_pickle(pathjoin(dirpath, '{}.pkl'.format(result_name)), result) # dump result if model is not None: model.dump(dirpath, model_name) if preprocessor is not None: preprocessor.dump_with_operation_rule(dirpath, train_id)
def load(load_func_kwargs, param): logger.debug('bigquery loader invoked.') if param['dataset_name'] is None: raise ValueError('dataset_name must be set for csv loader') fname = param['dataset_name'] + '.csv' df = pd_read_csv(pathjoin(settings.datasource_dir, fname), **load_func_kwargs) return df
def get_preprocessor_for_prediction(scenario_tag, train_id, train_result=None, dirpath=None): if train_result is None: logger.debug('train_result is None .. load from scenario_tag: {}, train_id: {}'.format(scenario_tag, train_id)) train_result = get_train_result(scenario_tag=scenario_tag, train_id=train_id) preprocessor = get_preprocessor(train_result['preprocessor_config']) preprocessor.set_operation_mode('predict') if dirpath is None: dirpath = pathjoin(settings.operation_results_dir, scenario_tag) preprocessor.load_with_operation_rule(dirpath, train_id) return preprocessor
def dump_predicted_result(predict_id, scenario_tag, dumper_config, df, meta): dirpath = pathjoin(settings.operation_results_dir, scenario_tag) fname_meta = 'predict_result_meta_{}'.format(predict_id) fname = 'predict_result_{}'.format(predict_id) dump_result_format = dumper_config['name'] if dump_result_format == 'csv': pd_to_csv(df, pathjoin(dirpath, fname + '.csv'), index=False) elif dump_result_format == 'pickle': to_pickle(pathjoin(dirpath, fname + '.pkl'), df) elif dump_result_format == 'bigquery': import pandas_gbq destination_table = dumper_config.get('destination_table') if destination_table is None: raise ValueError( 'destination_table must be set for bigquery dumper.') if dumper_config.get('add_predict_id_enabled', True): destination_table += ('_' + predict_id) pandas_gbq.to_gbq(df, destination_table, **dumper_config.get('kwargs', {})) else: raise Exception('invalid format') to_pickle(pathjoin(dirpath, fname_meta + '.pkl'), meta)
def get_scenario_ids(scenario_tag): dirpath = pathjoin(settings.operation_results_dir, scenario_tag) filenames = list_directory(dirpath, mode='filename') train_regexps = [ re.search(_train_result_meta_pattern, fn) for fn in filenames ] train_ids = [tr.group(1) for tr in train_regexps if tr is not None] predict_regexps = [ re.search(_predict_result_meta_pattern, fn) for fn in filenames ] predict_ids = [tr.group(1) for tr in predict_regexps if tr is not None] return { 'train': train_ids, 'predict': predict_ids, }
def load_train_results(scenario_tag='default', train_ids='all'): dirpath = pathjoin(settings.operation_results_dir, scenario_tag) logger.debug('dirpath: {}'.format(dirpath)) file_paths = list_directory(dirpath) result_paths = [ fp for fp in file_paths if re.search('.+\{}train_result_meta_[0-9]+\.pkl$'.format(os.sep), fp) is not None ] results = [from_pickle(rp) for rp in result_paths] if not (isinstance(train_ids, list) or train_ids == 'all'): raise ValueError('train_ids must be list type or str "all"') if not train_ids == 'all': results = [r for r in results if r['id'] in train_ids] # convert evaluate result to pandas.DataFrame for idx, r in enumerate(results): if 'evaluate' in r: results[idx]['evaluate']['metrics'] = pd.DataFrame( [{o['name']: o['value'] for o in met} for met in r['evaluate']['metrics']]) return results
def load_sklearn_model(model, dirpath, model_name): model._value = from_pickle(pathjoin(dirpath, '{}.pkl'.format(model_name))) if model._value is None: raise Exception('load {} failed.'.format(model_name)) return model
def dump_sklearn_model(obj, dirpath, model_name): return to_pickle(pathjoin(dirpath, '{}.pkl'.format(model_name)), obj.value)
def predict(predict_id, scenario_tag, method_type='predict', dataset_config=None, train_id='0', dump_result_enabled=False, append_evacuated_columns_enabled=False, dumper_config={}, result_target_columns='all', result_predict_column='predicted'): """ 予測を実行する関数 :param predict_id: シナリオ中における予測実行の識別子 :type predict_id: str :param scenario_tag: シナリオに付与されるタグ :type scenario_tag: str :param method_type: 予測のタイプ。設定可能なタイプは `predict` or `predict_proba` :type method_type: str :param dataset_config: Datasetの設定。:class:`akebono.dataset.get_dataset` の引数。 :type dataset_config: dict :param train_id: 予測で使うモデルのtrain_id :type train_id: str :param dump_result_enabled: 予測結果の永続化を実行するかのフラグ :type dump_result_enabled: bool :param append_evacuated_columns_enabled: Dataset中で退避したカラムをpredictの結果に加えるかを決めるフラグ :type append_evacuated_columns_enabled: bool :param dumper_config: 予測結果の設定。 :type dumper_config: dict :param result_target_columns: 予測結果に含めるべき説明変数のカラム名のリスト。全ての場合は'all'とする :type result_target_columns: str or list(str) :param result_predict_column: 予測結果が格納されるカラム名 :type result_predict_column: str """ if dataset_config is None: raise ValueError('dataset_config must be set.') if dump_result_enabled and 'name' not in dumper_config: raise ValueError('`name` key must be contained in dumper_config.') train_id = str(train_id) tr = get_train_result(scenario_tag=scenario_tag, train_id=train_id) model, model_config = get_trained_model(scenario_tag, train_id, train_result=tr) ret = { 'type': 'predict', 'method_type': method_type, 'dataset_config': dataset_config, 'train_id': train_id, 'dump_result_enabled': dump_result_enabled, 'dumper_config': dumper_config, 'result_target_columns': result_target_columns, 'result_predict_column': result_predict_column, 'train_result': tr, 'model_config': model_config, } dataset_config[ 'target_column'] = None # target_columnがNoneだと、predict用のDatasetが返ってくる dataset = get_dataset(dataset_config) dirpath = pathjoin(settings.operation_results_dir, scenario_tag) preprocessor = get_preprocessor_for_prediction(scenario_tag, train_id, train_result=tr, dirpath=dirpath) X = dataset.get_predictor() if X.index.size == 0: raise EmptyDatasetError('empty record') fX, _ = preprocessor.process(X, None) gc.collect() predict_func = getattr(model, method_type, None) if predict_func is None: raise Exception('{} is not defined.'.format(method_type)) rawresult = predict_func(fX) gc.collect() predict_result = fX.copy() if not result_target_columns == 'all': if not isinstance(result_target_columns, list): raise TypeError('result_target_columns must be list.') predict_result = predict_result[result_target_columns] # len(rawresult.shape) > 1でもpredict_resultのdfに格納できるようにするためlistにしている # そもそもpredict_resultを1つのdfにするべきなのかは考え直しても良いと思う predict_result.loc[:, result_predict_column] = list(rawresult) if append_evacuated_columns_enabled: predict_result = pd.concat([dataset.get_evacuated(), predict_result], axis=1) if dump_result_enabled: logger.debug('dump_predicted_result start.') dump_predicted_result(predict_id, scenario_tag, dumper_config, predict_result, ret) logger.debug('dump_predicted_result done.') ret['predict_result'] = predict_result return ret
def load(self, dirpath, name): self._value.load_model(pathjoin(dirpath, name + '.bin')) return self
def dump(self, dirpath, name): self.value.save_model(pathjoin(dirpath, name + '.bin')) return self
def get_dataset(dataset_config): """ Datasetを生成するための関数 :param dataset_config: Datasetについての設定 :type dataset_config: dict :return: :class:`Dataset` object Usage: >>> from akebono.dataset import get_dataset >>> dataset_config = { 'loader_config': { 'name': '*****@*****.**', 'kwargs': { 'n_features': 1, 'noise': 30.0, 'random_state': 0, }, }, 'target_column': 'target', 'cache_enabled': False, } >>> ds = get_dataset(dataset_config) >>> ds <akebono.dataset.model.Dataset object at 0x11291acc0> """ dataset_name = dataset_config.get('name') target_column = dataset_config.get('target_column', 'target') cache_enabled = dataset_config.get('cache_enabled', False) evacuated_columns = dataset_config.get('evacuated_columns', []) if not isinstance(evacuated_columns, list): raise TypeError('evacuated_columns must be list.') loader_config = dataset_config.get('loader_config') if not isinstance(loader_config, dict): raise Exception( 'loader_config must be specified and this type is dict.') load_func = loader_config.get('name') load_func = _loader_name_alias.get(load_func, load_func) # エイリアスがあったらそれを使う if load_func is None: raise Exception('loader_config.name must be specified.') load_func = load_object_by_str(load_func) load_func_kwargs = Param(loader_config.get('kwargs', {})) loader_param = loader_config.get('param', {}) _reserved_params = ( 'dataset_name', 'target_column', ) for rp in _reserved_params: if rp in loader_param: raise KeyError('{} is reserved param.'.format(rp)) loader_param['dataset_name'] = dataset_name loader_param['target_column'] = target_column preprocess_func_str = dataset_config.get( 'preprocess_func', '*****@*****.**') preprocess_func_hash = get_hash(preprocess_func_str) preprocess_func = load_object_by_str(preprocess_func_str) preprocess_func_kwargs = Param( dataset_config.get('preprocess_func_kwargs', {})) def _core_func(): return preprocess_func( load_func(copy.deepcopy(load_func_kwargs.value), loader_param), **copy.copy(preprocess_func_kwargs.value)) fname = '{}_{}_{}_{}'.format( dataset_name, load_func_kwargs.get_hashed_id(length=24), preprocess_func_hash[:24], preprocess_func_kwargs.get_hashed_id(length=24)) dataset_loading_cache_enabled = dataset_config.get( 'dataset_loading_cache_enabled', True) if dataset_loading_cache_enabled: ds = datasetholder.get(fname) if ds is not None: logger.debug( 'dataset_loading_cache enabled .. {} get done.'.format( ds.name)) return ds pkl_fname = fname + '.pkl' if cache_enabled: if dataset_name is not None: logger.info('dataset cache enabled') _core_func = cache_located_at( pathjoin(settings.cache_dir, pkl_fname))(_core_func) else: raise Exception( 'dataset_config.cache_enabled is True, but dataset_config.name is None' ) ds = Dataset(fname, _core_func(), target_column, evacuated_columns) if dataset_loading_cache_enabled: datasetholder.set(ds) logger.debug('dataset_loading_cache enabled .. {} set done.'.format( ds.name)) return ds