def predict_oof(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> np.ndarray: """out-of-foldなpredict結果を返す。 Args: dataset: 入力データ folds: CVのindex Returns: 推論結果 """ pred_list = [ self.predict(dataset.slice(val_indices), fold) for fold, (_, val_indices) in enumerate(folds) ] assert len(pred_list) == len(folds) if isinstance(pred_list[0], list): # multiple output oofp = [ self._get_oofp(dataset, folds, [p[i] for p in pred_list]) for i in range(len(pred_list[0])) ] else: oofp = self._get_oofp(dataset, folds, pred_list) return oofp
def _serial_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType): self.models = [] score_list = [] score_weights = [] for fold, (train_indices, val_indices) in enumerate(folds): tk.log.get(__name__).info( f"Fold {fold + 1}/{len(folds)}: train={len(train_indices)} val={len(val_indices)}" ) train_set = dataset.slice(train_indices) val_set = dataset.slice(val_indices) scores = self.train(train_set, val_set, _fold=fold) score_list.append(scores) score_weights.append(len(val_indices)) return { k: np.average([s[k] for s in score_list], weights=score_weights) for k in score_list[0] }
def _cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> dict: scores = [] score_weights = [] self.estimators_ = [] for train_indices, val_indices in tk.utils.tqdm(folds, desc="cv"): train_set = dataset.slice(train_indices) val_set = dataset.slice(val_indices) kwargs = {} if train_set.weights is not None: kwargs[self.weights_arg_name] = train_set.weights estimator = sklearn.base.clone(self.estimator) estimator.fit(train_set.data, train_set.labels, **kwargs) self.estimators_.append(estimator) kwargs = {} if val_set.weights is not None: kwargs[self.weights_arg_name] = val_set.weights scores.append(estimator.score(val_set.data, val_set.labels, **kwargs)) score_weights.append(len(val_set)) return {"score": np.average(scores, weights=score_weights)}
def _parallel_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType): self.models = [self.create_model_fn() for _ in folds] inputs = [] targets = [] outputs = [] losses = [] metrics: dict = {n: [] for n in self.models[0].metrics_names if n != "loss"} for i, model in enumerate(self.models): input_shape = model.input_shape output_shape = model.output_shape if isinstance(input_shape, tuple): input_shape = [input_shape] if isinstance(output_shape, tuple): output_shape = [output_shape] model_inputs = [ keras.layers.Input(s[1:], name=f"model{i}_input{j}") for j, s in enumerate(input_shape) ] model_targets = [ keras.layers.Input(s[1:], name=f"model{i}_target{j}") for j, s in enumerate(output_shape) ] inputs.extend(model_inputs) targets.extend(model_targets) if len(model_targets) == 1: model_targets = model_targets[0] x = model(model_inputs) outputs.append(x) losses.extend([loss(model_targets, x) for loss in model.loss_functions]) assert len(metrics) == len(model.metrics) for k, m in zip(metrics, model.metrics): metrics[k].append(m(model_targets, x)) def loss(y_true, y_pred): del y_true, y_pred return tf.reduce_mean(losses, axis=0) for k, v in metrics.items(): def metric_func(y_true, y_pred, v=v): del y_true, y_pred return tf.reduce_mean(v, axis=0) metric_func.__name__ = k metrics[k] = metric_func model = keras.models.Model(inputs=inputs + targets, outputs=outputs) model.compile(self.models[0].optimizer, loss, list(metrics.values())) tk.models.summary(model) def generator(datasets, data_loader): iterators = [ data_loader.iter(dataset, shuffle=True, use_horovod=True).run() for dataset in datasets ] while True: X_batch = {} for i, it in enumerate(iterators): Xt, yt = next(it, (None, None)) assert Xt is not None assert yt is not None if isinstance(Xt, np.ndarray): Xt = [Xt] elif isinstance(Xt, dict): Xt = Xt.values() # TODO: 並び順 for j, Xtj in enumerate(Xt): X_batch[f"model{i}_input{j}"] = Xtj if isinstance(yt, np.ndarray): yt = [yt] elif isinstance(yt, dict): yt = yt.values() # TODO: 並び順 for j, ytj in enumerate(yt): X_batch[f"model{i}_target{j}"] = ytj yield X_batch, None train_sets = [] val_sets = [] for train_indices, val_indices in folds: train_sets.append(dataset.slice(train_indices)) val_sets.append(dataset.slice(val_indices)) model.fit_generator( generator(train_sets, self.train_data_loader), steps_per_epoch=-(-len(train_sets[0]) // self.train_data_loader.batch_size), validation_data=generator(val_sets, self.val_data_loader), validation_steps=-(-len(val_sets[0]) // self.val_data_loader.batch_size), **(self.fit_params or {}), ) evals = model.evaluate_generator( generator(val_sets, self.val_data_loader), -(-len(val_sets[0]) // self.val_data_loader.batch_size) * 3, ) scores = dict(zip(model.metrics_names, evals)) for k, v in scores.items(): tk.log.get(__name__).info(f"CV: val_{k}={v:,.3f}") return scores