def predict(self, dataset: tk.data.Dataset) -> typing.List[np.ndarray]: """予測結果をリストで返す。 Args: dataset (tk.data.Dataset): 入力データ Returns: len(self.folds)個の予測結果 """ dataset = dataset.copy() if self.preprocessors is not None: dataset.data = self.preprocessors.transform(dataset.data) pred_list = self._predict(dataset) if self.postprocessors is not None: for i in range(len(pred_list)): if pred_list[i].ndim <= 1: pred_list[i] = np.squeeze( self.postprocessors.inverse_transform( np.expand_dims(pred_list[i], axis=-1)), axis=-1, ) else: pred_list[i] = self.postprocessors.inverse_transform( pred_list[i]) return pred_list
def predict(self, dataset: tk.data.Dataset, fold: int) -> np.ndarray: """推論結果を返す。 Args: dataset: 入力データ fold: 使用するモデル Returns: 推論結果 """ dataset = dataset.copy() if self.preprocessors is not None: dataset.data = self.preprocessors.transform(dataset.data) pred = self._predict(dataset, fold) if self.postprocessors is not None: if isinstance(pred, np.ndarray) and pred.ndim <= 1: pred = np.squeeze( self.postprocessors.inverse_transform( np.expand_dims(pred, axis=-1)), axis=-1, ) else: pred = self.postprocessors.inverse_transform(pred) return pred
def cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> Model: """CVして保存。 Args: dataset: 入力データ folds: CVのindex Returns: self """ dataset = dataset.copy() if self.preprocessors is not None: dataset.data = self.preprocessors.fit_transform( dataset.data, dataset.labels) if self.postprocessors is not None: dataset.labels = np.squeeze( self.postprocessors.fit_transform( np.expand_dims(dataset.labels, axis=-1)), axis=-1, ) self._cv(dataset, folds) if self.save_on_cv: self.save() return self
def _cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> None: evals_list = [] score_weights = [] self.estimators_ = [] for fold, (train_set, val_set) in tk.utils.tqdm(enumerate(dataset.iter(folds)), total=len(folds), desc="cv"): kwargs = {} if train_set.weights is not None: kwargs[self.weights_arg_name] = train_set.weights estimator = sklearn.base.clone(self.estimator) estimator.fit(train_set.data, train_set.labels, **kwargs) self.estimators_.append(estimator) kwargs = {} if val_set.weights is not None: kwargs[self.weights_arg_name] = val_set.weights if self.score_fn is None: evals = { "score": estimator.score(val_set.data, val_set.labels, **kwargs) } else: pred_val = self._predict(val_set, fold) evals = self.score_fn(val_set.labels, pred_val) evals_list.append(evals) score_weights.append(len(val_set)) evals = tk.evaluations.mean(evals_list, weights=score_weights) logger.info(f"cv: {tk.evaluations.to_str(evals)}")
def predict_oof(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> np.ndarray: """out-of-foldなpredict結果を返す。 Args: dataset: 入力データ folds: CVのindex Returns: 推論結果 """ pred_list = [ self.predict(dataset.slice(val_indices), fold) for fold, (_, val_indices) in enumerate(folds) ] assert len(pred_list) == len(folds) if isinstance(pred_list[0], list): # multiple output oofp = [ self._get_oofp(dataset, folds, [p[i] for p in pred_list]) for i in range(len(pred_list[0])) ] else: oofp = self._get_oofp(dataset, folds, pred_list) return oofp
def get_data(self, dataset: tk.data.Dataset, index: int): X, y = dataset.get_data(index) X = tk.ndimage.load(X) X = self.aug1(image=X)["image"] y = tf.keras.utils.to_categorical( y, num_classes) if y is not None else None return X, y
def _serial_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType): self.models = [] score_list = [] score_weights = [] for fold, (train_indices, val_indices) in enumerate(folds): tk.log.get(__name__).info( f"Fold {fold + 1}/{len(folds)}: train={len(train_indices)} val={len(val_indices)}" ) train_set = dataset.slice(train_indices) val_set = dataset.slice(val_indices) scores = self.train(train_set, val_set, _fold=fold) score_list.append(scores) score_weights.append(len(val_indices)) return { k: np.average([s[k] for s in score_list], weights=score_weights) for k in score_list[0] }
def cv( self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType, models_dir: pathlib.Path, ) -> dict: """CVして保存。 Args: dataset: 入力データ folds: CVのindex models_dir: 保存先ディレクトリ (Noneなら保存しない) Returns: metrics名と値 """ if models_dir is not None: models_dir = pathlib.Path(models_dir) models_dir.mkdir(parents=True, exist_ok=True) dataset = dataset.copy() if self.preprocessors is not None: dataset.data = self.preprocessors.fit_transform( dataset.data, dataset.labels) if self.postprocessors is not None: dataset.labels = np.squeeze( self.postprocessors.fit_transform( np.expand_dims(dataset.labels, axis=-1)), axis=-1, ) scores = self._cv(dataset, folds) if models_dir is not None: if self.preprocessors is not None: tk.utils.dump(self.preprocessors, models_dir / "preprocessors.pkl") if self.postprocessors is not None: tk.utils.dump(self.postprocessors, models_dir / "postprocessors.pkl") self._save(models_dir) return scores
def _serial_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType): evals_list = [] evals_weights = [] for fold, (train_set, val_set) in enumerate(dataset.iter(folds)): logger.info(f"fold{fold}: train={len(train_set)} val={len(val_set)}") evals = self.train(train_set, val_set, fold=fold) evals_list.append(evals) evals_weights.append(len(val_set)) evals = tk.evaluations.mean(evals_list, weights=evals_weights) logger.info(f"cv: {tk.evaluations.to_str(evals)}")
def get_data(self, dataset: tk.data.Dataset, index: int): X, y = dataset.get_data(index) X = tk.ndimage.load(X) if self.mask: y = tk.ndimage.load(y) a = self.aug(image=X, mask=y) X = a["image"] y = a["mask"] else: a = self.aug(image=X) X = a["image"] return X, y
def _cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> dict: scores = [] score_weights = [] self.estimators_ = [] for train_indices, val_indices in tk.utils.tqdm(folds, desc="cv"): train_set = dataset.slice(train_indices) val_set = dataset.slice(val_indices) kwargs = {} if train_set.weights is not None: kwargs[self.weights_arg_name] = train_set.weights estimator = sklearn.base.clone(self.estimator) estimator.fit(train_set.data, train_set.labels, **kwargs) self.estimators_.append(estimator) kwargs = {} if val_set.weights is not None: kwargs[self.weights_arg_name] = val_set.weights scores.append(estimator.score(val_set.data, val_set.labels, **kwargs)) score_weights.append(len(val_set)) return {"score": np.average(scores, weights=score_weights)}
def _parallel_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType): self.models = [self.create_model_fn() for _ in folds] inputs = [] targets = [] outputs = [] losses = [] metrics: dict = {n: [] for n in self.models[0].metrics_names if n != "loss"} for i, model in enumerate(self.models): input_shape = model.input_shape output_shape = model.output_shape if isinstance(input_shape, tuple): input_shape = [input_shape] if isinstance(output_shape, tuple): output_shape = [output_shape] model_inputs = [ keras.layers.Input(s[1:], name=f"model{i}_input{j}") for j, s in enumerate(input_shape) ] model_targets = [ keras.layers.Input(s[1:], name=f"model{i}_target{j}") for j, s in enumerate(output_shape) ] inputs.extend(model_inputs) targets.extend(model_targets) if len(model_targets) == 1: model_targets = model_targets[0] x = model(model_inputs) outputs.append(x) losses.extend([loss(model_targets, x) for loss in model.loss_functions]) assert len(metrics) == len(model.metrics) for k, m in zip(metrics, model.metrics): metrics[k].append(m(model_targets, x)) def loss(y_true, y_pred): del y_true, y_pred return tf.reduce_mean(losses, axis=0) for k, v in metrics.items(): def metric_func(y_true, y_pred, v=v): del y_true, y_pred return tf.reduce_mean(v, axis=0) metric_func.__name__ = k metrics[k] = metric_func model = keras.models.Model(inputs=inputs + targets, outputs=outputs) model.compile(self.models[0].optimizer, loss, list(metrics.values())) tk.models.summary(model) def generator(datasets, data_loader): iterators = [ data_loader.iter(dataset, shuffle=True, use_horovod=True).run() for dataset in datasets ] while True: X_batch = {} for i, it in enumerate(iterators): Xt, yt = next(it, (None, None)) assert Xt is not None assert yt is not None if isinstance(Xt, np.ndarray): Xt = [Xt] elif isinstance(Xt, dict): Xt = Xt.values() # TODO: 並び順 for j, Xtj in enumerate(Xt): X_batch[f"model{i}_input{j}"] = Xtj if isinstance(yt, np.ndarray): yt = [yt] elif isinstance(yt, dict): yt = yt.values() # TODO: 並び順 for j, ytj in enumerate(yt): X_batch[f"model{i}_target{j}"] = ytj yield X_batch, None train_sets = [] val_sets = [] for train_indices, val_indices in folds: train_sets.append(dataset.slice(train_indices)) val_sets.append(dataset.slice(val_indices)) model.fit_generator( generator(train_sets, self.train_data_loader), steps_per_epoch=-(-len(train_sets[0]) // self.train_data_loader.batch_size), validation_data=generator(val_sets, self.val_data_loader), validation_steps=-(-len(val_sets[0]) // self.val_data_loader.batch_size), **(self.fit_params or {}), ) evals = model.evaluate_generator( generator(val_sets, self.val_data_loader), -(-len(val_sets[0]) // self.val_data_loader.batch_size) * 3, ) scores = dict(zip(model.metrics_names, evals)) for k, v in scores.items(): tk.log.get(__name__).info(f"CV: val_{k}={v:,.3f}") return scores