コード例 #1
0
ファイル: core.py プロジェクト: ak110/pytoolkit
    def predict_oof(self, dataset: tk.data.Dataset,
                    folds: tk.validation.FoldsType) -> np.ndarray:
        """out-of-foldなpredict結果を返す。

        Args:
            dataset: 入力データ
            folds: CVのindex

        Returns:
            推論結果

        """
        pred_list = [
            self.predict(dataset.slice(val_indices), fold)
            for fold, (_, val_indices) in enumerate(folds)
        ]
        assert len(pred_list) == len(folds)

        if isinstance(pred_list[0], list):  # multiple output
            oofp = [
                self._get_oofp(dataset, folds, [p[i] for p in pred_list])
                for i in range(len(pred_list[0]))
            ]
        else:
            oofp = self._get_oofp(dataset, folds, pred_list)
        return oofp
コード例 #2
0
ファイル: keras.py プロジェクト: sean512/pytoolkit
    def _serial_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType):
        self.models = []
        score_list = []
        score_weights = []
        for fold, (train_indices, val_indices) in enumerate(folds):
            tk.log.get(__name__).info(
                f"Fold {fold + 1}/{len(folds)}: train={len(train_indices)} val={len(val_indices)}"
            )
            train_set = dataset.slice(train_indices)
            val_set = dataset.slice(val_indices)
            scores = self.train(train_set, val_set, _fold=fold)
            score_list.append(scores)
            score_weights.append(len(val_indices))

        return {
            k: np.average([s[k] for s in score_list], weights=score_weights)
            for k in score_list[0]
        }
コード例 #3
0
ファイル: sklearn.py プロジェクト: sean512/pytoolkit
    def _cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> dict:
        scores = []
        score_weights = []
        self.estimators_ = []
        for train_indices, val_indices in tk.utils.tqdm(folds, desc="cv"):
            train_set = dataset.slice(train_indices)
            val_set = dataset.slice(val_indices)

            kwargs = {}
            if train_set.weights is not None:
                kwargs[self.weights_arg_name] = train_set.weights

            estimator = sklearn.base.clone(self.estimator)
            estimator.fit(train_set.data, train_set.labels, **kwargs)
            self.estimators_.append(estimator)

            kwargs = {}
            if val_set.weights is not None:
                kwargs[self.weights_arg_name] = val_set.weights

            scores.append(estimator.score(val_set.data, val_set.labels, **kwargs))
            score_weights.append(len(val_set))

        return {"score": np.average(scores, weights=score_weights)}
コード例 #4
0
ファイル: keras.py プロジェクト: sean512/pytoolkit
    def _parallel_cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType):
        self.models = [self.create_model_fn() for _ in folds]

        inputs = []
        targets = []
        outputs = []
        losses = []
        metrics: dict = {n: [] for n in self.models[0].metrics_names if n != "loss"}
        for i, model in enumerate(self.models):
            input_shape = model.input_shape
            output_shape = model.output_shape
            if isinstance(input_shape, tuple):
                input_shape = [input_shape]
            if isinstance(output_shape, tuple):
                output_shape = [output_shape]

            model_inputs = [
                keras.layers.Input(s[1:], name=f"model{i}_input{j}")
                for j, s in enumerate(input_shape)
            ]
            model_targets = [
                keras.layers.Input(s[1:], name=f"model{i}_target{j}")
                for j, s in enumerate(output_shape)
            ]
            inputs.extend(model_inputs)
            targets.extend(model_targets)
            if len(model_targets) == 1:
                model_targets = model_targets[0]

            x = model(model_inputs)
            outputs.append(x)
            losses.extend([loss(model_targets, x) for loss in model.loss_functions])
            assert len(metrics) == len(model.metrics)
            for k, m in zip(metrics, model.metrics):
                metrics[k].append(m(model_targets, x))

        def loss(y_true, y_pred):
            del y_true, y_pred
            return tf.reduce_mean(losses, axis=0)

        for k, v in metrics.items():

            def metric_func(y_true, y_pred, v=v):
                del y_true, y_pred
                return tf.reduce_mean(v, axis=0)

            metric_func.__name__ = k
            metrics[k] = metric_func

        model = keras.models.Model(inputs=inputs + targets, outputs=outputs)
        model.compile(self.models[0].optimizer, loss, list(metrics.values()))
        tk.models.summary(model)

        def generator(datasets, data_loader):
            iterators = [
                data_loader.iter(dataset, shuffle=True, use_horovod=True).run()
                for dataset in datasets
            ]
            while True:
                X_batch = {}
                for i, it in enumerate(iterators):
                    Xt, yt = next(it, (None, None))
                    assert Xt is not None
                    assert yt is not None

                    if isinstance(Xt, np.ndarray):
                        Xt = [Xt]
                    elif isinstance(Xt, dict):
                        Xt = Xt.values()  # TODO: 並び順
                    for j, Xtj in enumerate(Xt):
                        X_batch[f"model{i}_input{j}"] = Xtj

                    if isinstance(yt, np.ndarray):
                        yt = [yt]
                    elif isinstance(yt, dict):
                        yt = yt.values()  # TODO: 並び順
                    for j, ytj in enumerate(yt):
                        X_batch[f"model{i}_target{j}"] = ytj
                yield X_batch, None

        train_sets = []
        val_sets = []
        for train_indices, val_indices in folds:
            train_sets.append(dataset.slice(train_indices))
            val_sets.append(dataset.slice(val_indices))

        model.fit_generator(
            generator(train_sets, self.train_data_loader),
            steps_per_epoch=-(-len(train_sets[0]) // self.train_data_loader.batch_size),
            validation_data=generator(val_sets, self.val_data_loader),
            validation_steps=-(-len(val_sets[0]) // self.val_data_loader.batch_size),
            **(self.fit_params or {}),
        )

        evals = model.evaluate_generator(
            generator(val_sets, self.val_data_loader),
            -(-len(val_sets[0]) // self.val_data_loader.batch_size) * 3,
        )
        scores = dict(zip(model.metrics_names, evals))
        for k, v in scores.items():
            tk.log.get(__name__).info(f"CV: val_{k}={v:,.3f}")

        return scores