Beispiel #1
0
 def from_checkpoint(params: PredictorParams, checkpoint: str, auto_update_checkpoints=True):
     ckpt = SavedCalamariModel(checkpoint, auto_update=False)
     trainer_params = Scenario.trainer_params_from_dict(ckpt.dict)
     trainer_params.scenario_params.data_params.pre_processors_.run_parallel = False
     trainer_params.scenario_params.data_params.post_processors_.run_parallel = False
     scenario = Scenario(trainer_params.scenario_params)
     predictor = Predictor(params, scenario.create_data())
     ckpt = SavedCalamariModel(checkpoint, auto_update=auto_update_checkpoints)  # Device params must be specified first
     predictor.set_model(keras.models.load_model(ckpt.ckpt_path + '.h5', custom_objects=Scenario.model_cls().get_all_custom_objects()))
     return predictor
Beispiel #2
0
    def __init__(self, params: TrainerParams, scenario, restore=False):
        """Train a DNN using given preprocessing, weights, and data

        The purpose of the Trainer is handle a default training mechanism.
        As required input it expects a `dataset` and hyperparameters (`checkpoint_params`).

        The steps are
            1. Loading and preprocessing of the dataset
            2. Computation of the codec
            3. Construction of the DNN in the desired Deep Learning Framework
            4. Launch of the training

        During the training the Trainer will perform validation checks if a `validation_dataset` is given
        to determine the best model.
        Furthermore, the current status is printet and checkpoints are written.
        """
        super(Trainer, self).__init__(params, scenario, restore)
        self._params: TrainerParams = params
        if not isinstance(self._params.checkpoint_save_freq,
                          str) and self._params.checkpoint_save_freq < 0:
            self._params.checkpoint_save_freq = self._params.early_stopping_params.frequency
        self._params.warmstart.model = (checkpoint_path(
            self._params.warmstart.model) if self._params.warmstart.model else
                                        None)
        self.checkpoint = None
        if self._params.warmstart.model:
            # Manually handle loading
            self.checkpoint = SavedCalamariModel(
                self._params.warmstart.model,
                auto_update=self._params.auto_upgrade_checkpoints,
            )
            self._params.warmstart.model = self.checkpoint.ckpt_path + ".h5"
            self._params.warmstart.trim_graph_name = False

        self._codec_changes = None
    def from_paths(
        cls,
        checkpoints: List[str],
        auto_update_checkpoints=True,
        predictor_params: PredictorParams = None,
        voter_params: VoterParams = None,
        **kwargs,
    ) -> "tfaip_cls.MultiModelPredictor":
        if not checkpoints:
            raise Exception("No checkpoints provided.")

        if predictor_params is None:
            predictor_params = PredictorParams(silent=True, progress_bar=True)

        DeviceConfig(predictor_params.device)
        checkpoints = [
            SavedCalamariModel(ckpt, auto_update=auto_update_checkpoints)
            for ckpt in checkpoints
        ]
        multi_predictor = super(MultiPredictor, cls).from_paths(
            [ckpt.json_path for ckpt in checkpoints],
            predictor_params,
            CalamariScenario,
            model_paths=[ckpt.ckpt_path + ".h5" for ckpt in checkpoints],
            predictor_args={"voter_params": voter_params},
        )

        return multi_predictor
Beispiel #4
0
    def test_upgrade_2_to_4(self):
        with tempfile.TemporaryDirectory() as d:
            for filename in {'0.ckpt.h5', '0.ckpt.json'}:
                shutil.copyfile(os.path.join(models_dir, 'version2', filename), os.path.join(d, filename))

            ckpt = SavedCalamariModel(os.path.join(d, '0.ckpt.json'))
            self.predict(ckpt.ckpt_path)
Beispiel #5
0
    def test_upgrade_from_2(self):
        with tempfile.TemporaryDirectory() as d:
            for filename in {"0.ckpt.h5", "0.ckpt.json"}:
                shutil.copyfile(
                    os.path.join(models_dir, "version2", filename),
                    os.path.join(d, filename),
                )

            ckpt = SavedCalamariModel(os.path.join(d, "0.ckpt.json"))
            self.predict_and_eval(ckpt.ckpt_path)
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--checkpoints', nargs='+', type=str, required=True)
    parser.add_argument('--dry_run', action='store_true')

    args = parser.parse_args()

    for ckpt in tqdm(glob_all(args.checkpoints)):
        ckpt = os.path.splitext(ckpt)[0]
        SavedCalamariModel(ckpt, dry_run=args.dry_run)
Beispiel #7
0
def split(args):
    ckpt = SavedCalamariModel(args.model)
    keras_model = keras.models.load_model(
        ckpt.ckpt_path + ".h5",
        custom_objects={
            "Graph": Graph,
            "EnsembleGraph": EnsembleGraph,
            "VoterGraph": EnsembleGraph,
        },
    )

    def extract_keras_model(i):
        inputs = keras_model.input
        outputs = keras_model.output
        assert isinstance(outputs, dict)
        assert isinstance(inputs, dict)
        names_to_extract = [
            "blank_last_logits",
            "blank_last_softmax",
            "softmax",
            "decoded",
            "out_len",
        ]
        split_outputs = {}
        for name in names_to_extract:
            src_name = f"{name}_{i}"
            if src_name not in outputs:
                return None
            split_outputs[name] = outputs[src_name]

        return keras.Model(inputs=inputs, outputs=split_outputs)

    split_models: List[keras.Model] = []
    print("Starting to split models")
    while True:
        model = extract_keras_model(len(split_models))
        if model is None:
            break
        split_models.append(model)

    print(f"Split model into {len(split_models)}.")
    print(f"Saving models to {ckpt.dirname}/{ckpt.basename}_split_(i).ckpt")

    with open(ckpt.json_path) as f:
        ckpt_dict = json.load(f)
        ckpt_dict["scenario_params"]["model_params"]["ensemble"] = -1
        ckpt_dict["scenario_params"]["data_params"]["ensemble_"] = -1

    for i, split_model in enumerate(split_models):
        path = os.path.join(ckpt.dirname, f"{ckpt.basename}_split_{i}.ckpt")
        with open(path + ".json", "w") as f:
            json.dump(ckpt_dict, f, indent=2)
        split_model.save(path + ".h5")
        print(f"Saved {i + 1}/{len(split_models)}")
Beispiel #8
0
 def from_checkpoint(params: PredictorParams,
                     checkpoint: str,
                     auto_update_checkpoints=True):
     DeviceConfig(params.device)  # Device must be specified first
     ckpt = SavedCalamariModel(checkpoint,
                               auto_update=auto_update_checkpoints)
     scenario_params = CalamariScenario.params_from_dict(ckpt.dict)
     scenario = CalamariScenario(scenario_params)
     predictor = Predictor(params, scenario.create_data())
     predictor.set_model(
         keras.models.load_model(ckpt.ckpt_path + '.h5',
                                 custom_objects=CalamariScenario.model_cls(
                                 ).all_custom_objects()))
     return predictor
Beispiel #9
0
def main(args: EvalArgs):
    # Local imports (imports that require tensorflow)
    from calamari_ocr.ocr.scenario import CalamariScenario
    from calamari_ocr.ocr.dataset.data import Data
    from calamari_ocr.ocr.evaluator import Evaluator

    if args.checkpoint:
        saved_model = SavedCalamariModel(args.checkpoint, auto_update=True)
        trainer_params = CalamariScenario.trainer_cls().params_cls().from_dict(saved_model.dict)
        data_params = trainer_params.scenario.data
    else:
        data_params = Data.default_params()

    data = Data(data_params)

    pred_data = args.pred if args.pred is not None else args.gt.to_prediction()
    evaluator = Evaluator(args.evaluator, data=data)
    evaluator.preload_gt(gt_dataset=args.gt)
    r = evaluator.run(gt_dataset=args.gt, pred_dataset=pred_data)

    # TODO: More output
    print("Evaluation result")
    print("=================")
    print("")
    print(
        "Got mean normalized label error rate of {:.2%} ({} errs, {} total chars, {} sync errs)".format(
            r["avg_ler"], r["total_char_errs"], r["total_chars"], r["total_sync_errs"]
        )
    )

    # sort descending
    print_confusions(r, args.n_confusions)

    samples = data.create_pipeline(evaluator.params.setup, args.gt).reader().samples()
    print_worst_lines(r, samples, args.n_worst_lines)

    if args.xlsx_output:
        write_xlsx(
            args.xlsx_output,
            [
                {
                    "prefix": "evaluation",
                    "results": r,
                    "gt_files": [s["id"] for s in samples],
                }
            ],
        )

    return r
Beispiel #10
0
def split(args):
    ckpt = SavedCalamariModel(args.model)
    keras_model = keras.models.load_model(ckpt.ckpt_path + '.h5', custom_objects={
        'Graph': Graph, 'EnsembleGraph': EnsembleGraph, 'VoterGraph': EnsembleGraph})

    def extract_keras_model(i):
        inputs = keras_model.input
        outputs = keras_model.output
        assert(isinstance(outputs, dict))
        assert(isinstance(inputs, dict))
        names_to_extract = ['blank_last_logits', 'blank_last_softmax', 'softmax', 'decoded', 'out_len']
        split_outputs = {}
        for name in names_to_extract:
            src_name = f"{name}_{i}"
            if src_name not in outputs:
                return None
            split_outputs[name] = outputs[src_name]

        return keras.Model(inputs=inputs, outputs=split_outputs)

    split_models: List[keras.Model] = []
    print("Starting to split models")
    while True:
        model = extract_keras_model(len(split_models))
        if model is None:
            break
        split_models.append(model)

    print(f"Split model into {len(split_models)}.")
    print(f"Saving models to {ckpt.dirname}/{ckpt.basename}_split_(i).ckpt")

    with open(ckpt.json_path) as f:
        ckpt_dict = json.load(f)
        ckpt_dict['scenario_params']['model_params']['ensemble'] = -1
        ckpt_dict['scenario_params']['data_params']['ensemble_'] = -1

    for i, split_model in enumerate(split_models):
        path = os.path.join(ckpt.dirname, f"{ckpt.basename}_split_{i}.ckpt")
        with open(path + '.json', 'w') as f:
            json.dump(ckpt_dict, f, indent=2)
        split_model.save(path + '.h5')
        print(f"Saved {i + 1}/{len(split_models)}")
Beispiel #11
0
    def run(
        self,
        single_fold=None,
        seed=-1,
        weights=None,
        max_parallel_models=-1,
        temporary_dir=None,
        keep_temporary_files=False,
    ):
        # Default params
        single_fold = single_fold if single_fold else []
        weights = weights if weights else []
        if max_parallel_models <= 0:
            max_parallel_models = self.n_folds

        # argument checks
        if len(weights) > 1 and len(weights) != self.n_folds:
            raise Exception(
                "Either no, one or n_folds (={}) models are required for pretraining but got {}."
                .format(self.n_folds, len(weights)))

        if len(single_fold) > 0:
            if len(set(single_fold)) != len(single_fold):
                raise Exception("Repeated fold id's found.")
            for fold_id in single_fold:
                if fold_id < 0 or fold_id >= self.n_folds:
                    raise Exception(
                        "Invalid fold id found: 0 <= id <= {}, but id == {}".
                        format(self.n_folds, fold_id))

        # create temporary dir
        # by default, the temporary files will be deleted after a successful training
        # if you specify a temporary dir, you can easily resume to train if an error occurred
        if keep_temporary_files and not temporary_dir:
            raise Exception(
                "If you want to keep the temporary model files you have to specify a temporary dir"
            )

        # temporary dir
        if temporary_dir is None:
            temporary_dir = tempfile.mkdtemp(prefix="calamari")
        else:
            temporary_dir = os.path.abspath(temporary_dir)

        if not os.path.exists(temporary_dir):
            os.makedirs(temporary_dir)

        # Compute the files in the cross fold (create a CrossFold)
        fold_file = os.path.join(temporary_dir, "folds.json")
        cross_fold = CrossFold(n_folds=self.n_folds,
                               data_reader=self.data_reader,
                               output_dir=temporary_dir,
                               progress_bar=self.progress_bars)
        cross_fold.write_folds_to_json(fold_file)

        # Create the json argument file for each individual training
        run_args = []
        folds_to_run = single_fold if len(single_fold) > 0 else range(
            len(cross_fold.folds))
        for fold in folds_to_run:
            train_files = cross_fold.train_files(fold)
            test_files = cross_fold.test_files(fold)
            path = os.path.join(temporary_dir, "fold_{}.json".format(fold))
            with open(path, 'w') as f:
                fold_args = self.train_args.copy()
                fold_args["dataset"] = cross_fold.dataset_type.name
                fold_args["validation_dataset"] = cross_fold.dataset_type.name
                fold_args["validation_extension"] = self.train_args[
                    'gt_extension']
                fold_args["id"] = fold
                fold_args["files"] = train_files
                fold_args["validation"] = test_files
                fold_args["train_script"] = self.train_script_path
                fold_args["verbose"] = True
                fold_args["output_dir"] = os.path.join(temporary_dir,
                                                       "fold_{}".format(fold))
                fold_args[
                    "early_stopping_best_model_output_dir"] = self.best_models_dir
                fold_args[
                    "early_stopping_best_model_prefix"] = self.best_model_label.format(
                        id=fold)
                fold_args['train_verbose'] = 2

                if seed >= 0:
                    fold_args["seed"] = seed + fold

                if len(weights) == 1:
                    fold_args["weights"] = weights[0]
                elif len(weights) > 1:
                    fold_args["weights"] = weights[fold]
                else:
                    fold_args["weights"] = None

                # start from scratch via None
                if fold_args["weights"]:
                    if len(fold_args["weights"].strip()
                           ) == 0 or fold_args["weights"].upper() == "NONE":
                        fold_args["weights"] = None
                    else:
                        # access model once to upgrade the model if necessary (can not be performed in parallel)
                        SavedCalamariModel(fold_args["weights"])

                # HDF5 dataset is already preloaded and does not require a extension anymore
                if cross_fold.dataset_type == DataSetType.HDF5:
                    del fold_args["validation_extension"]
                    del fold_args["gt_extension"]

                json.dump(
                    fold_args,
                    f,
                    indent=4,
                )

            run_args.append({"json": path, "args": fold_args})

        # Launch the individual processes for each training
        with multiprocessing.pool.ThreadPool(
                processes=max_parallel_models) as pool:
            # workaround to forward keyboard interrupt
            pool.map_async(train_individual_model, run_args).get()

        if not keep_temporary_files:
            import shutil
            shutil.rmtree(temporary_dir)
Beispiel #12
0
    def run(self):
        # temporary dir
        temporary_dir = self.params.temporary_dir
        if temporary_dir is None:
            temporary_dir = tempfile.mkdtemp(prefix="calamari")
        else:
            temporary_dir = os.path.abspath(temporary_dir)

        if not os.path.exists(temporary_dir):
            os.makedirs(temporary_dir)

        # Compute the files in the cross fold (create a CrossFold)
        fold_file = os.path.join(temporary_dir, "folds.json")
        cross_fold = CrossFold(
            n_folds=self.params.n_folds,
            data_generator_params=self.params.trainer.gen.train,
            output_dir=temporary_dir,
            progress_bar=self.params.trainer.progress_bar)
        cross_fold.write_folds_to_json(fold_file)

        # Create the json argument file for each individual training
        run_args = []
        seed = self.params.trainer.random_seed or -1
        folds_to_run = self.params.single_fold if len(
            self.params.single_fold) > 0 else range(len(cross_fold.folds))
        for fold in folds_to_run:
            train_files = cross_fold.train_files(fold)
            test_files = cross_fold.test_files(fold)
            path = os.path.join(temporary_dir, "fold_{}.json".format(fold))
            with open(path, 'w') as f:
                trainer_params = deepcopy(self.params.trainer)
                trainer_params.gen = CalamariDefaultTrainerPipelineParams(
                    train=trainer_params.gen.train,
                    val=deepcopy(trainer_params.gen.train),
                    setup=trainer_params.gen.setup,
                )
                if cross_fold.is_h5_dataset:
                    tp = trainer_params.gen.train.to_dict()
                    del tp['__cls__']
                    tp["files"] = train_files
                    trainer_params.gen.train = Hdf5.from_dict(tp)
                    vp = trainer_params.gen.val.to_dict()
                    del vp['__cls__']
                    vp['files'] = test_files
                    trainer_params.gen.val = Hdf5.from_dict(vp)
                else:
                    trainer_params.gen.train.images = train_files
                    trainer_params.gen.val.images = test_files
                    trainer_params.gen.val.gt_extension = trainer_params.gen.train.gt_extension

                trainer_params.scenario.id = fold
                trainer_params.progress_bar_mode = 2
                trainer_params.output_dir = os.path.join(
                    temporary_dir, "fold_{}".format(fold))
                trainer_params.early_stopping.best_model_output_dir = self.params.best_models_dir
                trainer_params.early_stopping.best_model_name = ''
                best_model_prefix = self.params.best_model_label.format(
                    id=fold)
                trainer_params.best_model_prefix = best_model_prefix

                if self.params.visible_gpus:
                    assert trainer_params.device.gpus is None, "Using visible_gpus with device.gpus is not supported"
                    trainer_params.device.gpus = [
                        self.params.visible_gpus[fold %
                                                 len(self.params.visible_gpus)]
                    ]

                if seed >= 0:
                    trainer_params.random_seed = seed + fold

                if len(self.params.weights) == 1:
                    trainer_params.warmstart.model = self.params.weights[0]
                elif len(self.params.weights) > 1:
                    trainer_params.warmstart.model = self.params.weights[fold]

                # start from scratch via None
                if trainer_params.warmstart.model:
                    if len(
                            trainer_params.warmstart.model.strip()
                    ) == 0 or trainer_params.warmstart.model.upper() == "NONE":
                        trainer_params.warmstart.model = None
                    else:
                        # access model once to upgrade the model if necessary
                        # (can not be performed in parallel if multiple folds use the same model)
                        SavedCalamariModel(trainer_params.warmstart.model)

                post_init(trainer_params)

                json.dump(
                    trainer_params.to_dict(),
                    f,
                    indent=4,
                )

            run_args.append({
                "json": path,
                "args": trainer_params,
                "id": fold,
                'train_script': self.train_script_path,
                'run': self.params.run,
                'verbose': True
            })

        # Launch the individual processes for each training
        with multiprocessing.pool.ThreadPool(
                processes=self.params.max_parallel_models) as pool:
            # workaround to forward keyboard interrupt
            pool.map_async(train_individual_model, run_args).get()

        if not self.params.keep_temporary_files:
            import shutil
            shutil.rmtree(temporary_dir)
Beispiel #13
0
def main():
    # Local imports (imports that require tensorflow)
    from calamari_ocr.ocr.scenario import Scenario
    from calamari_ocr.ocr.dataset.data import Data
    from calamari_ocr.ocr.evaluator import Evaluator

    parser = ArgumentParser()
    parser.add_argument("--dataset", type=DataSetType.from_string, choices=list(DataSetType), default=DataSetType.FILE)
    parser.add_argument("--gt", nargs="+", required=True,
                        help="Ground truth files (.gt.txt extension). "
                             "Optionally, you can pass a single json file defining all parameters.")
    parser.add_argument("--pred", nargs="+", default=None,
                        help="Prediction files if provided. Else files with .pred.txt are expected at the same "
                             "location as the gt.")
    parser.add_argument("--pred_dataset", type=DataSetType.from_string, choices=list(DataSetType), default=DataSetType.FILE)
    parser.add_argument("--pred_ext", type=str, default=".pred.txt",
                        help="Extension of the predicted text files")
    parser.add_argument("--n_confusions", type=int, default=10,
                        help="Only print n most common confusions. Defaults to 10, use -1 for all.")
    parser.add_argument("--n_worst_lines", type=int, default=0,
                        help="Print the n worst recognized text lines with its error")
    parser.add_argument("--xlsx_output", type=str,
                        help="Optionally write a xlsx file with the evaluation results")
    parser.add_argument("--num_threads", type=int, default=1,
                        help="Number of threads to use for evaluation")
    parser.add_argument("--non_existing_file_handling_mode", type=str, default="error",
                        help="How to handle non existing .pred.txt files. Possible modes: skip, empty, error. "
                             "'Skip' will simply skip the evaluation of that file (not counting it to errors). "
                             "'Empty' will handle this file as would it be empty (fully checking for errors)."
                             "'Error' will throw an exception if a file is not existing. This is the default behaviour.")
    parser.add_argument("--skip_empty_gt", action="store_true", default=False,
                        help="Ignore lines of the gt that are empty.")
    parser.add_argument("--no_progress_bars", action="store_true",
                        help="Do not show any progress bars")
    parser.add_argument("--checkpoint", type=str, default=None,
                        help="Specify an optional checkpoint to parse the text preprocessor (for the gt txt files)")

    # page xml specific args
    parser.add_argument("--pagexml_gt_text_index", default=0)
    parser.add_argument("--pagexml_pred_text_index", default=1)

    args = parser.parse_args()

    # check if loading a json file
    if len(args.gt) == 1 and args.gt[0].endswith("json"):
        with open(args.gt[0], 'r') as f:
            json_args = json.load(f)
            for key, value in json_args.items():
                setattr(args, key, value)

    logger.info("Resolving files")
    gt_files = sorted(glob_all(args.gt))

    if args.pred:
        pred_files = sorted(glob_all(args.pred))
    else:
        pred_files = [split_all_ext(gt)[0] + args.pred_ext for gt in gt_files]
        args.pred_dataset = args.dataset

    if args.non_existing_file_handling_mode.lower() == "skip":
        non_existing_pred = [p for p in pred_files if not os.path.exists(p)]
        for f in non_existing_pred:
            idx = pred_files.index(f)
            del pred_files[idx]
            del gt_files[idx]

    data_params = Data.get_default_params()
    if args.checkpoint:
        saved_model = SavedCalamariModel(args.checkpoint, auto_update=True)
        trainer_params = Scenario.trainer_params_from_dict(saved_model.dict)
        data_params = trainer_params.scenario_params.data_params

    data = Data(data_params)

    gt_reader_args = FileDataReaderArgs(
        text_index=args.pagexml_gt_text_index
    )
    pred_reader_args = FileDataReaderArgs(
        text_index=args.pagexml_pred_text_index
    )
    gt_data_set = PipelineParams(
        type=args.dataset,
        text_files=gt_files,
        data_reader_args=gt_reader_args,
        skip_invalid=args.skip_empty_gt,
    )
    pred_data_set = PipelineParams(
        type=args.pred_dataset,
        text_files=pred_files,
        data_reader_args=pred_reader_args,
    )

    evaluator = Evaluator(data=data)
    evaluator.preload_gt(gt_dataset=gt_data_set)
    r = evaluator.run(gt_dataset=gt_data_set, pred_dataset=pred_data_set, processes=args.num_threads,
                      progress_bar=not args.no_progress_bars)

    # TODO: More output
    print("Evaluation result")
    print("=================")
    print("")
    print("Got mean normalized label error rate of {:.2%} ({} errs, {} total chars, {} sync errs)".format(
        r["avg_ler"], r["total_char_errs"], r["total_chars"], r["total_sync_errs"]))

    # sort descending
    print_confusions(r, args.n_confusions)

    print_worst_lines(r, data.create_pipeline(PipelineMode.Targets, gt_data_set).reader().samples(), args.n_worst_lines)

    if args.xlsx_output:
        write_xlsx(args.xlsx_output,
                   [{
                       "prefix": "evaluation",
                       "results": r,
                       "gt_files": gt_files,
                   }])