Python CrossFold Examples, calamari_ocr.ocr.CrossFold Python Examples

Example #1

0

Show file

File: cross_fold_split.py Project: templeblock/calamari

def main():
    parser = argparse.ArgumentParser(
        description="Write split of folds to separate directories")
    parser.add_argument(
        "--files",
        nargs="+",
        help=
        "List all image files that shall be processed. Ground truth fils with the same "
        "base name but with '.gt.txt' as extension are required at the same location"
    )
    parser.add_argument(
        "--n_folds",
        type=int,
        required=True,
        help="The number of fold, that is the number of models to train")
    parser.add_argument("--output_dir",
                        type=str,
                        required=True,
                        help="Where to write the folds")

    args = parser.parse_args()

    print("Creating folds")
    cross_fold = CrossFold(n_folds=args.n_folds,
                           source_files=args.files,
                           output_dir=args.output_dir)

    print("Copying files")
    for fold_id, fold_files in enumerate(cross_fold.folds):
        fold_out_dir = os.path.join(args.output_dir, str(fold_id))
        if not os.path.exists(fold_out_dir):
            os.makedirs(fold_out_dir)

        for file_id, file in tqdm(enumerate(fold_files),
                                  total=len(fold_files),
                                  desc="Fold {}".format(fold_id)):
            img_file = file
            base, ext = split_all_ext(file)
            txt_file = base + ".gt.txt"

            if os.path.exists(img_file) and os.path.exists(txt_file):
                output_file = os.path.join(fold_out_dir,
                                           "{:08d}{}".format(file_id, ext))
                shutil.copyfile(img_file, output_file)

                output_file = os.path.join(
                    fold_out_dir, "{:08d}{}".format(file_id, ".gt.txt"))
                shutil.copyfile(txt_file, output_file)
            else:
                print("Waring: Does not exist {} or {}".format(
                    img_file, txt_file))

Example #2

0

Show file

File: split_to_folds.py Project: znsoftm/calamari

def main():
    parser = argparse.ArgumentParser(
        description="Write split of folds to separate directories"
    )
    parser.add_argument("--files", nargs="+",
                        help="List all image files that shall be processed. Ground truth fils with the same "
                             "base name but with '.gt.txt' as extension are required at the same location")
    parser.add_argument("--n_folds", type=int, required=True,
                        help="The number of fold, that is the number of models to train")
    parser.add_argument("--output_dir", type=str, required=True,
                        help="Where to write the folds")
    parser.add_argument("--keep_original_filename", action="store_true",
                        help="By default the copied new files get a new 8 digit name. Use this flag to keep the "
                             "original name but be aware, that this might override lines with the same name")

    args = parser.parse_args()

    logger.info("Creating folds")
    images = glob_all(args.files)
    texts = [split_all_ext(p)[0] + '.gt.txt' for p in images]
    data_reader = FileDataReader(PipelineMode.Training, images=images, texts=texts, skip_invalid=True)
    cross_fold = CrossFold(n_folds=args.n_folds, data_reader=data_reader, output_dir=args.output_dir)

    logger.info("Copying files")
    for fold_id, fold_files in enumerate(cross_fold.folds):
        fold_out_dir = os.path.join(args.output_dir, str(fold_id))
        if not os.path.exists(fold_out_dir):
            os.makedirs(fold_out_dir)

        for file_id, file in tqdm(enumerate(fold_files), total=len(fold_files), desc="Fold {}".format(fold_id)):
            img_file = file
            base, ext = split_all_ext(file)
            txt_file = base + ".gt.txt"
            output_basename = os.path.basename(base) if args.keep_original_filename else "{:08d}".format(file_id)

            if os.path.exists(img_file) and os.path.exists(txt_file):
                output_file = os.path.join(fold_out_dir, "{}{}".format(output_basename, ext))
                shutil.copyfile(img_file, output_file)

                output_file = os.path.join(fold_out_dir, "{}{}".format(output_basename, ".gt.txt"))
                shutil.copyfile(txt_file, output_file)
            else:
                logger.info("Warning: Does not exist {} or {}".format(img_file, txt_file))

Example #3

0

Show file

File: standalone_cross_fold_split_script.py Project: jacektl/calamari

import argparse
import json

from calamari_ocr.ocr import CrossFold

if __name__ == "__main__":
    # Standalone script to run the cross fold splitting in a separate thread
    # this script is called from cross_fold_trianer.py
    parser = argparse.ArgumentParser()
    parser.add_argument("config")
    parser.add_argument("--dir", required=True)
    parser.add_argument("--progress_bar", action="store_true")

    args = parser.parse_args()

    with open(args.config) as f:
        cfg = json.load(f)

    cross_fold = CrossFold.from_dict(cfg)
    cross_fold.create_folds(progress_bar=args.progress_bar)
    cross_fold.write_folds_to_json(args.dir)
    with open(args.config, "w") as f:
        json.dump(cross_fold.to_dict(), f)

Example #4

0

Show file

    def run(
        self,
        single_fold=None,
        seed=-1,
        weights=None,
        max_parallel_models=-1,
        temporary_dir=None,
        keep_temporary_files=False,
    ):
        # Default params
        single_fold = single_fold if single_fold else []
        weights = weights if weights else []
        if max_parallel_models <= 0:
            max_parallel_models = self.n_folds

        # argument checks
        if len(weights) > 1 and len(weights) != self.n_folds:
            raise Exception(
                "Either no, one or n_folds (={}) models are required for pretraining but got {}."
                .format(self.n_folds, len(weights)))

        if len(single_fold) > 0:
            if len(set(single_fold)) != len(single_fold):
                raise Exception("Repeated fold id's found.")
            for fold_id in single_fold:
                if fold_id < 0 or fold_id >= self.n_folds:
                    raise Exception(
                        "Invalid fold id found: 0 <= id <= {}, but id == {}".
                        format(self.n_folds, fold_id))

        # create temporary dir
        # by default, the temporary files will be deleted after a successful training
        # if you specify a temporary dir, you can easily resume to train if an error occurred
        if keep_temporary_files and not temporary_dir:
            raise Exception(
                "If you want to keep the temporary model files you have to specify a temporary dir"
            )

        # temporary dir
        if temporary_dir is None:
            temporary_dir = tempfile.mkdtemp(prefix="calamari")
        else:
            temporary_dir = os.path.abspath(temporary_dir)

        if not os.path.exists(temporary_dir):
            os.makedirs(temporary_dir)

        # Compute the files in the cross fold (create a CrossFold)
        fold_file = os.path.join(temporary_dir, "folds.json")
        cross_fold = CrossFold(n_folds=self.n_folds,
                               data_reader=self.data_reader,
                               output_dir=temporary_dir,
                               progress_bar=self.progress_bars)
        cross_fold.write_folds_to_json(fold_file)

        # Create the json argument file for each individual training
        run_args = []
        folds_to_run = single_fold if len(single_fold) > 0 else range(
            len(cross_fold.folds))
        for fold in folds_to_run:
            train_files = cross_fold.train_files(fold)
            test_files = cross_fold.test_files(fold)
            path = os.path.join(temporary_dir, "fold_{}.json".format(fold))
            with open(path, 'w') as f:
                fold_args = self.train_args.copy()
                fold_args["dataset"] = cross_fold.dataset_type.name
                fold_args["validation_dataset"] = cross_fold.dataset_type.name
                fold_args["validation_extension"] = self.train_args[
                    'gt_extension']
                fold_args["id"] = fold
                fold_args["files"] = train_files
                fold_args["validation"] = test_files
                fold_args["train_script"] = self.train_script_path
                fold_args["verbose"] = True
                fold_args["output_dir"] = os.path.join(temporary_dir,
                                                       "fold_{}".format(fold))
                fold_args[
                    "early_stopping_best_model_output_dir"] = self.best_models_dir
                fold_args[
                    "early_stopping_best_model_prefix"] = self.best_model_label.format(
                        id=fold)
                fold_args['train_verbose'] = 2

                if seed >= 0:
                    fold_args["seed"] = seed + fold

                if len(weights) == 1:
                    fold_args["weights"] = weights[0]
                elif len(weights) > 1:
                    fold_args["weights"] = weights[fold]
                else:
                    fold_args["weights"] = None

                # start from scratch via None
                if fold_args["weights"]:
                    if len(fold_args["weights"].strip()
                           ) == 0 or fold_args["weights"].upper() == "NONE":
                        fold_args["weights"] = None
                    else:
                        # access model once to upgrade the model if necessary (can not be performed in parallel)
                        SavedCalamariModel(fold_args["weights"])

                # HDF5 dataset is already preloaded and does not require a extension anymore
                if cross_fold.dataset_type == DataSetType.HDF5:
                    del fold_args["validation_extension"]
                    del fold_args["gt_extension"]

                json.dump(
                    fold_args,
                    f,
                    indent=4,
                )

            run_args.append({"json": path, "args": fold_args})

        # Launch the individual processes for each training
        with multiprocessing.pool.ThreadPool(
                processes=max_parallel_models) as pool:
            # workaround to forward keyboard interrupt
            pool.map_async(train_individual_model, run_args).get()

        if not keep_temporary_files:
            import shutil
            shutil.rmtree(temporary_dir)

Example #5

0

Show file

    def run(self):
        # temporary dir
        temporary_dir = self.params.temporary_dir
        if temporary_dir is None:
            temporary_dir = tempfile.mkdtemp(prefix="calamari")
        else:
            temporary_dir = os.path.abspath(temporary_dir)

        if not os.path.exists(temporary_dir):
            os.makedirs(temporary_dir)

        # Compute the files in the cross fold (create a CrossFold)
        fold_file = os.path.join(temporary_dir, "folds.json")
        cross_fold = CrossFold(
            n_folds=self.params.n_folds,
            data_generator_params=self.params.trainer.gen.train,
            output_dir=temporary_dir,
            progress_bar=self.params.trainer.progress_bar)
        cross_fold.write_folds_to_json(fold_file)

        # Create the json argument file for each individual training
        run_args = []
        seed = self.params.trainer.random_seed or -1
        folds_to_run = self.params.single_fold if len(
            self.params.single_fold) > 0 else range(len(cross_fold.folds))
        for fold in folds_to_run:
            train_files = cross_fold.train_files(fold)
            test_files = cross_fold.test_files(fold)
            path = os.path.join(temporary_dir, "fold_{}.json".format(fold))
            with open(path, 'w') as f:
                trainer_params = deepcopy(self.params.trainer)
                trainer_params.gen = CalamariDefaultTrainerPipelineParams(
                    train=trainer_params.gen.train,
                    val=deepcopy(trainer_params.gen.train),
                    setup=trainer_params.gen.setup,
                )
                if cross_fold.is_h5_dataset:
                    tp = trainer_params.gen.train.to_dict()
                    del tp['__cls__']
                    tp["files"] = train_files
                    trainer_params.gen.train = Hdf5.from_dict(tp)
                    vp = trainer_params.gen.val.to_dict()
                    del vp['__cls__']
                    vp['files'] = test_files
                    trainer_params.gen.val = Hdf5.from_dict(vp)
                else:
                    trainer_params.gen.train.images = train_files
                    trainer_params.gen.val.images = test_files
                    trainer_params.gen.val.gt_extension = trainer_params.gen.train.gt_extension

                trainer_params.scenario.id = fold
                trainer_params.progress_bar_mode = 2
                trainer_params.output_dir = os.path.join(
                    temporary_dir, "fold_{}".format(fold))
                trainer_params.early_stopping.best_model_output_dir = self.params.best_models_dir
                trainer_params.early_stopping.best_model_name = ''
                best_model_prefix = self.params.best_model_label.format(
                    id=fold)
                trainer_params.best_model_prefix = best_model_prefix

                if self.params.visible_gpus:
                    assert trainer_params.device.gpus is None, "Using visible_gpus with device.gpus is not supported"
                    trainer_params.device.gpus = [
                        self.params.visible_gpus[fold %
                                                 len(self.params.visible_gpus)]
                    ]

                if seed >= 0:
                    trainer_params.random_seed = seed + fold

                if len(self.params.weights) == 1:
                    trainer_params.warmstart.model = self.params.weights[0]
                elif len(self.params.weights) > 1:
                    trainer_params.warmstart.model = self.params.weights[fold]

                # start from scratch via None
                if trainer_params.warmstart.model:
                    if len(
                            trainer_params.warmstart.model.strip()
                    ) == 0 or trainer_params.warmstart.model.upper() == "NONE":
                        trainer_params.warmstart.model = None
                    else:
                        # access model once to upgrade the model if necessary
                        # (can not be performed in parallel if multiple folds use the same model)
                        SavedCalamariModel(trainer_params.warmstart.model)

                post_init(trainer_params)

                json.dump(
                    trainer_params.to_dict(),
                    f,
                    indent=4,
                )

            run_args.append({
                "json": path,
                "args": trainer_params,
                "id": fold,
                'train_script': self.train_script_path,
                'run': self.params.run,
                'verbose': True
            })

        # Launch the individual processes for each training
        with multiprocessing.pool.ThreadPool(
                processes=self.params.max_parallel_models) as pool:
            # workaround to forward keyboard interrupt
            pool.map_async(train_individual_model, run_args).get()

        if not self.params.keep_temporary_files:
            import shutil
            shutil.rmtree(temporary_dir)