Esempio n. 1
0
 def test_eval_files_with_different_sources(self):
     run_predict(
         predict_args(data=FileDataParams(
             pred_extension=".ext-pred.txt",
             images=sorted(
                 glob_all([
                     os.path.join(this_dir, "data", "uw3_50lines", "test",
                                  "*.png")
                 ])),
         )))
     r = run_eval(
         eval_args(
             gt_data=FileDataParams(texts=sorted(
                 glob_all([
                     os.path.join(this_dir, "data", "uw3_50lines", "test",
                                  "*.gt.txt")
                 ]))),
             pred_data=FileDataParams(texts=sorted(
                 glob_all([
                     os.path.join(
                         this_dir,
                         "data",
                         "uw3_50lines",
                         "test",
                         "*.ext-pred.txt",
                     )
                 ]))),
         ))
     self.assertLess(r["avg_ler"],
                     0.0009,
                     msg="Current best model yields about 0.09% CER")
Esempio n. 2
0
 def test_eval_list_files(self):
     run_predict(
         predict_args(data=FileDataParams(images=sorted(
             glob_all([
                 os.path.join(this_dir, "data", "uw3_50lines", "test.files")
             ])))))
     r = run_eval(
         eval_args(gt_data=FileDataParams(texts=sorted(
             glob_all([
                 os.path.join(this_dir, "data", "uw3_50lines",
                              "test.gt.files")
             ])))))
     self.assertLess(r["avg_ler"],
                     0.0009,
                     msg="Current best model yields about 0.09% CER")
Esempio n. 3
0
def uw3_trainer_params(with_validation=False,
                       with_split=False,
                       preload=True,
                       debug=False):
    p = CalamariTestScenario.default_trainer_params()
    p.scenario.debug_graph_construction = debug
    p.force_eager = debug

    train = FileDataParams(
        images=glob_all(
            [os.path.join(this_dir, "data", "uw3_50lines", "train", "*.png")]),
        preload=preload,
    )
    if with_split:
        p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.2,
                                                   train=train)
    elif with_validation:
        p.gen.val.images = glob_all(
            [os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png")])
        p.gen.val.preload = preload
        p.gen.train = train
        p.gen.__post_init__()
    else:
        p.gen = CalamariTrainOnlyPipelineParams(train=train)

    p.gen.setup.val.batch_size = 1
    p.gen.setup.val.num_processes = 1
    p.gen.setup.train.batch_size = 1
    p.gen.setup.train.num_processes = 1
    post_init(p)
    return p
Esempio n. 4
0
 def test_prediction_files_with_different_extension(self):
     run_predict(
         predict_args(data=FileDataParams(
             pred_extension='.ext-pred.txt',
             images=sorted(
                 glob_all([
                     os.path.join(this_dir, "data", "uw3_50lines", "test",
                                  "*.png")
                 ])))))
     run_eval(
         eval_args(gt_data=FileDataParams(
             pred_extension='.ext-pred.txt',
             texts=sorted(
                 glob_all([
                     os.path.join(this_dir, "data", "uw3_50lines", "test",
                                  "*.gt.txt")
                 ])))))
Esempio n. 5
0
def setup_trainer_params(preload=True, debug=False):
    p = CalamariTestEnsembleScenario.default_trainer_params()
    p.force_eager = debug

    p.gen.train = FileDataParams(
        images=glob_all([os.path.join(this_dir, "data", "uw3_50lines", "train", "*.png")]),
        preload=preload,
    )

    post_init(p)
    return p
Esempio n. 6
0
 def test_prediction_files(self):
     run_predict(
         predict_args(data=FileDataParams(images=sorted(
             glob_all([
                 os.path.join(this_dir, "data", "uw3_50lines", "test",
                              "*.png")
             ])))))
     run_eval(
         eval_args(gt_data=FileDataParams(texts=sorted(
             glob_all([
                 os.path.join(this_dir, "data", "uw3_50lines", "test",
                              "*.gt.txt")
             ])))))
     args = eval_args(gt_data=FileDataParams(texts=sorted(
         glob_all([
             os.path.join(this_dir, "data", "uw3_50lines", "test",
                          "*.gt.txt")
         ]))))
     with tempfile.TemporaryDirectory() as d:
         args.xlsx_output = os.path.join(d, 'output.xlsx')
         run_eval(args)
Esempio n. 7
0
 def test_eval_files(self):
     run_predict(
         predict_args(data=FileDataParams(images=sorted(
             glob_all([
                 os.path.join(this_dir, "data", "uw3_50lines", "test",
                              "*.png")
             ])))))
     r = run_eval(
         eval_args(gt_data=FileDataParams(texts=sorted(
             glob_all([
                 os.path.join(this_dir, "data", "uw3_50lines", "test",
                              "*.gt.txt")
             ])))))
     self.assertLess(r["avg_ler"],
                     0.0009,
                     msg="Current best model yields about 0.09% CER")
     args = eval_args(gt_data=FileDataParams(texts=sorted(
         glob_all([
             os.path.join(this_dir, "data", "uw3_50lines", "test",
                          "*.gt.txt")
         ]))))
     with tempfile.TemporaryDirectory() as d:
         args.xlsx_output = os.path.join(d, "output.xlsx")
         run_eval(args)
Esempio n. 8
0
        if not data.dtype == np.uint8:
            raise TypeError("Data for hdf5 must have type np.uint8")

        self.data.append(data)
        self.text.append(text)

        if len(self.data) >= self.n_max:
            self.finish_chunck()


if __name__ == "__main__":
    from calamari_ocr.ocr.dataset.datareader.file import (
        FileDataParams, )

    dg = FileDataParams(
        images="calamari_ocr/test/data/uw3_50lines/train/*.png").create(
            PipelineMode.TRAINING)
    with Hdf5DatasetWriter("calamari_ocr/test/data/uw3_50lines/uw3-50lines.h5",
                           n_max=1000) as writer:
        for sample in dg.generate():
            writer.write(sample.inputs, sample.targets)

    from contextlib import ExitStack

    with Hdf5DatasetWriter("test", n_max=5) as writer:
        writer.write(np.zeros((10, 10), dtype=np.uint8), "test")
        writer.write(np.zeros((10, 15), dtype=np.uint8), "asdfasd")
        writer.write(np.zeros((1, 10), dtype=np.uint8), "te345")

    l = [
        Hdf5DatasetWriter("test1", n_max=5),
Esempio n. 9
0
        return self.data_pipeline_cls()(
            pipeline_params,
            self,
            generator_params=params,
        )


if __name__ == "__main__":
    from calamari_ocr.ocr import Codec

    this_dir = os.path.dirname(os.path.realpath(__file__))
    base_path = os.path.abspath(os.path.join(this_dir, "..", "..", "test", "data", "uw3_50lines", "train"))

    fdr = FileDataParams(
        num_processes=8,
        images=[os.path.join(base_path, "*.png")],
        limit=1000,
    )

    params = DataParams(
        codec=Codec("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,:;-?+=_()*{}[]`@#$%^&'\""),
        downscale_factor=4,
        line_height=48,
        pre_proc=SequentialProcessorPipelineParams(
            run_parallel=True,
            processors=default_image_processors()
            + default_text_pre_processors()
            + [
                AugmentationProcessorParams(
                    modes={PipelineMode.TRAINING},
                    data_aug_params=DataAugmentationAmount(amount=2),
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser(
        description="Write split of folds to separate directories")
    parser.add_argument(
        "--files",
        nargs="+",
        help=
        "List all image files that shall be processed. Ground truth fils with the same "
        "base name but with '.gt.txt' as extension are required at the same location",
    )
    parser.add_argument(
        "--n_folds",
        type=int,
        required=True,
        help="The number of fold, that is the number of models to train",
    )
    parser.add_argument("--output_dir",
                        type=str,
                        required=True,
                        help="Where to write the folds")
    parser.add_argument(
        "--keep_original_filename",
        action="store_true",
        help=
        "By default the copied new files get a new 8 digit name. Use this flag to keep the "
        "original name but be aware, that this might override lines with the same name",
    )

    args = parser.parse_args()

    logger.info("Creating folds")
    images = glob_all(args.files)
    texts = [split_all_ext(p)[0] + ".gt.txt" for p in images]
    data_reader = FileDataParams(images=images, texts=texts, skip_invalid=True)
    data_reader.prepare_for_mode(PipelineMode.TRAINING)
    cross_fold = CrossFold(
        n_folds=args.n_folds,
        data_generator_params=data_reader,
        output_dir=args.output_dir,
    )

    logger.info("Copying files")
    for fold_id, fold_files in enumerate(cross_fold.folds):
        fold_out_dir = os.path.join(args.output_dir, str(fold_id))
        if not os.path.exists(fold_out_dir):
            os.makedirs(fold_out_dir)

        for file_id, file in tqdm(enumerate(fold_files),
                                  total=len(fold_files),
                                  desc=f"Fold {fold_id}"):
            img_file = file
            base, ext = split_all_ext(file)
            txt_file = base + ".gt.txt"
            output_basename = os.path.basename(
                base) if args.keep_original_filename else f"{fold_id:08d}"

            if os.path.exists(img_file) and os.path.exists(txt_file):
                output_file = os.path.join(fold_out_dir,
                                           f"{output_basename}{ext}")
                shutil.copyfile(img_file, output_file)

                output_file = os.path.join(fold_out_dir,
                                           f"{output_basename}.gt.txt")
                shutil.copyfile(txt_file, output_file)
            else:
                logger.info(
                    f"Warning: Does not exist {img_file} or {txt_file}")
Esempio n. 11
0
def file_dataset():
    return FileDataParams(images=sorted(
        glob_all(
            [os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png")])))