Ejemplo n.º 1
0
    def test_train_val_error(self):
        trainer_params = uw3_trainer_params()
        trainer_params.force_eager = True
        trainer_params.epochs = 1
        trainer_params.random_seed = 9412
        trainer_params.samples_per_epoch = 6
        trainer_params.gen.setup.train.batch_size = trainer_params.gen.setup.val.batch_size = 6
        trainer_params.learning_rate.lr = 0
        trainer_params.gen.train.images = trainer_params.gen.train.images[:
                                                                          trainer_params
                                                                          .
                                                                          samples_per_epoch]
        del trainer_params.scenario.model.layers[-1]  # no dropout
        trainer = CalamariScenario.create_trainer(trainer_params)

        class FinalLogsCallback(keras.callbacks.Callback):
            def __init__(self):
                super().__init__()
                self.train_logs = {}

            def on_train_end(self, logs=None):
                self.train_logs = logs

        cb = FinalLogsCallback()
        trainer.train(callbacks=[cb])

        assert len(cb.train_logs) > 0

        for k, v in cb.train_logs.items():
            if k.startswith("val_"):
                continue
            self.assertAlmostEqual(round_sig(v),
                                   round_sig(cb.train_logs["val_" + k]))
Ejemplo n.º 2
0
 def test_simple_train(self):
     trainer_params = uw3_trainer_params()
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         train.main(trainer_params)
         keras.backend.clear_session()
         resume_training.main([os.path.join(d, "checkpoint", "checkpoint_0001")])
Ejemplo n.º 3
0
 def test_model_zoo(self):
     version = '1.0'
     url = f"https://github.com/Calamari-OCR/calamari_models/archive/{version}.tar.gz"
     with tempfile.TemporaryDirectory() as d:
         d = 'model_archive_permanent'  # for debugging
         os.makedirs(d, exist_ok=True)
         os.chdir(d)
         if not os.path.exists('calamari_models'):
             check_call([
                 'sh', '-c', ' '.join([
                     'wget', '-q', '-O', '-', url, '|', 'tar', 'xz', '&&',
                     'mv', f'calamari_models-{version}', 'calamari_models'
                 ])
             ])
         trainer_params = uw3_trainer_params(with_validation=True)
         args = PredictAndEvalArgs(
             checkpoint=glob(
                 os.path.join('calamari_models', 'antiqua_modern',
                              '*.ckpt.json')),
             predictor=PredictorParams(pipeline=DataPipelineParams(
                 batch_size=5)),
             data=trainer_params.gen.val_gen(),
         )
         full_evaluation = predict_and_eval_main(args)
         self.assertLess(
             full_evaluation['voted']['eval']['avg_ler'], 0.001,
             "The accuracy on the test data must be below 0.1%")
Ejemplo n.º 4
0
def default_uw3_trainer_params(*, preload=True):
    p = uw3_trainer_params(with_validation=True, preload=preload)
    p.learning_rate.lr = 0
    p.scenario.model.layers = default_layers()  # need for correct loading
    p.warmstart.model = os.path.join(this_dir, "models",
                                     f"version{SavedCalamariModel.VERSION}",
                                     "0.ckpt.json")
    return p
Ejemplo n.º 5
0
 def test_pure_lstm_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         BiLSTMLayerParams(hidden_nodes=10),
         BiLSTMLayerParams(hidden_nodes=20),
     ]
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 6
0
 def test_predict_and_eval_uw3_with_voting(self):
     from calamari_ocr.test.test_train_file import uw3_trainer_params
     checkpoint = os.path.join(this_dir, "models", "best.ckpt")
     trainer_params = uw3_trainer_params(with_validation=True)
     args = PredictAndEvalArgs(
         checkpoint=[checkpoint, checkpoint, checkpoint],
         predictor=PredictorParams(pipeline=DataPipelineParams(
             batch_size=5)),
         data=trainer_params.gen.val_gen(),
     )
     main(args)
Ejemplo n.º 7
0
 def test_dilated_block_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         Conv2DLayerParams(filters=10),
         MaxPool2DLayerParams(),
         DilatedBlockLayerParams(filters=10),
         DilatedBlockLayerParams(filters=10),
         Conv2DLayerParams(filters=10),
     ]
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 8
0
def default_trainer_params(*,
                           with_validation=False,
                           with_split=False,
                           preload=True):
    p = uw3_trainer_params(with_validation=with_validation,
                           with_split=with_split)
    if hasattr(p.gen, 'val'):
        p.gen.val.preload = preload
    p.gen.train.preload = preload
    for dp in p.scenario.data.pre_proc.processors_of_type(
            AugmentationProcessorParams):
        dp.n_augmentations = 1
    return p
Ejemplo n.º 9
0
 def test_pure_cnn_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         Conv2DLayerParams(filters=10),
         MaxPool2DLayerParams(),
         Conv2DLayerParams(filters=20,
                           strides=IntVec2D(2, 2),
                           kernel_size=IntVec2D(4, 4)),
         Conv2DLayerParams(filters=30),
     ]
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 10
0
 def test_concat_cnn_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         Conv2DLayerParams(filters=10),
         MaxPool2DLayerParams(),
         DilatedBlockLayerParams(filters=10),
         TransposedConv2DLayerParams(filters=10),
         ConcatLayerParams(
             concat_indices=[1, 4]
         ),  # corresponds to output of first and fourth layer
         Conv2DLayerParams(filters=10),
         BiLSTMLayerParams(hidden_nodes=10),
     ]
     post_init(trainer_params)
     cmd_line_trainer_params = parse_args([
         "--network",
         "conv=10,pool=2x2,db=10:2,tconv=10,concat=1:4,conv=10,lstm=10"
     ])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     cmd_line_trainer_params = parse_args([
         "--model.layers",
         "Conv",
         "Pool",
         "DilatedBlock",
         "TConv",
         "Concat",
         "Conv",
         "BiLSTM",
         "--model.layers.0.filters",
         "10",
         "--model.layers.2.filters",
         "10",
         "--model.layers.3.filters",
         "10",
         "--model.layers.4.concat_indices",
         "1",
         "4",
         "--model.layers.5.filters",
         "10",
         "--model.layers.6.hidden_nodes",
         "10",
     ])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 11
0
 def test_concat_cnn_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         Conv2DLayerParams(filters=10),
         MaxPool2DLayerParams(),
         DilatedBlockLayerParams(filters=10),
         TransposedConv2DLayerParams(filters=10),
         ConcatLayerParams(
             concat_indices=[1, 4]
         ),  # corresponds to output of first and fourth layer
         Conv2DLayerParams(filters=10),
         BiLSTMLayerParams(hidden_nodes=10),
     ]
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 12
0
 def test_model_zoo(self):
     version = "1.0"
     url = f"https://github.com/Calamari-OCR/calamari_models/archive/{version}.tar.gz"
     with tempfile.TemporaryDirectory() as d:
         d = "model_archive_permanent"  # for debugging
         os.makedirs(d, exist_ok=True)
         os.chdir(d)
         if not os.path.exists("calamari_models"):
             check_call(
                 [
                     "sh",
                     "-c",
                     " ".join(
                         [
                             "wget",
                             "-q",
                             "-O",
                             "-",
                             url,
                             "|",
                             "tar",
                             "xz",
                             "&&",
                             "mv",
                             f"calamari_models-{version}",
                             "calamari_models",
                         ]
                     ),
                 ]
             )
         trainer_params = uw3_trainer_params(with_validation=True)
         args = PredictAndEvalArgs(
             checkpoint=glob(os.path.join("calamari_models", "antiqua_modern", "*.ckpt.json")),
             predictor=PredictorParams(pipeline=DataPipelineParams(batch_size=5)),
             data=trainer_params.gen.val_gen(),
         )
         full_evaluation = predict_and_eval_main(args)
         self.assertLess(
             full_evaluation["voted"]["eval"]["avg_ler"],
             0.001,
             "The accuracy on the test data must be below 0.1%",
         )
Ejemplo n.º 13
0
 def test_pure_cnn_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         Conv2DLayerParams(filters=10),
         MaxPool2DLayerParams(),
         Conv2DLayerParams(filters=20,
                           strides=IntVec2D(2, 2),
                           kernel_size=IntVec2D(4, 4)),
         Conv2DLayerParams(filters=30),
     ]
     post_init(trainer_params)
     cmd_line_trainer_params = parse_args(
         ["--network", "conv=10,pool=2x2,conv=20:4x4:2x2,conv=30"])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     cmd_line_trainer_params = parse_args([
         "--model.layers",
         "Conv",
         "Pool",
         "Conv",
         "Conv",
         "--model.layers.0.filters",
         "10",
         "--model.layers.2.filters",
         "20",
         "--model.layers.2.kernel_size.x",
         "4",
         "--model.layers.2.kernel_size.y",
         "4",
         "--model.layers.2.strides.x",
         "2",
         "--model.layers.2.strides.y",
         "2",
         "--model.layers.3.filters",
         "30",
     ])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 14
0
 def test_dilated_block_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         Conv2DLayerParams(filters=10),
         MaxPool2DLayerParams(strides=IntVec2D(2, 2)),
         DilatedBlockLayerParams(filters=10),
         DilatedBlockLayerParams(filters=10),
         Conv2DLayerParams(filters=10),
     ]
     post_init(trainer_params)
     cmd_line_trainer_params = parse_args(
         ["--network", "conv=10,pool=2x2:2x2,db=10:2,db=10:2,conv=10"])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     cmd_line_trainer_params = parse_args([
         "--model.layers",
         "Conv",
         "Pool",
         "DilatedBlock",
         "DilatedBlock",
         "Conv",
         "--model.layers.0.filters",
         "10",
         "--model.layers.1.strides",
         "2",
         "2",
         "--model.layers.2.filters",
         "10",
         "--model.layers.3.filters",
         "10",
         "--model.layers.4.filters",
         "10",
     ])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 15
0
 def test_pure_lstm_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = [
         BiLSTMLayerParams(hidden_nodes=10),
         BiLSTMLayerParams(hidden_nodes=20),
     ]
     post_init(trainer_params)
     cmd_line_trainer_params = parse_args(["--network", "lstm=10,lstm=20"])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     cmd_line_trainer_params = parse_args([
         "--model.layers",
         "BiLSTM",
         "BiLSTM",
         "--model.layers.0.hidden_nodes",
         "10",
         "--model.layers.1.hidden_nodes",
         "20",
     ])
     self.assertDictEqual(trainer_params.scenario.model.to_dict(),
                          cmd_line_trainer_params.scenario.model.to_dict())
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 16
0
 def test_default_architecture(self):
     trainer_params = uw3_trainer_params()
     trainer_params.scenario.model.layers = default_layers()
     with tempfile.TemporaryDirectory() as d:
         trainer_params.output_dir = d
         main(trainer_params)
Ejemplo n.º 17
0
def default_trainer_params(*, preload=True):
    p = uw3_trainer_params(with_validation=True, preload=preload)
    p.warmstart.model = os.path.join(this_dir, 'models', 'best.ckpt.json')
    return p