def test_data_generator(self): numbers = list(range(10)) data_params = DataParams( pre_proc=ComposedProcessorPipelineParams( pipelines=[ SequentialProcessorPipelineParams( max_tasks_per_process=10000, run_parallel=True, num_threads=1, processors=[ RepeatSampleProcessorParams(f=100, add_per_step=7), ], ) ] ) ) data = data_params.create() with data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers) ).generate_input_samples(auto_repeat=False) as samples: gen = zip(range(10), samples) out = [s.inputs for _, s in gen] with data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers) ).generate_input_samples(auto_repeat=False) as samples: out = [s.inputs for s in samples]
def default_trainer_params(cls): p = super().default_trainer_params() p.gen.setup.train = DataPipelineParams(batch_size=1) p.gen.setup.val = DataPipelineParams(limit=1, batch_size=1) p.gen.__post_init__() p.gen.train_val.dataset = "fashion_mnist" p.skip_model_load_test = True p.random_seed = 1337 p.force_eager = False p.epochs = 1 p.samples_per_epoch = 1 return p
def default_trainer_params(cls): p = super().default_trainer_params() p.gen.setup.train = DataPipelineParams(limit=10, batch_size=1) p.gen.setup.val = DataPipelineParams(limit=10, batch_size=1) p.gen.__post_init__() p.gen.train_val.dataset = "fashion_mnist" p.gen.train_val.force_train = True # Use always training data ... p.gen.train_val.shuffle = False # ... and dont shuffle the training data p.skip_model_load_test = True p.random_seed = 1337 p.force_eager = False p.epochs = 5 p.samples_per_epoch = 10 return p
class TrainerPipelines: train: DataPipelineParams = field( default_factory=lambda: DataPipelineParams(mode=PipelineMode.TRAINING), metadata=pai_meta(fix_dc=True, mode="flat"), ) val: DataPipelineParams = field( default_factory=lambda: DataPipelineParams(mode=PipelineMode.EVALUATION ), metadata=pai_meta(fix_dc=True, mode="flat"), ) def __post_init__(self): self.train.mode = PipelineMode.TRAINING self.val.mode = PipelineMode.EVALUATION
def test_model_zoo(self): version = '1.0' url = f"https://github.com/Calamari-OCR/calamari_models/archive/{version}.tar.gz" with tempfile.TemporaryDirectory() as d: d = 'model_archive_permanent' # for debugging os.makedirs(d, exist_ok=True) os.chdir(d) if not os.path.exists('calamari_models'): check_call([ 'sh', '-c', ' '.join([ 'wget', '-q', '-O', '-', url, '|', 'tar', 'xz', '&&', 'mv', f'calamari_models-{version}', 'calamari_models' ]) ]) trainer_params = uw3_trainer_params(with_validation=True) args = PredictAndEvalArgs( checkpoint=glob( os.path.join('calamari_models', 'antiqua_modern', '*.ckpt.json')), predictor=PredictorParams(pipeline=DataPipelineParams( batch_size=5)), data=trainer_params.gen.val_gen(), ) full_evaluation = predict_and_eval_main(args) self.assertLess( full_evaluation['voted']['eval']['avg_ler'], 0.001, "The accuracy on the test data must be below 0.1%")
def test_data_sequential_pipeline(self): numbers = list(range(10)) target_numbers = [] for n in numbers: target_numbers.append((n + 1 + 3)) target_numbers.append((n + 1 + 3 + 7)) target_numbers = [n * 3 + 1 for n in target_numbers] target_numbers = [n for n in target_numbers if n % 2 == 1] data_params = DataParams( pre_proc=SequentialProcessorPipelineParams( run_parallel=False, num_threads=3, processors=[ AddProcessorParams(v=1), AddProcessorParams(v=3), RepeatSampleProcessorParams(f=2, add_per_step=7), MultiplyProcessorParams(f=3), AddProcessorParams(v=1), DropIfEvenProcessorParams(), ], ) ) data = data_params.create() with data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers) ).generate_input_samples(auto_repeat=False) as samples: out = [s.inputs for s in samples] self.assertListEqual(target_numbers, out)
def __init__(self, settings: AlgorithmPredictorSettings): super().__init__(settings) # ctc_decoder_params = deepcopy(settings.params.ctcDecoder.params) # lnp = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.ONE_STRING)) # if len(ctc_decoder_params.dictionary) > 0: # ctc_decoder_params.dictionary[:] = [lnp.apply(word) for word in ctc_decoder_params.dictionary] # else: # with open(os.path.join(BASE_DIR, 'internal_storage', 'resources', 'hyphen_dictionary.txt')) as f: # # TODO: dataset params in settings, that we can create the correct normalization params # ctc_decoder_params.dictionary[:] = [lnp.apply(line.split()[0]) for line in f.readlines()] # self.predictor = MultiPredictor(glob_all([s + '/text_best*.ckpt.json' for s in params.checkpoints])) voter_params = VoterParams() voter_params.type = VoterParams.type.ConfidenceVoterDefaultCTC self.predictor = MultiPredictor.from_paths( checkpoints=glob_all([settings.model.local_file('text.ckpt.json') ]), voter_params=voter_params, predictor_params=PredictorParams( silent=True, progress_bar=True, pipeline=DataPipelineParams(batch_size=1, mode=PipelineMode("prediction")))) # self.height = self.predictor.predictors[0].network_params.features self.voter = voter_from_params(voter_params) self.dict_corrector = None if settings.params.useDictionaryCorrection: self.dict_corrector = DictionaryCorrector()
def test_predict_and_eval_uw3_with_voting(self): from calamari_ocr.test.test_train_file import uw3_trainer_params checkpoint = os.path.join(this_dir, "models", "best.ckpt") trainer_params = uw3_trainer_params(with_validation=True) args = PredictAndEvalArgs( checkpoint=[checkpoint, checkpoint, checkpoint], predictor=PredictorParams(pipeline=DataPipelineParams( batch_size=5)), data=trainer_params.gen.val_gen(), ) main(args)
def test_predict_and_eval_hdf5(self): from calamari_ocr.test.test_train_hdf5 import default_trainer_params checkpoint = os.path.join(this_dir, "models", "best.ckpt") trainer_params = default_trainer_params(with_validation=True) args = PredictAndEvalArgs( checkpoint=[checkpoint], predictor=PredictorParams(pipeline=DataPipelineParams( num_processes=1)), data=trainer_params.gen.val_gen(), ) main(args)
def test_standalone_pipeline(self): from tfaip.imports import DataBaseParams class TestDataParams(DataBaseParams): @staticmethod def cls(): raise NotImplementedError data_params = TestDataParams() samples = [Sample()] * 100 pipeline = data_params.pre_proc.create( DataPipelineParams(num_processes=8), data_params) for i, d in enumerate(pipeline.apply(samples)): print(i, d)
def test_data_composed_pipeline(self): numbers = list(range(10)) target_numbers = [] for n in numbers: target_numbers.append((n + 1 + 3)) target_numbers.append((n + 1 + 3 + 7)) target_numbers = [n * 2 + 1 for n in target_numbers] target_numbers = [n for n in target_numbers if n % 2 == 1] data_params = DataParams( pre_proc=ComposedProcessorPipelineParams( pipelines=[ SequentialProcessorPipelineParams( run_parallel=False, num_threads=3, processors=[ AddProcessorParams(v=1), ], ), SequentialProcessorPipelineParams( num_threads=1, # Deterministic processors=[ # Test that generator with pre- and post-processor works AddProcessorParams(v=3), RepeatSampleProcessorParams(f=2, add_per_step=7), MultiplyProcessorParams(f=2), ], ), SequentialProcessorPipelineParams( run_parallel=True, processors=[ AddProcessorParams(v=1), ], ), SequentialProcessorPipelineParams( num_threads=1, # Deterministic processors=[ DropIfEvenProcessorParams(), ], ), ] ) ) data = data_params.create() with data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING, use_shared_memory=True), SimpleDataGeneratorParams(numbers_to_generate=numbers), ).generate_input_samples(auto_repeat=False) as samples: out = [s.inputs for s in samples] self.assertListEqual(target_numbers, out)
def run_test(test, parallel, n_numbers=100): numbers = list(range(n_numbers)) target_numbers = [] for n in numbers: target_numbers.append((n + 1 + 3)) target_numbers = [n * 3 + 1 for n in target_numbers] target_numbers = groups_into_samples(target_numbers) data_params = DataParams( pre_proc=SequentialProcessorPipelineParams( run_parallel=parallel, num_threads=3, processors=[ AddProcessorParams(v=1), AddProcessorParams(v=3), MultiplyProcessorParams(f=3), AddProcessorParams(v=1), PrepareParams(), ], ) ) data = data_params.create() pipeline = data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING), BatchedDataGeneratorParams(numbers_to_generate=numbers) ) with pipeline.generate_input_samples(auto_repeat=False) as batched_samples: # Test generate input samples batched_samples = list(batched_samples) out = [list(np.squeeze(x.inputs["n"], axis=-1)) for x in batched_samples] test.assertListEqual(target_numbers, out) with pipeline.generate_input_batches(auto_repeat=False) as batched_samples_ds: # Test dataset out = [list(np.squeeze(i["n"], axis=-1)) for i, t, m in batched_samples_ds] test.assertListEqual(target_numbers, out) for s in batched_samples_ds: s[2]["meta"] = np.array([[d[0].decode("utf-8")] for d in s[2]["meta"]]) def check_equal(s1: dict, s2: dict): test.assertListEqual(list(s1.keys()), list(s2.keys())) for k in s1.keys(): numpy.testing.assert_array_equal(s1[k], s2[k]) for s1, s2 in zip(batched_samples, batched_samples_ds): check_equal(s1.inputs, s2[0]) check_equal(s1.targets, s2[1]) check_equal(s1.meta, s2[2])
def test_data_preload(self): numbers = list(range(10)) data_params = DataParams( pre_proc=SequentialProcessorPipelineParams( max_tasks_per_process=10000, run_parallel=True, num_threads=1, # deterministic processors=[ RepeatSampleProcessorParams(f=10, add_per_step=7), ], ) ) data = data_params.create() l = data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers) ).preload_input_samples() out_numbers = [int(s.inputs) for s in l] self.assertListEqual(out_numbers, sum([[i + x * 7 for x in range(10)] for i in numbers], []))
def run_pad_test(test, n_numbers=1000): numbers = list(range(n_numbers)) data_params = DataPadParams() data = data_params.create() def to_tuple(s): return s.inputs, s.targets, s.meta pipeline = data.create_pipeline( DataPipelineParams(mode=PipelineMode.TRAINING), BatchedPadDataGeneratorParams(numbers_to_generate=numbers) ) # Test generate input samples, and dataset with pipeline.generate_input_samples(auto_repeat=False) as batched_samples, pipeline.generate_input_batches( auto_repeat=False ) as batched_samples_with_ds: batched_samples_with_ds = list(batched_samples_with_ds) for (i1, t1, m1), (i2, t2, m2) in zip(map(to_tuple, batched_samples), batched_samples_with_ds): np.testing.assert_array_equal(i1["n"], i2["n"]) np.testing.assert_array_equal(t1["n"], t2["n"])
def test_model_zoo(self): version = "1.0" url = f"https://github.com/Calamari-OCR/calamari_models/archive/{version}.tar.gz" with tempfile.TemporaryDirectory() as d: d = "model_archive_permanent" # for debugging os.makedirs(d, exist_ok=True) os.chdir(d) if not os.path.exists("calamari_models"): check_call( [ "sh", "-c", " ".join( [ "wget", "-q", "-O", "-", url, "|", "tar", "xz", "&&", "mv", f"calamari_models-{version}", "calamari_models", ] ), ] ) trainer_params = uw3_trainer_params(with_validation=True) args = PredictAndEvalArgs( checkpoint=glob(os.path.join("calamari_models", "antiqua_modern", "*.ckpt.json")), predictor=PredictorParams(pipeline=DataPipelineParams(batch_size=5)), data=trainer_params.gen.val_gen(), ) full_evaluation = predict_and_eval_main(args) self.assertLess( full_evaluation["voted"]["eval"]["avg_ler"], 0.001, "The accuracy on the test data must be below 0.1%", )
def default_trainer_params(cls): p = super().default_trainer_params() p.gen.setup.train = DataPipelineParams(batch_size=1) p.gen.setup.val = DataPipelineParams(limit=5, batch_size=1) p.scenario.data.pre_proc.run_parallel = False return p