def test_eval_files_with_different_sources(self): run_predict( predict_args(data=FileDataParams( pred_extension=".ext-pred.txt", images=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png") ])), ))) r = run_eval( eval_args( gt_data=FileDataParams(texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.gt.txt") ]))), pred_data=FileDataParams(texts=sorted( glob_all([ os.path.join( this_dir, "data", "uw3_50lines", "test", "*.ext-pred.txt", ) ]))), )) self.assertLess(r["avg_ler"], 0.0009, msg="Current best model yields about 0.09% CER")
def test_eval_list_files(self): run_predict( predict_args(data=FileDataParams(images=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test.files") ]))))) r = run_eval( eval_args(gt_data=FileDataParams(texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test.gt.files") ]))))) self.assertLess(r["avg_ler"], 0.0009, msg="Current best model yields about 0.09% CER")
def uw3_trainer_params(with_validation=False, with_split=False, preload=True, debug=False): p = CalamariTestScenario.default_trainer_params() p.scenario.debug_graph_construction = debug p.force_eager = debug train = FileDataParams( images=glob_all( [os.path.join(this_dir, "data", "uw3_50lines", "train", "*.png")]), preload=preload, ) if with_split: p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.2, train=train) elif with_validation: p.gen.val.images = glob_all( [os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png")]) p.gen.val.preload = preload p.gen.train = train p.gen.__post_init__() else: p.gen = CalamariTrainOnlyPipelineParams(train=train) p.gen.setup.val.batch_size = 1 p.gen.setup.val.num_processes = 1 p.gen.setup.train.batch_size = 1 p.gen.setup.train.num_processes = 1 post_init(p) return p
def test_prediction_files_with_different_extension(self): run_predict( predict_args(data=FileDataParams( pred_extension='.ext-pred.txt', images=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png") ]))))) run_eval( eval_args(gt_data=FileDataParams( pred_extension='.ext-pred.txt', texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.gt.txt") ])))))
def setup_trainer_params(preload=True, debug=False): p = CalamariTestEnsembleScenario.default_trainer_params() p.force_eager = debug p.gen.train = FileDataParams( images=glob_all([os.path.join(this_dir, "data", "uw3_50lines", "train", "*.png")]), preload=preload, ) post_init(p) return p
def test_prediction_files(self): run_predict( predict_args(data=FileDataParams(images=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png") ]))))) run_eval( eval_args(gt_data=FileDataParams(texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.gt.txt") ]))))) args = eval_args(gt_data=FileDataParams(texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.gt.txt") ])))) with tempfile.TemporaryDirectory() as d: args.xlsx_output = os.path.join(d, 'output.xlsx') run_eval(args)
def test_eval_files(self): run_predict( predict_args(data=FileDataParams(images=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png") ]))))) r = run_eval( eval_args(gt_data=FileDataParams(texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.gt.txt") ]))))) self.assertLess(r["avg_ler"], 0.0009, msg="Current best model yields about 0.09% CER") args = eval_args(gt_data=FileDataParams(texts=sorted( glob_all([ os.path.join(this_dir, "data", "uw3_50lines", "test", "*.gt.txt") ])))) with tempfile.TemporaryDirectory() as d: args.xlsx_output = os.path.join(d, "output.xlsx") run_eval(args)
if not data.dtype == np.uint8: raise TypeError("Data for hdf5 must have type np.uint8") self.data.append(data) self.text.append(text) if len(self.data) >= self.n_max: self.finish_chunck() if __name__ == "__main__": from calamari_ocr.ocr.dataset.datareader.file import ( FileDataParams, ) dg = FileDataParams( images="calamari_ocr/test/data/uw3_50lines/train/*.png").create( PipelineMode.TRAINING) with Hdf5DatasetWriter("calamari_ocr/test/data/uw3_50lines/uw3-50lines.h5", n_max=1000) as writer: for sample in dg.generate(): writer.write(sample.inputs, sample.targets) from contextlib import ExitStack with Hdf5DatasetWriter("test", n_max=5) as writer: writer.write(np.zeros((10, 10), dtype=np.uint8), "test") writer.write(np.zeros((10, 15), dtype=np.uint8), "asdfasd") writer.write(np.zeros((1, 10), dtype=np.uint8), "te345") l = [ Hdf5DatasetWriter("test1", n_max=5),
return self.data_pipeline_cls()( pipeline_params, self, generator_params=params, ) if __name__ == "__main__": from calamari_ocr.ocr import Codec this_dir = os.path.dirname(os.path.realpath(__file__)) base_path = os.path.abspath(os.path.join(this_dir, "..", "..", "test", "data", "uw3_50lines", "train")) fdr = FileDataParams( num_processes=8, images=[os.path.join(base_path, "*.png")], limit=1000, ) params = DataParams( codec=Codec("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,:;-?+=_()*{}[]`@#$%^&'\""), downscale_factor=4, line_height=48, pre_proc=SequentialProcessorPipelineParams( run_parallel=True, processors=default_image_processors() + default_text_pre_processors() + [ AugmentationProcessorParams( modes={PipelineMode.TRAINING}, data_aug_params=DataAugmentationAmount(amount=2),
def main(): parser = argparse.ArgumentParser( description="Write split of folds to separate directories") parser.add_argument( "--files", nargs="+", help= "List all image files that shall be processed. Ground truth fils with the same " "base name but with '.gt.txt' as extension are required at the same location", ) parser.add_argument( "--n_folds", type=int, required=True, help="The number of fold, that is the number of models to train", ) parser.add_argument("--output_dir", type=str, required=True, help="Where to write the folds") parser.add_argument( "--keep_original_filename", action="store_true", help= "By default the copied new files get a new 8 digit name. Use this flag to keep the " "original name but be aware, that this might override lines with the same name", ) args = parser.parse_args() logger.info("Creating folds") images = glob_all(args.files) texts = [split_all_ext(p)[0] + ".gt.txt" for p in images] data_reader = FileDataParams(images=images, texts=texts, skip_invalid=True) data_reader.prepare_for_mode(PipelineMode.TRAINING) cross_fold = CrossFold( n_folds=args.n_folds, data_generator_params=data_reader, output_dir=args.output_dir, ) logger.info("Copying files") for fold_id, fold_files in enumerate(cross_fold.folds): fold_out_dir = os.path.join(args.output_dir, str(fold_id)) if not os.path.exists(fold_out_dir): os.makedirs(fold_out_dir) for file_id, file in tqdm(enumerate(fold_files), total=len(fold_files), desc=f"Fold {fold_id}"): img_file = file base, ext = split_all_ext(file) txt_file = base + ".gt.txt" output_basename = os.path.basename( base) if args.keep_original_filename else f"{fold_id:08d}" if os.path.exists(img_file) and os.path.exists(txt_file): output_file = os.path.join(fold_out_dir, f"{output_basename}{ext}") shutil.copyfile(img_file, output_file) output_file = os.path.join(fold_out_dir, f"{output_basename}.gt.txt") shutil.copyfile(txt_file, output_file) else: logger.info( f"Warning: Does not exist {img_file} or {txt_file}")
def file_dataset(): return FileDataParams(images=sorted( glob_all( [os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png")])))