def from_checkpoint(params: PredictorParams, checkpoint: str, auto_update_checkpoints=True): ckpt = SavedCalamariModel(checkpoint, auto_update=False) trainer_params = Scenario.trainer_params_from_dict(ckpt.dict) trainer_params.scenario_params.data_params.pre_processors_.run_parallel = False trainer_params.scenario_params.data_params.post_processors_.run_parallel = False scenario = Scenario(trainer_params.scenario_params) predictor = Predictor(params, scenario.create_data()) ckpt = SavedCalamariModel(checkpoint, auto_update=auto_update_checkpoints) # Device params must be specified first predictor.set_model(keras.models.load_model(ckpt.ckpt_path + '.h5', custom_objects=Scenario.model_cls().get_all_custom_objects())) return predictor
def __init__(self, params: TrainerParams, scenario, restore=False): """Train a DNN using given preprocessing, weights, and data The purpose of the Trainer is handle a default training mechanism. As required input it expects a `dataset` and hyperparameters (`checkpoint_params`). The steps are 1. Loading and preprocessing of the dataset 2. Computation of the codec 3. Construction of the DNN in the desired Deep Learning Framework 4. Launch of the training During the training the Trainer will perform validation checks if a `validation_dataset` is given to determine the best model. Furthermore, the current status is printet and checkpoints are written. """ super(Trainer, self).__init__(params, scenario, restore) self._params: TrainerParams = params if not isinstance(self._params.checkpoint_save_freq, str) and self._params.checkpoint_save_freq < 0: self._params.checkpoint_save_freq = self._params.early_stopping_params.frequency self._params.warmstart.model = (checkpoint_path( self._params.warmstart.model) if self._params.warmstart.model else None) self.checkpoint = None if self._params.warmstart.model: # Manually handle loading self.checkpoint = SavedCalamariModel( self._params.warmstart.model, auto_update=self._params.auto_upgrade_checkpoints, ) self._params.warmstart.model = self.checkpoint.ckpt_path + ".h5" self._params.warmstart.trim_graph_name = False self._codec_changes = None
def from_paths( cls, checkpoints: List[str], auto_update_checkpoints=True, predictor_params: PredictorParams = None, voter_params: VoterParams = None, **kwargs, ) -> "tfaip_cls.MultiModelPredictor": if not checkpoints: raise Exception("No checkpoints provided.") if predictor_params is None: predictor_params = PredictorParams(silent=True, progress_bar=True) DeviceConfig(predictor_params.device) checkpoints = [ SavedCalamariModel(ckpt, auto_update=auto_update_checkpoints) for ckpt in checkpoints ] multi_predictor = super(MultiPredictor, cls).from_paths( [ckpt.json_path for ckpt in checkpoints], predictor_params, CalamariScenario, model_paths=[ckpt.ckpt_path + ".h5" for ckpt in checkpoints], predictor_args={"voter_params": voter_params}, ) return multi_predictor
def test_upgrade_2_to_4(self): with tempfile.TemporaryDirectory() as d: for filename in {'0.ckpt.h5', '0.ckpt.json'}: shutil.copyfile(os.path.join(models_dir, 'version2', filename), os.path.join(d, filename)) ckpt = SavedCalamariModel(os.path.join(d, '0.ckpt.json')) self.predict(ckpt.ckpt_path)
def test_upgrade_from_2(self): with tempfile.TemporaryDirectory() as d: for filename in {"0.ckpt.h5", "0.ckpt.json"}: shutil.copyfile( os.path.join(models_dir, "version2", filename), os.path.join(d, filename), ) ckpt = SavedCalamariModel(os.path.join(d, "0.ckpt.json")) self.predict_and_eval(ckpt.ckpt_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--checkpoints', nargs='+', type=str, required=True) parser.add_argument('--dry_run', action='store_true') args = parser.parse_args() for ckpt in tqdm(glob_all(args.checkpoints)): ckpt = os.path.splitext(ckpt)[0] SavedCalamariModel(ckpt, dry_run=args.dry_run)
def split(args): ckpt = SavedCalamariModel(args.model) keras_model = keras.models.load_model( ckpt.ckpt_path + ".h5", custom_objects={ "Graph": Graph, "EnsembleGraph": EnsembleGraph, "VoterGraph": EnsembleGraph, }, ) def extract_keras_model(i): inputs = keras_model.input outputs = keras_model.output assert isinstance(outputs, dict) assert isinstance(inputs, dict) names_to_extract = [ "blank_last_logits", "blank_last_softmax", "softmax", "decoded", "out_len", ] split_outputs = {} for name in names_to_extract: src_name = f"{name}_{i}" if src_name not in outputs: return None split_outputs[name] = outputs[src_name] return keras.Model(inputs=inputs, outputs=split_outputs) split_models: List[keras.Model] = [] print("Starting to split models") while True: model = extract_keras_model(len(split_models)) if model is None: break split_models.append(model) print(f"Split model into {len(split_models)}.") print(f"Saving models to {ckpt.dirname}/{ckpt.basename}_split_(i).ckpt") with open(ckpt.json_path) as f: ckpt_dict = json.load(f) ckpt_dict["scenario_params"]["model_params"]["ensemble"] = -1 ckpt_dict["scenario_params"]["data_params"]["ensemble_"] = -1 for i, split_model in enumerate(split_models): path = os.path.join(ckpt.dirname, f"{ckpt.basename}_split_{i}.ckpt") with open(path + ".json", "w") as f: json.dump(ckpt_dict, f, indent=2) split_model.save(path + ".h5") print(f"Saved {i + 1}/{len(split_models)}")
def from_checkpoint(params: PredictorParams, checkpoint: str, auto_update_checkpoints=True): DeviceConfig(params.device) # Device must be specified first ckpt = SavedCalamariModel(checkpoint, auto_update=auto_update_checkpoints) scenario_params = CalamariScenario.params_from_dict(ckpt.dict) scenario = CalamariScenario(scenario_params) predictor = Predictor(params, scenario.create_data()) predictor.set_model( keras.models.load_model(ckpt.ckpt_path + '.h5', custom_objects=CalamariScenario.model_cls( ).all_custom_objects())) return predictor
def main(args: EvalArgs): # Local imports (imports that require tensorflow) from calamari_ocr.ocr.scenario import CalamariScenario from calamari_ocr.ocr.dataset.data import Data from calamari_ocr.ocr.evaluator import Evaluator if args.checkpoint: saved_model = SavedCalamariModel(args.checkpoint, auto_update=True) trainer_params = CalamariScenario.trainer_cls().params_cls().from_dict(saved_model.dict) data_params = trainer_params.scenario.data else: data_params = Data.default_params() data = Data(data_params) pred_data = args.pred if args.pred is not None else args.gt.to_prediction() evaluator = Evaluator(args.evaluator, data=data) evaluator.preload_gt(gt_dataset=args.gt) r = evaluator.run(gt_dataset=args.gt, pred_dataset=pred_data) # TODO: More output print("Evaluation result") print("=================") print("") print( "Got mean normalized label error rate of {:.2%} ({} errs, {} total chars, {} sync errs)".format( r["avg_ler"], r["total_char_errs"], r["total_chars"], r["total_sync_errs"] ) ) # sort descending print_confusions(r, args.n_confusions) samples = data.create_pipeline(evaluator.params.setup, args.gt).reader().samples() print_worst_lines(r, samples, args.n_worst_lines) if args.xlsx_output: write_xlsx( args.xlsx_output, [ { "prefix": "evaluation", "results": r, "gt_files": [s["id"] for s in samples], } ], ) return r
def split(args): ckpt = SavedCalamariModel(args.model) keras_model = keras.models.load_model(ckpt.ckpt_path + '.h5', custom_objects={ 'Graph': Graph, 'EnsembleGraph': EnsembleGraph, 'VoterGraph': EnsembleGraph}) def extract_keras_model(i): inputs = keras_model.input outputs = keras_model.output assert(isinstance(outputs, dict)) assert(isinstance(inputs, dict)) names_to_extract = ['blank_last_logits', 'blank_last_softmax', 'softmax', 'decoded', 'out_len'] split_outputs = {} for name in names_to_extract: src_name = f"{name}_{i}" if src_name not in outputs: return None split_outputs[name] = outputs[src_name] return keras.Model(inputs=inputs, outputs=split_outputs) split_models: List[keras.Model] = [] print("Starting to split models") while True: model = extract_keras_model(len(split_models)) if model is None: break split_models.append(model) print(f"Split model into {len(split_models)}.") print(f"Saving models to {ckpt.dirname}/{ckpt.basename}_split_(i).ckpt") with open(ckpt.json_path) as f: ckpt_dict = json.load(f) ckpt_dict['scenario_params']['model_params']['ensemble'] = -1 ckpt_dict['scenario_params']['data_params']['ensemble_'] = -1 for i, split_model in enumerate(split_models): path = os.path.join(ckpt.dirname, f"{ckpt.basename}_split_{i}.ckpt") with open(path + '.json', 'w') as f: json.dump(ckpt_dict, f, indent=2) split_model.save(path + '.h5') print(f"Saved {i + 1}/{len(split_models)}")
def run( self, single_fold=None, seed=-1, weights=None, max_parallel_models=-1, temporary_dir=None, keep_temporary_files=False, ): # Default params single_fold = single_fold if single_fold else [] weights = weights if weights else [] if max_parallel_models <= 0: max_parallel_models = self.n_folds # argument checks if len(weights) > 1 and len(weights) != self.n_folds: raise Exception( "Either no, one or n_folds (={}) models are required for pretraining but got {}." .format(self.n_folds, len(weights))) if len(single_fold) > 0: if len(set(single_fold)) != len(single_fold): raise Exception("Repeated fold id's found.") for fold_id in single_fold: if fold_id < 0 or fold_id >= self.n_folds: raise Exception( "Invalid fold id found: 0 <= id <= {}, but id == {}". format(self.n_folds, fold_id)) # create temporary dir # by default, the temporary files will be deleted after a successful training # if you specify a temporary dir, you can easily resume to train if an error occurred if keep_temporary_files and not temporary_dir: raise Exception( "If you want to keep the temporary model files you have to specify a temporary dir" ) # temporary dir if temporary_dir is None: temporary_dir = tempfile.mkdtemp(prefix="calamari") else: temporary_dir = os.path.abspath(temporary_dir) if not os.path.exists(temporary_dir): os.makedirs(temporary_dir) # Compute the files in the cross fold (create a CrossFold) fold_file = os.path.join(temporary_dir, "folds.json") cross_fold = CrossFold(n_folds=self.n_folds, data_reader=self.data_reader, output_dir=temporary_dir, progress_bar=self.progress_bars) cross_fold.write_folds_to_json(fold_file) # Create the json argument file for each individual training run_args = [] folds_to_run = single_fold if len(single_fold) > 0 else range( len(cross_fold.folds)) for fold in folds_to_run: train_files = cross_fold.train_files(fold) test_files = cross_fold.test_files(fold) path = os.path.join(temporary_dir, "fold_{}.json".format(fold)) with open(path, 'w') as f: fold_args = self.train_args.copy() fold_args["dataset"] = cross_fold.dataset_type.name fold_args["validation_dataset"] = cross_fold.dataset_type.name fold_args["validation_extension"] = self.train_args[ 'gt_extension'] fold_args["id"] = fold fold_args["files"] = train_files fold_args["validation"] = test_files fold_args["train_script"] = self.train_script_path fold_args["verbose"] = True fold_args["output_dir"] = os.path.join(temporary_dir, "fold_{}".format(fold)) fold_args[ "early_stopping_best_model_output_dir"] = self.best_models_dir fold_args[ "early_stopping_best_model_prefix"] = self.best_model_label.format( id=fold) fold_args['train_verbose'] = 2 if seed >= 0: fold_args["seed"] = seed + fold if len(weights) == 1: fold_args["weights"] = weights[0] elif len(weights) > 1: fold_args["weights"] = weights[fold] else: fold_args["weights"] = None # start from scratch via None if fold_args["weights"]: if len(fold_args["weights"].strip() ) == 0 or fold_args["weights"].upper() == "NONE": fold_args["weights"] = None else: # access model once to upgrade the model if necessary (can not be performed in parallel) SavedCalamariModel(fold_args["weights"]) # HDF5 dataset is already preloaded and does not require a extension anymore if cross_fold.dataset_type == DataSetType.HDF5: del fold_args["validation_extension"] del fold_args["gt_extension"] json.dump( fold_args, f, indent=4, ) run_args.append({"json": path, "args": fold_args}) # Launch the individual processes for each training with multiprocessing.pool.ThreadPool( processes=max_parallel_models) as pool: # workaround to forward keyboard interrupt pool.map_async(train_individual_model, run_args).get() if not keep_temporary_files: import shutil shutil.rmtree(temporary_dir)
def run(self): # temporary dir temporary_dir = self.params.temporary_dir if temporary_dir is None: temporary_dir = tempfile.mkdtemp(prefix="calamari") else: temporary_dir = os.path.abspath(temporary_dir) if not os.path.exists(temporary_dir): os.makedirs(temporary_dir) # Compute the files in the cross fold (create a CrossFold) fold_file = os.path.join(temporary_dir, "folds.json") cross_fold = CrossFold( n_folds=self.params.n_folds, data_generator_params=self.params.trainer.gen.train, output_dir=temporary_dir, progress_bar=self.params.trainer.progress_bar) cross_fold.write_folds_to_json(fold_file) # Create the json argument file for each individual training run_args = [] seed = self.params.trainer.random_seed or -1 folds_to_run = self.params.single_fold if len( self.params.single_fold) > 0 else range(len(cross_fold.folds)) for fold in folds_to_run: train_files = cross_fold.train_files(fold) test_files = cross_fold.test_files(fold) path = os.path.join(temporary_dir, "fold_{}.json".format(fold)) with open(path, 'w') as f: trainer_params = deepcopy(self.params.trainer) trainer_params.gen = CalamariDefaultTrainerPipelineParams( train=trainer_params.gen.train, val=deepcopy(trainer_params.gen.train), setup=trainer_params.gen.setup, ) if cross_fold.is_h5_dataset: tp = trainer_params.gen.train.to_dict() del tp['__cls__'] tp["files"] = train_files trainer_params.gen.train = Hdf5.from_dict(tp) vp = trainer_params.gen.val.to_dict() del vp['__cls__'] vp['files'] = test_files trainer_params.gen.val = Hdf5.from_dict(vp) else: trainer_params.gen.train.images = train_files trainer_params.gen.val.images = test_files trainer_params.gen.val.gt_extension = trainer_params.gen.train.gt_extension trainer_params.scenario.id = fold trainer_params.progress_bar_mode = 2 trainer_params.output_dir = os.path.join( temporary_dir, "fold_{}".format(fold)) trainer_params.early_stopping.best_model_output_dir = self.params.best_models_dir trainer_params.early_stopping.best_model_name = '' best_model_prefix = self.params.best_model_label.format( id=fold) trainer_params.best_model_prefix = best_model_prefix if self.params.visible_gpus: assert trainer_params.device.gpus is None, "Using visible_gpus with device.gpus is not supported" trainer_params.device.gpus = [ self.params.visible_gpus[fold % len(self.params.visible_gpus)] ] if seed >= 0: trainer_params.random_seed = seed + fold if len(self.params.weights) == 1: trainer_params.warmstart.model = self.params.weights[0] elif len(self.params.weights) > 1: trainer_params.warmstart.model = self.params.weights[fold] # start from scratch via None if trainer_params.warmstart.model: if len( trainer_params.warmstart.model.strip() ) == 0 or trainer_params.warmstart.model.upper() == "NONE": trainer_params.warmstart.model = None else: # access model once to upgrade the model if necessary # (can not be performed in parallel if multiple folds use the same model) SavedCalamariModel(trainer_params.warmstart.model) post_init(trainer_params) json.dump( trainer_params.to_dict(), f, indent=4, ) run_args.append({ "json": path, "args": trainer_params, "id": fold, 'train_script': self.train_script_path, 'run': self.params.run, 'verbose': True }) # Launch the individual processes for each training with multiprocessing.pool.ThreadPool( processes=self.params.max_parallel_models) as pool: # workaround to forward keyboard interrupt pool.map_async(train_individual_model, run_args).get() if not self.params.keep_temporary_files: import shutil shutil.rmtree(temporary_dir)
def main(): # Local imports (imports that require tensorflow) from calamari_ocr.ocr.scenario import Scenario from calamari_ocr.ocr.dataset.data import Data from calamari_ocr.ocr.evaluator import Evaluator parser = ArgumentParser() parser.add_argument("--dataset", type=DataSetType.from_string, choices=list(DataSetType), default=DataSetType.FILE) parser.add_argument("--gt", nargs="+", required=True, help="Ground truth files (.gt.txt extension). " "Optionally, you can pass a single json file defining all parameters.") parser.add_argument("--pred", nargs="+", default=None, help="Prediction files if provided. Else files with .pred.txt are expected at the same " "location as the gt.") parser.add_argument("--pred_dataset", type=DataSetType.from_string, choices=list(DataSetType), default=DataSetType.FILE) parser.add_argument("--pred_ext", type=str, default=".pred.txt", help="Extension of the predicted text files") parser.add_argument("--n_confusions", type=int, default=10, help="Only print n most common confusions. Defaults to 10, use -1 for all.") parser.add_argument("--n_worst_lines", type=int, default=0, help="Print the n worst recognized text lines with its error") parser.add_argument("--xlsx_output", type=str, help="Optionally write a xlsx file with the evaluation results") parser.add_argument("--num_threads", type=int, default=1, help="Number of threads to use for evaluation") parser.add_argument("--non_existing_file_handling_mode", type=str, default="error", help="How to handle non existing .pred.txt files. Possible modes: skip, empty, error. " "'Skip' will simply skip the evaluation of that file (not counting it to errors). " "'Empty' will handle this file as would it be empty (fully checking for errors)." "'Error' will throw an exception if a file is not existing. This is the default behaviour.") parser.add_argument("--skip_empty_gt", action="store_true", default=False, help="Ignore lines of the gt that are empty.") parser.add_argument("--no_progress_bars", action="store_true", help="Do not show any progress bars") parser.add_argument("--checkpoint", type=str, default=None, help="Specify an optional checkpoint to parse the text preprocessor (for the gt txt files)") # page xml specific args parser.add_argument("--pagexml_gt_text_index", default=0) parser.add_argument("--pagexml_pred_text_index", default=1) args = parser.parse_args() # check if loading a json file if len(args.gt) == 1 and args.gt[0].endswith("json"): with open(args.gt[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): setattr(args, key, value) logger.info("Resolving files") gt_files = sorted(glob_all(args.gt)) if args.pred: pred_files = sorted(glob_all(args.pred)) else: pred_files = [split_all_ext(gt)[0] + args.pred_ext for gt in gt_files] args.pred_dataset = args.dataset if args.non_existing_file_handling_mode.lower() == "skip": non_existing_pred = [p for p in pred_files if not os.path.exists(p)] for f in non_existing_pred: idx = pred_files.index(f) del pred_files[idx] del gt_files[idx] data_params = Data.get_default_params() if args.checkpoint: saved_model = SavedCalamariModel(args.checkpoint, auto_update=True) trainer_params = Scenario.trainer_params_from_dict(saved_model.dict) data_params = trainer_params.scenario_params.data_params data = Data(data_params) gt_reader_args = FileDataReaderArgs( text_index=args.pagexml_gt_text_index ) pred_reader_args = FileDataReaderArgs( text_index=args.pagexml_pred_text_index ) gt_data_set = PipelineParams( type=args.dataset, text_files=gt_files, data_reader_args=gt_reader_args, skip_invalid=args.skip_empty_gt, ) pred_data_set = PipelineParams( type=args.pred_dataset, text_files=pred_files, data_reader_args=pred_reader_args, ) evaluator = Evaluator(data=data) evaluator.preload_gt(gt_dataset=gt_data_set) r = evaluator.run(gt_dataset=gt_data_set, pred_dataset=pred_data_set, processes=args.num_threads, progress_bar=not args.no_progress_bars) # TODO: More output print("Evaluation result") print("=================") print("") print("Got mean normalized label error rate of {:.2%} ({} errs, {} total chars, {} sync errs)".format( r["avg_ler"], r["total_char_errs"], r["total_chars"], r["total_sync_errs"])) # sort descending print_confusions(r, args.n_confusions) print_worst_lines(r, data.create_pipeline(PipelineMode.Targets, gt_data_set).reader().samples(), args.n_worst_lines) if args.xlsx_output: write_xlsx(args.xlsx_output, [{ "prefix": "evaluation", "results": r, "gt_files": gt_files, }])