def main(): parser = argparse.ArgumentParser( description="Write split of folds to separate directories") parser.add_argument( "--files", nargs="+", help= "List all image files that shall be processed. Ground truth fils with the same " "base name but with '.gt.txt' as extension are required at the same location" ) parser.add_argument( "--n_folds", type=int, required=True, help="The number of fold, that is the number of models to train") parser.add_argument("--output_dir", type=str, required=True, help="Where to write the folds") args = parser.parse_args() print("Creating folds") cross_fold = CrossFold(n_folds=args.n_folds, source_files=args.files, output_dir=args.output_dir) print("Copying files") for fold_id, fold_files in enumerate(cross_fold.folds): fold_out_dir = os.path.join(args.output_dir, str(fold_id)) if not os.path.exists(fold_out_dir): os.makedirs(fold_out_dir) for file_id, file in tqdm(enumerate(fold_files), total=len(fold_files), desc="Fold {}".format(fold_id)): img_file = file base, ext = split_all_ext(file) txt_file = base + ".gt.txt" if os.path.exists(img_file) and os.path.exists(txt_file): output_file = os.path.join(fold_out_dir, "{:08d}{}".format(file_id, ext)) shutil.copyfile(img_file, output_file) output_file = os.path.join( fold_out_dir, "{:08d}{}".format(file_id, ".gt.txt")) shutil.copyfile(txt_file, output_file) else: print("Waring: Does not exist {} or {}".format( img_file, txt_file))
def main(): parser = argparse.ArgumentParser( description="Write split of folds to separate directories" ) parser.add_argument("--files", nargs="+", help="List all image files that shall be processed. Ground truth fils with the same " "base name but with '.gt.txt' as extension are required at the same location") parser.add_argument("--n_folds", type=int, required=True, help="The number of fold, that is the number of models to train") parser.add_argument("--output_dir", type=str, required=True, help="Where to write the folds") parser.add_argument("--keep_original_filename", action="store_true", help="By default the copied new files get a new 8 digit name. Use this flag to keep the " "original name but be aware, that this might override lines with the same name") args = parser.parse_args() logger.info("Creating folds") images = glob_all(args.files) texts = [split_all_ext(p)[0] + '.gt.txt' for p in images] data_reader = FileDataReader(PipelineMode.Training, images=images, texts=texts, skip_invalid=True) cross_fold = CrossFold(n_folds=args.n_folds, data_reader=data_reader, output_dir=args.output_dir) logger.info("Copying files") for fold_id, fold_files in enumerate(cross_fold.folds): fold_out_dir = os.path.join(args.output_dir, str(fold_id)) if not os.path.exists(fold_out_dir): os.makedirs(fold_out_dir) for file_id, file in tqdm(enumerate(fold_files), total=len(fold_files), desc="Fold {}".format(fold_id)): img_file = file base, ext = split_all_ext(file) txt_file = base + ".gt.txt" output_basename = os.path.basename(base) if args.keep_original_filename else "{:08d}".format(file_id) if os.path.exists(img_file) and os.path.exists(txt_file): output_file = os.path.join(fold_out_dir, "{}{}".format(output_basename, ext)) shutil.copyfile(img_file, output_file) output_file = os.path.join(fold_out_dir, "{}{}".format(output_basename, ".gt.txt")) shutil.copyfile(txt_file, output_file) else: logger.info("Warning: Does not exist {} or {}".format(img_file, txt_file))
import argparse import json from calamari_ocr.ocr import CrossFold if __name__ == "__main__": # Standalone script to run the cross fold splitting in a separate thread # this script is called from cross_fold_trianer.py parser = argparse.ArgumentParser() parser.add_argument("config") parser.add_argument("--dir", required=True) parser.add_argument("--progress_bar", action="store_true") args = parser.parse_args() with open(args.config) as f: cfg = json.load(f) cross_fold = CrossFold.from_dict(cfg) cross_fold.create_folds(progress_bar=args.progress_bar) cross_fold.write_folds_to_json(args.dir) with open(args.config, "w") as f: json.dump(cross_fold.to_dict(), f)
def run( self, single_fold=None, seed=-1, weights=None, max_parallel_models=-1, temporary_dir=None, keep_temporary_files=False, ): # Default params single_fold = single_fold if single_fold else [] weights = weights if weights else [] if max_parallel_models <= 0: max_parallel_models = self.n_folds # argument checks if len(weights) > 1 and len(weights) != self.n_folds: raise Exception( "Either no, one or n_folds (={}) models are required for pretraining but got {}." .format(self.n_folds, len(weights))) if len(single_fold) > 0: if len(set(single_fold)) != len(single_fold): raise Exception("Repeated fold id's found.") for fold_id in single_fold: if fold_id < 0 or fold_id >= self.n_folds: raise Exception( "Invalid fold id found: 0 <= id <= {}, but id == {}". format(self.n_folds, fold_id)) # create temporary dir # by default, the temporary files will be deleted after a successful training # if you specify a temporary dir, you can easily resume to train if an error occurred if keep_temporary_files and not temporary_dir: raise Exception( "If you want to keep the temporary model files you have to specify a temporary dir" ) # temporary dir if temporary_dir is None: temporary_dir = tempfile.mkdtemp(prefix="calamari") else: temporary_dir = os.path.abspath(temporary_dir) if not os.path.exists(temporary_dir): os.makedirs(temporary_dir) # Compute the files in the cross fold (create a CrossFold) fold_file = os.path.join(temporary_dir, "folds.json") cross_fold = CrossFold(n_folds=self.n_folds, data_reader=self.data_reader, output_dir=temporary_dir, progress_bar=self.progress_bars) cross_fold.write_folds_to_json(fold_file) # Create the json argument file for each individual training run_args = [] folds_to_run = single_fold if len(single_fold) > 0 else range( len(cross_fold.folds)) for fold in folds_to_run: train_files = cross_fold.train_files(fold) test_files = cross_fold.test_files(fold) path = os.path.join(temporary_dir, "fold_{}.json".format(fold)) with open(path, 'w') as f: fold_args = self.train_args.copy() fold_args["dataset"] = cross_fold.dataset_type.name fold_args["validation_dataset"] = cross_fold.dataset_type.name fold_args["validation_extension"] = self.train_args[ 'gt_extension'] fold_args["id"] = fold fold_args["files"] = train_files fold_args["validation"] = test_files fold_args["train_script"] = self.train_script_path fold_args["verbose"] = True fold_args["output_dir"] = os.path.join(temporary_dir, "fold_{}".format(fold)) fold_args[ "early_stopping_best_model_output_dir"] = self.best_models_dir fold_args[ "early_stopping_best_model_prefix"] = self.best_model_label.format( id=fold) fold_args['train_verbose'] = 2 if seed >= 0: fold_args["seed"] = seed + fold if len(weights) == 1: fold_args["weights"] = weights[0] elif len(weights) > 1: fold_args["weights"] = weights[fold] else: fold_args["weights"] = None # start from scratch via None if fold_args["weights"]: if len(fold_args["weights"].strip() ) == 0 or fold_args["weights"].upper() == "NONE": fold_args["weights"] = None else: # access model once to upgrade the model if necessary (can not be performed in parallel) SavedCalamariModel(fold_args["weights"]) # HDF5 dataset is already preloaded and does not require a extension anymore if cross_fold.dataset_type == DataSetType.HDF5: del fold_args["validation_extension"] del fold_args["gt_extension"] json.dump( fold_args, f, indent=4, ) run_args.append({"json": path, "args": fold_args}) # Launch the individual processes for each training with multiprocessing.pool.ThreadPool( processes=max_parallel_models) as pool: # workaround to forward keyboard interrupt pool.map_async(train_individual_model, run_args).get() if not keep_temporary_files: import shutil shutil.rmtree(temporary_dir)
def run(self): # temporary dir temporary_dir = self.params.temporary_dir if temporary_dir is None: temporary_dir = tempfile.mkdtemp(prefix="calamari") else: temporary_dir = os.path.abspath(temporary_dir) if not os.path.exists(temporary_dir): os.makedirs(temporary_dir) # Compute the files in the cross fold (create a CrossFold) fold_file = os.path.join(temporary_dir, "folds.json") cross_fold = CrossFold( n_folds=self.params.n_folds, data_generator_params=self.params.trainer.gen.train, output_dir=temporary_dir, progress_bar=self.params.trainer.progress_bar) cross_fold.write_folds_to_json(fold_file) # Create the json argument file for each individual training run_args = [] seed = self.params.trainer.random_seed or -1 folds_to_run = self.params.single_fold if len( self.params.single_fold) > 0 else range(len(cross_fold.folds)) for fold in folds_to_run: train_files = cross_fold.train_files(fold) test_files = cross_fold.test_files(fold) path = os.path.join(temporary_dir, "fold_{}.json".format(fold)) with open(path, 'w') as f: trainer_params = deepcopy(self.params.trainer) trainer_params.gen = CalamariDefaultTrainerPipelineParams( train=trainer_params.gen.train, val=deepcopy(trainer_params.gen.train), setup=trainer_params.gen.setup, ) if cross_fold.is_h5_dataset: tp = trainer_params.gen.train.to_dict() del tp['__cls__'] tp["files"] = train_files trainer_params.gen.train = Hdf5.from_dict(tp) vp = trainer_params.gen.val.to_dict() del vp['__cls__'] vp['files'] = test_files trainer_params.gen.val = Hdf5.from_dict(vp) else: trainer_params.gen.train.images = train_files trainer_params.gen.val.images = test_files trainer_params.gen.val.gt_extension = trainer_params.gen.train.gt_extension trainer_params.scenario.id = fold trainer_params.progress_bar_mode = 2 trainer_params.output_dir = os.path.join( temporary_dir, "fold_{}".format(fold)) trainer_params.early_stopping.best_model_output_dir = self.params.best_models_dir trainer_params.early_stopping.best_model_name = '' best_model_prefix = self.params.best_model_label.format( id=fold) trainer_params.best_model_prefix = best_model_prefix if self.params.visible_gpus: assert trainer_params.device.gpus is None, "Using visible_gpus with device.gpus is not supported" trainer_params.device.gpus = [ self.params.visible_gpus[fold % len(self.params.visible_gpus)] ] if seed >= 0: trainer_params.random_seed = seed + fold if len(self.params.weights) == 1: trainer_params.warmstart.model = self.params.weights[0] elif len(self.params.weights) > 1: trainer_params.warmstart.model = self.params.weights[fold] # start from scratch via None if trainer_params.warmstart.model: if len( trainer_params.warmstart.model.strip() ) == 0 or trainer_params.warmstart.model.upper() == "NONE": trainer_params.warmstart.model = None else: # access model once to upgrade the model if necessary # (can not be performed in parallel if multiple folds use the same model) SavedCalamariModel(trainer_params.warmstart.model) post_init(trainer_params) json.dump( trainer_params.to_dict(), f, indent=4, ) run_args.append({ "json": path, "args": trainer_params, "id": fold, 'train_script': self.train_script_path, 'run': self.params.run, 'verbose': True }) # Launch the individual processes for each training with multiprocessing.pool.ThreadPool( processes=self.params.max_parallel_models) as pool: # workaround to forward keyboard interrupt pool.map_async(train_individual_model, run_args).get() if not self.params.keep_temporary_files: import shutil shutil.rmtree(temporary_dir)