def train_books(self, books, output_model_prefix, weights=None, train_to_val=1, max_iters=100000, display=500, checkpoint_frequency=-1, preload=False): if isinstance(books, str): books = [books] dset = Nash5DataSet(DataSetMode.TRAIN, self.cachefile, books) if 0 < train_to_val < 1: valsamples = random.sample(dset._samples, int((1-train_to_val)*len(dset))) for s in valsamples: dset._samples.remove(s) vdset = Nash5DataSet(DataSetMode.TRAIN, self.cachefile, []) vdset._samples = valsamples else: vdset = None parser = argparse.ArgumentParser() setup_train_args(parser, omit=["files", "validation"]) args = parser.parse_known_args()[0] with h5py.File(self.cachefile, 'r', libver='latest', swmr=True) as cache: if all(cache[b].attrs.get("dir") == "rtl" for b in books): args.bidi_dir = "rtl" params = params_from_args(args) params.output_model_prefix = output_model_prefix params.early_stopping_best_model_prefix = "best_" + output_model_prefix params.max_iters = max_iters params.display = display params.checkpoint_frequency = checkpoint_frequency trainer = Trainer(params, dset, txt_preproc=NoopTextProcessor(), data_preproc=NoopDataPreprocessor(), validation_dataset=vdset, weights=weights, preload_training=preload, preload_validation=True) trainer.train(progress_bar=True)
def main(args=None): if args is None: # parse args from command line parser = argparse.ArgumentParser() # fold parameters parser.add_argument("--files", nargs="+", help="List all image files that shall be processed. Ground truth fils with the same " "base name but with '.gt.txt' as extension are required at the same location") parser.add_argument("--n_folds", type=int, default=5, help="The number of fold, that is the number of models to train") parser.add_argument("--keep_temporary_files", action="store_true", help="By default all temporary files (e.g. intermediate checkpoints) will be erased. Set this " "flag if you want to keep those files.") parser.add_argument("--best_models_dir", type=str, required=True, help="path where to store the best models of each fold") parser.add_argument("--best_model_label", type=str, default="{id}", help="The label of the best model in best model dirs. This will be string formatted. " "The default '{id}' will label the models 0, 1, 2, 3, ...") parser.add_argument("--temporary_dir", type=str, default=None, help="A path to a temporary dir, where the intermediate model training data will be stored" "for each fold. Use --keep_temporary_files flag to keep the files. By default a system" "temporary dir will be used") parser.add_argument("--run", type=str, default=None, help="An optional command that will receive the train calls. Useful e.g. when using a resource " "manager such as slurm.") parser.add_argument("--max_parallel_models", type=int, default=-1, help="Number of models to train in parallel. Defaults to all.") parser.add_argument("--weights", type=str, nargs="+", default=[], help="Load network weights from the given file. If more than one file is provided the number " "models must match the number of folds. Each fold is then initialized with the weights " "of each model, respectively. If a model path is set to 'None', this model will start " "from scratch") parser.add_argument("--single_fold", type=int, nargs="+", default=[], help="Only train a single (list of single) specific fold(s).") # add the training args (omit those params, that are set by the cross fold training) setup_train_args(parser, omit=["files", "validation", "weights", "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix"]) args = parser.parse_args() # argument checks if len(args.weights) > 1 and len(args.weights) != args.n_folds: raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format( args.n_folds, len(args.weights) )) if len(args.single_fold) > 0: if len(set(args.single_fold)) != len(args.single_fold): raise Exception("Repeated fold id's found.") for fold_id in args.single_fold: if fold_id < 0 or fold_id >= args.n_folds: raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id)) # automatically set the number of models that shall be run in parallel if args.max_parallel_models <= 0: args.max_parallel_models = args.n_folds # by default, the temporary files will be deleted after a successful training # if you specify a temporary dir, you can easily resume to train if an error occurred if args.keep_temporary_files and not args.temporary_dir: raise Exception("If you want to keep the temporary model files you have to specify a temporary dir") if not args.temporary_dir: args.temporary_dir = tempfile.mkdtemp(prefix="calamari") else: args.temporary_dir = os.path.abspath(args.temporary_dir) if not os.path.exists(args.temporary_dir): os.makedirs(args.temporary_dir) # location of best models output if not os.path.exists(args.best_models_dir): os.makedirs(args.best_models_dir) # locate the training script (must be in the same dir as "this") train_script_path = os.path.join(this_absdir, "train.py") if not os.path.exists(train_script_path): raise Exception("Missing train script path. Expected 'train.py' at {}".format(this_absdir)) # Compute the files in the cross fold (create a CrossFold) fold_file = os.path.join(args.temporary_dir, "folds.json") cross_fold = CrossFold(n_folds=args.n_folds, source_files=args.files, output_dir=args.best_models_dir) cross_fold.write_folds_to_json(fold_file) # Create the json argument file for each individual training run_args = [] folds_to_run = args.single_fold if len(args.single_fold) > 0 else range(len(cross_fold.folds)) for fold in folds_to_run: train_files = cross_fold.train_files(fold) test_files = cross_fold.test_files(fold) path = os.path.join(args.temporary_dir, "fold_{}.json".format(fold)) with open(path, 'w') as f: fold_args = vars(args).copy() fold_args["id"] = fold fold_args["files"] = train_files fold_args["validation"] = test_files fold_args["train_script"] = train_script_path fold_args["verbose"] = True fold_args["output_dir"] = os.path.join(args.temporary_dir, "fold_{}".format(fold)) fold_args["early_stopping_best_model_output_dir"] = args.best_models_dir fold_args["early_stopping_best_model_prefix"] = args.best_model_label.format(id=fold) if args.seed >= 0: fold_args["seed"] = args.seed + fold if len(args.weights) == 1: fold_args["weights"] = args.weights[0] elif len(args.weights) > 1: fold_args["weights"] = args.weights[fold] else: fold_args["weights"] = None # start from scratch via None if fold_args["weights"]: if len(fold_args["weights"].strip()) == 0 or fold_args["weights"].upper() == "NONE": fold_args["weights"] = None json.dump( fold_args, f, indent=4, ) run_args.append({"json": path, "args": fold_args}) # Launch the individual processes for each training with multiprocessing.Pool(processes=args.max_parallel_models) as pool: # workaround to forward keyboard interrupt pool.map_async(train_individual_model, run_args).get(999999999)
def main(args=None): if args is None: # parse args from command line parser = argparse.ArgumentParser() # fold parameters parser.add_argument("--files", nargs="+", help="List all image files that shall be processed. Ground truth fils with the same " "base name but with '.gt.txt' as extension are required at the same location. " "Optionally you can pass a single json file defining all arguments") parser.add_argument("--dataset", type=DataSetType.from_string, choices=list(DataSetType), default=DataSetType.FILE) parser.add_argument("--text_files", nargs="+", default=None, help="Optional list of GT files if they are in other directory") parser.add_argument("--gt_extension", default=None, help="Default extension of the gt files (expected to exist in same dir)") parser.add_argument("--n_folds", type=int, default=5, help="The number of fold, that is the number of models to train") parser.add_argument("--keep_temporary_files", action="store_true", help="By default all temporary files (e.g. intermediate checkpoints) will be erased. Set this " "flag if you want to keep those files.") parser.add_argument("--best_models_dir", type=str, required=True, help="path where to store the best models of each fold") parser.add_argument("--best_model_label", type=str, default="{id}", help="The label of the best model in best model dirs. This will be string formatted. " "The default '{id}' will label the models 0, 1, 2, 3, ...") parser.add_argument("--temporary_dir", type=str, default=None, help="A path to a temporary dir, where the intermediate model training data will be stored" "for each fold. Use --keep_temporary_files flag to keep the files. By default a system" "temporary dir will be used") parser.add_argument("--run", type=str, default=None, help="An optional command that will receive the train calls. Useful e.g. when using a resource " "manager such as slurm.") parser.add_argument("--max_parallel_models", type=int, default=-1, help="Number of models to train in parallel. Defaults to all.") parser.add_argument("--weights", type=str, nargs="+", default=[], help="Load network weights from the given file. If more than one file is provided the number " "models must match the number of folds. Each fold is then initialized with the weights " "of each model, respectively. If a model path is set to 'None', this model will start " "from scratch") parser.add_argument("--single_fold", type=int, nargs="+", default=[], help="Only train a single (list of single) specific fold(s).") # add the training args (omit those params, that are set by the cross fold training) setup_train_args(parser, omit=["files", "validation", "weights", "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix", "output_dir"]) args = parser.parse_args() # check if loading a json file if len(args.files) == 1 and args.files[0].endswith("json"): with open(args.files[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): if key == 'dataset' or key == 'validation_dataset': setattr(args, key, DataSetType.from_string(value)) else: setattr(args, key, value) dataset_args = FileDataReaderArgs( line_generator_params=args.line_generator_params, text_generator_params=args.text_generator_params, pad=args.dataset_pad, text_index=args.pagexml_text_index, ) train_params = PipelineParams( type=args.dataset, skip_invalid=not args.no_skip_invalid_gt, remove_invalid=True, files=args.files, text_files=args.text_files, gt_extension=args.gt_extension if args.gt_extension else DataSetType.gt_extension(args.dataset), data_reader_args=dataset_args, batch_size=args.batch_size, num_processes=args.num_threads, ) reader = data_reader_from_params(PipelineMode.Training, train_params) trainer = CrossFoldTrainer( n_folds=args.n_folds, data_reader=reader, best_models_dir=args.best_models_dir, best_model_label=args.best_model_label, train_args=vars(args), progress_bars=not args.no_progress_bars, ) trainer.run( args.single_fold, seed=args.seed, weights=args.weights, max_parallel_models=args.max_parallel_models, temporary_dir=args.temporary_dir, keep_temporary_files=args.keep_temporary_files, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--base_dir", type=str, required=True, help="The base directory where to store all working files") parser.add_argument("--eval_files", type=str, nargs="+", required=True, help="All files that shall be used for evaluation") parser.add_argument("--train_files", type=str, nargs="+", required=True, help="All files that shall be used for (cross-fold) training") parser.add_argument("--n_lines", type=int, default=[-1], nargs="+", help="Optional argument to specify the number of lines (images) used for training. " "On default, all available lines will be used.") parser.add_argument("--run", type=str, default=None, help="An optional command that will receive the train calls. Useful e.g. when using a resource " "manager such as slurm.") parser.add_argument("--n_folds", type=int, default=5, help="The number of fold, that is the number of models to train") parser.add_argument("--max_parallel_models", type=int, default=-1, help="Number of models to train in parallel per fold. Defaults to all.") parser.add_argument("--weights", type=str, nargs="+", default=[], help="Load network weights from the given file. If more than one file is provided the number " "models must match the number of folds. Each fold is then initialized with the weights " "of each model, respectively") parser.add_argument("--single_fold", type=int, nargs="+", default=[], help="Only train a single (list of single) specific fold(s).") parser.add_argument("--skip_train", action="store_true", help="Skip the cross fold training") parser.add_argument("--skip_eval", action="store_true", help="Skip the cross fold evaluation") parser.add_argument("--verbose", action="store_true", help="Verbose output") parser.add_argument("--n_confusions", type=int, default=0, help="Only print n most common confusions. Defaults to 0, use -1 for all.") parser.add_argument("--xlsx_output", type=str, help="Optionally write a xlsx file with the evaluation results") setup_train_args(parser, omit=["files", "validation", "weights", "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix", "output_dir"]) args = parser.parse_args() args.base_dir = os.path.abspath(os.path.expanduser(args.base_dir)) np.random.seed(args.seed) random.seed(args.seed) # argument checks args.weights = glob_all(args.weights) if len(args.weights) > 1 and len(args.weights) != args.n_folds: raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format( args.n_folds, len(args.weights) )) if len(args.single_fold) > 0: if len(set(args.single_fold)) != len(args.single_fold): raise Exception("Repeated fold id's found.") for fold_id in args.single_fold: if fold_id < 0 or fold_id >= args.n_folds: raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id)) actual_folds = args.single_fold else: actual_folds = list(range(args.n_folds)) # run for all lines single_args = [copy.copy(args) for _ in args.n_lines] for s_args, n_lines in zip(single_args, args.n_lines): s_args.n_lines = n_lines predictions = parallel_map(run_for_single_line, single_args, progress_bar=False, processes=len(single_args), use_thread_pool=True) # output predictions as csv: header = "lines," + ",".join([str(fold) for fold in range(args.n_folds)])\ + ",avg,std,seq. vot., def. conf. vot., fuz. conf. vot." print(header) for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] data = "{}".format(n_lines) folds_lers = [] for fold in range(len(actual_folds)): eval = prediction[str(fold)]["eval"] data += ",{}".format(eval['avg_ler']) folds_lers.append(eval['avg_ler']) data += ",{},{}".format(np.mean(folds_lers), np.std(folds_lers)) for voter in ['sequence_voter', 'confidence_voter_default_ctc']: eval = prediction[voter]["eval"] data += ",{}".format(eval['avg_ler']) print(data) if args.n_confusions != 0: for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] print("") print("CONFUSIONS (lines = {})".format(n_lines)) print("==========") print() for fold in range(len(actual_folds)): print("FOLD {}".format(fold)) print_confusions(prediction[str(fold)]['eval'], args.n_confusions) for voter in ['sequence_voter', 'confidence_voter_default_ctc']: print("VOTER {}".format(voter)) print_confusions(prediction[voter]['eval'], args.n_confusions) if args.xlsx_output: data_list = [] for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] for fold in actual_folds: pred = prediction[str(fold)] data_list.append({ "prefix": "L{} - Fold{}".format(n_lines, fold), "results": pred['eval'], "gt_files": prediction_map['gt_txts'], "gts": prediction_map['gt'], "preds": pred['data'] }) for voter in ['sequence_voter', 'confidence_voter_default_ctc']: pred = prediction[voter] data_list.append({ "prefix": "L{} - {}".format(n_lines, voter[:3]), "results": pred['eval'], "gt_files": prediction_map['gt_txts'], "gts": prediction_map['gt'], "preds": pred['data'] }) write_xlsx(args.xlsx_output, data_list)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--base_dir", type=str, required=True, help="The base directory where to store all working files") parser.add_argument("--eval_files", type=str, nargs="+", required=True, help="All files that shall be used for evaluation") parser.add_argument("--train_files", type=str, nargs="+", required=True, help="All files that shall be used for (cross-fold) training") parser.add_argument("--n_lines", type=int, default=[-1], nargs="+", help="Optional argument to specify the number of lines (images) used for training. " "On default, all available lines will be used.") parser.add_argument("--run", type=str, default=None, help="An optional command that will receive the train calls. Useful e.g. when using a resource " "manager such as slurm.") parser.add_argument("--n_folds", type=int, default=5, help="The number of fold, that is the number of models to train") parser.add_argument("--max_parallel_models", type=int, default=-1, help="Number of models to train in parallel per fold. Defaults to all.") parser.add_argument("--weights", type=str, nargs="+", default=[], help="Load network weights from the given file. If more than one file is provided the number " "models must match the number of folds. Each fold is then initialized with the weights " "of each model, respectively") parser.add_argument("--single_fold", type=int, nargs="+", default=[], help="Only train a single (list of single) specific fold(s).") parser.add_argument("--skip_train", action="store_true", help="Skip the cross fold training") parser.add_argument("--skip_eval", action="store_true", help="Skip the cross fold evaluation") parser.add_argument("--verbose", action="store_true", help="Verbose output") parser.add_argument("--n_confusions", type=int, default=0, help="Only print n most common confusions. Defaults to 0, use -1 for all.") parser.add_argument("--xlsx_output", type=str, help="Optionally write a xlsx file with the evaluation results") setup_train_args(parser, omit=["files", "validation", "weights", "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix", "output_dir"]) args = parser.parse_args() args.base_dir = os.path.abspath(os.path.expanduser(args.base_dir)) np.random.seed(args.seed) random.seed(args.seed) # argument checks args.weights = glob_all(args.weights) if len(args.weights) > 1 and len(args.weights) != args.n_folds: raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format( args.n_folds, len(args.weights) )) if len(args.single_fold) > 0: if len(set(args.single_fold)) != len(args.single_fold): raise Exception("Repeated fold id's found.") for fold_id in args.single_fold: if fold_id < 0 or fold_id >= args.n_folds: raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id)) actual_folds = args.single_fold else: actual_folds = list(range(args.n_folds)) # run for all lines single_args = [copy.copy(args) for _ in args.n_lines] for s_args, n_lines in zip(single_args, args.n_lines): s_args.n_lines = n_lines predictions = parallel_map(run_for_single_line, single_args, progress_bar=False, processes=len(single_args), use_thread_pool=True) # output predictions as csv: header = "lines," + ",".join([str(fold) for fold in range(args.n_folds)])\ + ",avg,std,seq. vot., def. conf. vot., fuz. conf. vot." print(header) for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] data = "{}".format(n_lines) folds_lers = [] for fold in range(len(actual_folds)): eval = prediction[str(fold)]["eval"] data += ",{}".format(eval['avg_ler']) folds_lers.append(eval['avg_ler']) data += ",{},{}".format(np.mean(folds_lers), np.std(folds_lers)) for voter in ['sequence_voter', 'confidence_voter_default_ctc', 'confidence_voter_fuzzy_ctc']: eval = prediction[voter]["eval"] data += ",{}".format(eval['avg_ler']) print(data) if args.n_confusions != 0: for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] print("") print("CONFUSIONS (lines = {})".format(n_lines)) print("==========") print() for fold in range(len(actual_folds)): print("FOLD {}".format(fold)) print_confusions(prediction[str(fold)]['eval'], args.n_confusions) for voter in ['sequence_voter', 'confidence_voter_default_ctc', 'confidence_voter_fuzzy_ctc']: print("VOTER {}".format(voter)) print_confusions(prediction[voter]['eval'], args.n_confusions) if args.xlsx_output: data_list = [] for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] for fold in actual_folds: pred = prediction[str(fold)] data_list.append({ "prefix": "L{} - Fold{}".format(n_lines, fold), "results": pred['eval'], "gt_files": prediction_map['gt_txts'], "gts": prediction_map['gt'], "preds": pred['data'] }) for voter in ['sequence_voter', 'confidence_voter_default_ctc']: pred = prediction[voter] data_list.append({ "prefix": "L{} - {}".format(n_lines, voter[:3]), "results": pred['eval'], "gt_files": prediction_map['gt_txts'], "gts": prediction_map['gt'], "preds": pred['data'] }) write_xlsx(args.xlsx_output, data_list)
def main(args=None): if args is None: # parse args from command line parser = argparse.ArgumentParser() # fold parameters parser.add_argument("--files", nargs="+", help="List all image files that shall be processed. Ground truth fils with the same " "base name but with '.gt.txt' as extension are required at the same location") parser.add_argument("--n_folds", type=int, default=5, help="The number of fold, that is the number of models to train") parser.add_argument("--keep_temporary_files", action="store_true", help="By default all temporary files (e.g. intermediate checkpoints) will be erased. Set this " "flag if you want to keep those files.") parser.add_argument("--best_models_dir", type=str, required=True, help="path where to store the best models of each fold") parser.add_argument("--best_model_label", type=str, default="{id}", help="The label of the best model in best model dirs. This will be string formatted. " "The default '{id}' will label the models 0, 1, 2, 3, ...") parser.add_argument("--temporary_dir", type=str, default=None, help="A path to a temporary dir, where the intermediate model training data will be stored" "for each fold. Use --keep_temporary_files flag to keep the files. By default a system" "temporary dir will be used") parser.add_argument("--run", type=str, default=None, help="An optional command that will receive the train calls. Useful e.g. when using a resource " "manager such as slurm.") parser.add_argument("--max_parallel_models", type=int, default=-1, help="Number of models to train in parallel. Defaults to all.") parser.add_argument("--weights", type=str, nargs="+", default=[], help="Load network weights from the given file. If more than one file is provided the number " "models must match the number of folds. Each fold is then initialized with the weights " "of each model, respectively. If a model path is set to 'None', this model will start " "from scratch") parser.add_argument("--single_fold", type=int, nargs="+", default=[], help="Only train a single (list of single) specific fold(s).") # add the training args (omit those params, that are set by the cross fold training) setup_train_args(parser, omit=["files", "validation", "weights", "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix", "output_dir"]) args = parser.parse_args() # argument checks if len(args.weights) > 1 and len(args.weights) != args.n_folds: raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format( args.n_folds, len(args.weights) )) if len(args.single_fold) > 0: if len(set(args.single_fold)) != len(args.single_fold): raise Exception("Repeated fold id's found.") for fold_id in args.single_fold: if fold_id < 0 or fold_id >= args.n_folds: raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id)) # automatically set the number of models that shall be run in parallel if args.max_parallel_models <= 0: args.max_parallel_models = args.n_folds # by default, the temporary files will be deleted after a successful training # if you specify a temporary dir, you can easily resume to train if an error occurred if args.keep_temporary_files and not args.temporary_dir: raise Exception("If you want to keep the temporary model files you have to specify a temporary dir") if not args.temporary_dir: args.temporary_dir = tempfile.mkdtemp(prefix="calamari") else: args.temporary_dir = os.path.abspath(args.temporary_dir) if not os.path.exists(args.temporary_dir): os.makedirs(args.temporary_dir) # location of best models output if not os.path.exists(args.best_models_dir): os.makedirs(args.best_models_dir) # locate the training script (must be in the same dir as "this") train_script_path = os.path.join(this_absdir, "train.py") if not os.path.exists(train_script_path): raise Exception("Missing train script path. Expected 'train.py' at {}".format(this_absdir)) # Compute the files in the cross fold (create a CrossFold) fold_file = os.path.join(args.temporary_dir, "folds.json") cross_fold = CrossFold(n_folds=args.n_folds, source_files=args.files, output_dir=args.best_models_dir) cross_fold.write_folds_to_json(fold_file) # Create the json argument file for each individual training run_args = [] folds_to_run = args.single_fold if len(args.single_fold) > 0 else range(len(cross_fold.folds)) for fold in folds_to_run: train_files = cross_fold.train_files(fold) test_files = cross_fold.test_files(fold) path = os.path.join(args.temporary_dir, "fold_{}.json".format(fold)) with open(path, 'w') as f: fold_args = vars(args).copy() fold_args["id"] = fold fold_args["files"] = train_files fold_args["validation"] = test_files fold_args["train_script"] = train_script_path fold_args["verbose"] = True fold_args["output_dir"] = os.path.join(args.temporary_dir, "fold_{}".format(fold)) fold_args["early_stopping_best_model_output_dir"] = args.best_models_dir fold_args["early_stopping_best_model_prefix"] = args.best_model_label.format(id=fold) if args.seed >= 0: fold_args["seed"] = args.seed + fold if len(args.weights) == 1: fold_args["weights"] = args.weights[0] elif len(args.weights) > 1: fold_args["weights"] = args.weights[fold] else: fold_args["weights"] = None # start from scratch via None if fold_args["weights"]: if len(fold_args["weights"].strip()) == 0 or fold_args["weights"].upper() == "NONE": fold_args["weights"] = None json.dump( fold_args, f, indent=4, ) run_args.append({"json": path, "args": fold_args}) # Launch the individual processes for each training with multiprocessing.Pool(processes=args.max_parallel_models) as pool: # workaround to forward keyboard interrupt pool.map_async(train_individual_model, run_args).get(999999999)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--base_dir", type=str, required=True, help="The base directory where to store all working files") parser.add_argument("--eval_files", type=str, nargs="+", required=True, help="All files that shall be used for evaluation") parser.add_argument("--n_lines", type=int, default=[-1], nargs="+", help="Optional argument to specify the number of lines (images) used for training. " "On default, all available lines will be used.") parser.add_argument("--run", type=str, default=None, help="An optional command that will receive the train calls. Useful e.g. when using a resource " "manager such as slurm.") parser.add_argument("--skip_train", action="store_true", help="Skip the cross fold training") parser.add_argument("--skip_eval", action="store_true", help="Skip the cross fold evaluation") parser.add_argument("--verbose", action="store_true", help="Verbose output") parser.add_argument("--n_confusions", type=int, default=0, help="Only print n most common confusions. Defaults to 0, use -1 for all.") parser.add_argument("--xlsx_output", type=str, help="Optionally write a xlsx file with the evaluation results") setup_train_args(parser, omit=["early_stopping_best_model_output_dir", "output_dir"]) args = parser.parse_args() args.base_dir = os.path.abspath(os.path.expanduser(args.base_dir)) np.random.seed(args.seed) random.seed(args.seed) # run for all lines single_args = [copy.copy(args) for _ in args.n_lines] for s_args, n_lines in zip(single_args, args.n_lines): s_args.n_lines = n_lines predictions = parallel_map(run_for_single_line, single_args, progress_bar=False, processes=len(single_args), use_thread_pool=True) predictions = list(predictions) # output predictions as csv: header = "lines," + ",".join([str(fold) for fold in range(len(predictions[0]["full"]) - 1)])\ + ",avg,std,voted" print(header) for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] data = "{}".format(n_lines) folds_lers = [] for fold, pred in prediction.items(): if fold == 'voted': continue eval = pred["eval"] data += ",{}".format(eval['avg_ler']) folds_lers.append(eval['avg_ler']) data += ",{},{}".format(np.mean(folds_lers), np.std(folds_lers)) eval = prediction['voted']["eval"] data += ",{}".format(eval['avg_ler']) print(data) if args.n_confusions != 0: for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] print("") print("CONFUSIONS (lines = {})".format(n_lines)) print("==========") print() for fold, pred in prediction.items(): print("FOLD {}".format(fold)) print_confusions(pred['eval'], args.n_confusions) if args.xlsx_output: data_list = [] for prediction_map, n_lines in zip(predictions, args.n_lines): prediction = prediction_map["full"] for fold, pred in prediction.items(): data_list.append({ "prefix": "L{} - Fold{}".format(n_lines, fold), "results": pred['eval'], "gt_files": prediction_map['gt_txts'], "gts": prediction_map['gt'], "preds": pred['data'] }) for voter in ['sequence_voter', 'confidence_voter_default_ctc']: pred = prediction[voter] data_list.append({ "prefix": "L{} - {}".format(n_lines, voter[:3]), "results": pred['eval'], "gt_files": prediction_map['gt_txts'], "gts": prediction_map['gt'], "preds": pred['data'] }) write_xlsx(args.xlsx_output, data_list)