def main(): parser = argparse.ArgumentParser() parser.add_argument("--files", type=str, default=[], nargs="+", required=True, help="Protobuf files to convert") parser.add_argument("--logits", action="store_true", help="Do write logits") args = parser.parse_args() files = glob_all(args.files) for file in tqdm(files, desc="Converting"): predictions = Predictions() with open(file, 'rb') as f: predictions.ParseFromString(f.read()) if not args.logits: for prediction in predictions.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] out_json_path = split_all_ext(file)[0] + ".json" with open(out_json_path, 'w') as f: f.write( MessageToJson(predictions, including_default_value_fields=True))
def _load_sample(self, sample, text_only): gt_txt_path = sample['pred_path'] if gt_txt_path is None: return None, None if gt_txt_path.endswith('.json'): with codecs.open(gt_txt_path, 'r', 'utf-8') as f: p = Parse(str(f.read()), Predictions()) if len(p.predictions) == 0: return None, None voted_p = p.predictions[0] for vp in p.predictions: if vp.id == 'voted': voted_p = vp sample['best_prediction'] = voted_p sample['predictions'] = p return None, voted_p.sentence
def run(args): # check if loading a json file if len(args.files) == 1 and args.files[0].endswith("json"): import json with open(args.files[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): setattr(args, key, value) # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files input_image_files = glob_all(args.files) if args.text_files: args.text_files = glob_all(args.text_files) # skip invalid files and remove them, there wont be predictions of invalid files dataset = create_dataset( args.dataset, DataSetMode.PREDICT, input_image_files, args.text_files, skip_invalid=True, remove_invalid=True, args={'text_index': args.pagexml_text_index}, ) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception( "Empty dataset provided. Check your files argument (got {})!". format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes) do_prediction = predictor.predict_dataset( dataset, progress_bar=not args.no_progress_bars) avg_sentence_confidence = 0 n_predictions = 0 # output the voted results to the appropriate files for result, sample in do_prediction: n_predictions += 1 for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if args.verbose: lr = "\u202A\u202B" print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C")) output_dir = args.output_dir dataset.store_text(sentence, sample, output_dir=output_dir, extension=".pred.txt") if args.extended_prediction_data: ps = Predictions() ps.line_path = sample[ 'image_path'] if 'image_path' in sample else sample['id'] ps.predictions.extend([prediction] + [r.prediction for r in result]) output_dir = output_dir if output_dir else os.path.dirname( ps.line_path) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write( MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") print("Average sentence confidence: {:.2%}".format( avg_sentence_confidence / n_predictions)) dataset.store() print("All files written")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--files", nargs="+", required=True, default=[], help="List all image files that shall be processed") parser.add_argument("--checkpoint", type=str, nargs="+", default=[], help="Path to the checkpoint without file extension") parser.add_argument("-j", "--processes", type=int, default=1, help="Number of processes to use") parser.add_argument( "--batch_size", type=int, default=1, help= "The batch size during the prediction (number of lines to process in parallel)" ) parser.add_argument("--verbose", action="store_true", help="Print additional information") parser.add_argument( "--voter", type=str, default="confidence_voter_default_ctc", help= "The voting algorithm to use. Possible values: confidence_voter_default_ctc (default), " "confidence_voter_fuzzy_ctc, sequence_voter") parser.add_argument( "--output_dir", type=str, help= "By default the prediction files will be written to the same directory as the given files. " "You can use this argument to specify a specific output dir for the prediction files." ) parser.add_argument( "--extended_prediction_data", action="store_true", help= "Write: Predicted string, labels; position, probabilities and alternatives of chars to a .pred (protobuf) file" ) parser.add_argument( "--extended_prediction_data_format", type=str, default="json", help= "Extension format: Either pred or json. Note that json will not print logits." ) parser.add_argument("--no_progress_bars", action="store_true", help="Do not show any progress bars") args = parser.parse_args() # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files input_image_files = sorted(glob_all(args.files)) # skip invalid files, but keep then so that empty predictions are created dataset = FileDataSet(input_image_files, skip_invalid=True, remove_invalid=False) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception( "Empty dataset provided. Check your files argument (got {})!". format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint) do_prediction = predictor.predict_dataset( dataset, batch_size=args.batch_size, processes=args.processes, progress_bar=not args.no_progress_bars) # output the voted results to the appropriate files for (result, sample), filepath in zip(do_prediction, input_image_files): for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence if args.verbose: print("{}: '{}'".format(sample['id'], sentence)) output_dir = args.output_dir if args.output_dir else os.path.dirname( filepath) with codecs.open(os.path.join(output_dir, sample['id'] + ".pred.txt"), 'w', 'utf-8') as f: f.write(sentence) if args.extended_prediction_data: ps = Predictions() ps.line_path = filepath ps.predictions.extend([prediction] + [r.prediction for r in result]) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write( MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") print("All files written")
def run(args): # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files files = glob.glob(args.files) dataset = AbbyyDataSet(files, skip_invalid=True, remove_invalid=False, binary=args.binary) dataset.load_samples(processes=args.processes, progress_bar=not args.no_progress_bars) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception( "Empty dataset provided. Check your files argument (got {})!". format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes) do_prediction = predictor.predict_dataset( dataset, progress_bar=not args.no_progress_bars) # output the voted results to the appropriate files input_image_files = [] # creat input_image_files list for next loop for page in dataset.book.pages: for fo in page.getFormats(): input_image_files.append(page.imgFile) for (result, sample), filepath in zip(do_prediction, input_image_files): for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence if args.verbose: lr = "\u202A\u202B" print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C")) output_dir = args.output_dir if args.output_dir else os.path.dirname( filepath) sample["format"].text = sentence if args.extended_prediction_data: ps = Predictions() ps.line_path = filepath ps.predictions.extend([prediction] + [r.prediction for r in result]) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write( MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") w = XMLWriter(output_dir, os.path.dirname(filepath), dataset.book) w.write() print("All files written")
def run(args): # check if loading a json file if len(args.files) == 1 and args.files[0].endswith("json"): import json with open(args.files[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): setattr(args, key, value) # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception("Only 'pred' and 'json' are allowed extended prediction data formats") # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files input_image_files = glob_all(args.files) if args.text_files: args.text_files = glob_all(args.text_files) # skip invalid files and remove them, there wont be predictions of invalid files dataset = create_dataset( args.dataset, DataSetMode.PREDICT, input_image_files, args.text_files, skip_invalid=True, remove_invalid=True, args={'text_index': args.pagexml_text_index}, ) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception("Empty dataset provided. Check your files argument (got {})!".format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes) do_prediction = predictor.predict_dataset(dataset, progress_bar=not args.no_progress_bars) avg_sentence_confidence = 0 n_predictions = 0 # output the voted results to the appropriate files for result, sample in do_prediction: n_predictions += 1 for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if args.verbose: lr = "\u202A\u202B" print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C" )) output_dir = args.output_dir dataset.store_text(sentence, sample, output_dir=output_dir, extension=".pred.txt") if args.extended_prediction_data: ps = Predictions() ps.line_path = sample['image_path'] if 'image_path' in sample else sample['id'] ps.predictions.extend([prediction] + [r.prediction for r in result]) output_dir = output_dir if output_dir else os.path.dirname(ps.line_path) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write(MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") print("Average sentence confidence: {:.2%}".format(avg_sentence_confidence / n_predictions)) dataset.store() print("All files written")