Beispiel #1
0
    def on_batch_end(self, batch, logs):
        dt = time.time() - self.iter_start_time
        self.iter_start_time = time.time()
        self.dt_stats.push(dt)
        self.loss_stats.push(logs['loss'])
        self.checkpoint_params.iter += 1

        if self.display > 0 and self.checkpoint_params.iter % self.display == 0:
            # apply postprocessing to display the true output
            cer, target, decoded = self._generate(1)
            self.ler_stats.push(cer)
            pred_sentence = self.text_post_proc.apply("".join(
                self.codec.decode(decoded[0])))
            gt_sentence = self.text_post_proc.apply("".join(
                self.codec.decode(target[0])))

            if self.display_epochs:
                print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                    self.checkpoint_params.iter / self.steps_per_epoch,
                    self.loss_stats.mean(), self.ler_stats.mean(),
                    self.dt_stats.mean()))
            else:
                print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                    self.checkpoint_params.iter, self.loss_stats.mean(),
                    self.ler_stats.mean(), self.dt_stats.mean()))

            # Insert utf-8 ltr/rtl direction marks for bidi support
            lr = "\u202A\u202B"
            print("  PRED: '{}{}{}'".format(
                lr[bidi.get_base_level(pred_sentence)], pred_sentence,
                "\u202C"))
            print("  TRUE: '{}{}{}'".format(
                lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))
Beispiel #2
0
 def print_evaluate(self, sample: Sample, data, print_fn):
     targets, outputs = sample.targets, sample.outputs
     pred_sentence = outputs.sentence
     gt_sentence = targets['sentence']
     lr = "\u202A\u202B"
     cer = Levenshtein.distance(pred_sentence, gt_sentence) / len(gt_sentence)
     print_fn("\n  CER:  {}".format(cer) +
              "\n  PRED: '{}{}{}'".format(lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C") +
              "\n  TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))
Beispiel #3
0
    def display(self, train_cer, train_loss, train_dt, iter, steps_per_epoch, display_epochs,
                example_pred, example_gt):
        if display_epochs:
            print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                iter / steps_per_epoch, train_loss, train_cer,
                train_dt))
        else:
            print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                iter, train_loss, train_cer, train_dt))

        lr = "\u202A\u202B"
        print("  PRED: '{}{}{}'".format(lr[bidi.get_base_level(example_pred)], example_pred, "\u202C"))
        print("  TRUE: '{}{}{}'".format(lr[bidi.get_base_level(example_gt)], example_gt, "\u202C"))
Beispiel #4
0
    def print_evaluate(self, sample: Sample, data, print_fn=print):
        targets, outputs = sample.targets, sample.outputs
        gt_sentence = targets['sentence']
        lr = "\u202A\u202B"
        s = ""

        pred_sentence = outputs.sentence
        cer = Levenshtein.distance(pred_sentence, gt_sentence) / len(gt_sentence)
        s += (
                "\n  PRED (CER={:.2f}): '{}{}{}'".format(cer, lr[bidi.get_base_level(pred_sentence)], pred_sentence,
                                                         "\u202C") +
                "\n  TRUE:            '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))

        print_fn(s)
Beispiel #5
0
 def print_evaluate(self, inputs: Dict[str, AnyNumpy], outputs: Prediction,
                    targets: Dict[str, AnyNumpy], data: 'CalamariData',
                    print_fn):
     pred_sentence = outputs.sentence
     gt_sentence = targets['sentence']
     lr = "\u202A\u202B"
     cer = Levenshtein.distance(pred_sentence,
                                gt_sentence) / len(gt_sentence)
     print_fn(
         "\n  CER:  {}".format(cer) +
         "\n  PRED: '{}{}{}'".format(lr[bidi.get_base_level(pred_sentence)],
                                     pred_sentence, "\u202C") +
         "\n  TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)],
                                     gt_sentence, "\u202C"))
Beispiel #6
0
def bidi_record(record):
    """
    Reorders a record using the Unicode BiDi algorithm. 
    
    Models trained for RTL or mixed scripts still emit classes in LTR order
    requiring reordering for proper display.

    Args:
        record (kraken.rpred.ocr_record)

    Returns:
        kraken.rpred.ocr_record 
    """
    storage = bd.get_empty_storage()
    base_level = bd.get_base_level(record.prediction)
    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bd.get_embedding_levels(record.prediction, storage)
    bd.explicit_embed_and_overrides(storage)
    bd.resolve_weak_types(storage)
    bd.resolve_neutral_types(storage, False)
    bd.resolve_implicit_levels(storage, False)
    for i, j in enumerate(record):
        storage['chars'][i]['record'] = j
    bd.reorder_resolved_levels(storage, False)
    bd.apply_mirroring(storage, False)
    prediction = u''
    cuts = []
    confidences = []
    for ch in storage['chars']:
        prediction = prediction + ch['record'][0]
        cuts.append(ch['record'][1])
        confidences.append(ch['record'][2])
    return ocr_record(prediction, cuts, confidences)
Beispiel #7
0
def bidi_record(record):
    """
    Reorders a record using the Unicode BiDi algorithm.

    Models trained for RTL or mixed scripts still emit classes in LTR order
    requiring reordering for proper display.

    Args:
        record (kraken.rpred.ocr_record)

    Returns:
        kraken.rpred.ocr_record
    """
    storage = bd.get_empty_storage()
    base_level = bd.get_base_level(record.prediction)
    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bd.get_embedding_levels(record.prediction, storage)
    bd.explicit_embed_and_overrides(storage)
    bd.resolve_weak_types(storage)
    bd.resolve_neutral_types(storage, False)
    bd.resolve_implicit_levels(storage, False)
    for i, j in enumerate(record):
        storage['chars'][i]['record'] = j
    bd.reorder_resolved_levels(storage, False)
    bd.apply_mirroring(storage, False)
    prediction = u''
    cuts = []
    confidences = []
    for ch in storage['chars']:
        prediction = prediction + ch['record'][0]
        cuts.append(ch['record'][1])
        confidences.append(ch['record'][2])
    return ocr_record(prediction, cuts, confidences)
Beispiel #8
0
def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False):
    """
    Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
    for debugging (default: False).

    Set `base_dir` to 'L' or 'R' to override the calculated base_level.

    Set `debug` to True to display (using sys.stderr) the steps taken with the
    algorithm.

    Returns an info dict object and the display layout.
    """
    storage = get_empty_storage()

    if base_dir is None:
        base_level = get_base_level(text, upper_is_rtl)
    else:
        base_level = PARAGRAPH_LEVELS[base_dir]

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    get_embedding_levels(text, storage, upper_is_rtl, debug)
    assert len(text) == len(storage["chars"])
    for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])):
        assert ch == chInfo["ch"]
        chInfo["index"] = index

    explicit_embed_and_overrides(storage, debug)
    resolve_weak_types(storage, debug)
    resolve_neutral_types(storage, debug)
    resolve_implicit_levels(storage, debug)
    reorder_resolved_levels(storage, debug)

    return storage
Beispiel #9
0
    def print_evaluate(self,
                       inputs: Dict[str, AnyNumpy],
                       outputs: Prediction,
                       targets: Dict[str, AnyNumpy],
                       data,
                       print_fn=print):
        gt_sentence = targets['sentence']
        lr = "\u202A\u202B"
        s = ""

        pred_sentence = outputs.sentence
        cer = Levenshtein.distance(pred_sentence,
                                   gt_sentence) / len(gt_sentence)
        s += ("\n  PRED (CER={:.2f}): '{}{}{}'".format(
            cer, lr[bidi.get_base_level(pred_sentence)], pred_sentence,
            "\u202C") + "\n  TRUE:            '{}{}{}'".format(
                lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))

        print_fn(s)
Beispiel #10
0
def get_display_mod(unicode_or_str,
                    encoding='utf-8',
                    upper_is_rtl=False,
                    base_dir=None,
                    debug=False):
    """Accepts unicode or string. In case it's a string, `encoding`
    is needed as it works on unicode ones (default:"utf-8").
    Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
    for debugging (default: False).
    Set `base_dir` to 'L' or 'R' to override the calculated base_level.
    Set `debug` to True to display (using sys.stderr) the steps taken with the
    algorithm.
    Returns the display layout, either as unicode or `encoding` encoded
    string.
    """
    storage = bidi.get_empty_storage()

    # utf-8 ? we need unicode
    if isinstance(unicode_or_str, six.text_type):
        text = unicode_or_str
        decoded = False
    else:
        text = unicode_or_str.decode(encoding)
        decoded = True

    if base_dir is None:
        base_level = bidi.get_base_level(text, upper_is_rtl)
    else:
        base_level = bidi.PARAGRAPH_LEVELS[base_dir]

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bidi.get_embedding_levels(text, storage, upper_is_rtl, debug)
    bidi.explicit_embed_and_overrides(storage, debug)
    bidi.resolve_weak_types(storage, debug)
    bidi.resolve_neutral_types(storage, debug)
    bidi.resolve_implicit_levels(storage, debug)
    bidi.reorder_resolved_levels(storage, debug)
    #Commented out from original code:
    # bidi.apply_mirroring(storage, debug)
    # print_storage_chars(storage)
    # chars = storage['chars']
    # display = u''.join([_ch['ch'] for _ch in chars])
    display = print_storage_chars(storage)

    if decoded:
        return display.encode(encoding)
    else:
        return display
Beispiel #11
0
def bidi_record(record: ocr_record, base_dir=None) -> ocr_record:
    """
    Reorders a record using the Unicode BiDi algorithm.

    Models trained for RTL or mixed scripts still emit classes in LTR order
    requiring reordering for proper display.

    Args:
        record (kraken.rpred.ocr_record)

    Returns:
        kraken.rpred.ocr_record
    """
    storage = bd.get_empty_storage()

    if base_dir not in ('L', 'R'):
        base_level = bd.get_base_level(record.prediction)
    else:
        base_level = {'L': 0, 'R': 1}[base_dir]

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    bd.get_embedding_levels(record.prediction, storage)
    bd.explicit_embed_and_overrides(storage)
    bd.resolve_weak_types(storage)
    bd.resolve_neutral_types(storage, False)
    bd.resolve_implicit_levels(storage, False)
    for i, j in enumerate(record):
        storage['chars'][i]['record'] = j
    bd.reorder_resolved_levels(storage, False)
    bd.apply_mirroring(storage, False)
    prediction = ''
    cuts = []
    confidences = []
    for ch in storage['chars']:
        # code point may have been mirrored
        prediction = prediction + ch['ch']
        cuts.append(ch['record'][1])
        confidences.append(ch['record'][2])
    # carry over whole line information
    if record.type == 'baselines':
        line = {'boundary': record.line, 'baseline': record.baseline}
    else:
        line = record.line
    rec = ocr_record(prediction, cuts, confidences, line)
    rec.tags = record.tags
    rec.base_dir = base_dir
    return rec
Beispiel #12
0
    def train(self, progress_bar=False):
        """ Launch the training

        Parameters
        ----------
        progress_bar : bool
            Show or hide any progress bar

        """
        checkpoint_params = self.checkpoint_params

        train_start_time = time.time() + self.checkpoint_params.total_time

        self.dataset.load_samples(processes=1, progress_bar=progress_bar)
        datas, txts = self.dataset.train_samples(
            skip_empty=checkpoint_params.skip_invalid_gt)
        if len(datas) == 0:
            raise Exception(
                "Empty dataset is not allowed. Check if the data is at the correct location"
            )

        if self.validation_dataset:
            self.validation_dataset.load_samples(processes=1,
                                                 progress_bar=progress_bar)
            validation_datas, validation_txts = self.validation_dataset.train_samples(
                skip_empty=checkpoint_params.skip_invalid_gt)
            if len(validation_datas) == 0:
                raise Exception(
                    "Validation dataset is empty. Provide valid validation data for early stopping."
                )
        else:
            validation_datas, validation_txts = [], []

        # preprocessing steps
        texts = self.txt_preproc.apply(txts,
                                       processes=checkpoint_params.processes,
                                       progress_bar=progress_bar)
        datas = self.data_preproc.apply(datas,
                                        processes=checkpoint_params.processes,
                                        progress_bar=progress_bar)
        validation_txts = self.txt_preproc.apply(
            validation_txts,
            processes=checkpoint_params.processes,
            progress_bar=progress_bar)
        validation_datas = self.data_preproc.apply(
            validation_datas,
            processes=checkpoint_params.processes,
            progress_bar=progress_bar)

        # compute the codec
        codec = self.codec if self.codec else Codec.from_texts(
            texts, whitelist=self.codec_whitelist)

        # data augmentation on preprocessed data
        if self.data_augmenter:
            datas, texts = self.data_augmenter.augment_datas(
                datas,
                texts,
                n_augmentations=self.n_augmentations,
                processes=checkpoint_params.processes,
                progress_bar=progress_bar)

            # TODO: validation data augmentation
            # validation_datas, validation_txts = self.data_augmenter.augment_datas(validation_datas, validation_txts, n_augmentations=0,
            #                                                  processes=checkpoint_params.processes, progress_bar=progress_bar)

        # create backend
        network_params = checkpoint_params.model.network
        network_params.features = checkpoint_params.model.line_height
        network_params.classes = len(codec)
        if self.weights:
            # if we load the weights, take care of codec changes as-well
            with open(self.weights + '.json', 'r') as f:
                restore_checkpoint_params = json_format.Parse(
                    f.read(), CheckpointParams())
                restore_model_params = restore_checkpoint_params.model

            # checks
            if checkpoint_params.model.line_height != network_params.features:
                raise Exception(
                    "The model to restore has a line height of {} but a line height of {} is requested"
                    .format(network_params.features,
                            checkpoint_params.model.line_height))

            # create codec of the same type
            restore_codec = codec.__class__(restore_model_params.codec.charset)
            # the codec changes as tuple (deletions/insertions), and the new codec is the changed old one
            codec_changes = restore_codec.align(codec)
            codec = restore_codec
            print("Codec changes: {} deletions, {} appends".format(
                len(codec_changes[0]), len(codec_changes[1])))
            # The actual weight/bias matrix will be changed after loading the old weights
        else:
            codec_changes = None

        # store the new codec
        checkpoint_params.model.codec.charset[:] = codec.charset
        print("CODEC: {}".format(codec.charset))

        # compute the labels with (new/current) codec
        labels = [codec.encode(txt) for txt in texts]

        backend = create_backend_from_proto(
            network_params,
            weights=self.weights,
        )
        train_net = backend.create_net(restore=None,
                                       weights=self.weights,
                                       graph_type="train",
                                       batch_size=checkpoint_params.batch_size)
        test_net = backend.create_net(restore=None,
                                      weights=self.weights,
                                      graph_type="test",
                                      batch_size=checkpoint_params.batch_size)
        train_net.set_data(datas, labels)
        test_net.set_data(validation_datas, validation_txts)
        if codec_changes:
            # only required on one net, since the other shares the same variables
            train_net.realign_model_labels(*codec_changes)

        train_net.prepare()
        test_net.prepare()

        loss_stats = RunningStatistics(checkpoint_params.stats_size,
                                       checkpoint_params.loss_stats)
        ler_stats = RunningStatistics(checkpoint_params.stats_size,
                                      checkpoint_params.ler_stats)
        dt_stats = RunningStatistics(checkpoint_params.stats_size,
                                     checkpoint_params.dt_stats)

        early_stopping_enabled = self.validation_dataset is not None \
                                 and checkpoint_params.early_stopping_frequency > 0 \
                                 and checkpoint_params.early_stopping_nbest > 1
        early_stopping_best_accuracy = checkpoint_params.early_stopping_best_accuracy
        early_stopping_best_cur_nbest = checkpoint_params.early_stopping_best_cur_nbest
        early_stopping_best_at_iter = checkpoint_params.early_stopping_best_at_iter

        early_stopping_predictor = Predictor(codec=codec,
                                             text_postproc=self.txt_postproc,
                                             network=test_net)

        # Start the actual training
        # ====================================================================================

        iter = checkpoint_params.iter

        # helper function to write a checkpoint
        def make_checkpoint(base_dir, prefix, version=None):
            if version:
                checkpoint_path = os.path.abspath(
                    os.path.join(base_dir, "{}{}.ckpt".format(prefix,
                                                              version)))
            else:
                checkpoint_path = os.path.abspath(
                    os.path.join(base_dir,
                                 "{}{:08d}.ckpt".format(prefix, iter + 1)))
            print("Storing checkpoint to '{}'".format(checkpoint_path))
            train_net.save_checkpoint(checkpoint_path)
            checkpoint_params.iter = iter
            checkpoint_params.loss_stats[:] = loss_stats.values
            checkpoint_params.ler_stats[:] = ler_stats.values
            checkpoint_params.dt_stats[:] = dt_stats.values
            checkpoint_params.total_time = time.time() - train_start_time
            checkpoint_params.early_stopping_best_accuracy = early_stopping_best_accuracy
            checkpoint_params.early_stopping_best_cur_nbest = early_stopping_best_cur_nbest
            checkpoint_params.early_stopping_best_at_iter = early_stopping_best_at_iter

            with open(checkpoint_path + ".json", 'w') as f:
                f.write(json_format.MessageToJson(checkpoint_params))

            return checkpoint_path

        try:
            last_checkpoint = None
            n_infinite_losses = 0
            n_max_infinite_losses = 5

            # Training loop, can be interrupted by early stopping
            for iter in range(iter, checkpoint_params.max_iters):
                checkpoint_params.iter = iter

                iter_start_time = time.time()
                result = train_net.train_step()

                if not np.isfinite(result['loss']):
                    n_infinite_losses += 1

                    if n_max_infinite_losses == n_infinite_losses:
                        print(
                            "Error: Loss is not finite! Trying to restart from last checkpoint."
                        )
                        if not last_checkpoint:
                            raise Exception(
                                "No checkpoint written yet. Training must be stopped."
                            )
                        else:
                            # reload also non trainable weights, such as solver-specific variables
                            train_net.load_weights(
                                last_checkpoint, restore_only_trainable=False)
                            continue
                    else:
                        continue

                n_infinite_losses = 0

                loss_stats.push(result['loss'])
                ler_stats.push(result['ler'])

                dt_stats.push(time.time() - iter_start_time)

                if iter % checkpoint_params.display == 0:
                    # apply postprocessing to display the true output
                    pred_sentence = self.txt_postproc.apply("".join(
                        codec.decode(result["decoded"][0])))
                    gt_sentence = self.txt_postproc.apply("".join(
                        codec.decode(result["gt"][0])))

                    print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                        iter, loss_stats.mean(), ler_stats.mean(),
                        dt_stats.mean()))
                    # Insert utf-8 ltr/rtl direction marks for bidi support
                    lr = "\u202A\u202B"
                    print(" PRED: '{}{}{}'".format(
                        lr[bidi.get_base_level(pred_sentence)], pred_sentence,
                        "\u202C"))
                    print(" TRUE: '{}{}{}'".format(
                        lr[bidi.get_base_level(gt_sentence)], gt_sentence,
                        "\u202C"))

                if (iter + 1) % checkpoint_params.checkpoint_frequency == 0:
                    last_checkpoint = make_checkpoint(
                        checkpoint_params.output_dir,
                        checkpoint_params.output_model_prefix)

                if early_stopping_enabled and (
                        iter +
                        1) % checkpoint_params.early_stopping_frequency == 0:
                    print("Checking early stopping model")

                    out = early_stopping_predictor.predict_raw(
                        validation_datas,
                        progress_bar=progress_bar,
                        apply_preproc=False)
                    pred_texts = [d.sentence for d in out]
                    pred_texts = self.txt_preproc.apply(
                        pred_texts,
                        processes=checkpoint_params.processes,
                        progress_bar=progress_bar)
                    result = Evaluator.evaluate(gt_data=validation_txts,
                                                pred_data=pred_texts,
                                                progress_bar=progress_bar)
                    accuracy = 1 - result["avg_ler"]

                    if accuracy > early_stopping_best_accuracy:
                        early_stopping_best_accuracy = accuracy
                        early_stopping_best_cur_nbest = 1
                        early_stopping_best_at_iter = iter + 1
                        # overwrite as best model
                        last_checkpoint = make_checkpoint(
                            checkpoint_params.
                            early_stopping_best_model_output_dir,
                            prefix="",
                            version=checkpoint_params.
                            early_stopping_best_model_prefix,
                        )
                        print(
                            "Found better model with accuracy of {:%}".format(
                                early_stopping_best_accuracy))
                    else:
                        early_stopping_best_cur_nbest += 1
                        print(
                            "No better model found. Currently accuracy of {:%} at iter {} (remaining nbest = {})"
                            .format(
                                early_stopping_best_accuracy,
                                early_stopping_best_at_iter,
                                checkpoint_params.early_stopping_nbest -
                                early_stopping_best_cur_nbest))

                    if accuracy > 0 and early_stopping_best_cur_nbest >= checkpoint_params.early_stopping_nbest:
                        print("Early stopping now.")
                        break

        except KeyboardInterrupt as e:
            print("Storing interrupted checkpoint")
            make_checkpoint(checkpoint_params.output_dir,
                            checkpoint_params.output_model_prefix,
                            "interrupted")
            raise e

        print("Total time {}s for {} iterations.".format(
            time.time() - train_start_time, iter))
Beispiel #13
0
def run(args):
    # check if loading a json file
    if len(args.files) == 1 and args.files[0].endswith("json"):
        import json
        with open(args.files[0], 'r') as f:
            json_args = json.load(f)
            for key, value in json_args.items():
                setattr(args, key, value)

    # checks
    if args.extended_prediction_data_format not in ["pred", "json"]:
        raise Exception("Only 'pred' and 'json' are allowed extended prediction data formats")

    # add json as extension, resolve wildcard, expand user, ... and remove .json again
    args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint]
    args.checkpoint = glob_all(args.checkpoint)
    args.checkpoint = [cp[:-5] for cp in args.checkpoint]

    # create voter
    voter_params = VoterParams()
    voter_params.type = VoterParams.Type.Value(args.voter.upper())
    voter = voter_from_proto(voter_params)

    # load files
    input_image_files = glob_all(args.files)
    if args.text_files:
        args.text_files = glob_all(args.text_files)

    # skip invalid files and remove them, there wont be predictions of invalid files
    dataset = create_dataset(
        args.dataset,
        DataSetMode.PREDICT,
        input_image_files,
        args.text_files,
        skip_invalid=True,
        remove_invalid=True,
        args={'text_index': args.pagexml_text_index},
    )

    print("Found {} files in the dataset".format(len(dataset)))
    if len(dataset) == 0:
        raise Exception("Empty dataset provided. Check your files argument (got {})!".format(args.files))

    # predict for all models
    predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes)
    do_prediction = predictor.predict_dataset(dataset, progress_bar=not args.no_progress_bars)

    avg_sentence_confidence = 0
    n_predictions = 0

    # output the voted results to the appropriate files
    for result, sample in do_prediction:
        n_predictions += 1
        for i, p in enumerate(result):
            p.prediction.id = "fold_{}".format(i)

        # vote the results (if only one model is given, this will just return the sentences)
        prediction = voter.vote_prediction_result(result)
        prediction.id = "voted"
        sentence = prediction.sentence
        avg_sentence_confidence += prediction.avg_char_probability
        if args.verbose:
            lr = "\u202A\u202B"
            print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C" ))

        output_dir = args.output_dir

        dataset.store_text(sentence, sample, output_dir=output_dir, extension=".pred.txt")

        if args.extended_prediction_data:
            ps = Predictions()
            ps.line_path = sample['image_path'] if 'image_path' in sample else sample['id']
            ps.predictions.extend([prediction] + [r.prediction for r in result])
            output_dir = output_dir if output_dir else os.path.dirname(ps.line_path)
            if not os.path.exists(output_dir):
                os.mkdir(output_dir)

            if args.extended_prediction_data_format == "pred":
                with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f:
                    f.write(ps.SerializeToString())
            elif args.extended_prediction_data_format == "json":
                with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f:
                    # remove logits
                    for prediction in ps.predictions:
                        prediction.logits.rows = 0
                        prediction.logits.cols = 0
                        prediction.logits.data[:] = []

                    f.write(MessageToJson(ps, including_default_value_fields=True))
            else:
                raise Exception("Unknown prediction format.")

    print("Average sentence confidence: {:.2%}".format(avg_sentence_confidence / n_predictions))

    dataset.store()
    print("All files written")
Beispiel #14
0
    def predict_books(
        self,
        books,
        checkpoint,
        cachefile=None,
        pageupload=True,
        text_index=1,
        pred_all=False,
    ):
        keras.backend.clear_session()
        if type(books) == str:
            books = [books]
        if type(checkpoint) == str:
            checkpoint = [checkpoint]
        checkpoint = [
            (cp if cp.endswith(".json") else cp + ".json") for cp in checkpoint
        ]
        checkpoint = glob_all(checkpoint)
        checkpoint = [cp[:-5] for cp in checkpoint]
        if cachefile is None:
            cachefile = self.cachefile
        verbose = False
        lids = list(
            lids_from_books(
                books,
                cachefile,
                complete_only=False,
                skip_commented=False,
                new_only=not pred_all,
            )
        )
        data = Nsh5(cachefile=cachefile, lines=lids)

        predparams = PredictorParams()
        predparams.device.gpus = [n for n, _ in enumerate(list_physical_devices("GPU"))]

        predictor = MultiPredictor.from_paths(
            checkpoints=checkpoint,
            voter_params=VoterParams(),
            predictor_params=predparams,
        )

        newprcs = []
        for prc in predictor.data.params.pre_proc.processors:
            prc = deepcopy(prc)
            if isinstance(prc, FinalPreparationProcessorParams):
                prc.normalize, prc.invert, prc.transpose = False, False, True
                newprcs.append(prc)
            elif isinstance(prc, PrepareSampleProcessorParams):
                newprcs.append(prc)
        predictor.data.params.pre_proc.processors = newprcs

        do_prediction = predictor.predict(data)
        pipeline = predictor.data.get_or_create_pipeline(
            predictor.params.pipeline, data
        )
        reader = pipeline.reader()
        if len(reader) == 0:
            raise Exception(
                "Empty dataset provided. Check your lines (got {})!".format(lids)
            )

        avg_sentence_confidence = 0
        n_predictions = 0

        reader.prepare_store()

        samples = []
        sentences = []
        # output the voted results to the appropriate files
        for s in do_prediction:
            _, (_, prediction), meta = s.inputs, s.outputs, s.meta
            sample = reader.sample_by_id(meta["id"])
            n_predictions += 1
            sentence = prediction.sentence

            avg_sentence_confidence += prediction.avg_char_probability
            if verbose:
                lr = "\u202A\u202B"
                logger.info(
                    "{}: '{}{}{}'".format(
                        meta["id"], lr[get_base_level(sentence)], sentence, "\u202C"
                    )
                )

            samples.append(sample)
            sentences.append(sentence)
            reader.store_text(sentence, sample, output_dir=None, extension=None)

        logger.info(
            "Average sentence confidence: {:.2%}".format(
                avg_sentence_confidence / n_predictions
            )
        )

        if pageupload:
            ocrdata = {}
            for lname, text in reader.predictions.items():
                _, b, p, ln = lname.split("/")
                if b not in ocrdata:
                    ocrdata[b] = {}
                if p not in ocrdata[b]:
                    ocrdata[b][p] = {}
                ocrdata[b][p][ln] = text

            data = {"ocrdata": ocrdata, "index": text_index}
            self.get_session().post(
                self.baseurl + "/_ocrdata",
                data=gzip.compress(json.dumps(data).encode("utf-8")),
                headers={
                    "Content-Type": "application/json;charset=UTF-8",
                    "Content-Encoding": "gzip",
                },
            )
            logger.info("Results uploaded")
        else:
            reader.store()
            logger.info("All prediction files written")
Beispiel #15
0
def run(args):

    # check if loading a json file
    if len(args.files) == 1 and args.files[0].endswith("json"):
        import json
        with open(args.files[0], 'r') as f:
            json_args = json.load(f)
            for key, value in json_args.items():
                setattr(args, key, value)

    # checks
    if args.extended_prediction_data_format not in ["pred", "json"]:
        raise Exception(
            "Only 'pred' and 'json' are allowed extended prediction data formats"
        )

    # add json as extension, resolve wildcard, expand user, ... and remove .json again
    args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json")
                       for cp in args.checkpoint]
    args.checkpoint = glob_all(args.checkpoint)
    args.checkpoint = [cp[:-5] for cp in args.checkpoint]

    # create voter
    voter_params = VoterParams()
    voter_params.type = VoterParams.Type.Value(args.voter.upper())
    voter = voter_from_proto(voter_params)

    # load files
    input_image_files = glob_all(args.files)
    if args.text_files:
        args.text_files = glob_all(args.text_files)

    # skip invalid files and remove them, there wont be predictions of invalid files
    dataset = create_dataset(
        args.dataset,
        DataSetMode.PREDICT,
        input_image_files,
        args.text_files,
        skip_invalid=True,
        remove_invalid=True,
        args={'text_index': args.pagexml_text_index},
    )

    print("Found {} files in the dataset".format(len(dataset)))
    if len(dataset) == 0:
        raise Exception(
            "Empty dataset provided. Check your files argument (got {})!".
            format(args.files))

    # predict for all models
    predictor = MultiPredictor(checkpoints=args.checkpoint,
                               batch_size=args.batch_size,
                               processes=args.processes)
    do_prediction = predictor.predict_dataset(
        dataset, progress_bar=not args.no_progress_bars)

    avg_sentence_confidence = 0
    n_predictions = 0

    # output the voted results to the appropriate files
    for result, sample in do_prediction:
        n_predictions += 1
        for i, p in enumerate(result):
            p.prediction.id = "fold_{}".format(i)

        # vote the results (if only one model is given, this will just return the sentences)
        prediction = voter.vote_prediction_result(result)
        prediction.id = "voted"
        sentence = prediction.sentence
        avg_sentence_confidence += prediction.avg_char_probability
        if args.verbose:
            lr = "\u202A\u202B"
            print("{}: '{}{}{}'".format(sample['id'],
                                        lr[get_base_level(sentence)], sentence,
                                        "\u202C"))

        output_dir = args.output_dir

        dataset.store_text(sentence,
                           sample,
                           output_dir=output_dir,
                           extension=".pred.txt")

        if args.extended_prediction_data:
            ps = Predictions()
            ps.line_path = sample[
                'image_path'] if 'image_path' in sample else sample['id']
            ps.predictions.extend([prediction] +
                                  [r.prediction for r in result])
            output_dir = output_dir if output_dir else os.path.dirname(
                ps.line_path)
            if not os.path.exists(output_dir):
                os.mkdir(output_dir)

            if args.extended_prediction_data_format == "pred":
                with open(os.path.join(output_dir, sample['id'] + ".pred"),
                          'wb') as f:
                    f.write(ps.SerializeToString())
            elif args.extended_prediction_data_format == "json":
                with open(os.path.join(output_dir, sample['id'] + ".json"),
                          'w') as f:
                    # remove logits
                    for prediction in ps.predictions:
                        prediction.logits.rows = 0
                        prediction.logits.cols = 0
                        prediction.logits.data[:] = []

                    f.write(
                        MessageToJson(ps, including_default_value_fields=True))
            else:
                raise Exception("Unknown prediction format.")

    print("Average sentence confidence: {:.2%}".format(
        avg_sentence_confidence / n_predictions))

    dataset.store()
    print("All files written")
Beispiel #16
0
    def _run_train(self, train_net, test_net, codec, train_start_time,
                   progress_bar):
        checkpoint_params = self.checkpoint_params
        validation_dataset = test_net.input_dataset
        iters_per_epoch = max(
            1,
            int(train_net.input_dataset.epoch_size() /
                checkpoint_params.batch_size))

        loss_stats = RunningStatistics(checkpoint_params.stats_size,
                                       checkpoint_params.loss_stats)
        ler_stats = RunningStatistics(checkpoint_params.stats_size,
                                      checkpoint_params.ler_stats)
        dt_stats = RunningStatistics(checkpoint_params.stats_size,
                                     checkpoint_params.dt_stats)

        display = checkpoint_params.display
        display_epochs = display <= 1
        if display <= 0:
            display = 0  # to not display anything
        elif display_epochs:
            display = max(1,
                          int(display * iters_per_epoch))  # relative to epochs
        else:
            display = max(1, int(display))  # iterations

        checkpoint_frequency = checkpoint_params.checkpoint_frequency
        early_stopping_frequency = checkpoint_params.early_stopping_frequency
        if early_stopping_frequency < 0:
            # set early stopping frequency to half epoch
            early_stopping_frequency = int(0.5 * iters_per_epoch)
        elif 0 < early_stopping_frequency <= 1:
            early_stopping_frequency = int(
                early_stopping_frequency *
                iters_per_epoch)  # relative to epochs
        else:
            early_stopping_frequency = int(early_stopping_frequency)
        early_stopping_frequency = max(1, early_stopping_frequency)

        if checkpoint_frequency < 0:
            checkpoint_frequency = early_stopping_frequency
        elif 0 < checkpoint_frequency <= 1:
            checkpoint_frequency = int(checkpoint_frequency *
                                       iters_per_epoch)  # relative to epochs
        else:
            checkpoint_frequency = int(checkpoint_frequency)

        early_stopping_enabled = self.validation_dataset is not None \
                                 and checkpoint_params.early_stopping_frequency > 0 \
                                 and checkpoint_params.early_stopping_nbest > 1
        early_stopping_best_accuracy = checkpoint_params.early_stopping_best_accuracy
        early_stopping_best_cur_nbest = checkpoint_params.early_stopping_best_cur_nbest
        early_stopping_best_at_iter = checkpoint_params.early_stopping_best_at_iter

        early_stopping_predictor = Predictor(codec=codec,
                                             text_postproc=self.txt_postproc,
                                             network=test_net)

        # Start the actual training
        # ====================================================================================

        iter = checkpoint_params.iter

        # helper function to write a checkpoint
        def make_checkpoint(base_dir, prefix, version=None):
            if version:
                checkpoint_path = os.path.abspath(
                    os.path.join(base_dir, "{}{}.ckpt".format(prefix,
                                                              version)))
            else:
                checkpoint_path = os.path.abspath(
                    os.path.join(base_dir,
                                 "{}{:08d}.ckpt".format(prefix, iter + 1)))
            print("Storing checkpoint to '{}'".format(checkpoint_path))
            train_net.save_checkpoint(checkpoint_path)
            checkpoint_params.version = Checkpoint.VERSION
            checkpoint_params.iter = iter
            checkpoint_params.loss_stats[:] = loss_stats.values
            checkpoint_params.ler_stats[:] = ler_stats.values
            checkpoint_params.dt_stats[:] = dt_stats.values
            checkpoint_params.total_time = time.time() - train_start_time
            checkpoint_params.early_stopping_best_accuracy = early_stopping_best_accuracy
            checkpoint_params.early_stopping_best_cur_nbest = early_stopping_best_cur_nbest
            checkpoint_params.early_stopping_best_at_iter = early_stopping_best_at_iter

            with open(checkpoint_path + ".json", 'w') as f:
                f.write(json_format.MessageToJson(checkpoint_params))

            return checkpoint_path

        try:
            last_checkpoint = None
            n_infinite_losses = 0
            n_max_infinite_losses = 5

            # Training loop, can be interrupted by early stopping
            for iter in range(iter, checkpoint_params.max_iters):
                checkpoint_params.iter = iter

                iter_start_time = time.time()
                result = train_net.train_step()

                if not np.isfinite(result['loss']):
                    n_infinite_losses += 1

                    if n_max_infinite_losses == n_infinite_losses:
                        print(
                            "Error: Loss is not finite! Trying to restart from last checkpoint."
                        )
                        if not last_checkpoint:
                            raise Exception(
                                "No checkpoint written yet. Training must be stopped."
                            )
                        else:
                            # reload also non trainable weights, such as solver-specific variables
                            train_net.load_weights(
                                last_checkpoint, restore_only_trainable=False)
                            continue
                    else:
                        continue

                n_infinite_losses = 0

                loss_stats.push(result['loss'])
                ler_stats.push(result['ler'])

                dt_stats.push(time.time() - iter_start_time)

                if display > 0 and iter % display == 0:
                    # apply postprocessing to display the true output
                    pred_sentence = self.txt_postproc.apply("".join(
                        codec.decode(result["decoded"][0])))
                    gt_sentence = self.txt_postproc.apply("".join(
                        codec.decode(result["gt"][0])))

                    if display_epochs:
                        print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".
                              format(iter / iters_per_epoch, loss_stats.mean(),
                                     ler_stats.mean(), dt_stats.mean()))
                    else:
                        print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".
                              format(iter, loss_stats.mean(), ler_stats.mean(),
                                     dt_stats.mean()))

                    # Insert utf-8 ltr/rtl direction marks for bidi support
                    lr = "\u202A\u202B"
                    print(" PRED: '{}{}{}'".format(
                        lr[bidi.get_base_level(pred_sentence)], pred_sentence,
                        "\u202C"))
                    print(" TRUE: '{}{}{}'".format(
                        lr[bidi.get_base_level(gt_sentence)], gt_sentence,
                        "\u202C"))

                if checkpoint_frequency > 0 and (
                        iter + 1) % checkpoint_frequency == 0:
                    last_checkpoint = make_checkpoint(
                        checkpoint_params.output_dir,
                        checkpoint_params.output_model_prefix)

                if early_stopping_enabled and (
                        iter + 1) % early_stopping_frequency == 0:
                    print("Checking early stopping model")

                    out_gen = early_stopping_predictor.predict_input_dataset(
                        validation_dataset, progress_bar=progress_bar)
                    result = Evaluator.evaluate_single_list(
                        map(
                            Evaluator.evaluate_single_args,
                            map(
                                lambda d: tuple(
                                    self.txt_preproc.apply([
                                        ''.join(d.ground_truth), d.sentence
                                    ])), out_gen)))
                    accuracy = 1 - result["avg_ler"]

                    if accuracy > early_stopping_best_accuracy:
                        early_stopping_best_accuracy = accuracy
                        early_stopping_best_cur_nbest = 1
                        early_stopping_best_at_iter = iter + 1
                        # overwrite as best model
                        last_checkpoint = make_checkpoint(
                            checkpoint_params.
                            early_stopping_best_model_output_dir,
                            prefix="",
                            version=checkpoint_params.
                            early_stopping_best_model_prefix,
                        )
                        print(
                            "Found better model with accuracy of {:%}".format(
                                early_stopping_best_accuracy))
                    else:
                        early_stopping_best_cur_nbest += 1
                        print(
                            "No better model found. Currently accuracy of {:%} at iter {} (remaining nbest = {})"
                            .format(
                                early_stopping_best_accuracy,
                                early_stopping_best_at_iter,
                                checkpoint_params.early_stopping_nbest -
                                early_stopping_best_cur_nbest))

                    if accuracy > 0 and early_stopping_best_cur_nbest >= checkpoint_params.early_stopping_nbest:
                        print("Early stopping now.")
                        break

                    if accuracy >= 1:
                        print(
                            "Reached perfect score on validation set. Early stopping now."
                        )
                        break

        except KeyboardInterrupt as e:
            print("Storing interrupted checkpoint")
            make_checkpoint(checkpoint_params.output_dir,
                            checkpoint_params.output_model_prefix,
                            "interrupted")
            raise e

        print("Total time {}s for {} iterations.".format(
            time.time() - train_start_time, iter))
Beispiel #17
0
    def _run_train(self, train_net, test_net, codec, train_start_time, progress_bar):
        checkpoint_params = self.checkpoint_params
        validation_dataset = test_net.input_dataset
        iters_per_epoch = max(1, int(len(train_net.input_dataset) / checkpoint_params.batch_size))

        loss_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.loss_stats)
        ler_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.ler_stats)
        dt_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.dt_stats)

        display = checkpoint_params.display
        display_epochs = display <= 1
        if display <= 0:
            display = 0                                       # to not display anything
        elif display_epochs:
            display = max(1, int(display * iters_per_epoch))  # relative to epochs
        else:
            display = max(1, int(display))                    # iterations

        checkpoint_frequency = checkpoint_params.checkpoint_frequency
        early_stopping_frequency = checkpoint_params.early_stopping_frequency
        if early_stopping_frequency < 0:
            # set early stopping frequency to half epoch
            early_stopping_frequency = int(0.5 * iters_per_epoch)
        elif 0 < early_stopping_frequency <= 1:
            early_stopping_frequency = int(early_stopping_frequency * iters_per_epoch)  # relative to epochs
        else:
            early_stopping_frequency = int(early_stopping_frequency)

        if checkpoint_frequency < 0:
            checkpoint_frequency = early_stopping_frequency
        elif 0 < checkpoint_frequency <= 1:
            checkpoint_frequency = int(checkpoint_frequency * iters_per_epoch)  # relative to epochs
        else:
            checkpoint_frequency = int(checkpoint_frequency)

        early_stopping_enabled = self.validation_dataset is not None \
                                 and checkpoint_params.early_stopping_frequency > 0 \
                                 and checkpoint_params.early_stopping_nbest > 1
        early_stopping_best_accuracy = checkpoint_params.early_stopping_best_accuracy
        early_stopping_best_cur_nbest = checkpoint_params.early_stopping_best_cur_nbest
        early_stopping_best_at_iter = checkpoint_params.early_stopping_best_at_iter

        early_stopping_predictor = Predictor(codec=codec, text_postproc=self.txt_postproc,
                                             network=test_net)

        # Start the actual training
        # ====================================================================================

        iter = checkpoint_params.iter

        # helper function to write a checkpoint
        def make_checkpoint(base_dir, prefix, version=None):
            if version:
                checkpoint_path = os.path.abspath(os.path.join(base_dir, "{}{}.ckpt".format(prefix, version)))
            else:
                checkpoint_path = os.path.abspath(os.path.join(base_dir, "{}{:08d}.ckpt".format(prefix, iter + 1)))
            print("Storing checkpoint to '{}'".format(checkpoint_path))
            train_net.save_checkpoint(checkpoint_path)
            checkpoint_params.version = Checkpoint.VERSION
            checkpoint_params.iter = iter
            checkpoint_params.loss_stats[:] = loss_stats.values
            checkpoint_params.ler_stats[:] = ler_stats.values
            checkpoint_params.dt_stats[:] = dt_stats.values
            checkpoint_params.total_time = time.time() - train_start_time
            checkpoint_params.early_stopping_best_accuracy = early_stopping_best_accuracy
            checkpoint_params.early_stopping_best_cur_nbest = early_stopping_best_cur_nbest
            checkpoint_params.early_stopping_best_at_iter = early_stopping_best_at_iter

            with open(checkpoint_path + ".json", 'w') as f:
                f.write(json_format.MessageToJson(checkpoint_params))

            return checkpoint_path

        try:
            last_checkpoint = None
            n_infinite_losses = 0
            n_max_infinite_losses = 5

            # Training loop, can be interrupted by early stopping
            for iter in range(iter, checkpoint_params.max_iters):
                checkpoint_params.iter = iter

                iter_start_time = time.time()
                result = train_net.train_step()

                if not np.isfinite(result['loss']):
                    n_infinite_losses += 1

                    if n_max_infinite_losses == n_infinite_losses:
                        print("Error: Loss is not finite! Trying to restart from last checkpoint.")
                        if not last_checkpoint:
                            raise Exception("No checkpoint written yet. Training must be stopped.")
                        else:
                            # reload also non trainable weights, such as solver-specific variables
                            train_net.load_weights(last_checkpoint, restore_only_trainable=False)
                            continue
                    else:
                        continue

                n_infinite_losses = 0

                loss_stats.push(result['loss'])
                ler_stats.push(result['ler'])

                dt_stats.push(time.time() - iter_start_time)

                if display > 0 and iter % display == 0:
                    # apply postprocessing to display the true output
                    pred_sentence = self.txt_postproc.apply("".join(codec.decode(result["decoded"][0])))
                    gt_sentence = self.txt_postproc.apply("".join(codec.decode(result["gt"][0])))

                    if display_epochs:
                        print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                            iter / iters_per_epoch, loss_stats.mean(), ler_stats.mean(), dt_stats.mean()))
                    else:
                        print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format(
                            iter, loss_stats.mean(), ler_stats.mean(), dt_stats.mean()))

                    # Insert utf-8 ltr/rtl direction marks for bidi support
                    lr = "\u202A\u202B"
                    print(" PRED: '{}{}{}'".format(lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C"))
                    print(" TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))

                if checkpoint_frequency > 0 and (iter + 1) % checkpoint_frequency == 0:
                    last_checkpoint = make_checkpoint(checkpoint_params.output_dir, checkpoint_params.output_model_prefix)

                if early_stopping_enabled and (iter + 1) % early_stopping_frequency == 0:
                    print("Checking early stopping model")

                    out_gen = early_stopping_predictor.predict_input_dataset(validation_dataset,
                                                                             progress_bar=progress_bar)
                    result = Evaluator.evaluate_single_list(map(
                        Evaluator.evaluate_single_args,
                        map(lambda d: tuple(self.txt_preproc.apply([''.join(d.ground_truth), d.sentence])), out_gen)))
                    accuracy = 1 - result["avg_ler"]

                    if accuracy > early_stopping_best_accuracy:
                        early_stopping_best_accuracy = accuracy
                        early_stopping_best_cur_nbest = 1
                        early_stopping_best_at_iter = iter + 1
                        # overwrite as best model
                        last_checkpoint = make_checkpoint(
                            checkpoint_params.early_stopping_best_model_output_dir,
                            prefix="",
                            version=checkpoint_params.early_stopping_best_model_prefix,
                        )
                        print("Found better model with accuracy of {:%}".format(early_stopping_best_accuracy))
                    else:
                        early_stopping_best_cur_nbest += 1
                        print("No better model found. Currently accuracy of {:%} at iter {} (remaining nbest = {})".
                              format(early_stopping_best_accuracy, early_stopping_best_at_iter,
                                     checkpoint_params.early_stopping_nbest - early_stopping_best_cur_nbest))

                    if accuracy > 0 and early_stopping_best_cur_nbest >= checkpoint_params.early_stopping_nbest:
                        print("Early stopping now.")
                        break

                    if accuracy >= 1:
                        print("Reached perfect score on validation set. Early stopping now.")
                        break

        except KeyboardInterrupt as e:
            print("Storing interrupted checkpoint")
            make_checkpoint(checkpoint_params.output_dir,
                            checkpoint_params.output_model_prefix,
                            "interrupted")
            raise e

        print("Total time {}s for {} iterations.".format(time.time() - train_start_time, iter))
Beispiel #18
0
def run(args: PredictArgs):
    # check if loading a json file
    # TODO: support running from JSON
    # if len(args.files) == 1 and args.files[0].endswith("json"):
    #     import json
    #     with open(args.files[0], 'r') as f:
    #        json_args = json.load(f)
    #        for key, value in json_args.items():
    #            setattr(args, key, value)

    # checks
    if args.extended_prediction_data_format not in ["pred", "json"]:
        raise Exception(
            "Only 'pred' and 'json' are allowed extended prediction data formats"
        )

    # add json as extension, resolve wildcard, expand user, ... and remove .json again
    args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json")
                       for cp in args.checkpoint]
    args.checkpoint = glob_all(args.checkpoint)
    args.checkpoint = [cp[:-5] for cp in args.checkpoint]

    # create ctc decoder
    prepare_ctc_decoder_params(args.ctc_decoder)

    # predict for all models
    from calamari_ocr.ocr.predict.predictor import MultiPredictor

    predictor = MultiPredictor.from_paths(
        checkpoints=args.checkpoint,
        voter_params=args.voter,
        predictor_params=args.predictor,
    )
    do_prediction = predictor.predict(args.data)
    pipeline: CalamariPipeline = predictor.data.get_or_create_pipeline(
        predictor.params.pipeline, args.data)
    reader = pipeline.reader()
    if len(reader) == 0:
        raise Exception(
            "Empty dataset provided. Check your command line arguments or if the provided files are empty."
        )

    avg_sentence_confidence = 0
    n_predictions = 0

    reader.prepare_store()

    # output the voted results to the appropriate files
    for s in do_prediction:
        _, (result, prediction), meta = s.inputs, s.outputs, s.meta
        sample = reader.sample_by_id(meta["id"])
        n_predictions += 1
        sentence = prediction.sentence

        avg_sentence_confidence += prediction.avg_char_probability
        if args.verbose:
            lr = "\u202A\u202B"
            logger.info("{}: '{}{}{}'".format(meta["id"],
                                              lr[get_base_level(sentence)],
                                              sentence, "\u202C"))

        output_dir = args.output_dir if args.output_dir else os.path.dirname(
            prediction.line_path)

        reader.store_text_prediction(prediction,
                                     meta["id"],
                                     output_dir=output_dir)

        if args.extended_prediction_data:
            ps = Predictions()
            ps.line_path = sample[
                "image_path"] if "image_path" in sample else sample["id"]
            ps.predictions.extend([prediction] +
                                  [r.prediction for r in result])
            output_dir = output_dir if output_dir else os.path.dirname(
                ps.line_path)
            if not os.path.exists(output_dir):
                os.mkdir(output_dir)

            if args.extended_prediction_data_format == "pred":
                data = zlib.compress(
                    ps.to_json(indent=2, ensure_ascii=False).encode("utf-8"))
            elif args.extended_prediction_data_format == "json":
                # remove logits
                for p in ps.predictions:
                    p.logits = None

                data = ps.to_json(indent=2)
            else:
                raise Exception("Unknown prediction format.")

            reader.store_extended_prediction(
                data,
                sample,
                output_dir=output_dir,
                extension=args.extended_prediction_data_format,
            )

    logger.info("Average sentence confidence: {:.2%}".format(
        avg_sentence_confidence / n_predictions))

    reader.store()
    logger.info("All prediction files written")
Beispiel #19
0
def run(args):
    # checks
    if args.extended_prediction_data_format not in ["pred", "json"]:
        raise Exception(
            "Only 'pred' and 'json' are allowed extended prediction data formats"
        )

    # add json as extension, resolve wildcard, expand user, ... and remove .json again
    args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json")
                       for cp in args.checkpoint]
    args.checkpoint = glob_all(args.checkpoint)
    args.checkpoint = [cp[:-5] for cp in args.checkpoint]

    # create voter
    voter_params = VoterParams()
    voter_params.type = VoterParams.Type.Value(args.voter.upper())
    voter = voter_from_proto(voter_params)

    # load files
    files = glob.glob(args.files)
    dataset = AbbyyDataSet(files,
                           skip_invalid=True,
                           remove_invalid=False,
                           binary=args.binary)

    dataset.load_samples(processes=args.processes,
                         progress_bar=not args.no_progress_bars)

    print("Found {} files in the dataset".format(len(dataset)))
    if len(dataset) == 0:
        raise Exception(
            "Empty dataset provided. Check your files argument (got {})!".
            format(args.files))

    # predict for all models
    predictor = MultiPredictor(checkpoints=args.checkpoint,
                               batch_size=args.batch_size,
                               processes=args.processes)
    do_prediction = predictor.predict_dataset(
        dataset, progress_bar=not args.no_progress_bars)

    # output the voted results to the appropriate files
    input_image_files = []

    # creat input_image_files list for next loop
    for page in dataset.book.pages:
        for fo in page.getFormats():
            input_image_files.append(page.imgFile)

    for (result, sample), filepath in zip(do_prediction, input_image_files):
        for i, p in enumerate(result):
            p.prediction.id = "fold_{}".format(i)

        # vote the results (if only one model is given, this will just return the sentences)
        prediction = voter.vote_prediction_result(result)
        prediction.id = "voted"
        sentence = prediction.sentence
        if args.verbose:
            lr = "\u202A\u202B"
            print("{}: '{}{}{}'".format(sample['id'],
                                        lr[get_base_level(sentence)], sentence,
                                        "\u202C"))

        output_dir = args.output_dir if args.output_dir else os.path.dirname(
            filepath)

        sample["format"].text = sentence

        if args.extended_prediction_data:
            ps = Predictions()
            ps.line_path = filepath
            ps.predictions.extend([prediction] +
                                  [r.prediction for r in result])
            if args.extended_prediction_data_format == "pred":
                with open(os.path.join(output_dir, sample['id'] + ".pred"),
                          'wb') as f:
                    f.write(ps.SerializeToString())
            elif args.extended_prediction_data_format == "json":
                with open(os.path.join(output_dir, sample['id'] + ".json"),
                          'w') as f:
                    # remove logits
                    for prediction in ps.predictions:
                        prediction.logits.rows = 0
                        prediction.logits.cols = 0
                        prediction.logits.data[:] = []

                    f.write(
                        MessageToJson(ps, including_default_value_fields=True))
            else:
                raise Exception("Unknown prediction format.")

    w = XMLWriter(output_dir, os.path.dirname(filepath), dataset.book)
    w.write()

    print("All files written")
Beispiel #20
0
def run(args):
    # check if loading a json file
    if len(args.files) == 1 and args.files[0].endswith("json"):
        import json
        with open(args.files[0], 'r') as f:
            json_args = json.load(f)
            for key, value in json_args.items():
                setattr(args, key, value)

    # checks
    if args.extended_prediction_data_format not in ["pred", "json"]:
        raise Exception(
            "Only 'pred' and 'json' are allowed extended prediction data formats"
        )

    # add json as extension, resolve wildcard, expand user, ... and remove .json again
    args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json")
                       for cp in args.checkpoint]
    args.checkpoint = glob_all(args.checkpoint)
    args.checkpoint = [cp[:-5] for cp in args.checkpoint]
    args.extension = args.extension if args.extension else DataSetType.pred_extension(
        args.dataset)

    # create ctc decoder
    ctc_decoder_params = create_ctc_decoder_params(args)

    # create voter
    voter_params = VoterParams()
    voter_params.type = VoterType(args.voter)

    # load files
    input_image_files = glob_all(args.files)
    if args.text_files:
        args.text_files = glob_all(args.text_files)

    # skip invalid files and remove them, there wont be predictions of invalid files
    predict_params = PipelineParams(
        type=args.dataset,
        skip_invalid=True,
        remove_invalid=True,
        files=input_image_files,
        text_files=args.text_files,
        data_reader_args=FileDataReaderArgs(
            pad=args.dataset_pad,
            text_index=args.pagexml_text_index,
        ),
        batch_size=args.batch_size,
        num_processes=args.processes,
    )

    # predict for all models
    # TODO: Use CTC Decoder params
    from calamari_ocr.ocr.predict.predictor import MultiPredictor
    predictor = MultiPredictor.from_paths(
        checkpoints=args.checkpoint,
        voter_params=voter_params,
        predictor_params=PredictorParams(
            silent=True, progress_bar=not args.no_progress_bars))
    do_prediction = predictor.predict(predict_params)
    pipeline: CalamariPipeline = predictor.data.get_predict_data(
        predict_params)
    reader = pipeline.reader()
    if len(reader) == 0:
        raise Exception(
            "Empty dataset provided. Check your files argument (got {})!".
            format(args.files))

    avg_sentence_confidence = 0
    n_predictions = 0

    reader.prepare_store()

    # output the voted results to the appropriate files
    for s in do_prediction:
        inputs, (result, prediction), meta = s.inputs, s.outputs, s.meta
        sample = reader.sample_by_id(meta['id'])
        n_predictions += 1
        sentence = prediction.sentence

        avg_sentence_confidence += prediction.avg_char_probability
        if args.verbose:
            lr = "\u202A\u202B"
            logger.info("{}: '{}{}{}'".format(meta['id'],
                                              lr[get_base_level(sentence)],
                                              sentence, "\u202C"))

        output_dir = args.output_dir

        reader.store_text(sentence,
                          sample,
                          output_dir=output_dir,
                          extension=args.extension)

        if args.extended_prediction_data:
            ps = Predictions()
            ps.line_path = sample[
                'image_path'] if 'image_path' in sample else sample['id']
            ps.predictions.extend([prediction] +
                                  [r.prediction for r in result])
            output_dir = output_dir if output_dir else os.path.dirname(
                ps.line_path)
            if not os.path.exists(output_dir):
                os.mkdir(output_dir)

            if args.extended_prediction_data_format == "pred":
                data = zlib.compress(
                    ps.to_json(indent=2, ensure_ascii=False).encode('utf-8'))
            elif args.extended_prediction_data_format == "json":
                # remove logits
                for p in ps.predictions:
                    p.logits = None

                data = ps.to_json(indent=2)
            else:
                raise Exception("Unknown prediction format.")

            reader.store_extended_prediction(
                data,
                sample,
                output_dir=output_dir,
                extension=args.extended_prediction_data_format)

    logger.info("Average sentence confidence: {:.2%}".format(
        avg_sentence_confidence / n_predictions))

    reader.store(args.extension)
    logger.info("All prediction files written")