Esempio n. 1
0
    def _recognize_baseline_line(self, line):
        if self.tags_ignore is not None:
            for tag in line['lines'][0]['tags'].values():
                if tag in self.tags_ignore:
                    logger.info(
                        f'Ignoring line segment with tags {line["lines"][0]["tags"]} based on {tag}.'
                    )
                    return ocr_record('', [], [], line['lines'][0])

        try:
            box, coords = next(extract_polygons(self.im, line))
        except KrakenInputException as e:
            logger.warning(f'Extracting line failed: {e}')
            return ocr_record('', [], [], line['lines'][0])

        self.box = box

        tag, net = self._resolve_tags_to_model(coords['tags'], self.nets)
        # check if boxes are non-zero in any dimension
        if 0 in box.size:
            logger.warning(
                f'bbox {coords} with zero dimension. Emitting empty record.')
            return ocr_record('', [], [], coords)
        # try conversion into tensor
        try:
            line = self.ts[tag](box)
        except Exception:
            return ocr_record('', [], [], coords)
        # check if line is non-zero
        if line.max() == line.min():
            return ocr_record('', [], [], coords)

        preds = net.predict(line.unsqueeze(0))[0]
        # calculate recognized LSTM locations of characters
        # scale between network output and network input
        self.net_scale = line.shape[2] / net.outputs.shape[2]
        # scale between network input and original line
        self.in_scale = box.size[0] / (line.shape[2] - 2 * self.pad)

        # XXX: fix bounding box calculation ocr_record for multi-codepoint labels.
        pred = ''.join(x[0] for x in preds)
        pos = []
        conf = []
        for _, start, end, c in preds:
            pos.append(
                compute_polygon_section(
                    coords['baseline'], coords['boundary'],
                    self._scale_val(start, 0, self.box.size[0]),
                    self._scale_val(end, 0, self.box.size[0])))
            conf.append(c)
        if self.bidi_reordering:
            logger.debug('BiDi reordering record.')
            return bidi_record(ocr_record(pred, pos, conf, coords),
                               base_dir=self.bidi_reordering
                               if self.bidi_reordering in ('L', 'R') else None)
        else:
            logger.debug('Emitting raw record')
            return ocr_record(pred, pos, conf, coords)
Esempio n. 2
0
    def _recognize_baseline_line(self, line):
        try:
            box, coords = next(extract_polygons(self.im, line))
        except KrakenInputException as e:
            logger.warning(f'Extracting line failed: {e}')
            return ocr_record('', [], [], line['lines'][0])

        script = coords['script']
        # check if boxes are non-zero in any dimension
        if 0 in box.size:
            logger.warning(
                'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            return ocr_record('', [], [], coords)
        # try conversion into tensor
        try:
            line = self.ts[script](box)
        except Exception:
            return ocr_record('', [], [], coords)
        # check if line is non-zero
        if line.max() == line.min():
            return ocr_record('', [], [], coords)

        preds = self.nets[script].predict(line.unsqueeze(0))[0]
        # calculate recognized LSTM locations of characters
        # scale between network output and network input
        net_scale = line.shape[2] / self.nets[script].outputs.shape[2]
        # scale between network input and original line
        in_scale = box.size[0] / (line.shape[2] - 2 * self.pad)

        def _scale_val(val, min_val, max_val):
            return int(
                round(
                    min(
                        max(((val * net_scale) - self.pad) * in_scale,
                            min_val), max_val - 1)))

        # XXX: fix bounding box calculation ocr_record for multi-codepoint labels.
        pred = ''.join(x[0] for x in preds)
        pos = []
        conf = []
        for _, start, end, c in preds:
            pos.append(
                compute_polygon_section(coords['baseline'], coords['boundary'],
                                        _scale_val(start, 0, box.size[0]),
                                        _scale_val(end, 0, box.size[0])))
            conf.append(c)
        if self.bidi_reordering:
            logger.debug('BiDi reordering record.')
            rec = bidi_record(ocr_record(pred, pos, conf, coords))
            return rec
        else:
            logger.debug('Emitting raw record')
            return ocr_record(pred, pos, conf, coords)
Esempio n. 3
0
def forced_align(doc, model):
    """
    Performs a forced character alignment of text with recognition model
    output activations.

    Argument:
        doc (dict): Parsed document.
        model (kraken.lib.model.TorchSeqRecognizer): Recognition model to use for alignment.

    Returns:
        A list of kraken.rpred.ocr_record.
    """
    im = Image.open(doc['image'])
    predictor = rpred.rpred(model, im, doc)
    records = []
    for line in doc['lines']:
        bidi_text = get_display(line['text'])
        gt_fst = fst_from_text(bidi_text, model.codec)
        next(predictor)
        lat_fst = fst_from_lattice(model.outputs)
        composed_graph = _compose(lat_fst, gt_fst)
        short_graph = _shortest_path(composed_graph)
        pred = []
        pos = []
        conf = []
        for act, label in _generate_line_record(short_graph,
                                                model.outputs.shape[2] - 1):
            pos.append(
                compute_polygon_section(
                    line['baseline'], line['boundary'],
                    predictor._scale_val(act[0], 0, predictor.box.size[0]),
                    predictor._scale_val(act[1], 0, predictor.box.size[0])))
            conf.append(1.0)
            pred.append(model.codec.decode([(label, 0, 0, 0)])[0][0])
        records.append(
            rpred.bidi_record(rpred.ocr_record(pred, pos, conf, line)))
    return records