Exemple #1
0
    def add(self, image, split=lambda x: os.path.splitext(x)[0],
                 suffix='.gt.txt', normalization=None, reorder=True,
                 pad=16):
        """
        Adds a single image to the training set.
        """
        with click.open_file(split(image) + suffix, 'r', encoding='utf-8') as fp:
            gt = fp.read()
            if normalization:
                gt = unicodedata.normalize(normalization, gt)
            if reorder:
                gt = bd.get_display(gt)

            im = Image.open(image)
            im = rpred.dewarp(self.lnorm, im)
            im = pil2array(im)
            im = lstm.prepare_line(im, pad)
            self.training_set.append((im, gt))
Exemple #2
0
    def add(self,
            image,
            split=lambda x: os.path.splitext(x)[0],
            suffix='.gt.txt',
            normalization=None,
            reorder=True,
            pad=16):
        """
        Adds a single image to the training set.
        """
        with click.open_file(split(image) + suffix, 'r',
                             encoding='utf-8') as fp:
            gt = fp.read()
            if normalization:
                gt = unicodedata.normalize(normalization, gt)
            if reorder:
                gt = bd.get_display(gt)

            im = Image.open(image)
            im = rpred.dewarp(self.lnorm, im)
            im = pil2array(im)
            im = lstm.prepare_line(im, pad)
            self.training_set.append((im, gt))
Exemple #3
0
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """

    lnorm = getattr(network, 'lnorm', CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                pos.append((coords[0] + int(
                    (start - pad) * scale), coords[1], coords[0] + int(
                        (end - pad / 2) * scale), coords[3]))
            else:
                pos.append((coords[0], coords[1] + int(
                    (start - pad) * scale), coords[2], coords[1] + int(
                        (end - pad / 2) * scale)))
            conf.append(c)
        if bidi_reordering:
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
Exemple #4
0
def mm_rpred(nets,
             im,
             bounds,
             pad=16,
             line_normalization=True,
             bidi_reordering=True):
    """
    Multi-model version of kraken.rpred.rpred.

    Takes a dictionary of ISO15924 script identifiers->models and an
    script-annotated segmentation to dynamically select appropriate models for
    these lines.

    Args:
        nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer
                     objects. Recommended to be an defaultdict.
        im (PIL.Image): Image to extract text from
                        bounds (dict): A dictionary containing a 'boxes' entry
                        with a list of lists of coordinates (script, (x0, y0,
                        x1, y1)) of a text line in the image and an entry
                        'text_direction' containing
                        'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    for line in bounds['boxes']:
        rec = ocr_record('', [], [])
        for script, (box, coords) in zip(
                map(lambda x: x[0], line),
                extract_boxes(
                    im, {
                        'text_direction': bounds['text_direction'],
                        'boxes': map(lambda x: x[1], line)
                    })):
            # check if boxes are non-zero in any dimension
            if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
                continue
            raw_line = pil2array(box)
            # check if line is non-zero
            if np.amax(raw_line) == np.amin(raw_line):
                continue
            if line_normalization:
                # fail gracefully and return no recognition result in case the
                # input line can not be normalized.
                try:
                    lnorm = getattr(nets[script], 'lnorm', CenterNormalizer())
                    box = dewarp(lnorm, box)
                except Exception as e:
                    continue
            line = pil2array(box)
            line = lstm.prepare_line(line, pad)
            pred = nets[script].predictString(line)
            # calculate recognized LSTM locations of characters
            scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad)
            result = lstm.translate_back_locations(nets[script].outputs)
            pos = []
            conf = []

            for _, start, end, c in result:
                if bounds['text_direction'].startswith('horizontal'):
                    pos.append((coords[0] + int(
                        (start - pad) * scale), coords[1], coords[0] + int(
                            (end - pad / 2) * scale), coords[3]))
                else:
                    pos.append((coords[0], coords[1] + int(
                        (start - pad) * scale), coords[2], coords[1] + int(
                            (end - pad / 2) * scale)))
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if bidi_reordering:
            yield bidi_record(rec)
        else:
            yield rec
Exemple #5
0
def rpred(network, im, bounds, pad=16, line_normalization=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (iterable): An iterable returning a tuple defining the absolute
                           coordinates (x0, y0, x1, y1) of a text line in the
                           Image.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
    Yields:
        A tuple containing the recognized text (0), absolute character
        positions in the image (1), and confidence values for each
        character(2).
    """

    lnorm = getattr(network, "lnorm", CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == False or coords[3] - coords[1] == False:
            yield ocr_record("", [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record("", [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record("", [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        result = lstm.translate_back(network.outputs, pos=1)
        conf = [network.outputs[r, c] for r, c in result if c != 0]
        cuts = [(int((r - pad) * scale), c) for (r, c) in result]
        # append last offset to end of line
        cuts.append((coords[2] - coords[0], 0))
        pos = []
        lx = 0
        for i, d in enumerate(cuts):
            if d[1] == 0:
                lx = d[0]
                continue
            try:
                pos.append((coords[0] + lx, coords[1], coords[0] + d[0], coords[3]))
            except:
                break
            lx = d[0]
        yield ocr_record(pred, pos, conf)
Exemple #6
0
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info(u'Running recognizer on {} with {} lines'.format(
        im_str, len(bounds['boxes'])))
    logger.debug(u'Loading line normalizer')
    lnorm = getattr(network, 'lnorm', CenterNormalizer())
    if not is_bitonal(im):
        logger.info(u'Image is grayscale. Adjusting normalizer parameters')
        lnorm.range = 2

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            logger.warning(
                u'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            logger.warning(
                u'Empty line {}. Emitting empty record.'.format(coords))
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                logger.warning(
                    u'Dewarping for bbox {} failed. Emitting empty record.'.
                    format(coords))
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        logger.debug(u'Preparing line.')
        line = lstm.prepare_line(line, pad)
        logger.debug(u'Performing forward pass.')
        pred = network.predictString(line)
        logger.info(u'Prediction: {}'.format(pred))

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        logger.debug(u'Extracting labels.')
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                xmin = coords[0] + int(max((start - pad) * scale, 0))
                xmax = coords[0] + max(
                    int(min((end - pad) * scale, coords[2] - coords[0])), 1)
                pos.append((xmin, coords[1], xmax, coords[3]))
            else:
                ymin = coords[1] + int(max((start - pad) * scale, 0))
                ymax = coords[1] + max(
                    int(min((end - pad) * scale, coords[3] - coords[1])), 1)
                pos.append((coords[0], ymin, coords[2], ymax))
            conf.append(c)
        if bidi_reordering:
            logger.debug(u'BiDi reordering record.')
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
Exemple #7
0
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (iterable): An iterable returning a tuple defining the absolute
                           coordinates (x0, y0, x1, y1) of a text line in the
                           Image.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character. 
    """

    lnorm = getattr(network, 'lnorm', CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == False or coords[3] - coords[1] == False:
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T)/(len(network.outputs)-2 * pad)
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            pos.append((coords[0] + int((start-pad)*scale), coords[1], coords[0] + int((end-pad/2)*scale), coords[3]))
            conf.append(c)
        if bidi_reordering:
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)