Beispiel #1
0
    def add_page(self, im, segmentation=None, records=None):
        """
        Adds an image to the transcription interface, optionally filling in
        information from a list of ocr_record objects.

        Args:
            im (PIL.Image): Input image
            records (list): A list of ocr_record objects.
        """
        page = {}
        fd = BytesIO()
        im.save(fd, format='png', optimize=True)
        page['index'] = self.page_idx
        self.page_idx += 1
        page['img'] = 'data:image/png;base64,' + base64.b64encode(fd.getvalue()).decode('ascii')
        page['lines'] = []
        if records:
            for record in records:
                splits = regex.split(u'(\s+)', record.prediction)
                bbox = max_bbox(record.cuts)
                line_offset = 0
                segments = []
                for segment, whitespace in zip_longest(splits[0::2], splits[1::2]):
                    if len(segment):
                        seg_bbox = max_bbox(record.cuts[line_offset:line_offset + len(segment)])
                        segments.append({'bbox': '{}, {}, {}, {}'.format(*seg_bbox), 'text': segment, 'index': self.seg_idx})
                        self.seg_idx += 1
                        line_offset += len(segment)
                    if whitespace:
                        line_offset += len(whitespace)
                page['lines'].append({'index': self.line_idx, 'recognition': segments,
                                      'left': 100*int(bbox[0]) / im.size[0],
                                      'top': 100*int(bbox[1]) / im.size[1],
                                      'width': 100*(bbox[2] - bbox[0])/im.size[0],
                                      'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1],
                                      'bbox': '{}, {}, {}, {}'.format(int(bbox[0]),
                                                                      int(bbox[1]),
                                                                      int(bbox[2]),
                                                                      int(bbox[3]))})

                self.line_idx += 1
        elif segmentation:
            for bbox in segmentation:
                page['lines'].append({'index': self.line_idx, 
                                      'left': 100*int(bbox[0]) / im.size[0],
                                      'top': 100*int(bbox[1]) / im.size[1],
                                      'width': 100*(bbox[2] - bbox[0])/im.size[0],
                                      'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1],
                                      'bbox': '{}, {}, {}, {}'.format(int(bbox[0]),
                                                                      int(bbox[1]),
                                                                      int(bbox[2]),
                                                                      int(bbox[3]))})
                self.line_idx += 1
        else:
            raise KrakenInputException('Neither segmentations nor records given')
        self.pages.append(page)
Beispiel #2
0
def detect_scripts(im, bounds, model=None):
    """
    Detects scripts in a segmented page.

    Classifies lines returned by the page segmenter into runs of scripts/writing systems.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-tb/vertical-lr/rl'.
        model (str): Location of the script classification model or None for default.

    Returns:
        {'text_direction': '$dir', 'boxes': [[(script, (x1, y1, x2, y2)),...]]}: A
        dictionary containing the text direction and a list of lists of reading
        order sorted bounding boxes under the key 'boxes' with each list
        containing the script segmentation of a single line. Script is a
        ISO15924 4 character identifier.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
        KrakenInvalidModelException if no clstm module is available.
    """
    if not model:
        model = pkg_resources.resource_filename(__name__, 'script.clstm')
    rnn = models.load_clstm(model)
    # load numerical to 4 char identifier map
    with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp:
        n2s = json.load(fp)
    it = rpred(rnn, im, bounds)
    preds = []
    for pred in it:
        # substitute inherited scripts with neighboring runs
        def subs(m, s):
            p = u''
            for c in s:
                if c in m and p:
                    p += p[-1]
                else:
                    p += c
            return p
        p = subs([u'\U000f03e6', u'\U000f03e6'], pred.prediction)
        # do a reverse run to fix leading inherited scripts
        pred.prediction = ''.join(reversed(subs([u'\U000f03e6', u'\U000f03e6'], reversed(p))))
        # group by grapheme
        t = []
        for k, g in groupby(pred, key=lambda x: x[0]):
            # convert to ISO15924 numerical identifier
            k = ord(k) - 0xF0000
            b = max_bbox(x[1] for x in g)
            t.append((n2s[str(k)], b))
        preds.append(t)

    return {'boxes': preds, 'text_direction': bounds['text_direction']}
Beispiel #3
0
def detect_scripts(im,
                   bounds,
                   model=pkg_resources.resource_filename(
                       __name__, 'script.mlmodel'),
                   valid_scripts=None):
    """
    Detects scripts in a segmented page.

    Classifies lines returned by the page segmenter into runs of scripts/writing systems.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        model (str): Location of the script classification model or None for default.
        valid_scripts (list): List of valid scripts.

    Returns:
        {'script_detection': True, 'text_direction': '$dir', 'boxes':
        [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text
        direction and a list of lists of reading order sorted bounding boxes
        under the key 'boxes' with each list containing the script segmentation
        of a single line. Script is a ISO15924 4 character identifier.

    Raises:
        KrakenInvalidModelException if no clstm module is available.
    """
    raise NotImplementedError(
        'Temporarily unavailable. Please open a github ticket if you want this fixed sooner.'
    )
    im_str = get_im_str(im)
    logger.info(u'Detecting scripts with {} in {} lines on {}'.format(
        model, len(bounds['boxes']), im_str))
    logger.debug(u'Loading detection model {}'.format(model))
    rnn = models.load_any(model)
    # load numerical to 4 char identifier map
    logger.debug(u'Loading label to identifier map')
    with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp:
        n2s = json.load(fp)
    # convert allowed scripts to labels
    val_scripts = []
    if valid_scripts:
        logger.debug(
            u'Converting allowed scripts list {}'.format(valid_scripts))
        for k, v in n2s.items():
            if v in valid_scripts:
                val_scripts.append(chr(int(k) + 0xF0000))
    else:
        valid_scripts = []
    it = rpred(rnn, im, bounds, bidi_reordering=False)
    preds = []
    logger.debug(u'Running detection')
    for pred, bbox in zip(it, bounds['boxes']):
        # substitute inherited scripts with neighboring runs
        def _subs(m, s, r=False):
            p = u''
            for c in s:
                if c in m and p and not r:
                    p += p[-1]
                elif c not in m and p and r:
                    p += p[-1]
                else:
                    p += c
            return p

        logger.debug(u'Substituting scripts')
        p = _subs([u'\U000f03e2', u'\U000f03e6'], pred.prediction)
        # do a reverse run to fix leading inherited scripts
        pred.prediction = ''.join(
            reversed(_subs([u'\U000f03e2', u'\U000f03e6'], reversed(p))))
        # group by valid scripts. two steps: 1. substitute common confusions
        # (Latin->Fraktur and Syriac->Arabic) if given in script list.
        if 'Arab' in valid_scripts and 'Syrc' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f0087',
                                                      u'\U000f00a0')
        if 'Latn' in valid_scripts and 'Latf' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f00d9',
                                                      u'\U000f00d7')
        # next merge adjacent scripts
        if val_scripts:
            pred.prediction = _subs(val_scripts, pred.prediction, r=True)

        # group by grapheme
        t = []
        logger.debug(u'Merging detections')
        # if line contains only a single script return whole line bounding box
        if len(set(pred.prediction)) == 1:
            logger.debug('Only one script on line. Emitting whole line bbox')
            k = ord(pred.prediction[0]) - 0xF0000
            t.append((n2s[str(k)], bbox))
        else:
            for k, g in groupby(pred, key=lambda x: x[0]):
                # convert to ISO15924 numerical identifier
                k = ord(k) - 0xF0000
                b = max_bbox(x[1] for x in g)
                t.append((n2s[str(k)], b))
        preds.append(t)
    return {
        'boxes': preds,
        'text_direction': bounds['text_direction'],
        'script_detection': True
    }
Beispiel #4
0
def detect_scripts(im, bounds, model=pkg_resources.resource_filename(__name__, 'script.mlmodel'), valid_scripts=None):
    """
    Detects scripts in a segmented page.

    Classifies lines returned by the page segmenter into runs of scripts/writing systems.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        model (str): Location of the script classification model or None for default.
        valid_scripts (list): List of valid scripts.

    Returns:
        {'script_detection': True, 'text_direction': '$dir', 'boxes':
        [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text
        direction and a list of lists of reading order sorted bounding boxes
        under the key 'boxes' with each list containing the script segmentation
        of a single line. Script is a ISO15924 4 character identifier.

    Raises:
        KrakenInvalidModelException if no clstm module is available.
    """
    raise NotImplementedError('Temporarily unavailable. Please open a github ticket if you want this fixed sooner.')
    im_str = get_im_str(im)
    logger.info(u'Detecting scripts with {} in {} lines on {}'.format(model, len(bounds['boxes']), im_str))
    logger.debug(u'Loading detection model {}'.format(model))
    rnn = models.load_any(model)
    # load numerical to 4 char identifier map
    logger.debug(u'Loading label to identifier map')
    with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp:
        n2s = json.load(fp)
    # convert allowed scripts to labels
    val_scripts = []
    if valid_scripts:
        logger.debug(u'Converting allowed scripts list {}'.format(valid_scripts))
        for k, v in n2s.items():
            if v in valid_scripts:
                val_scripts.append(chr(int(k) + 0xF0000))
    else:
        valid_scripts = []
    it = rpred(rnn, im, bounds, bidi_reordering=False)
    preds = []
    logger.debug(u'Running detection')
    for pred, bbox in zip(it, bounds['boxes']):
        # substitute inherited scripts with neighboring runs
        def _subs(m, s, r=False):
            p = u''
            for c in s:
                if c in m and p and not r:
                    p += p[-1]
                elif c not in m and p and r:
                    p += p[-1]
                else:
                    p += c
            return p

        logger.debug(u'Substituting scripts')
        p = _subs([u'\U000f03e2', u'\U000f03e6'], pred.prediction)
        # do a reverse run to fix leading inherited scripts
        pred.prediction = ''.join(reversed(_subs([u'\U000f03e2', u'\U000f03e6'], reversed(p))))
        # group by valid scripts. two steps: 1. substitute common confusions
        # (Latin->Fraktur and Syriac->Arabic) if given in script list.
        if 'Arab' in valid_scripts and 'Syrc' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f0087', u'\U000f00a0')
        if 'Latn' in valid_scripts and 'Latf' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f00d9', u'\U000f00d7')
        # next merge adjacent scripts
        if val_scripts:
            pred.prediction = _subs(val_scripts, pred.prediction, r=True)

        # group by grapheme
        t = []
        logger.debug(u'Merging detections')
        # if line contains only a single script return whole line bounding box
        if len(set(pred.prediction)) == 1:
            logger.debug('Only one script on line. Emitting whole line bbox')
            k = ord(pred.prediction[0]) - 0xF0000
            t.append((n2s[str(k)], bbox))
        else:
            for k, g in groupby(pred, key=lambda x: x[0]):
                # convert to ISO15924 numerical identifier
                k = ord(k) - 0xF0000
                b = max_bbox(x[1] for x in g)
                t.append((n2s[str(k)], b))
        preds.append(t)
    return {'boxes': preds, 'text_direction': bounds['text_direction'], 'script_detection': True}