Beispiel #1
0
def compute_segmentation_map(im,
                             mask: Optional[np.array] = None,
                             model=None,
                             device: str = 'cpu'):
    """

    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if model.input[
            1] == 1 and model.one_channel_mode == '1' and not is_bitonal(im):
        logger.warning('Running binary model on non-binary input image '
                       '(mode {}). This will result in severely degraded '
                       'performance'.format(im.mode))

    model.eval()
    model.to(device)

    if mask:
        if mask.mode != '1' and not is_bitonal(mask):
            logger.error('Mask is not bitonal')
            raise KrakenInputException('Mask is not bitonal')
        mask = mask.convert('1')
        if mask.size != im.size:
            logger.error(
                'Mask size {mask.size} doesn\'t match image size {im.size}')
            raise KrakenInputException(
                'Mask size {mask.size} doesn\'t match image size {im.size}')
        logger.info('Masking enabled in segmenter.')
        mask = pil2array(mask)

    batch, channels, height, width = model.input
    transforms = dataset.generate_input_transforms(batch,
                                                   height,
                                                   width,
                                                   channels,
                                                   0,
                                                   valid_norm=False)
    res_tf = tf.Compose(transforms.transforms[:3])
    scal_im = res_tf(im).convert('L')

    with torch.no_grad():
        logger.debug('Running network forward pass')
        o, _ = model.nn(transforms(im).unsqueeze(0).to(device))
    logger.debug('Upsampling network output')
    o = F.interpolate(o, size=scal_im.size[::-1])
    o = o.squeeze().cpu().numpy()
    scale = np.divide(im.size, o.shape[:0:-1])
    bounding_regions = model.user_metadata[
        'bounding_regions'] if 'bounding_regions' in model.user_metadata else None
    return {
        'heatmap': o,
        'cls_map': model.user_metadata['class_mapping'],
        'bounding_regions': bounding_regions,
        'scale': scale,
        'scal_im': scal_im
    }
Beispiel #2
0
    def add_page(self, im, segmentation=None, records=None):
        """
        Adds an image to the transcription interface, optionally filling in
        information from a list of ocr_record objects.

        Args:
            im (PIL.Image): Input image
            segmentation (dict): Output of the segment method.
            records (list): A list of ocr_record objects.
        """
        im_str = get_im_str(im)
        logger.info(u'Adding page {} with {} lines'.format(im_str, len(segmentation) if segmentation else len(records)))
        page = {}
        fd = BytesIO()
        im.save(fd, format='png', optimize=True)
        page['index'] = self.page_idx
        self.page_idx += 1
        logger.debug(u'Base64 encoding image')
        page['img'] = 'data:image/png;base64,' + base64.b64encode(fd.getvalue()).decode('ascii')
        page['lines'] = []
        if records:
            logger.debug(u'Adding records.')
            self.text_direction = segmentation['text_direction']
            for record, bbox in zip(records, segmentation['boxes']):
                page['lines'].append({'index': self.line_idx, 'text': record.prediction,
                                      'left': 100*int(bbox[0]) / im.size[0],
                                      'top': 100*int(bbox[1]) / im.size[1],
                                      'width': 100*(bbox[2] - bbox[0])/im.size[0],
                                      'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1],
                                      'bbox': '{}, {}, {}, {}'.format(int(bbox[0]),
                                                                      int(bbox[1]),
                                                                      int(bbox[2]),
                                                                      int(bbox[3]))})

                self.line_idx += 1
        elif segmentation:
            logger.debug(u'Adding segmentations.')
            self.text_direction = segmentation['text_direction']
            for bbox in segmentation['boxes']:
                page['lines'].append({'index': self.line_idx,
                                      'left': 100*int(bbox[0]) / im.size[0],
                                      'top': 100*int(bbox[1]) / im.size[1],
                                      'width': 100*(bbox[2] - bbox[0])/im.size[0],
                                      'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1],
                                      'bbox': '{}, {}, {}, {}'.format(int(bbox[0]),
                                                                      int(bbox[1]),
                                                                      int(bbox[2]),
                                                                      int(bbox[3]))})
                self.line_idx += 1
        else:
            raise KrakenInputException('Neither segmentations nor records given')
        self.pages.append(page)
Beispiel #3
0
def nlbin(im: Image.Image,
          threshold: float = 0.5,
          zoom: float = 0.5,
          escale: float = 1.0,
          border: float = 0.1,
          perc: int = 80,
          range: int = 20,
          low: int = 5,
          high: int = 90) -> Image:
    """
    Performs binarization using non-linear processing.

    Args:
        im (PIL.Image.Image):
        threshold (float):
        zoom (float): Zoom for background page estimation
        escale (float): Scale for estimating a mask over the text region
        border (float): Ignore this much of the border
        perc (int): Percentage for filters
        range (int): Range for filters
        low (int): Percentile for black estimation
        high (int): Percentile for white estimation

    Returns:
        PIL.Image containing the binarized image

    Raises:
        KrakenInputException when trying to binarize an empty image.
    """
    im_str = get_im_str(im)
    logger.info(f'Binarizing {im_str}')
    if is_bitonal(im):
        logger.info(f'Skipping binarization because {im_str} is bitonal.')
        return im
    # convert to grayscale first
    logger.debug(f'Converting {im_str} to grayscale')
    im = im.convert('L')
    raw = pil2array(im)
    logger.debug('Scaling and normalizing')
    # rescale image to between -1 or 0 and 1
    raw = raw / np.float(np.iinfo(raw.dtype).max)
    # perform image normalization
    if np.amax(raw) == np.amin(raw):
        logger.warning(f'Trying to binarize empty image {im_str}')
        raise KrakenInputException('Image is empty')
    image = raw - np.amin(raw)
    image /= np.amax(image)

    logger.debug('Interpolation and percentile filtering')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        m = interpolation.zoom(image, zoom)
        m = filters.percentile_filter(m, perc, size=(range, 2))
        m = filters.percentile_filter(m, perc, size=(2, range))
        mh, mw = m.shape
        oh, ow = image.shape
        scale = np.diag([mh * 1.0 / oh, mw * 1.0 / ow])
        m = affine_transform(m, scale, output_shape=image.shape)
    w, h = np.minimum(np.array(image.shape), np.array(m.shape))
    flat = np.clip(image[:w, :h] - m[:w, :h] + 1, 0, 1)

    # estimate low and high thresholds
    d0, d1 = flat.shape
    o0, o1 = int(border * d0), int(border * d1)
    est = flat[o0:d0 - o0, o1:d1 - o1]
    logger.debug('Threshold estimates {}'.format(est))
    # by default, we use only regions that contain
    # significant variance; this makes the percentile
    # based low and high estimates more reliable
    logger.debug('Refine estimates')
    v = est - filters.gaussian_filter(est, escale * 20.0)
    v = filters.gaussian_filter(v**2, escale * 20.0)**0.5
    v = (v > 0.3 * np.amax(v))
    v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1)))
    v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50))))
    est = est[v]
    lo = np.percentile(est.ravel(), low)
    hi = np.percentile(est.ravel(), high)
    flat -= lo
    flat /= (hi - lo)
    flat = np.clip(flat, 0, 1)
    logger.debug(f'Thresholding at {threshold}')
    bin = np.array(255 * (flat > threshold), 'B')
    return array2pil(bin)
Beispiel #4
0
    def __init__(self,
                 nets: Dict[str, TorchSeqRecognizer],
                 im: Image.Image,
                 bounds: dict,
                 pad: int = 16,
                 bidi_reordering: bool = True,
                 script_ignore: Optional[List[str]] = None) -> Generator[ocr_record, None, None]:
        """
        Multi-model version of kraken.rpred.rpred.

        Takes a dictionary of ISO15924 script identifiers->models and an
        script-annotated segmentation to dynamically select appropriate models for
        these lines.

        Args:
            nets (dict): A dict mapping ISO15924 identifiers to TorchSegRecognizer
                         objects. Recommended to be an defaultdict.
            im (PIL.Image.Image): Image to extract text from
            bounds (dict): A dictionary containing a 'boxes' entry
                            with a list of lists of coordinates (script, (x0, y0,
                            x1, y1)) of a text line in the image and an entry
                            'text_direction' containing
                            'horizontal-lr/rl/vertical-lr/rl'.
            pad (int): Extra blank padding to the left and right of text line
            bidi_reordering (bool): Reorder classes in the ocr_record according to
                                    the Unicode bidirectional algorithm for correct
                                    display.
            script_ignore (list): List of scripts to ignore during recognition
        Yields:
            An ocr_record containing the recognized text, absolute character
            positions, and confidence values for each character.

        Raises:
            KrakenInputException if the mapping between segmentation scripts and
            networks is incomplete.
        """
        seg_types = set(recognizer.seg_type for recognizer in nets.values())
        if ('type' in bounds and bounds['type'] not in seg_types) or len(seg_types) > 1:
            logger.warning('Recognizers with segmentation types {} will be '
                           'applied to segmentation of type {}. This will likely result '
                           'in severely degraded performace'.format(seg_types,
                            bounds['type'] if 'type' in bounds else None))
        one_channel_modes = set(recognizer.nn.one_channel_mode for recognizer in nets.values())
        if '1' in one_channel_modes and len(one_channel_modes) > 1:
            raise KrakenInputException('Mixing binary and non-binary recognition models is not supported.')
        elif '1' in one_channel_modes and not is_bitonal(im):
            logger.warning('Running binary models on non-binary input image '
                           '(mode {}). This will result in severely degraded '
                           'performance'.format(im.mode))
        if 'type' in bounds and bounds['type'] == 'baselines':
            valid_norm = False
            self.len = len(bounds['lines'])
            self.seg_key = 'lines'
            self.next_iter = self._recognize_baseline_line
            self.line_iter = iter(bounds['lines'])
            scripts = [x['script'] for x in bounds['lines']]
        else:
            valid_norm = True
            self.len = len(bounds['boxes'])
            self.seg_key = 'boxes'
            self.next_iter = self._recognize_box_line
            self.line_iter = iter(bounds['boxes'])
            scripts = [x[0] for line in bounds['boxes'] for x in line]

        im_str = get_im_str(im)
        logger.info('Running {} multi-script recognizers on {} with {} lines'.format(len(nets), im_str, self.len))

        miss = [script for script in scripts if not nets.get(script)]
        if miss and not isinstance(nets, defaultdict):
            raise KrakenInputException('Missing models for scripts {}'.format(set(miss)))

        # build dictionary for line preprocessing
        self.ts = {}
        for script in scripts:
            logger.debug('Loading line transforms for {}'.format(script))
            network = nets[script]
            batch, channels, height, width = network.nn.input
            self.ts[script] = generate_input_transforms(batch, height, width, channels, pad, valid_norm)

        self.im = im
        self.nets = nets
        self.bidi_reordering = bidi_reordering
        self.pad = pad
        self.bounds = bounds
        self.script_ignore = script_ignore
Beispiel #5
0
def detect_scripts(im,
                   bounds,
                   model=pkg_resources.resource_filename(
                       __name__, 'script.mlmodel'),
                   valid_scripts=None):
    """
    Detects scripts in a segmented page.

    Classifies lines returned by the page segmenter into runs of scripts/writing systems.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        model (str): Location of the script classification model or None for default.
        valid_scripts (list): List of valid scripts.

    Returns:
        {'script_detection': True, 'text_direction': '$dir', 'boxes':
        [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text
        direction and a list of lists of reading order sorted bounding boxes
        under the key 'boxes' with each list containing the script segmentation
        of a single line. Script is a ISO15924 4 character identifier.

    Raises:
        KrakenInvalidModelException if no clstm module is available.
    """
    raise NotImplementedError(
        'Temporarily unavailable. Please open a github ticket if you want this fixed sooner.'
    )
    im_str = get_im_str(im)
    logger.info(u'Detecting scripts with {} in {} lines on {}'.format(
        model, len(bounds['boxes']), im_str))
    logger.debug(u'Loading detection model {}'.format(model))
    rnn = models.load_any(model)
    # load numerical to 4 char identifier map
    logger.debug(u'Loading label to identifier map')
    with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp:
        n2s = json.load(fp)
    # convert allowed scripts to labels
    val_scripts = []
    if valid_scripts:
        logger.debug(
            u'Converting allowed scripts list {}'.format(valid_scripts))
        for k, v in n2s.items():
            if v in valid_scripts:
                val_scripts.append(chr(int(k) + 0xF0000))
    else:
        valid_scripts = []
    it = rpred(rnn, im, bounds, bidi_reordering=False)
    preds = []
    logger.debug(u'Running detection')
    for pred, bbox in zip(it, bounds['boxes']):
        # substitute inherited scripts with neighboring runs
        def _subs(m, s, r=False):
            p = u''
            for c in s:
                if c in m and p and not r:
                    p += p[-1]
                elif c not in m and p and r:
                    p += p[-1]
                else:
                    p += c
            return p

        logger.debug(u'Substituting scripts')
        p = _subs([u'\U000f03e2', u'\U000f03e6'], pred.prediction)
        # do a reverse run to fix leading inherited scripts
        pred.prediction = ''.join(
            reversed(_subs([u'\U000f03e2', u'\U000f03e6'], reversed(p))))
        # group by valid scripts. two steps: 1. substitute common confusions
        # (Latin->Fraktur and Syriac->Arabic) if given in script list.
        if 'Arab' in valid_scripts and 'Syrc' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f0087',
                                                      u'\U000f00a0')
        if 'Latn' in valid_scripts and 'Latf' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f00d9',
                                                      u'\U000f00d7')
        # next merge adjacent scripts
        if val_scripts:
            pred.prediction = _subs(val_scripts, pred.prediction, r=True)

        # group by grapheme
        t = []
        logger.debug(u'Merging detections')
        # if line contains only a single script return whole line bounding box
        if len(set(pred.prediction)) == 1:
            logger.debug('Only one script on line. Emitting whole line bbox')
            k = ord(pred.prediction[0]) - 0xF0000
            t.append((n2s[str(k)], bbox))
        else:
            for k, g in groupby(pred, key=lambda x: x[0]):
                # convert to ISO15924 numerical identifier
                k = ord(k) - 0xF0000
                b = max_bbox(x[1] for x in g)
                t.append((n2s[str(k)], b))
        preds.append(t)
    return {
        'boxes': preds,
        'text_direction': bounds['text_direction'],
        'script_detection': True
    }
Beispiel #6
0
def segment(im,
            text_direction: str = 'horizontal-lr',
            mask: Optional[np.array] = None,
            reading_order_fn: Callable = polygonal_reading_order,
            model: Union[List[vgsl.TorchVGSLModel],
                         vgsl.TorchVGSLModel] = None,
            device: str = 'cpu'):
    """
    Segments a page into text lines using the baseline segmenter.

    Segments a page into text lines and returns the polyline formed by each
    baseline and their estimated environment.

    Args:
        im (PIL.Image): An RGB image.
        text_direction (str): Ignored by the segmenter but kept for
                              serialization.
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.
        reading_order_fn (function): Function to determine the reading order.
                                     Has to accept a list of tuples (baselines,
                                     polygon) and a text direction (`lr` or
                                     `rl`).
        model (vgsl.TorchVGSLModel or list): One or more TorchVGSLModel
                                             containing a segmentation model.
                                             If none is given a default model
                                             will be loaded.
        device (str or torch.Device): The target device to run the neural
                                      network on.

    Returns:
        {'text_direction': '$dir',
         'type': 'baseline',
         'lines': [
            {'baseline': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'boundary': [[x0, y0, x1, y1], ... [x_m, y_m]]},
            {'baseline': [[x0, ...]], 'boundary': [[x0, ...]]}
          ]
          'regions': [
            {'region': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'type': 'image'},
            {'region': [[x0, ...]], 'type': 'text'}
          ]
        }: A dictionary containing the text direction and under the key 'lines'
        a list of reading order sorted baselines (polylines) and their
        respective polygonal boundaries. The last and first point of each
        boundary polygon is connected.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    if model is None:
        logger.info('No segmentation model given. Loading default model.')
        model = vgsl.TorchVGSLModel.load_model(
            pkg_resources.resource_filename(__name__, 'blla.mlmodel'))

    if isinstance(model, vgsl.TorchVGSLModel):
        model = [model]

    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    for net in model:
        rets = compute_segmentation_map(im, mask, net, device)
        regions = vec_regions(**rets)
        # flatten regions for line ordering/fetch bounding regions
        line_regs = []
        suppl_obj = []
        for cls, regs in regions.items():
            line_regs.extend(regs)
            if rets['bounding_regions'] is not None and cls in rets[
                    'bounding_regions']:
                suppl_obj.extend(regs)
        lines = vec_lines(**rets,
                          regions=line_regs,
                          reading_order_fn=reading_order_fn,
                          text_direction=text_direction,
                          suppl_obj=suppl_obj)

    if len(rets['cls_map']['baselines']) > 1:
        script_detection = True
    else:
        script_detection = False

    return {
        'text_direction': text_direction,
        'type': 'baselines',
        'lines': lines,
        'regions': regions,
        'script_detection': script_detection
    }
Beispiel #7
0
def mm_rpred(nets: Dict[str, TorchSeqRecognizer],
             im: Image.Image,
             bounds: dict,
             pad: int = 16,
             bidi_reordering: bool = True,
             script_ignore: Optional[List[str]] = None) -> Generator[ocr_record, None, None]:
    """
    Multi-model version of kraken.rpred.rpred.

    Takes a dictionary of ISO15924 script identifiers->models and an
    script-annotated segmentation to dynamically select appropriate models for
    these lines.

    Args:
        nets (dict): A dict mapping ISO15924 identifiers to TorchSegRecognizer
                     objects. Recommended to be an defaultdict.
        im (PIL.Image.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry
                        with a list of lists of coordinates (script, (x0, y0,
                        x1, y1)) of a text line in the image and an entry
                        'text_direction' containing
                        'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
        script_ignore (list): List of scripts to ignore during recognition
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.

    Raises:
        KrakenInputException if the mapping between segmentation scripts and
        networks is incomplete.
    """
    im_str = get_im_str(im)
    logger.info('Running {} multi-script recognizers on {} with {} lines'.format(len(nets), im_str, len(bounds['boxes'])))

    miss = [x[0] for x in bounds['boxes'] if not nets.get(x[0])]
    if miss:
        raise KrakenInputException('Missing models for scripts {}'.format(miss))

    # build dictionary for line preprocessing
    ts = {}
    for script, network in nets.items():
        logger.debug('Loading line transforms for {}'.format(script))
        batch, channels, height, width = network.nn.input
        ts[script] = generate_input_transforms(batch, height, width, channels, pad)

    for line in bounds['boxes']:
        rec = ocr_record('', [], [])
        for script, (box, coords) in zip(map(lambda x: x[0], line),
                                         extract_boxes(im, {'text_direction': bounds['text_direction'],
                                                            'boxes': map(lambda x: x[1], line)})):
            # skip if script is set to ignore
            if script_ignore is not None and script in script_ignore:
                logger.info('Ignoring {} line segment.'.format(script))
                continue
            # check if boxes are non-zero in any dimension
            if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
                logger.warning('Run with zero dimension. Skipping.')
                continue
            # try conversion into tensor
            try:
                logger.debug('Preparing run.')
                line = ts[script](box)
            except Exception:
                logger.warning('Conversion of line {} failed. Skipping.'.format(coords))
                yield ocr_record('', [], [])
                continue

            # check if line is non-zero
            if line.max() == line.min():
                logger.warning('Empty run. Skipping.')
                yield ocr_record('', [], [])
                continue

            logger.debug('Forward pass with model {}'.format(script))
            preds = nets[script].predict(line)

            # calculate recognized LSTM locations of characters
            logger.debug('Convert to absolute coordinates')
            scale = box.size[0]/(len(nets[script].outputs)-2 * pad)
            pred = ''.join(x[0] for x in preds)
            pos = []
            conf = []

            for _, start, end, c in preds:
                if bounds['text_direction'].startswith('horizontal'):
                    xmin = coords[0] + int(max((start-pad)*scale, 0))
                    xmax = coords[0] + max(int(min((end-pad)*scale, coords[2]-coords[0])), 1)
                    pos.append((xmin, coords[1], xmax, coords[3]))
                else:
                    ymin = coords[1] + int(max((start-pad)*scale, 0))
                    ymax = coords[1] + max(int(min((end-pad)*scale, coords[3]-coords[1])), 1)
                    pos.append((coords[0], ymin, coords[2], ymax))
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if bidi_reordering:
            logger.debug('BiDi reordering record.')
            yield bidi_record(rec)
        else:
            logger.debug('Emitting raw record')
            yield rec
Beispiel #8
0
def segment(im,
            text_direction='horizontal-lr',
            scale=None,
            maxcolseps=2,
            black_colseps=False,
            no_hlines=True,
            pad=0):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not
        no_hlines (bool): Switch for horizontal line removal
        pad (int or tuple): Padding to add to line bounding boxes. If int the
                            same padding is used both left and right. If a
                            2-tuple, uses (padding_left, padding_right).

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info(u'Segmenting {}'.format(im_str))

    if im.mode != '1' and not is_bitonal(im):
        logger.error(u'Image {} is not bi-level'.format(im_str))
        raise KrakenInputException('Image {} is not bi-level'.format(im_str))

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        logger.error(u'Invalid text direction \'{}\''.format(text_direction))
        raise KrakenInputException(
            'Invalid text direction {}'.format(text_direction))

    logger.debug(u'Rotating input image by {} degrees'.format(angle))
    im = im.rotate(angle, expand=True)

    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    if no_hlines:
        binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.

    try:
        if black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        logger.warning(
            u'Exception in column finder (probably empty image) for {}.'.
            format(im_str))
        return {'text_direction': text_direction, 'boxes': []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]

    if isinstance(pad, int):
        pad = (pad, pad)
    lines = [(max(x[0] - pad[0], 0), x[1], min(x[2] + pad[1],
                                               im.size[0]), x[3])
             for x in lines]

    return {
        'text_direction': text_direction,
        'boxes': rotate_lines(lines, 360 - angle, offset).tolist(),
        'script_detection': False
    }
Beispiel #9
0
def segment(im: PIL.Image.Image,
            text_direction: str = 'horizontal-lr',
            mask: Optional[np.ndarray] = None,
            reading_order_fn: Callable = polygonal_reading_order,
            model: Union[List[vgsl.TorchVGSLModel],
                         vgsl.TorchVGSLModel] = None,
            device: str = 'cpu') -> Dict[str, Any]:
    r"""
    Segments a page into text lines using the baseline segmenter.

    Segments a page into text lines and returns the polyline formed by each
    baseline and their estimated environment.

    Args:
        im: Input image. The mode can generally be anything but it is possible
            to supply a binarized-input-only model which requires accordingly
            treated images.
        text_direction: Passed-through value for serialization.serialize.
        mask: A bi-level mask image of the same size as `im` where 0-valued
              regions are ignored for segmentation purposes. Disables column
              detection.
        reading_order_fn: Function to determine the reading order.  Has to
                          accept a list of tuples (baselines, polygon) and a
                          text direction (`lr` or `rl`).
        model: One or more TorchVGSLModel containing a segmentation model. If
               none is given a default model will be loaded.
        device: The target device to run the neural network on.

    Returns:
        A dictionary containing the text direction and under the key 'lines' a
        list of reading order sorted baselines (polylines) and their respective
        polygonal boundaries. The last and first point of each boundary polygon
        are connected.

        .. code-block::
           :force:

            {'text_direction': '$dir',
             'type': 'baseline',
             'lines': [
                {'baseline': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'boundary': [[x0, y0, x1, y1], ... [x_m, y_m]]},
                {'baseline': [[x0, ...]], 'boundary': [[x0, ...]]}
              ]
              'regions': [
                {'region': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'type': 'image'},
                {'region': [[x0, ...]], 'type': 'text'}
              ]
            }

    Raises:
        KrakenInvalidModelException: if the given model is not a valid
                                     segmentation model.
        KrakenInputException: if the mask is not bitonal or does not match the
                              image size.
    """
    if model is None:
        logger.info('No segmentation model given. Loading default model.')
        model = vgsl.TorchVGSLModel.load_model(
            pkg_resources.resource_filename(__name__, 'blla.mlmodel'))

    if isinstance(model, vgsl.TorchVGSLModel):
        model = [model]

    for nn in model:
        if nn.model_type != 'segmentation':
            raise KrakenInvalidModelException(
                f'Invalid model type {nn.model_type} for {nn}')
        if 'class_mapping' not in nn.user_metadata:
            raise KrakenInvalidModelException(
                f'Segmentation model {nn} does not contain valid class mapping'
            )

    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    for net in model:
        if 'topline' in net.user_metadata:
            loc = {
                None: 'center',
                True: 'top',
                False: 'bottom'
            }[net.user_metadata['topline']]
            logger.debug(f'Baseline location: {loc}')
        rets = compute_segmentation_map(im, mask, net, device)
        regions = vec_regions(**rets)
        # flatten regions for line ordering/fetch bounding regions
        line_regs = []
        suppl_obj = []
        for cls, regs in regions.items():
            line_regs.extend(regs)
            if rets['bounding_regions'] is not None and cls in rets[
                    'bounding_regions']:
                suppl_obj.extend(regs)
        # convert back to net scale
        suppl_obj = scale_regions(suppl_obj, 1 / rets['scale'])
        line_regs = scale_regions(line_regs, 1 / rets['scale'])
        lines = vec_lines(**rets,
                          regions=line_regs,
                          reading_order_fn=reading_order_fn,
                          text_direction=text_direction,
                          suppl_obj=suppl_obj,
                          topline=net.user_metadata['topline']
                          if 'topline' in net.user_metadata else False)

    if len(rets['cls_map']['baselines']) > 1:
        script_detection = True
    else:
        script_detection = False

    return {
        'text_direction': text_direction,
        'type': 'baselines',
        'lines': lines,
        'regions': regions,
        'script_detection': script_detection
    }
Beispiel #10
0
def mm_rpred(nets,
             im,
             bounds,
             pad=16,
             line_normalization=True,
             bidi_reordering=True,
             script_ignore=None):
    """
    Multi-model version of kraken.rpred.rpred.

    Takes a dictionary of ISO15924 script identifiers->models and an
    script-annotated segmentation to dynamically select appropriate models for
    these lines.

    Args:
        nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer
                     objects. Recommended to be an defaultdict.
        im (PIL.Image): Image to extract text from
                        bounds (dict): A dictionary containing a 'boxes' entry
                        with a list of lists of coordinates (script, (x0, y0,
                        x1, y1)) of a text line in the image and an entry
                        'text_direction' containing
                        'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
        script_ignore (list): List of scripts to ignore during recognition
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info(
        u'Running {} multi-script recognizers on {} with {} lines'.format(
            len(nets), im_str, len(bounds['boxes'])))
    for line in bounds['boxes']:
        rec = ocr_record('', [], [])
        for script, (box, coords) in zip(
                map(lambda x: x[0], line),
                extract_boxes(
                    im, {
                        'text_direction': bounds['text_direction'],
                        'boxes': map(lambda x: x[1], line)
                    })):
            # skip if script is set to ignore
            if script in script_ignore:
                logger.info(u'Ignoring {} line segment.'.format(script))
                continue
            # check if boxes are non-zero in any dimension
            if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
                logger.warning(u'Run with zero dimension. Skipping.')
                continue
            raw_line = pil2array(box)
            # check if line is non-zero
            if np.amax(raw_line) == np.amin(raw_line):
                logger.warning(u'Empty run. Skipping.')
                continue
            if line_normalization:
                # fail gracefully and return no recognition result in case the
                # input line can not be normalized.
                try:
                    lnorm = getattr(nets[script], 'lnorm', CenterNormalizer())
                    if not is_bitonal(im):
                        logger.info(
                            u'Image is grayscale. Adjusting normalizer parameters'
                        )
                        lnorm.range = 2
                    box = dewarp(lnorm, box)
                except Exception as e:
                    logger.warning(
                        u'Dewarping for bbox {} failed. Skipping.'.format(
                            coords))
                    continue
            line = pil2array(box)
            logger.debug(u'Preparing run.')
            line = lstm.prepare_line(line, pad)
            logger.debug(u'Forward pass with model {}'.format(script))
            pred = nets[script].predictString(line)
            logger.info(u'Prediction: {}'.format(pred))
            # calculate recognized LSTM locations of characters
            scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad)
            logger.debug(u'Extracting labels.')
            result = lstm.translate_back_locations(nets[script].outputs)
            pos = []
            conf = []

            for _, start, end, c in result:
                if bounds['text_direction'].startswith('horizontal'):
                    pos.append(
                        (coords[0] + int(max(start - pad, 0) * scale),
                         coords[1],
                         coords[0] + int(min(end - pad, coords[2]) * scale),
                         coords[3]))
                else:
                    pos.append(
                        (coords[0],
                         coords[1] + int(max(start - pad, 0) * scale),
                         coords[2],
                         coords[1] + int(min(end - pad, coords[3]) * scale)))
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if bidi_reordering:
            logger.debug(u'BiDi reordering record.')
            yield bidi_record(rec)
        else:
            yield rec
Beispiel #11
0
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info(u'Running recognizer on {} with {} lines'.format(
        im_str, len(bounds['boxes'])))
    logger.debug(u'Loading line normalizer')
    lnorm = getattr(network, 'lnorm', CenterNormalizer())
    if not is_bitonal(im):
        logger.info(u'Image is grayscale. Adjusting normalizer parameters')
        lnorm.range = 2

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            logger.warning(
                u'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            logger.warning(
                u'Empty line {}. Emitting empty record.'.format(coords))
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                logger.warning(
                    u'Dewarping for bbox {} failed. Emitting empty record.'.
                    format(coords))
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        logger.debug(u'Preparing line.')
        line = lstm.prepare_line(line, pad)
        logger.debug(u'Performing forward pass.')
        pred = network.predictString(line)
        logger.info(u'Prediction: {}'.format(pred))

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        logger.debug(u'Extracting labels.')
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                xmin = coords[0] + int(max((start - pad) * scale, 0))
                xmax = coords[0] + max(
                    int(min((end - pad) * scale, coords[2] - coords[0])), 1)
                pos.append((xmin, coords[1], xmax, coords[3]))
            else:
                ymin = coords[1] + int(max((start - pad) * scale, 0))
                ymax = coords[1] + max(
                    int(min((end - pad) * scale, coords[3] - coords[1])), 1)
                pos.append((coords[0], ymin, coords[2], ymax))
            conf.append(c)
        if bidi_reordering:
            logger.debug(u'BiDi reordering record.')
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
Beispiel #12
0
def detect_scripts(im, bounds, model=pkg_resources.resource_filename(__name__, 'script.mlmodel'), valid_scripts=None):
    """
    Detects scripts in a segmented page.

    Classifies lines returned by the page segmenter into runs of scripts/writing systems.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        model (str): Location of the script classification model or None for default.
        valid_scripts (list): List of valid scripts.

    Returns:
        {'script_detection': True, 'text_direction': '$dir', 'boxes':
        [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text
        direction and a list of lists of reading order sorted bounding boxes
        under the key 'boxes' with each list containing the script segmentation
        of a single line. Script is a ISO15924 4 character identifier.

    Raises:
        KrakenInvalidModelException if no clstm module is available.
    """
    raise NotImplementedError('Temporarily unavailable. Please open a github ticket if you want this fixed sooner.')
    im_str = get_im_str(im)
    logger.info(u'Detecting scripts with {} in {} lines on {}'.format(model, len(bounds['boxes']), im_str))
    logger.debug(u'Loading detection model {}'.format(model))
    rnn = models.load_any(model)
    # load numerical to 4 char identifier map
    logger.debug(u'Loading label to identifier map')
    with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp:
        n2s = json.load(fp)
    # convert allowed scripts to labels
    val_scripts = []
    if valid_scripts:
        logger.debug(u'Converting allowed scripts list {}'.format(valid_scripts))
        for k, v in n2s.items():
            if v in valid_scripts:
                val_scripts.append(chr(int(k) + 0xF0000))
    else:
        valid_scripts = []
    it = rpred(rnn, im, bounds, bidi_reordering=False)
    preds = []
    logger.debug(u'Running detection')
    for pred, bbox in zip(it, bounds['boxes']):
        # substitute inherited scripts with neighboring runs
        def _subs(m, s, r=False):
            p = u''
            for c in s:
                if c in m and p and not r:
                    p += p[-1]
                elif c not in m and p and r:
                    p += p[-1]
                else:
                    p += c
            return p

        logger.debug(u'Substituting scripts')
        p = _subs([u'\U000f03e2', u'\U000f03e6'], pred.prediction)
        # do a reverse run to fix leading inherited scripts
        pred.prediction = ''.join(reversed(_subs([u'\U000f03e2', u'\U000f03e6'], reversed(p))))
        # group by valid scripts. two steps: 1. substitute common confusions
        # (Latin->Fraktur and Syriac->Arabic) if given in script list.
        if 'Arab' in valid_scripts and 'Syrc' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f0087', u'\U000f00a0')
        if 'Latn' in valid_scripts and 'Latf' not in valid_scripts:
            pred.prediction = pred.prediction.replace(u'\U000f00d9', u'\U000f00d7')
        # next merge adjacent scripts
        if val_scripts:
            pred.prediction = _subs(val_scripts, pred.prediction, r=True)

        # group by grapheme
        t = []
        logger.debug(u'Merging detections')
        # if line contains only a single script return whole line bounding box
        if len(set(pred.prediction)) == 1:
            logger.debug('Only one script on line. Emitting whole line bbox')
            k = ord(pred.prediction[0]) - 0xF0000
            t.append((n2s[str(k)], bbox))
        else:
            for k, g in groupby(pred, key=lambda x: x[0]):
                # convert to ISO15924 numerical identifier
                k = ord(k) - 0xF0000
                b = max_bbox(x[1] for x in g)
                t.append((n2s[str(k)], b))
        preds.append(t)
    return {'boxes': preds, 'text_direction': bounds['text_direction'], 'script_detection': True}
Beispiel #13
0
def segment(im, text_direction='horizontal-lr', scale=None, maxcolseps=2,
            black_colseps=False, no_hlines=True, pad=0, mask=None):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not
        no_hlines (bool): Switch for horizontal line removal
        pad (int or tuple): Padding to add to line bounding boxes. If int the
                            same padding is used both left and right. If a
                            2-tuple, uses (padding_left, padding_right).
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info('Segmenting {}'.format(im_str))

    if im.mode != '1' and not is_bitonal(im):
        logger.error('Image {} is not bi-level'.format(im_str))
        raise KrakenInputException('Image {} is not bi-level'.format(im_str))

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        logger.error('Invalid text direction \'{}\''.format(text_direction))
        raise KrakenInputException('Invalid text direction {}'.format(text_direction))

    logger.debug('Rotating input image by {} degrees'.format(angle))
    im = im.rotate(angle, expand=True)

    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5*(np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    if no_hlines:
        binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.

    try:
        if mask:
            if mask.mode != '1' and not is_bitonal(mask):
                logger.error('Mask is not bitonal')
                raise KrakenInputException('Mask is not bitonal')
            mask = mask.convert('1')
            if mask.size != im.size:
                logger.error('Mask size {} doesn\'t match image size {}'.format(mask.size, im.size))
                raise KrakenInputException('Mask size {} doesn\'t match image size {}'.format(mask.size, im.size))
            logger.info('Masking enabled in segmenter. Disabling column detection.')
            mask = mask.rotate(angle, expand=True)
            colseps = pil2array(mask)
        elif black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        logger.warning('Exception in column finder (probably empty image) for {}.'.format(im_str))
        return {'text_direction': text_direction, 'boxes':  []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread*binary)
    segmentation = llabels*binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]

    if isinstance(pad, int):
        pad = (pad, pad)
    lines = [(max(x[0]-pad[0], 0), x[1], min(x[2]+pad[1], im.size[0]), x[3]) for x in lines]

    return {'text_direction': text_direction, 'boxes':  rotate_lines(lines, 360-angle, offset).tolist(), 'script_detection': False}
Beispiel #14
0
def rpred(network: TorchSeqRecognizer,
          im: Image.Image,
          bounds: dict,
          pad: int = 16,
          bidi_reordering: bool = True) -> Generator[ocr_record, None, None]:
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.models.TorchSeqRecognizer): A TorchSegRecognizer
                                                        object
        im (PIL.Image.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line.
                   Auto-disabled when expected network inputs are incompatible
                   with padding.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info('Running recognizer on {} with {} lines'.format(im_str, len(bounds['boxes'])))
    logger.debug('Loading line transform')
    batch, channels, height, width = network.nn.input
    ts = generate_input_transforms(batch, height, width, channels, pad)

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            logger.warning('bbox {} with zero dimension. Emitting empty record.'.format(coords))
            yield ocr_record('', [], [])
            continue
        # try conversion into tensor
        try:
            line = ts(box)
        except Exception:
            yield ocr_record('', [], [])
            continue
        # check if line is non-zero
        if line.max() == line.min():
            yield ocr_record('', [], [])
            continue

        preds = network.predict(line)
        # calculate recognized LSTM locations of characters
        # scale between network output and network input
        net_scale = line.shape[2]/network.outputs.shape[1]
        # scale between network input and original line
        in_scale = box.size[0]/(line.shape[2]-2*pad)

        def _scale_val(val, min_val, max_val):
            return int(round(min(max(((val*net_scale)-pad)*in_scale, min_val), max_val)))

        # XXX: fix bounding box calculation ocr_record for multi-codepoint labels.
        pred = ''.join(x[0] for x in preds)
        pos = []
        conf = []
        for _, start, end, c in preds:
            if bounds['text_direction'].startswith('horizontal'):
                xmin = coords[0] + _scale_val(start, 0, box.size[0])
                xmax = coords[0] + _scale_val(end, 0, box.size[0])
                pos.append((xmin, coords[1], xmax, coords[3]))
            else:
                ymin = coords[1] + _scale_val(start, 0, box.size[1])
                ymax = coords[1] + _scale_val(start, 0, box.size[1])
                pos.append((coords[0], ymin, coords[2], ymax))
            conf.append(c)
        if bidi_reordering:
            logger.debug('BiDi reordering record.')
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            logger.debug('Emitting raw record')
            yield ocr_record(pred, pos, conf)
Beispiel #15
0
def mm_rpred(
    nets: Dict[str, TorchSeqRecognizer],
    im: Image.Image,
    bounds: dict,
    pad: int = 16,
    bidi_reordering: bool = True,
    script_ignore: Optional[List[str]] = None
) -> Generator[ocr_record, None, None]:
    """
    Multi-model version of kraken.rpred.rpred.

    Takes a dictionary of ISO15924 script identifiers->models and an
    script-annotated segmentation to dynamically select appropriate models for
    these lines.

    Args:
        nets (dict): A dict mapping ISO15924 identifiers to TorchSegRecognizer
                     objects. Recommended to be an defaultdict.
        im (PIL.Image.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry
                        with a list of lists of coordinates (script, (x0, y0,
                        x1, y1)) of a text line in the image and an entry
                        'text_direction' containing
                        'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
        script_ignore (list): List of scripts to ignore during recognition
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.

    Raises:
        KrakenInputException if the mapping between segmentation scripts and
        networks is incomplete.
    """
    im_str = get_im_str(im)
    logger.info(
        'Running {} multi-script recognizers on {} with {} lines'.format(
            len(nets), im_str, len(bounds['boxes'])))

    miss = [x[0] for x in bounds['boxes'] if not nets.get(x[0])]
    if miss:
        raise KrakenInputException(
            'Missing models for scripts {}'.format(miss))

    # build dictionary for line preprocessing
    ts = {}
    for script, network in nets.items():
        logger.debug('Loading line transforms for {}'.format(script))
        batch, channels, height, width = network.nn.input
        ts[script] = generate_input_transforms(batch, height, width, channels,
                                               pad)

    for line in bounds['boxes']:
        rec = ocr_record('', [], [])
        for script, (box, coords) in zip(
                map(lambda x: x[0], line),
                extract_boxes(
                    im, {
                        'text_direction': bounds['text_direction'],
                        'boxes': map(lambda x: x[1], line)
                    })):
            # skip if script is set to ignore
            if script_ignore is not None and script in script_ignore:
                logger.info('Ignoring {} line segment.'.format(script))
                continue
            # check if boxes are non-zero in any dimension
            if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
                logger.warning('Run with zero dimension. Skipping.')
                continue
            # try conversion into tensor
            try:
                logger.debug('Preparing run.')
                line = ts[script](box)
            except Exception:
                logger.warning(
                    'Conversion of line {} failed. Skipping.'.format(coords))
                yield ocr_record('', [], [])
                continue

            # check if line is non-zero
            if line.max() == line.min():
                logger.warning('Empty run. Skipping.')
                yield ocr_record('', [], [])
                continue

            logger.debug('Forward pass with model {}'.format(script))
            preds = nets[script].predict(line)

            # calculate recognized LSTM locations of characters
            logger.debug('Convert to absolute coordinates')
            scale = box.size[0] / (len(nets[script].outputs) - 2 * pad)
            pred = ''.join(x[0] for x in preds)
            pos = []
            conf = []

            for _, start, end, c in preds:
                if bounds['text_direction'].startswith('horizontal'):
                    xmin = coords[0] + int(max((start - pad) * scale, 0))
                    xmax = coords[0] + max(
                        int(min(
                            (end - pad) * scale, coords[2] - coords[0])), 1)
                    pos.append((xmin, coords[1], xmax, coords[3]))
                else:
                    ymin = coords[1] + int(max((start - pad) * scale, 0))
                    ymax = coords[1] + max(
                        int(min(
                            (end - pad) * scale, coords[3] - coords[1])), 1)
                    pos.append((coords[0], ymin, coords[2], ymax))
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if bidi_reordering:
            logger.debug('BiDi reordering record.')
            yield bidi_record(rec)
        else:
            logger.debug('Emitting raw record')
            yield rec
Beispiel #16
0
def compute_segmentation_map(im: PIL.Image.Image,
                             mask: Optional[np.ndarray] = None,
                             model: vgsl.TorchVGSLModel = None,
                             device: str = 'cpu') -> Dict[str, Any]:
    """
    Args:
        im: Input image
        mask: A bi-level mask array of the same size as `im` where 0-valued
              regions are ignored for segmentation purposes. Disables column
              detection.
        model: A TorchVGSLModel containing a segmentation model.
        device: The target device to run the neural network on.

    Returns:
        A dictionary containing the heatmaps ('heatmap', torch.Tensor), class
        map ('cls_map', Dict[str, Dict[str, int]]), the bounding regions for
        polygonization purposes ('bounding_regions', List[str]), the scale
        between the input image and the network output ('scale', float), and
        the scaled input image to the network ('scal_im', PIL.Image.Image).

    Raises:
        KrakenInputException: When given an invalid mask.
    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if model.input[
            1] == 1 and model.one_channel_mode == '1' and not is_bitonal(im):
        logger.warning('Running binary model on non-binary input image '
                       '(mode {}). This will result in severely degraded '
                       'performance'.format(im.mode))

    model.eval()
    model.to(device)

    batch, channels, height, width = model.input
    transforms = dataset.ImageInputTransforms(batch,
                                              height,
                                              width,
                                              channels,
                                              0,
                                              valid_norm=False)
    tf_idx, _ = next(
        filter(lambda x: isinstance(x[1], tf.ToTensor),
               enumerate(transforms.transforms)))
    res_tf = tf.Compose(transforms.transforms[:tf_idx])
    scal_im = np.array(res_tf(im).convert('L'))

    tensor_im = transforms(im)
    if mask:
        if mask.mode != '1' and not is_bitonal(mask):
            logger.error('Mask is not bitonal')
            raise KrakenInputException('Mask is not bitonal')
        mask = mask.convert('1')
        if mask.size != im.size:
            logger.error(
                'Mask size {mask.size} doesn\'t match image size {im.size}')
            raise KrakenInputException(
                'Mask size {mask.size} doesn\'t match image size {im.size}')
        logger.info('Masking enabled in segmenter.')
        tensor_im[~transforms(mask).bool()] = 0

    with torch.no_grad():
        logger.debug('Running network forward pass')
        o, _ = model.nn(tensor_im.unsqueeze(0).to(device))
    logger.debug('Upsampling network output')
    o = F.interpolate(o, size=scal_im.shape)
    o = o.squeeze().cpu().numpy()
    scale = np.divide(im.size, o.shape[:0:-1])
    bounding_regions = model.user_metadata[
        'bounding_regions'] if 'bounding_regions' in model.user_metadata else None
    return {
        'heatmap': o,
        'cls_map': model.user_metadata['class_mapping'],
        'bounding_regions': bounding_regions,
        'scale': scale,
        'scal_im': scal_im
    }
Beispiel #17
0
def rpred(network: TorchSeqRecognizer,
          im: Image.Image,
          bounds: dict,
          pad: int = 16,
          bidi_reordering: bool = True) -> Generator[ocr_record, None, None]:
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.models.TorchSeqRecognizer): A TorchSegRecognizer
                                                        object
        im (PIL.Image.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line.
                   Auto-disabled when expected network inputs are incompatible
                   with padding.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info('Running recognizer on {} with {} lines'.format(
        im_str, len(bounds['boxes'])))
    logger.debug('Loading line transform')
    batch, channels, height, width = network.nn.input
    ts = generate_input_transforms(batch, height, width, channels, pad)

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            logger.warning(
                'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            yield ocr_record('', [], [])
            continue
        # try conversion into tensor
        try:
            line = ts(box)
        except Exception:
            yield ocr_record('', [], [])
            continue
        # check if line is non-zero
        if line.max() == line.min():
            yield ocr_record('', [], [])
            continue

        preds = network.predict(line)
        # calculate recognized LSTM locations of characters
        # scale between network output and network input
        net_scale = line.shape[2] / network.outputs.shape[1]
        # scale between network input and original line
        in_scale = box.size[0] / (line.shape[2] - 2 * pad)

        def _scale_val(val, min_val, max_val):
            return int(
                round(
                    min(max(((val * net_scale) - pad) * in_scale, min_val),
                        max_val)))

        # XXX: fix bounding box calculation ocr_record for multi-codepoint labels.
        pred = ''.join(x[0] for x in preds)
        pos = []
        conf = []
        for _, start, end, c in preds:
            if bounds['text_direction'].startswith('horizontal'):
                xmin = coords[0] + _scale_val(start, 0, box.size[0])
                xmax = coords[0] + _scale_val(end, 0, box.size[0])
                pos.append((xmin, coords[1], xmax, coords[3]))
            else:
                ymin = coords[1] + _scale_val(start, 0, box.size[1])
                ymax = coords[1] + _scale_val(start, 0, box.size[1])
                pos.append((coords[0], ymin, coords[2], ymax))
            conf.append(c)
        if bidi_reordering:
            logger.debug('BiDi reordering record.')
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            logger.debug('Emitting raw record')
            yield ocr_record(pred, pos, conf)
Beispiel #18
0
def segment(im,
            text_direction: str = 'horizontal-lr',
            mask: Optional[np.array] = None,
            reading_order_fn: Callable = polygonal_reading_order,
            model=None,
            device: str = 'cpu'):
    """
    Segments a page into text lines using the baseline segmenter.

    Segments a page into text lines and returns the polyline formed by each
    baseline and their estimated environment.

    Args:
        im (PIL.Image): An RGB image.
        text_direction (str): Ignored by the segmenter but kept for
                              serialization.
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.
        reading_order_fn (function): Function to determine the reading order.
                                     Has to accept a list of tuples (baselines,
                                     polygon) and a text direction (`lr` or
                                     `rl`).
        model (vgsl.TorchVGSLModel): A TorchVGSLModel containing a segmentation
                                     model. If none is given a default model
                                     will be loaded.
        device (str or torch.Device): The target device to run the neural
                                      network on.

    Returns:
        {'text_direction': '$dir',
         'type': 'baseline',
         'lines': [
            {'baseline': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'boundary': [[x0, y0, x1, y1], ... [x_m, y_m]]},
            {'baseline': [[x0, ...]], 'boundary': [[x0, ...]]}
          ]
          'regions': [
            {'region': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'type': 'image'},
            {'region': [[x0, ...]], 'type': 'text'}
          ]
        }: A dictionary containing the text direction and under the key 'lines'
        a list of reading order sorted baselines (polylines) and their
        respective polygonal boundaries. The last and first point of each
        boundary polygon is connected.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if model is None:
        logger.info('No segmentation model given. Loading default model.')
        model = vgsl.TorchVGSLModel.load_model(pkg_resources.resource_filename(__name__, 'blla.mlmodel'))

    if model.one_channel_mode == '1' and not is_bitonal(im):
        logger.warning('Running binary model on non-binary input image '
                       '(mode {}). This will result in severely degraded '
                       'performance'.format(im.mode))

    model.eval()
    model.to(device)

    if mask:
        if mask.mode != '1' and not is_bitonal(mask):
            logger.error('Mask is not bitonal')
            raise KrakenInputException('Mask is not bitonal')
        mask = mask.convert('1')
        if mask.size != im.size:
            logger.error('Mask size {mask.size} doesn\'t match image size {im.size}')
            raise KrakenInputException('Mask size {mask.size} doesn\'t match image size {im.size}')
        logger.info('Masking enabled in segmenter.')
        mask = pil2array(mask)

    batch, channels, height, width = model.input
    transforms = dataset.generate_input_transforms(batch, height, width, channels, 0, valid_norm=False)
    res_tf = tf.Compose(transforms.transforms[:3])
    scal_im = res_tf(im).convert('L')

    with torch.no_grad():
        logger.debug('Running network forward pass')
        o = model.nn(transforms(im).unsqueeze(0).to(device))
    logger.debug('Upsampling network output')
    o = F.interpolate(o, size=scal_im.size[::-1])
    o = o.squeeze().cpu().numpy()
    scale = np.divide(im.size, o.shape[:0:-1])
    # postprocessing
    cls_map = model.user_metadata['class_mapping']
    st_sep = cls_map['aux']['_start_separator']
    end_sep = cls_map['aux']['_end_separator']

    logger.info('Vectorizing baselines')
    baselines = []
    regions = {}
    for bl_type, idx in cls_map['baselines'].items():
        logger.debug(f'Vectorizing lines of type {bl_type}')
        baselines.extend([(bl_type,x) for x in vectorize_lines(o[(st_sep, end_sep, idx), :, :])])
    logger.info('Vectorizing regions')
    for region_type, idx in cls_map['regions'].items():
        logger.debug(f'Vectorizing lines of type {bl_type}')
        regions[region_type] = vectorize_regions(o[idx])
    logger.debug('Polygonizing lines')
    lines = list(filter(lambda x: x[2] is not None, zip([x[0] for x in baselines],
                                                        [x[1] for x in baselines],
                                                        calculate_polygonal_environment(scal_im, [x[1] for x in baselines]))))
    logger.debug('Scaling vectorized lines')
    sc = scale_polygonal_lines([x[1:] for x in lines], scale)
    lines = list(zip([x[0] for x in lines], [x[0] for x in sc], [x[1] for x in sc]))
    logger.debug('Scaling vectorized regions')
    for reg_id, regs in regions.items():
        regions[reg_id] = scale_regions(regs, scale)
    logger.debug('Reordering baselines')
    order_regs = []
    for regs in regions.values():
        order_regs.extend(regs)
    lines = reading_order_fn(lines=lines, regions=order_regs, text_direction=text_direction[-2:])

    if 'class_mapping' in model.user_metadata and len(model.user_metadata['class_mapping']['baselines']) > 1:
        script_detection = True
    else:
        script_detection = False

    return {'text_direction': text_direction,
            'type': 'baselines',
            'lines': [{'script': bl_type, 'baseline': bl, 'boundary': pl} for bl_type, bl, pl in lines],
            'regions': regions,
            'script_detection': script_detection}
Beispiel #19
0
def segment(im,
            text_direction: str = 'horizontal-lr',
            scale: Optional[float] = None,
            maxcolseps: float = 2,
            black_colseps: bool = False,
            no_hlines: bool = True,
            pad: int = 0,
            mask: Optional[np.array] = None,
            reading_order_fn: Callable = reading_order) -> Dict[str, Any]:
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not
        no_hlines (bool): Switch for horizontal line removal
        pad (int or tuple): Padding to add to line bounding boxes. If int the
                            same padding is used both left and right. If a
                            2-tuple, uses (padding_left, padding_right).
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.
        reading_order_fn (Callable): Function to call to order line output.
                                     Callable accepting a list of slices (y, x)
                                     and a text direction in (`rl`, `lr`).

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if im.mode != '1' and not is_bitonal(im):
        logger.error(f'Image {im_str} is not bi-level')
        raise KrakenInputException(f'Image {im_str} is not bi-level')

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        logger.error(f'Invalid text direction \'{text_direction}\'')
        raise KrakenInputException(f'Invalid text direction {text_direction}')

    logger.debug(f'Rotating input image by {angle} degrees')
    im = im.rotate(angle, expand=True)

    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    if no_hlines:
        binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.

    try:
        if mask:
            if mask.mode != '1' and not is_bitonal(mask):
                logger.error('Mask is not bitonal')
                raise KrakenInputException('Mask is not bitonal')
            mask = mask.convert('1')
            if mask.size != im.size:
                logger.error(
                    f'Mask size {mask.size} doesn\'t match image size {im.size}'
                )
                raise KrakenInputException(
                    f'Mask size {mask.size} doesn\'t match image size {im.size}'
                )
            logger.info(
                'Masking enabled in segmenter. Disabling column detection.')
            mask = mask.rotate(angle, expand=True)
            colseps = pil2array(mask)
        elif black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        logger.warning(
            f'Exception in column finder (probably empty image) for {im_str}')
        return {'text_direction': text_direction, 'boxes': []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order_fn([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]

    if isinstance(pad, int):
        pad = (pad, pad)
    lines = [(max(x[0] - pad[0], 0), x[1], min(x[2] + pad[1],
                                               im.size[0]), x[3])
             for x in lines]

    return {
        'text_direction': text_direction,
        'boxes': rotate_lines(lines, 360 - angle, offset).tolist(),
        'script_detection': False
    }
Beispiel #20
0
def nlbin(im: Image.Image,
          threshold: float = 0.5,
          zoom: float = 0.5,
          escale: float = 1.0,
          border: float = 0.1,
          perc: int = 80,
          range: int = 20,
          low: int = 5,
          high: int = 90) -> Image:
    """
    Performs binarization using non-linear processing.

    Args:
        im (PIL.Image.Image):
        threshold (float):
        zoom (float): Zoom for background page estimation
        escale (float): Scale for estimating a mask over the text region
        border (float): Ignore this much of the border
        perc (int): Percentage for filters
        range (int): Range for filters
        low (int): Percentile for black estimation
        high (int): Percentile for white estimation

    Returns:
        PIL.Image containing the binarized image

    Raises:
        KrakenInputException when trying to binarize an empty image.
    """
    im_str = get_im_str(im)
    logger.info('Binarizing {}'.format(im_str))
    if is_bitonal(im):
        logger.info('Skipping binarization because {} is bitonal.'.format(im_str))
        return im
    # convert to grayscale first
    logger.debug('Converting {} to grayscale'.format(im_str))
    im = im.convert('L')
    raw = pil2array(im)
    logger.debug('Scaling and normalizing')
    # rescale image to between -1 or 0 and 1
    raw = raw/np.float(np.iinfo(raw.dtype).max)
    # perform image normalization
    if np.amax(raw) == np.amin(raw):
        logger.warning('Trying to binarize empty image {}'.format(im_str))
        raise KrakenInputException('Image is empty')
    image = raw-np.amin(raw)
    image /= np.amax(image)

    logger.debug('Interpolation and percentile filtering')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        m = interpolation.zoom(image, zoom)
        m = filters.percentile_filter(m, perc, size=(range, 2))
        m = filters.percentile_filter(m, perc, size=(2, range))
        mh, mw = m.shape
        oh, ow = image.shape
        scale = np.diag([mh * 1.0/oh, mw * 1.0/ow])
        m = affine_transform(m, scale, output_shape=image.shape)
    w, h = np.minimum(np.array(image.shape), np.array(m.shape))
    flat = np.clip(image[:w, :h]-m[:w, :h]+1, 0, 1)

    # estimate low and high thresholds
    d0, d1 = flat.shape
    o0, o1 = int(border*d0), int(border*d1)
    est = flat[o0:d0-o0, o1:d1-o1]
    logger.debug('Threshold estimates {}'.format(est))
    # by default, we use only regions that contain
    # significant variance; this makes the percentile
    # based low and high estimates more reliable
    logger.debug('Refine estimates')
    v = est-filters.gaussian_filter(est, escale*20.0)
    v = filters.gaussian_filter(v**2, escale*20.0)**0.5
    v = (v > 0.3*np.amax(v))
    v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1)))
    v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50))))
    est = est[v]
    lo = np.percentile(est.ravel(), low)
    hi = np.percentile(est.ravel(), high)
    flat -= lo
    flat /= (hi-lo)
    flat = np.clip(flat, 0, 1)
    logger.debug('Thresholding at {}'.format(threshold))
    bin = np.array(255*(flat > threshold), 'B')
    return array2pil(bin)