Beispiel #1
0
def nlbin(im, threshold=0.5, zoom=0.5, escale=1.0, border=0.1, perc=80,
          range=20, low=5, high=90):
    """
    Performs binarization using non-linear processing.

    Args:
        im (PIL.Image):
        threshold (float):
        zoom (float): Zoom for background page estimation
        escale (float): Scale for estimating a mask over the text region
        border (float): Ignore this much of the border
        perc (int): Percentage for filters
        range (int): Range for filters
        low (int): Percentile for black estimation
        high (int): Percentile for white estimation

    Returns:
        PIL.Image containing the binarized image
    """
    if im.mode == '1':
        return im
    raw = pil2array(im)
    # rescale image to between -1 or 0 and 1
    raw = raw/np.float(np.iinfo(raw.dtype).max)
    if raw.ndim == 3:
        raw = np.mean(raw, 2)
    # perform image normalization
    if np.amax(raw) == np.amin(raw):
        raise KrakenInputException('Image is empty')
    image = raw-np.amin(raw)
    image /= np.amax(image)

    m = interpolation.zoom(image, zoom)
    m = filters.percentile_filter(m, perc, size=(range, 2))
    m = filters.percentile_filter(m, perc, size=(2, range))
    m = interpolation.zoom(m, 1.0/zoom)
    w, h = np.minimum(np.array(image.shape), np.array(m.shape))
    flat = np.clip(image[:w, :h]-m[:w, :h]+1, 0, 1)

    # estimate low and high thresholds
    d0, d1 = flat.shape
    o0, o1 = int(border*d0), int(border*d1)
    est = flat[o0:d0-o0, o1:d1-o1]
    # by default, we use only regions that contain
    # significant variance; this makes the percentile
    # based low and high estimates more reliable
    v = est-filters.gaussian_filter(est, escale*20.0)
    v = filters.gaussian_filter(v**2, escale*20.0)**0.5
    v = (v > 0.3*np.amax(v))
    v = morphology.binary_dilation(v, structure=np.ones((escale*50, 1)))
    v = morphology.binary_dilation(v, structure=np.ones((1, escale*50)))
    est = est[v]
    lo = np.percentile(est.ravel(), low)
    hi = np.percentile(est.ravel(), high)

    flat -= lo
    flat /= (hi-lo)
    flat = np.clip(flat, 0, 1)
    bin = np.array(255*(flat > threshold), 'B')
    return array2pil(bin)
Beispiel #2
0
def degrade_line(im, eta=0, alpha=1.7, beta=1.7, alpha_0=1, beta_0=1):
    """
    Degrades a line image by adding noise

    Args:
        im (PIL.Image): Input image

    Returns:
        PIL.Image in mode 'L'
    """
    im = pil2array(im)
    im = np.amax(im) - im
    im = im * 1.0 / np.amax(im)

    # foreground distance transform and flipping to white probability
    fg_dist = distance_transform_cdt(im, metric='taxicab')
    fg_prob = alpha_0 * np.exp(-alpha * (fg_dist**2)) + eta
    fg_prob[im == 0] = 0
    fg_flip = np.random.binomial(1, fg_prob)

    # background distance transform and flipping to black probability
    bg_dist = distance_transform_cdt(1 - im, metric='taxicab')
    bg_prob = beta_0 * np.exp(-beta * (bg_dist**2)) + eta
    bg_prob[im == 1] = 0
    bg_flip = np.random.binomial(1, bg_prob)

    # flip
    im -= fg_flip
    im += bg_flip
    # use a circular kernel of size 3
    sel = np.array([[1, 1], [1, 1]])
    im = binary_closing(im, sel)
    return array2pil(255 - im.astype('B') * 255)
Beispiel #3
0
def compute_segmentation_map(im,
                             mask: Optional[np.array] = None,
                             model=None,
                             device: str = 'cpu'):
    """

    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if model.input[
            1] == 1 and model.one_channel_mode == '1' and not is_bitonal(im):
        logger.warning('Running binary model on non-binary input image '
                       '(mode {}). This will result in severely degraded '
                       'performance'.format(im.mode))

    model.eval()
    model.to(device)

    if mask:
        if mask.mode != '1' and not is_bitonal(mask):
            logger.error('Mask is not bitonal')
            raise KrakenInputException('Mask is not bitonal')
        mask = mask.convert('1')
        if mask.size != im.size:
            logger.error(
                'Mask size {mask.size} doesn\'t match image size {im.size}')
            raise KrakenInputException(
                'Mask size {mask.size} doesn\'t match image size {im.size}')
        logger.info('Masking enabled in segmenter.')
        mask = pil2array(mask)

    batch, channels, height, width = model.input
    transforms = dataset.generate_input_transforms(batch,
                                                   height,
                                                   width,
                                                   channels,
                                                   0,
                                                   valid_norm=False)
    res_tf = tf.Compose(transforms.transforms[:3])
    scal_im = res_tf(im).convert('L')

    with torch.no_grad():
        logger.debug('Running network forward pass')
        o, _ = model.nn(transforms(im).unsqueeze(0).to(device))
    logger.debug('Upsampling network output')
    o = F.interpolate(o, size=scal_im.size[::-1])
    o = o.squeeze().cpu().numpy()
    scale = np.divide(im.size, o.shape[:0:-1])
    bounding_regions = model.user_metadata[
        'bounding_regions'] if 'bounding_regions' in model.user_metadata else None
    return {
        'heatmap': o,
        'cls_map': model.user_metadata['class_mapping'],
        'bounding_regions': bounding_regions,
        'scale': scale,
        'scal_im': scal_im
    }
Beispiel #4
0
def distort_line(im, distort=3.0, sigma=10, eps=0.03, delta=0.3):
    """
    Distorts a line image.

    Run BEFORE degrade_line as a white border of 5 pixels will be added.

    Args:
        im (PIL.Image): Input image
        distort (float):
        sigma (float):
        eps (float):
        delta (float):

    Returns:
        PIL.Image in mode 'L'
    """
    w, h = im.size
    # XXX: determine correct output shape from transformation matrices instead
    # of guesstimating.
    logger.debug('Pasting source image into canvas')
    image = Image.new('L', (int(1.5 * w), 4 * h), 255)
    image.paste(im, (int((image.size[0] - w) / 2), int(
        (image.size[1] - h) / 2)))
    line = pil2array(image.convert('L'))

    # shear in y direction with factor eps * randn(), scaling with 1 + eps *
    # randn() in x/y axis (all offset at d)
    logger.debug('Performing affine transformation')
    m = np.array([[1 + eps * np.random.randn(), 0.0],
                  [eps * np.random.randn(), 1.0 + eps * np.random.randn()]])
    c = np.array([w / 2.0, h / 2])
    d = c - np.dot(m, c) + np.array(
        [np.random.randn() * delta,
         np.random.randn() * delta])
    line = affine_transform(line,
                            m,
                            offset=d,
                            order=1,
                            mode='constant',
                            cval=255)

    hs = gaussian_filter(np.random.randn(4 * h, int(1.5 * w)), sigma)
    ws = gaussian_filter(np.random.randn(4 * h, int(1.5 * w)), sigma)
    hs *= distort / np.amax(hs)
    ws *= distort / np.amax(ws)

    def _f(p):
        return (p[0] + hs[p[0], p[1]], p[1] + ws[p[0], p[1]])

    logger.debug('Performing geometric transformation')
    im = array2pil(geometric_transform(line, _f, order=1, mode='nearest'))
    logger.debug('Cropping canvas to content box')
    im = im.crop(ImageOps.invert(im).getbbox())
    return im
Beispiel #5
0
def ocropy_degrade(im, distort=1.0, dsigma=20.0, eps=0.03, delta=0.3, degradations=[(0.5, 0.0, 0.5, 0.0)]):
    """
    Degrades and distorts a line using the same noise model used by ocropus.

    Args:
        im (PIL.Image): Input image
        distort (float):
        dsigma (float):
        eps (float):
        delta (float): 
        degradations (list): list returning 4-tuples corresponding to
                             the degradations argument of ocropus-linegen.

    Returns:
        PIL.Image in mode 'L'
    """
    w, h = im.size
    # XXX: determine correct output shape from transformation matrices instead
    # of guesstimating.
    image = Image.new('L', (int(1.5*w), 4*h), 255)
    image.paste(im, (int((image.size[0] - w) / 2), int((image.size[1] - h) / 2)))
    a = pil2array(image.convert('L'))
    (sigma,ssigma,threshold,sthreshold) = degradations[np.random.choice(len(degradations))]
    sigma += (2*np.random.rand()-1)*ssigma
    threshold += (2*np.random.rand()-1)*sthreshold
    a = a*1.0/np.amax(a)
    if sigma>0.0:
        a = gaussian_filter(a,sigma)
    a += np.clip(np.random.randn(*a.shape)*0.2,-0.25,0.25)
    m = np.array([[1+eps*np.random.randn(),0.0],[eps*np.random.randn(),1.0+eps*np.random.randn()]])
    w,h = a.shape
    c = np.array([w/2.0,h/2])
    d = c-np.dot(m, c)+np.array([np.random.randn()*delta, np.random.randn()*delta])
    a = affine_transform(a, m, offset=d, order=1, mode='constant', cval=a[0,0])
    a = np.array(a>threshold,'f')
    [[r,c]] = find_objects(np.array(a==0,'i'))
    r0 = r.start
    r1 = r.stop
    c0 = c.start
    c1 = c.stop
    a = a[r0-5:r1+5,c0-5:c1+5]
    if distort > 0:
        h,w = a.shape
        hs = np.random.randn(h,w)
        ws = np.random.randn(h,w)
        hs = gaussian_filter(hs, dsigma)
        ws = gaussian_filter(ws, dsigma)
        hs *= distort/np.amax(hs)
        ws *= distort/np.amax(ws)
        def f(p):
            return (p[0]+hs[p[0],p[1]],p[1]+ws[p[0],p[1]])
        a = geometric_transform(a, f, output_shape=(h,w), order=1, mode='constant', cval=np.amax(a))
    im = array2pil(a).convert('L')
    return im
Beispiel #6
0
def segment(im, scale=None, black_colseps=False):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        scale (float): Scale of the image
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not

    Returns:
        [(x1, y1, x2, y2),...]: A list of tuples containing the bounding boxes
                                of the segmented lines in reading order.

    Raises:
        KrakenInputException if the input image is not binarized
    """

    if im.mode != '1' and not is_bitonal(im):
        raise KrakenInputException('Image is not bi-level')
    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    binary = remove_hlines(binary, scale)
    if black_colseps:
        colseps, binary = compute_black_colseps(binary, scale)
    else:
        colseps = compute_white_colseps(binary, scale)
    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    return [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
Beispiel #7
0
def segment(im, scale=None, black_colseps=False):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        scale (float): Scale of the image
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not

    Returns:
        [(x1, y1, x2, y2),...]: A list of tuples containing the bounding boxes
                                of the segmented lines in reading order.

    Raises:
        KrakenInputException if the input image is not binarized
    """

    if im.mode != '1' and im.histogram().count(0) != 254:
        raise KrakenInputException('Image is not bi-level')
    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5*(np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    binary = remove_hlines(binary, scale)
    if black_colseps:
        colseps, binary = compute_black_colseps(binary, scale)
    else:
        colseps = compute_white_colseps(binary, scale)
    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread*binary)
    segmentation = llabels*binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    return [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
Beispiel #8
0
def degrade_line(im, eta=0.0, alpha=1.5, beta=1.5, alpha_0=1.0, beta_0=1.0):
    """
    Degrades a line image by adding noise.

    For parameter meanings consult [1].

    Args:
        im (PIL.Image): Input image
        eta (float):
        alpha (float):
        beta (float):
        alpha_0 (float):
        beta_0 (float):

    Returns:
        PIL.Image in mode '1'
    """
    logger.debug('Inverting and normalizing input image')
    im = pil2array(im)
    im = np.amax(im) - im
    im = im * 1.0 / np.amax(im)

    logger.debug('Calculating foreground distance transform')
    fg_dist = distance_transform_cdt(1 - im, metric='taxicab')
    logger.debug('Calculating flip to white probability')
    fg_prob = alpha_0 * np.exp(-alpha * (fg_dist**2)) + eta
    fg_prob[im == 1] = 0
    fg_flip = np.random.binomial(1, fg_prob)

    logger.debug('Calculating background distance transform')
    bg_dist = distance_transform_cdt(im, metric='taxicab')
    logger.debug('Calculating flip to black probability')
    bg_prob = beta_0 * np.exp(-beta * (bg_dist**2)) + eta
    bg_prob[im == 0] = 0
    bg_flip = np.random.binomial(1, bg_prob)

    # flip
    logger.debug('Flipping')
    im -= bg_flip
    im += fg_flip

    logger.debug('Binary closing')
    sel = np.array([[1, 1], [1, 1]])
    im = binary_closing(im, sel)
    logger.debug('Converting to image')
    return array2pil(255 - im.astype('B') * 255)
Beispiel #9
0
def degrade_line(im, eta=0.0, alpha=1.5, beta=1.5, alpha_0=1.0, beta_0=1.0):
    """
    Degrades a line image by adding noise.

    For parameter meanings consult [1].

    Args:
        im (PIL.Image): Input image
        eta (float):
        alpha (float):
        beta (float):
        alpha_0 (float):
        beta_0 (float):

    Returns:
        PIL.Image in mode '1'
    """
    logger.debug(u'Inverting and normalizing input image')
    im = pil2array(im)
    im = np.amax(im)-im
    im = im*1.0/np.amax(im)

    logger.debug(u'Calculating foreground distance transform')
    fg_dist = distance_transform_cdt(1-im, metric='taxicab')
    logger.debug(u'Calculating flip to white probability')
    fg_prob = alpha_0 * np.exp(-alpha * (fg_dist**2)) + eta
    fg_prob[im == 1] = 0
    fg_flip = np.random.binomial(1, fg_prob)

    logger.debug(u'Calculating background distance transform')
    bg_dist = distance_transform_cdt(im, metric='taxicab')
    logger.debug(u'Calculating flip to black probability')
    bg_prob = beta_0 * np.exp(-beta * (bg_dist**2)) + eta
    bg_prob[im == 0] = 0
    bg_flip = np.random.binomial(1, bg_prob)

    # flip
    logger.debug(u'Flipping')
    im -= bg_flip
    im += fg_flip

    logger.debug(u'Binary closing')
    sel = np.array([[1, 1], [1, 1]])
    im = binary_closing(im, sel)
    logger.debug(u'Converting to image')
    return array2pil(255-im.astype('B')*255)
Beispiel #10
0
def distort_line(im, distort=3.0, sigma=10, eps=0.03, delta=0.3):
    """
    Distorts a line image.

    Run BEFORE degrade_line as a white border of 5 pixels will be added.

    Args:
        im (PIL.Image): Input image
        distort (float):
        sigma (float):
        eps (float):
        delta (float):

    Returns:
        PIL.Image in mode 'L'
    """
    w, h = im.size
    # XXX: determine correct output shape from transformation matrices instead
    # of guesstimating.
    logger.debug(u'Pasting source image into canvas')
    image = Image.new('L', (int(1.5*w), 4*h), 255)
    image.paste(im, (int((image.size[0] - w) / 2), int((image.size[1] - h) / 2)))
    line = pil2array(image.convert('L'))

    # shear in y direction with factor eps * randn(), scaling with 1 + eps *
    # randn() in x/y axis (all offset at d)
    logger.debug(u'Performing affine transformation')
    m = np.array([[1 + eps * np.random.randn(), 0.0], [eps * np.random.randn(), 1.0 + eps * np.random.randn()]])
    c = np.array([w/2.0, h/2])
    d = c - np.dot(m, c) + np.array([np.random.randn() * delta, np.random.randn() * delta])
    line = affine_transform(line, m, offset=d, order=1, mode='constant', cval=255)

    hs = gaussian_filter(np.random.randn(4*h, int(1.5*w)), sigma)
    ws = gaussian_filter(np.random.randn(4*h, int(1.5*w)), sigma)
    hs *= distort/np.amax(hs)
    ws *= distort/np.amax(ws)

    def _f(p):
        return (p[0] + hs[p[0], p[1]], p[1] + ws[p[0], p[1]])

    logger.debug(u'Performing geometric transformation')
    im = array2pil(geometric_transform(line, _f, order=1, mode='nearest'))
    logger.debug(u'Cropping canvas to content box')
    im = im.crop(ImageOps.invert(im).getbbox())
    return im
Beispiel #11
0
    def add(self, image, split=lambda x: os.path.splitext(x)[0],
                 suffix='.gt.txt', normalization=None, reorder=True,
                 pad=16):
        """
        Adds a single image to the training set.
        """
        with click.open_file(split(image) + suffix, 'r', encoding='utf-8') as fp:
            gt = fp.read()
            if normalization:
                gt = unicodedata.normalize(normalization, gt)
            if reorder:
                gt = bd.get_display(gt)

            im = Image.open(image)
            im = rpred.dewarp(self.lnorm, im)
            im = pil2array(im)
            im = lstm.prepare_line(im, pad)
            self.training_set.append((im, gt))
Beispiel #12
0
def dewarp(normalizer, im):
    """
    Dewarps an image of a line using a kraken.lib.lineest.CenterNormalizer
    instance.

    Args:
        normalizer (kraken.lib.lineest.CenterNormalizer): A line normalizer
                                                          instance
        im (PIL.Image): Image to dewarp

    Returns:
        PIL.Image containing the dewarped image.
    """
    line = pil2array(im)
    temp = np.amax(line) - line
    temp = temp * 1.0 / np.amax(temp)
    normalizer.measure(temp)
    line = normalizer.normalize(line, cval=np.amax(line))
    return array2pil(line)
Beispiel #13
0
def dewarp(normalizer: CenterNormalizer, im: Image.Image) -> Image.Image:
    """
    Dewarps an image of a line using a kraken.lib.lineest.CenterNormalizer
    instance.

    Args:
        normalizer (kraken.lib.lineest.CenterNormalizer): A line normalizer
                                                          instance
        im (PIL.Image.Image): Image to dewarp

    Returns:
        PIL.Image containing the dewarped image.
    """
    line = pil2array(im)
    temp = np.amax(line)-line
    temp = temp*1.0/np.amax(temp)
    normalizer.measure(temp)
    line = normalizer.normalize(line, cval=np.amax(line))
    return array2pil(line)
Beispiel #14
0
def degrade_line(im, mean=0.0, sigma=0.001, density=0.002):
    """
    Degrades a line image by adding several kinds of noise.

    Args:
        im (PIL.Image): Input image
        mean (float): Mean of distribution for Gaussian noise
        sigma (float): Standard deviation for Gaussian noise
        density (float): Noise density for Salt and Pepper noiase

    Returns:
        PIL.Image in mode 'L'
    """
    im = pil2array(im)
    m = np.amax(im)
    im = gaussian_filter(im.astype('f') / m, 0.5)
    im += np.random.normal(mean, sigma, im.shape)
    flipped = np.ceil(density / 2 * im.size)
    coords = [np.random.randint(0, i - 1, int(flipped)) for i in im.shape]
    im[coords] = 255
    coords = [np.random.randint(0, i - 1, int(flipped)) for i in im.shape]
    im[coords] = 0
    return array2pil(np.clip(im * m, 0, 255).astype('uint8'))
Beispiel #15
0
def degrade_line(im, mean=0.0, sigma=0.001, density=0.002):
    """
    Degrades a line image by adding several kinds of noise.

    Args:
        im (PIL.Image): Input image
        mean (float): Mean of distribution for Gaussian noise
        sigma (float): Standard deviation for Gaussian noise
        density (float): Noise density for Salt and Pepper noise

    Returns:
        PIL.Image in mode 'L'
    """
    im = pil2array(im)
    m = np.amax(im)
    im = gaussian_filter(im.astype('f')/m, 0.5)
    im += np.random.normal(mean, sigma, im.shape)
    flipped = np.ceil(density/2 * im.size)
    coords = [np.random.randint(0, i - 1, int(flipped)) for i in im.shape]
    im[coords] = 255
    coords = [np.random.randint(0, i - 1, int(flipped)) for i in im.shape]
    im[coords] = 0
    return array2pil(np.clip(im * m, 0, 255).astype('uint8'))
Beispiel #16
0
    def add(self,
            image,
            split=lambda x: os.path.splitext(x)[0],
            suffix='.gt.txt',
            normalization=None,
            reorder=True,
            pad=16):
        """
        Adds a single image to the training set.
        """
        with click.open_file(split(image) + suffix, 'r',
                             encoding='utf-8') as fp:
            gt = fp.read()
            if normalization:
                gt = unicodedata.normalize(normalization, gt)
            if reorder:
                gt = bd.get_display(gt)

            im = Image.open(image)
            im = rpred.dewarp(self.lnorm, im)
            im = pil2array(im)
            im = lstm.prepare_line(im, pad)
            self.training_set.append((im, gt))
Beispiel #17
0
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """

    lnorm = getattr(network, 'lnorm', CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                pos.append((coords[0] + int(
                    (start - pad) * scale), coords[1], coords[0] + int(
                        (end - pad / 2) * scale), coords[3]))
            else:
                pos.append((coords[0], coords[1] + int(
                    (start - pad) * scale), coords[2], coords[1] + int(
                        (end - pad / 2) * scale)))
            conf.append(c)
        if bidi_reordering:
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
Beispiel #18
0
def mm_rpred(nets,
             im,
             bounds,
             pad=16,
             line_normalization=True,
             bidi_reordering=True):
    """
    Multi-model version of kraken.rpred.rpred.

    Takes a dictionary of ISO15924 script identifiers->models and an
    script-annotated segmentation to dynamically select appropriate models for
    these lines.

    Args:
        nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer
                     objects. Recommended to be an defaultdict.
        im (PIL.Image): Image to extract text from
                        bounds (dict): A dictionary containing a 'boxes' entry
                        with a list of lists of coordinates (script, (x0, y0,
                        x1, y1)) of a text line in the image and an entry
                        'text_direction' containing
                        'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    for line in bounds['boxes']:
        rec = ocr_record('', [], [])
        for script, (box, coords) in zip(
                map(lambda x: x[0], line),
                extract_boxes(
                    im, {
                        'text_direction': bounds['text_direction'],
                        'boxes': map(lambda x: x[1], line)
                    })):
            # check if boxes are non-zero in any dimension
            if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
                continue
            raw_line = pil2array(box)
            # check if line is non-zero
            if np.amax(raw_line) == np.amin(raw_line):
                continue
            if line_normalization:
                # fail gracefully and return no recognition result in case the
                # input line can not be normalized.
                try:
                    lnorm = getattr(nets[script], 'lnorm', CenterNormalizer())
                    box = dewarp(lnorm, box)
                except Exception as e:
                    continue
            line = pil2array(box)
            line = lstm.prepare_line(line, pad)
            pred = nets[script].predictString(line)
            # calculate recognized LSTM locations of characters
            scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad)
            result = lstm.translate_back_locations(nets[script].outputs)
            pos = []
            conf = []

            for _, start, end, c in result:
                if bounds['text_direction'].startswith('horizontal'):
                    pos.append((coords[0] + int(
                        (start - pad) * scale), coords[1], coords[0] + int(
                            (end - pad / 2) * scale), coords[3]))
                else:
                    pos.append((coords[0], coords[1] + int(
                        (start - pad) * scale), coords[2], coords[1] + int(
                            (end - pad / 2) * scale)))
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if bidi_reordering:
            yield bidi_record(rec)
        else:
            yield rec
Beispiel #19
0
def segment(im, text_direction='horizontal-lr', scale=None, maxcolseps=2,
            black_colseps=False, no_hlines=True, pad=0, mask=None):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not
        no_hlines (bool): Switch for horizontal line removal
        pad (int or tuple): Padding to add to line bounding boxes. If int the
                            same padding is used both left and right. If a
                            2-tuple, uses (padding_left, padding_right).
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info('Segmenting {}'.format(im_str))

    if im.mode != '1' and not is_bitonal(im):
        logger.error('Image {} is not bi-level'.format(im_str))
        raise KrakenInputException('Image {} is not bi-level'.format(im_str))

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        logger.error('Invalid text direction \'{}\''.format(text_direction))
        raise KrakenInputException('Invalid text direction {}'.format(text_direction))

    logger.debug('Rotating input image by {} degrees'.format(angle))
    im = im.rotate(angle, expand=True)

    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5*(np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    if no_hlines:
        binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.

    try:
        if mask:
            if mask.mode != '1' and not is_bitonal(mask):
                logger.error('Mask is not bitonal')
                raise KrakenInputException('Mask is not bitonal')
            mask = mask.convert('1')
            if mask.size != im.size:
                logger.error('Mask size {} doesn\'t match image size {}'.format(mask.size, im.size))
                raise KrakenInputException('Mask size {} doesn\'t match image size {}'.format(mask.size, im.size))
            logger.info('Masking enabled in segmenter. Disabling column detection.')
            mask = mask.rotate(angle, expand=True)
            colseps = pil2array(mask)
        elif black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        logger.warning('Exception in column finder (probably empty image) for {}.'.format(im_str))
        return {'text_direction': text_direction, 'boxes':  []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread*binary)
    segmentation = llabels*binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]

    if isinstance(pad, int):
        pad = (pad, pad)
    lines = [(max(x[0]-pad[0], 0), x[1], min(x[2]+pad[1], im.size[0]), x[3]) for x in lines]

    return {'text_direction': text_direction, 'boxes':  rotate_lines(lines, 360-angle, offset).tolist(), 'script_detection': False}
Beispiel #20
0
def rpred(network, im, bounds, pad=16, line_normalization=True, bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (iterable): An iterable returning a tuple defining the absolute
                           coordinates (x0, y0, x1, y1) of a text line in the
                           Image.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character. 
    """

    lnorm = getattr(network, 'lnorm', CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == False or coords[3] - coords[1] == False:
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T)/(len(network.outputs)-2 * pad)
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            pos.append((coords[0] + int((start-pad)*scale), coords[1], coords[0] + int((end-pad/2)*scale), coords[3]))
            conf.append(c)
        if bidi_reordering:
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
Beispiel #21
0
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info(u'Running recognizer on {} with {} lines'.format(
        im_str, len(bounds['boxes'])))
    logger.debug(u'Loading line normalizer')
    lnorm = getattr(network, 'lnorm', CenterNormalizer())
    if not is_bitonal(im):
        logger.info(u'Image is grayscale. Adjusting normalizer parameters')
        lnorm.range = 2

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            logger.warning(
                u'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            logger.warning(
                u'Empty line {}. Emitting empty record.'.format(coords))
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                logger.warning(
                    u'Dewarping for bbox {} failed. Emitting empty record.'.
                    format(coords))
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        logger.debug(u'Preparing line.')
        line = lstm.prepare_line(line, pad)
        logger.debug(u'Performing forward pass.')
        pred = network.predictString(line)
        logger.info(u'Prediction: {}'.format(pred))

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        logger.debug(u'Extracting labels.')
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                xmin = coords[0] + int(max((start - pad) * scale, 0))
                xmax = coords[0] + max(
                    int(min((end - pad) * scale, coords[2] - coords[0])), 1)
                pos.append((xmin, coords[1], xmax, coords[3]))
            else:
                ymin = coords[1] + int(max((start - pad) * scale, 0))
                ymax = coords[1] + max(
                    int(min((end - pad) * scale, coords[3] - coords[1])), 1)
                pos.append((coords[0], ymin, coords[2], ymax))
            conf.append(c)
        if bidi_reordering:
            logger.debug(u'BiDi reordering record.')
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
Beispiel #22
0
def segment(im,
            text_direction: str = 'horizontal-lr',
            mask: Optional[np.array] = None,
            reading_order_fn: Callable = polygonal_reading_order,
            model=None,
            device: str = 'cpu'):
    """
    Segments a page into text lines using the baseline segmenter.

    Segments a page into text lines and returns the polyline formed by each
    baseline and their estimated environment.

    Args:
        im (PIL.Image): An RGB image.
        text_direction (str): Ignored by the segmenter but kept for
                              serialization.
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.
        reading_order_fn (function): Function to determine the reading order.
                                     Has to accept a list of tuples (baselines,
                                     polygon) and a text direction (`lr` or
                                     `rl`).
        model (vgsl.TorchVGSLModel): A TorchVGSLModel containing a segmentation
                                     model. If none is given a default model
                                     will be loaded.
        device (str or torch.Device): The target device to run the neural
                                      network on.

    Returns:
        {'text_direction': '$dir',
         'type': 'baseline',
         'lines': [
            {'baseline': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'boundary': [[x0, y0, x1, y1], ... [x_m, y_m]]},
            {'baseline': [[x0, ...]], 'boundary': [[x0, ...]]}
          ]
          'regions': [
            {'region': [[x0, y0], [x1, y1], ..., [x_n, y_n]], 'type': 'image'},
            {'region': [[x0, ...]], 'type': 'text'}
          ]
        }: A dictionary containing the text direction and under the key 'lines'
        a list of reading order sorted baselines (polylines) and their
        respective polygonal boundaries. The last and first point of each
        boundary polygon is connected.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if model is None:
        logger.info('No segmentation model given. Loading default model.')
        model = vgsl.TorchVGSLModel.load_model(pkg_resources.resource_filename(__name__, 'blla.mlmodel'))

    if model.one_channel_mode == '1' and not is_bitonal(im):
        logger.warning('Running binary model on non-binary input image '
                       '(mode {}). This will result in severely degraded '
                       'performance'.format(im.mode))

    model.eval()
    model.to(device)

    if mask:
        if mask.mode != '1' and not is_bitonal(mask):
            logger.error('Mask is not bitonal')
            raise KrakenInputException('Mask is not bitonal')
        mask = mask.convert('1')
        if mask.size != im.size:
            logger.error('Mask size {mask.size} doesn\'t match image size {im.size}')
            raise KrakenInputException('Mask size {mask.size} doesn\'t match image size {im.size}')
        logger.info('Masking enabled in segmenter.')
        mask = pil2array(mask)

    batch, channels, height, width = model.input
    transforms = dataset.generate_input_transforms(batch, height, width, channels, 0, valid_norm=False)
    res_tf = tf.Compose(transforms.transforms[:3])
    scal_im = res_tf(im).convert('L')

    with torch.no_grad():
        logger.debug('Running network forward pass')
        o = model.nn(transforms(im).unsqueeze(0).to(device))
    logger.debug('Upsampling network output')
    o = F.interpolate(o, size=scal_im.size[::-1])
    o = o.squeeze().cpu().numpy()
    scale = np.divide(im.size, o.shape[:0:-1])
    # postprocessing
    cls_map = model.user_metadata['class_mapping']
    st_sep = cls_map['aux']['_start_separator']
    end_sep = cls_map['aux']['_end_separator']

    logger.info('Vectorizing baselines')
    baselines = []
    regions = {}
    for bl_type, idx in cls_map['baselines'].items():
        logger.debug(f'Vectorizing lines of type {bl_type}')
        baselines.extend([(bl_type,x) for x in vectorize_lines(o[(st_sep, end_sep, idx), :, :])])
    logger.info('Vectorizing regions')
    for region_type, idx in cls_map['regions'].items():
        logger.debug(f'Vectorizing lines of type {bl_type}')
        regions[region_type] = vectorize_regions(o[idx])
    logger.debug('Polygonizing lines')
    lines = list(filter(lambda x: x[2] is not None, zip([x[0] for x in baselines],
                                                        [x[1] for x in baselines],
                                                        calculate_polygonal_environment(scal_im, [x[1] for x in baselines]))))
    logger.debug('Scaling vectorized lines')
    sc = scale_polygonal_lines([x[1:] for x in lines], scale)
    lines = list(zip([x[0] for x in lines], [x[0] for x in sc], [x[1] for x in sc]))
    logger.debug('Scaling vectorized regions')
    for reg_id, regs in regions.items():
        regions[reg_id] = scale_regions(regs, scale)
    logger.debug('Reordering baselines')
    order_regs = []
    for regs in regions.values():
        order_regs.extend(regs)
    lines = reading_order_fn(lines=lines, regions=order_regs, text_direction=text_direction[-2:])

    if 'class_mapping' in model.user_metadata and len(model.user_metadata['class_mapping']['baselines']) > 1:
        script_detection = True
    else:
        script_detection = False

    return {'text_direction': text_direction,
            'type': 'baselines',
            'lines': [{'script': bl_type, 'baseline': bl, 'boundary': pl} for bl_type, bl, pl in lines],
            'regions': regions,
            'script_detection': script_detection}
Beispiel #23
0
def nlbin(im: Image.Image,
          threshold: float = 0.5,
          zoom: float = 0.5,
          escale: float = 1.0,
          border: float = 0.1,
          perc: int = 80,
          range: int = 20,
          low: int = 5,
          high: int = 90) -> Image:
    """
    Performs binarization using non-linear processing.

    Args:
        im (PIL.Image.Image):
        threshold (float):
        zoom (float): Zoom for background page estimation
        escale (float): Scale for estimating a mask over the text region
        border (float): Ignore this much of the border
        perc (int): Percentage for filters
        range (int): Range for filters
        low (int): Percentile for black estimation
        high (int): Percentile for white estimation

    Returns:
        PIL.Image containing the binarized image

    Raises:
        KrakenInputException when trying to binarize an empty image.
    """
    im_str = get_im_str(im)
    logger.info(f'Binarizing {im_str}')
    if is_bitonal(im):
        logger.info(f'Skipping binarization because {im_str} is bitonal.')
        return im
    # convert to grayscale first
    logger.debug(f'Converting {im_str} to grayscale')
    im = im.convert('L')
    raw = pil2array(im)
    logger.debug('Scaling and normalizing')
    # rescale image to between -1 or 0 and 1
    raw = raw / np.float(np.iinfo(raw.dtype).max)
    # perform image normalization
    if np.amax(raw) == np.amin(raw):
        logger.warning(f'Trying to binarize empty image {im_str}')
        raise KrakenInputException('Image is empty')
    image = raw - np.amin(raw)
    image /= np.amax(image)

    logger.debug('Interpolation and percentile filtering')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        m = interpolation.zoom(image, zoom)
        m = filters.percentile_filter(m, perc, size=(range, 2))
        m = filters.percentile_filter(m, perc, size=(2, range))
        mh, mw = m.shape
        oh, ow = image.shape
        scale = np.diag([mh * 1.0 / oh, mw * 1.0 / ow])
        m = affine_transform(m, scale, output_shape=image.shape)
    w, h = np.minimum(np.array(image.shape), np.array(m.shape))
    flat = np.clip(image[:w, :h] - m[:w, :h] + 1, 0, 1)

    # estimate low and high thresholds
    d0, d1 = flat.shape
    o0, o1 = int(border * d0), int(border * d1)
    est = flat[o0:d0 - o0, o1:d1 - o1]
    logger.debug('Threshold estimates {}'.format(est))
    # by default, we use only regions that contain
    # significant variance; this makes the percentile
    # based low and high estimates more reliable
    logger.debug('Refine estimates')
    v = est - filters.gaussian_filter(est, escale * 20.0)
    v = filters.gaussian_filter(v**2, escale * 20.0)**0.5
    v = (v > 0.3 * np.amax(v))
    v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1)))
    v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50))))
    est = est[v]
    lo = np.percentile(est.ravel(), low)
    hi = np.percentile(est.ravel(), high)
    flat -= lo
    flat /= (hi - lo)
    flat = np.clip(flat, 0, 1)
    logger.debug(f'Thresholding at {threshold}')
    bin = np.array(255 * (flat > threshold), 'B')
    return array2pil(bin)
Beispiel #24
0
def nlbin(im,
          threshold=0.5,
          zoom=0.5,
          escale=1.0,
          border=0.1,
          perc=80,
          range=20,
          low=5,
          high=90):
    """
    Performs binarization using non-linear processing.

    Args:
        im (PIL.Image):
        threshold (float):
        zoom (float): Zoom for background page estimation
        escale (float): Scale for estimating a mask over the text region
        border (float): Ignore this much of the border
        perc (int): Percentage for filters
        range (int): Range for filters
        low (int): Percentile for black estimation
        high (int): Percentile for white estimation

    Returns:
        PIL.Image containing the binarized image
    """
    if im.mode == '1':
        return im
    raw = pil2array(im)
    # rescale image to between -1 or 0 and 1
    raw = raw / np.float(np.iinfo(raw.dtype).max)
    if raw.ndim == 3:
        raw = np.mean(raw, 2)
    # perform image normalization
    if np.amax(raw) == np.amin(raw):
        raise KrakenInputException('Image is empty')
    image = raw - np.amin(raw)
    image /= np.amax(image)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        m = interpolation.zoom(image, zoom)
        m = filters.percentile_filter(m, perc, size=(range, 2))
        m = filters.percentile_filter(m, perc, size=(2, range))
        m = interpolation.zoom(m, 1.0 / zoom)
    w, h = np.minimum(np.array(image.shape), np.array(m.shape))
    flat = np.clip(image[:w, :h] - m[:w, :h] + 1, 0, 1)

    # estimate low and high thresholds
    d0, d1 = flat.shape
    o0, o1 = int(border * d0), int(border * d1)
    est = flat[o0:d0 - o0, o1:d1 - o1]
    # by default, we use only regions that contain
    # significant variance; this makes the percentile
    # based low and high estimates more reliable
    v = est - filters.gaussian_filter(est, escale * 20.0)
    v = filters.gaussian_filter(v**2, escale * 20.0)**0.5
    v = (v > 0.3 * np.amax(v))
    v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1)))
    v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50))))
    est = est[v]
    lo = np.percentile(est.ravel(), low)
    hi = np.percentile(est.ravel(), high)

    flat -= lo
    flat /= (hi - lo)
    flat = np.clip(flat, 0, 1)
    bin = np.array(255 * (flat > threshold), 'B')
    return array2pil(bin)
Beispiel #25
0
def rpred(network, im, bounds, pad=16, line_normalization=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (iterable): An iterable returning a tuple defining the absolute
                           coordinates (x0, y0, x1, y1) of a text line in the
                           Image.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
    Yields:
        A tuple containing the recognized text (0), absolute character
        positions in the image (1), and confidence values for each
        character(2).
    """

    lnorm = getattr(network, "lnorm", CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == False or coords[3] - coords[1] == False:
            yield ocr_record("", [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record("", [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record("", [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        result = lstm.translate_back(network.outputs, pos=1)
        conf = [network.outputs[r, c] for r, c in result if c != 0]
        cuts = [(int((r - pad) * scale), c) for (r, c) in result]
        # append last offset to end of line
        cuts.append((coords[2] - coords[0], 0))
        pos = []
        lx = 0
        for i, d in enumerate(cuts):
            if d[1] == 0:
                lx = d[0]
                continue
            try:
                pos.append((coords[0] + lx, coords[1], coords[0] + d[0], coords[3]))
            except:
                break
            lx = d[0]
        yield ocr_record(pred, pos, conf)
Beispiel #26
0
    def read(self, page):
        """Perfoms OCR with Kraken."""

        stages = page.stages
        scan = stages.get("clean", None)
        if scan is None:
            return None

        nonLetter = self.nonLetter

        model = self.ensureLoaded()

        blocks = page.blocks
        ocrChars = []
        ocrWords = []
        ocrLines = []
        stages["char"] = ocrChars
        stages["word"] = ocrWords
        stages["line"] = ocrLines
        binary = pil2array(nlbin(array2pil(scan)))

        for ((stripe, block), data) in blocks.items():
            (left, top, right, bottom) = data["inner"]
            thisBinary = binary[top:bottom, left:right]
            lines = data["bands"]["main"]["lines"]
            for (ln, (up, lo)) in enumerate(lines):
                lln = ln + 1
                roi = thisBinary[up : lo + 1]
                (b, e, roi) = removeMargins(roi, keep=16)
                ocrLines.append((stripe, block, lln, left + b, top + up, left + e, top + lo))
                (roiH, roiW) = roi.shape[0:2]
                roi = array2pil(roi)
                bounds = dict(boxes=([0, 0, roiW, roiH],), text_direction=RL)

                # adapt the boxes, because they corresponds to peaks of recognition,
                # not to character extends
                #
                # See https://github.com/mittagessen/kraken/issues/184

                adaptedPreds = []
                for (c, (le, to, ri, bo), conf) in chain.from_iterable(
                    rpred(model, roi, bounds, pad=0, bidi_reordering=True)
                ):
                    if adaptedPreds:
                        prevPred = adaptedPreds[-1]
                        prevEdge = prevPred[1][0]
                    else:
                        prevEdge = roiW
                    correction = int(round((prevEdge - ri) / 2))
                    thisRi = ri + correction
                    if adaptedPreds:
                        adaptedPreds[-1][1][0] -= correction
                    adaptedPreds.append([c, [le, to, thisRi, bo], conf])
                if adaptedPreds:
                    adaptedPreds[-1][1][0] = 0

                # divide into words, not only on spaces, but also on punctuation

                curWord = [[], []]
                inWord = True

                for (c, (le, to, ri, bo), conf) in adaptedPreds:
                    offsetW = left + b
                    offsetH = top + up
                    pos = (le + offsetW, to + offsetH, ri + offsetW, bo + offsetH)
                    conf = int(round(conf * 100))
                    ocrChars.append((stripe, block, lln, *pos, conf, c))

                    spaceSeen = c == " "
                    changeWord = not inWord and c not in nonLetter
                    element = (c, pos, conf)

                    if spaceSeen:
                        curWord[1].append(element)
                    if spaceSeen or changeWord:
                        if curWord[0] or curWord[1]:
                            ocrWords.append((stripe, block, lln, *addWord(curWord)))
                            curWord = [[], []]
                            inWord = True
                            continue

                    if inWord:
                        if c in nonLetter:
                            inWord = False
                    dest = 0 if inWord else 1
                    curWord[dest].append(element)
                if curWord[0] or curWord[1]:
                    ocrWords.append((stripe, block, lln, *addWord(curWord)))

        page.write(stage="line,word,char")
Beispiel #27
0
def ocropy_degrade(im,
                   distort=1.0,
                   dsigma=20.0,
                   eps=0.03,
                   delta=0.3,
                   degradations=((0.5, 0.0, 0.5, 0.0), )):
    """
    Degrades and distorts a line using the same noise model used by ocropus.

    Args:
        im (PIL.Image): Input image
        distort (float):
        dsigma (float):
        eps (float):
        delta (float):
        degradations (list): list returning 4-tuples corresponding to
                             the degradations argument of ocropus-linegen.

    Returns:
        PIL.Image in mode 'L'
    """
    w, h = im.size
    # XXX: determine correct output shape from transformation matrices instead
    # of guesstimating.
    logger.debug('Pasting source image into canvas')
    image = Image.new('L', (int(1.5 * w), 4 * h), 255)
    image.paste(im, (int((image.size[0] - w) / 2), int(
        (image.size[1] - h) / 2)))
    a = pil2array(image.convert('L'))
    logger.debug('Selecting degradations')
    (sigma, ssigma, threshold,
     sthreshold) = degradations[np.random.choice(len(degradations))]
    sigma += (2 * np.random.rand() - 1) * ssigma
    threshold += (2 * np.random.rand() - 1) * sthreshold
    a = a * 1.0 / np.amax(a)
    if sigma > 0.0:
        logger.debug('Apply Gaussian filter')
        a = gaussian_filter(a, sigma)
    logger.debug('Adding noise')
    a += np.clip(np.random.randn(*a.shape) * 0.2, -0.25, 0.25)
    logger.debug('Perform affine transformation and resize')
    m = np.array([[1 + eps * np.random.randn(), 0.0],
                  [eps * np.random.randn(), 1.0 + eps * np.random.randn()]])
    w, h = a.shape
    c = np.array([w / 2.0, h / 2])
    d = c - np.dot(m, c) + np.array(
        [np.random.randn() * delta,
         np.random.randn() * delta])
    a = affine_transform(a,
                         m,
                         offset=d,
                         order=1,
                         mode='constant',
                         cval=a[0, 0])
    a = np.array(a > threshold, 'f')
    [[r, c]] = find_objects(np.array(a == 0, 'i'))
    r0 = r.start
    r1 = r.stop
    c0 = c.start
    c1 = c.stop
    a = a[r0 - 5:r1 + 5, c0 - 5:c1 + 5]
    if distort > 0:
        logger.debug('Perform geometric transformation')
        h, w = a.shape
        hs = np.random.randn(h, w)
        ws = np.random.randn(h, w)
        hs = gaussian_filter(hs, dsigma)
        ws = gaussian_filter(ws, dsigma)
        hs *= distort / np.amax(hs)
        ws *= distort / np.amax(ws)

        def _f(p):
            return (p[0] + hs[p[0], p[1]], p[1] + ws[p[0], p[1]])

        a = geometric_transform(a,
                                _f,
                                output_shape=(h, w),
                                order=1,
                                mode='constant',
                                cval=np.amax(a))
    im = array2pil(a).convert('L')
    return im
Beispiel #28
0
def segment(im,
            text_direction='horizontal-lr',
            scale=None,
            maxcolseps=2,
            black_colseps=False):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """

    if im.mode != '1' and not is_bitonal(im):
        raise KrakenInputException('Image is not bi-level')

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        raise KrakenInputException('Invalid text direction')

    im = im.rotate(angle, expand=True)

    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.
    try:
        if black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        return {'text_direction': text_direction, 'boxes': []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
    return {
        'text_direction': text_direction,
        'boxes': rotate_lines(lines, 360 - angle, offset).tolist(),
        'script_detection': False
    }
Beispiel #29
0
def segment(im,
            text_direction: str = 'horizontal-lr',
            scale: Optional[float] = None,
            maxcolseps: float = 2,
            black_colseps: bool = False,
            no_hlines: bool = True,
            pad: int = 0,
            mask: Optional[np.array] = None,
            reading_order_fn: Callable = reading_order) -> Dict[str, Any]:
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not
        no_hlines (bool): Switch for horizontal line removal
        pad (int or tuple): Padding to add to line bounding boxes. If int the
                            same padding is used both left and right. If a
                            2-tuple, uses (padding_left, padding_right).
        mask (PIL.Image): A bi-level mask image of the same size as `im` where
                          0-valued regions are ignored for segmentation
                          purposes. Disables column detection.
        reading_order_fn (Callable): Function to call to order line output.
                                     Callable accepting a list of slices (y, x)
                                     and a text direction in (`rl`, `lr`).

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """
    im_str = get_im_str(im)
    logger.info(f'Segmenting {im_str}')

    if im.mode != '1' and not is_bitonal(im):
        logger.error(f'Image {im_str} is not bi-level')
        raise KrakenInputException(f'Image {im_str} is not bi-level')

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        logger.error(f'Invalid text direction \'{text_direction}\'')
        raise KrakenInputException(f'Invalid text direction {text_direction}')

    logger.debug(f'Rotating input image by {angle} degrees')
    im = im.rotate(angle, expand=True)

    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    if no_hlines:
        binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.

    try:
        if mask:
            if mask.mode != '1' and not is_bitonal(mask):
                logger.error('Mask is not bitonal')
                raise KrakenInputException('Mask is not bitonal')
            mask = mask.convert('1')
            if mask.size != im.size:
                logger.error(
                    f'Mask size {mask.size} doesn\'t match image size {im.size}'
                )
                raise KrakenInputException(
                    f'Mask size {mask.size} doesn\'t match image size {im.size}'
                )
            logger.info(
                'Masking enabled in segmenter. Disabling column detection.')
            mask = mask.rotate(angle, expand=True)
            colseps = pil2array(mask)
        elif black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        logger.warning(
            f'Exception in column finder (probably empty image) for {im_str}')
        return {'text_direction': text_direction, 'boxes': []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order_fn([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]

    if isinstance(pad, int):
        pad = (pad, pad)
    lines = [(max(x[0] - pad[0], 0), x[1], min(x[2] + pad[1],
                                               im.size[0]), x[3])
             for x in lines]

    return {
        'text_direction': text_direction,
        'boxes': rotate_lines(lines, 360 - angle, offset).tolist(),
        'script_detection': False
    }
Beispiel #30
0
def nlbin(im: Image.Image,
          threshold: float = 0.5,
          zoom: float = 0.5,
          escale: float = 1.0,
          border: float = 0.1,
          perc: int = 80,
          range: int = 20,
          low: int = 5,
          high: int = 90) -> Image:
    """
    Performs binarization using non-linear processing.

    Args:
        im (PIL.Image.Image):
        threshold (float):
        zoom (float): Zoom for background page estimation
        escale (float): Scale for estimating a mask over the text region
        border (float): Ignore this much of the border
        perc (int): Percentage for filters
        range (int): Range for filters
        low (int): Percentile for black estimation
        high (int): Percentile for white estimation

    Returns:
        PIL.Image containing the binarized image

    Raises:
        KrakenInputException when trying to binarize an empty image.
    """
    im_str = get_im_str(im)
    logger.info('Binarizing {}'.format(im_str))
    if is_bitonal(im):
        logger.info('Skipping binarization because {} is bitonal.'.format(im_str))
        return im
    # convert to grayscale first
    logger.debug('Converting {} to grayscale'.format(im_str))
    im = im.convert('L')
    raw = pil2array(im)
    logger.debug('Scaling and normalizing')
    # rescale image to between -1 or 0 and 1
    raw = raw/np.float(np.iinfo(raw.dtype).max)
    # perform image normalization
    if np.amax(raw) == np.amin(raw):
        logger.warning('Trying to binarize empty image {}'.format(im_str))
        raise KrakenInputException('Image is empty')
    image = raw-np.amin(raw)
    image /= np.amax(image)

    logger.debug('Interpolation and percentile filtering')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        m = interpolation.zoom(image, zoom)
        m = filters.percentile_filter(m, perc, size=(range, 2))
        m = filters.percentile_filter(m, perc, size=(2, range))
        mh, mw = m.shape
        oh, ow = image.shape
        scale = np.diag([mh * 1.0/oh, mw * 1.0/ow])
        m = affine_transform(m, scale, output_shape=image.shape)
    w, h = np.minimum(np.array(image.shape), np.array(m.shape))
    flat = np.clip(image[:w, :h]-m[:w, :h]+1, 0, 1)

    # estimate low and high thresholds
    d0, d1 = flat.shape
    o0, o1 = int(border*d0), int(border*d1)
    est = flat[o0:d0-o0, o1:d1-o1]
    logger.debug('Threshold estimates {}'.format(est))
    # by default, we use only regions that contain
    # significant variance; this makes the percentile
    # based low and high estimates more reliable
    logger.debug('Refine estimates')
    v = est-filters.gaussian_filter(est, escale*20.0)
    v = filters.gaussian_filter(v**2, escale*20.0)**0.5
    v = (v > 0.3*np.amax(v))
    v = morphology.binary_dilation(v, structure=np.ones((int(escale * 50), 1)))
    v = morphology.binary_dilation(v, structure=np.ones((1, int(escale * 50))))
    est = est[v]
    lo = np.percentile(est.ravel(), low)
    hi = np.percentile(est.ravel(), high)
    flat -= lo
    flat /= (hi-lo)
    flat = np.clip(flat, 0, 1)
    logger.debug('Thresholding at {}'.format(threshold))
    bin = np.array(255*(flat > threshold), 'B')
    return array2pil(bin)