Beispiel #1
0
def pagexmllineseg(xmlfile, text_direction = 'horizontal-lr', outputfile=None):
	"""
	Opens file 'xmlfile', converts to newest pagexml version 2017,
	segments the text regions and writes xml to file.
	Output is written to input file if outfile is 'None'.
	"""
	if not outputfile:
		outputfile = xmlfile
		
	root = etree.parse(xmlfile).getroot()
	ns = {"ns":root.nsmap[None]}

	#convert point notation from older pagexml versions
	for c in root.xpath("//ns:Coords[not(@points)]", namespaces=ns):
		cc = []
		for point in c.xpath("./ns:Point", namespaces=ns):
		#coordstrings = [x.split(",") for x in c.attrib["points"].split()]
			cx = point.attrib["x"]
			cy = point.attrib["y"]
			c.remove(point)
			cc.append(cx+","+cy)
		c.attrib["points"] = " ".join(cc)    

	coordmap = {}
	for r in root.xpath('//ns:TextRegion', namespaces=ns):
		rid = r.attrib["id"]
		coordmap[rid] = {"type":r.attrib["type"]}
		coordmap[rid]["coords"] = []
		for c in r.xpath("./ns:Coords", namespaces=ns) + r.xpath("./Coords"):
			coordstrings = [x.split(",") for x in c.attrib["points"].split()]
			coordmap[rid]["coords"] += [[int(x[0]), int(x[1])] for x in coordstrings ]

	filename = root.xpath('//ns:Page', namespaces=ns)[0].attrib["imageFilename"]
	
	im = Image.open(filename)
	for n, c in enumerate(sorted(coordmap)):
		coords = coordmap[c]['coords']
		cropped = cutout(im, coords)
		offset = (min([x[0] for x in coords]), min([x[1] for x in coords]))
		if cropped != None:
			if not binarization.is_bitonal(cropped):
				cropped = binarization.nlbin(cropped)
			lines = segment(cropped, text_direction=text_direction, maxcolseps=0)['lines']
		else:
			lines = []

		for n, l in enumerate(lines):
			coords = ((x[1]+offset[0], x[0]+offset[1]) for x in l.polygon)
			coordstrg = " ".join(str(x[0])+","+str(x[1]) for x in coords)
			textregion = root.xpath('//ns:TextRegion[@id="'+c+'"]', namespaces=ns)[0]
			linexml = etree.SubElement(textregion, "TextLine", 
									   attrib={"id":c+"_l{:03d}".format(n + 1)})
			coordsxml = etree.SubElement(linexml, "Coords", 
									   attrib={"points":coordstrg})
	xmlstring = etree.tounicode(root.getroottree()).replace(
			 "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19",
			 "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15"
			)
	with open(outputfile, "w") as f:
		f.write(xmlstring)
Beispiel #2
0
def transcription(ctx, text_direction, scale, maxcolseps, black_colseps, font,
                  font_style, prefill, output, images):
    st_time = time.time()
    ti = transcribe.TranscriptionInterface(font, font_style)

    if prefill:
        if ctx.meta['verbose'] > 0:
            click.echo(u'[{:2.4f}] Loading model {}'.format(time.time() - st_time, prefill))
        else:
            spin('Loading RNN')
        prefill = models.load_any(prefill.encode('utf-8'))
        if not ctx.meta['verbose']:
            click.secho(u'\b\u2713', fg='green', nl=False)
            click.echo('\033[?25h\n', nl=False)

    for fp in images:
        if ctx.meta['verbose'] > 0:
            click.echo(u'[{:2.4f}] Reading {}'.format(time.time() - st_time, fp.name))
        else:
            spin('Reading images')
        im = Image.open(fp)
        if not binarization.is_bitonal(im):
            if ctx.meta['verbose'] > 0:
                click.echo(u'[{:2.4f}] Binarizing page'.format(time.time() - st_time))
            im = binarization.nlbin(im)
        if ctx.meta['verbose'] > 0:
            click.echo(u'[{:2.4f}] Segmenting page'.format(time.time() - st_time))
        res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps)
        if prefill:
            it = rpred.rpred(prefill, im, res)
            preds = []
            for pred in it: 
                if ctx.meta['verbose'] > 0:
                    click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction))
                else:
                    spin('Recognizing')
                preds.append(pred)
            if ctx.meta['verbose'] > 0:
                click.echo(u'Execution time: {}s'.format(time.time() - st_time))
            else:
                click.secho(u'\b\u2713', fg='green', nl=False)
                click.echo('\033[?25h\n', nl=False)
            ti.add_page(im, res, records=preds)
        else:
            ti.add_page(im, res)
        fp.close()
    if not ctx.meta['verbose']:
        click.secho(u'\b\u2713', fg='green', nl=False)
        click.echo('\033[?25h\n', nl=False)
    if ctx.meta['verbose'] > 0:
        click.echo(u'[{:2.4f}] Writing transcription to {}'.format(time.time() - st_time, output.name))
    else:
        spin('Writing output')
    ti.write(output)
    if not ctx.meta['verbose']:
        click.secho(u'\b\u2713', fg='green', nl=False)
        click.echo('\033[?25h\n', nl=False)
Beispiel #3
0
def transcription(ctx, font, font_style, prefill, output, images):
    st_time = time.time()
    ti = transcrib.TranscriptionInterface(font, font_style)

    if prefill:
        if ctx.meta['verbose'] > 0:
            click.echo(u'[{:2.4f}] Loading model {}'.format(time.time() - st_time, prefill))
        else:
            spin('Loading RNN')
        prefill = models.load_any(prefill)
        if not ctx.meta['verbose']:
            click.secho(u'\b\u2713', fg='green', nl=False)
            click.echo('\033[?25h\n', nl=False)

    for fp in images:
        if ctx.meta['verbose'] > 0:
            click.echo(u'[{:2.4f}] Reading {}'.format(time.time() - st_time, fp.name))
        else:
            spin('Reading images')
        im = Image.open(fp)
        if not binarization.is_bitonal(im):
            if ctx.meta['verbose'] > 0:
                click.echo(u'[{:2.4f}] Binarizing page'.format(time.time() - st_time))
            im = binarization.nlbin(im)
        if ctx.meta['verbose'] > 0:
            click.echo(u'[{:2.4f}] Segmenting page'.format(time.time() - st_time))
        res = pageseg.segment(im)
        if prefill:
            it = rpred.rpred(prefill, im, res)
            preds = []
            for pred in it: 
                if ctx.meta['verbose'] > 0:
                    click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction))
                else:
                    spin('Recognizing')
                preds.append(pred)
            if ctx.meta['verbose'] > 0:
                click.echo(u'Execution time: {}s'.format(time.time() - st_time))
            else:
                click.secho(u'\b\u2713', fg='green', nl=False)
                click.echo('\033[?25h\n', nl=False)
            ti.add_page(im, records=preds)
        else:
            ti.add_page(im, res)
    if not ctx.meta['verbose']:
        click.secho(u'\b\u2713', fg='green', nl=False)
        click.echo('\033[?25h\n', nl=False)
    if ctx.meta['verbose'] > 0:
        click.echo(u'[{:2.4f}] Writing transcription to {}'.format(time.time() - st_time, output.name))
    else:
        spin('Writing output')
    ti.write(output)
    if not ctx.meta['verbose']:
        click.secho(u'\b\u2713', fg='green', nl=False)
        click.echo('\033[?25h\n', nl=False)
Beispiel #4
0
def segment(im, scale=None, black_colseps=False):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        scale (float): Scale of the image
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not

    Returns:
        [(x1, y1, x2, y2),...]: A list of tuples containing the bounding boxes
                                of the segmented lines in reading order.

    Raises:
        KrakenInputException if the input image is not binarized
    """

    if im.mode != '1' and not is_bitonal(im):
        raise KrakenInputException('Image is not bi-level')
    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    binary = remove_hlines(binary, scale)
    if black_colseps:
        colseps, binary = compute_black_colseps(binary, scale)
    else:
        colseps = compute_white_colseps(binary, scale)
    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    return [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
Beispiel #5
0
def segment(im, scale=None, black_colseps=False):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        scale (float): Scale of the image
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not

    Returns:
        [(x1, y1, x2, y2),...]: A list of tuples containing the bounding boxes
                                of the segmented lines in reading order.

    Raises:
        KrakenInputException if the input image is not binarized
    """

    if im.mode != '1' and not is_bitonal(im):
        raise KrakenInputException('Image is not bi-level')
    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5*(np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    binary = remove_hlines(binary, scale)
    if black_colseps:
        colseps, binary = compute_black_colseps(binary, scale)
    else:
        colseps = compute_white_colseps(binary, scale)
    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread*binary)
    segmentation = llabels*binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    return [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
Beispiel #6
0
def transcription(ctx, text_direction, scale, maxcolseps, black_colseps, font,
                  font_style, prefill, output, images, segment_page):
    ti = transcribe.TranscriptionInterface(font, font_style)

    if prefill:
        logger.info('Loading model {}'.format(prefill))
        spin('Loading RNN')
        prefill = models.load_any(prefill.encode('utf-8'))
        message(u'\b\u2713', fg='green', nl=False)
        message('\033[?25h\n', nl=False)

    for fp in images:
        logger.info('Reading {}'.format(fp.name))
        spin('Reading images')
        im = Image.open(fp)
        if not binarization.is_bitonal(im):
            logger.info(u'Binarizing page')
            im = binarization.nlbin(im)
        if segment_page:
            logger.info(u'Segmenting page')
            res = pageseg.segment(im, text_direction, scale, maxcolseps,
                                  black_colseps)
        else:
            res = {
                'text_direction': 'horizontal-tb',
                'boxes': [(0, 0) + im.size]
            }
        if prefill:
            it = rpred.rpred(prefill, im, res)
            preds = []
            for pred in it:
                logger.info('{}'.format(pred.prediction))
                spin('Recognizing')
                preds.append(pred)
            message(u'\b\u2713', fg='green', nl=False)
            message('\033[?25h\n', nl=False)
            ti.add_page(im, res, records=preds)
        else:
            ti.add_page(im, res)
        fp.close()
    message(u'\b\u2713', fg='green', nl=False)
    message('\033[?25h\n', nl=False)
    logger.info(u'Writing transcription to {}'.format(output.name))
    spin('Writing output')
    ti.write(output)
    message(u'\b\u2713', fg='green', nl=False)
    message('\033[?25h\n', nl=False)
Beispiel #7
0
def segment(im,
            text_direction='horizontal-lr',
            scale=None,
            maxcolseps=2,
            black_colseps=False):
    """
    Segments a page into text lines.

    Segments a page into text lines and returns the absolute coordinates of
    each line in reading order.

    Args:
        im (PIL.Image): A bi-level page of mode '1' or 'L'
        text_direction (str): Principal direction of the text
                              (horizontal-lr/rl/vertical-lr/rl)
        scale (float): Scale of the image
        maxcolseps (int): Maximum number of whitespace column separators
        black_colseps (bool): Whether column separators are assumed to be
                              vertical black lines or not

    Returns:
        {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A
        dictionary containing the text direction and a list of reading order
        sorted bounding boxes under the key 'boxes'.

    Raises:
        KrakenInputException if the input image is not binarized or the text
        direction is invalid.
    """

    if im.mode != '1' and not is_bitonal(im):
        raise KrakenInputException('Image is not bi-level')

    # rotate input image for vertical lines
    if text_direction.startswith('horizontal'):
        angle = 0
        offset = (0, 0)
    elif text_direction == 'vertical-lr':
        angle = 270
        offset = (0, im.size[1])
    elif text_direction == 'vertical-rl':
        angle = 90
        offset = (im.size[0], 0)
    else:
        raise KrakenInputException('Invalid text direction')

    im = im.rotate(angle, expand=True)

    # honestly I've got no idea what's going on here. In theory a simple
    # np.array(im, 'i') should suffice here but for some reason the
    # tostring/fromstring magic in pil2array alters the array in a way that is
    # needed for the algorithm to work correctly.
    a = pil2array(im)
    binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i')
    binary = 1 - binary

    if not scale:
        scale = estimate_scale(binary)

    binary = remove_hlines(binary, scale)
    # emptyish images wll cause exceptions here.
    try:
        if black_colseps:
            colseps, binary = compute_black_colseps(binary, scale, maxcolseps)
        else:
            colseps = compute_white_colseps(binary, scale, maxcolseps)
    except ValueError:
        return {'text_direction': text_direction, 'boxes': []}

    bottom, top, boxmap = compute_gradmaps(binary, scale)
    seeds = compute_line_seeds(binary, bottom, top, colseps, scale)
    llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
    spread = morph.spread_labels(seeds, maxdist=scale)
    llabels = np.where(llabels > 0, llabels, spread * binary)
    segmentation = llabels * binary

    lines = compute_lines(segmentation, scale)
    order = reading_order([l.bounds for l in lines], text_direction[-2:])
    lsort = topsort(order)
    lines = [lines[i].bounds for i in lsort]
    lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
    return {
        'text_direction': text_direction,
        'boxes': rotate_lines(lines, 360 - angle, offset).tolist(),
        'script_detection': False
    }
Beispiel #8
0
def pagexmllineseg(xmlfile,
                   imgpath,
                   text_direction='horizontal-lr',
                   scale=None):
    root = etree.parse(xmlfile).getroot()
    ns = {"ns": root.nsmap[None]}

    # convert point notation from older pagexml versions
    for c in root.xpath("//ns:Coords[not(@points)]", namespaces=ns):
        cc = []
        for point in c.xpath("./ns:Point", namespaces=ns):
            # coordstrings = [x.split(",") for x in c.attrib["points"].split()]
            cx = point.attrib["x"]
            cy = point.attrib["y"]
            c.remove(point)
            cc.append(cx + "," + cy)
        c.attrib["points"] = " ".join(cc)

    coordmap = {}
    for r in root.xpath('//ns:TextRegion', namespaces=ns):
        rid = r.attrib["id"]
        coordmap[rid] = {"type": r.attrib["type"]}
        coordmap[rid]["coords"] = []
        for c in r.xpath("./ns:Coords", namespaces=ns) + r.xpath("./Coords"):
            coordmap[rid]["coordstring"] = c.attrib["points"]
            coordstrings = [x.split(",") for x in c.attrib["points"].split()]
            coordmap[rid]["coords"] += [[int(x[0]), int(x[1])]
                                        for x in coordstrings]

    filename = root.xpath('//ns:Page', namespaces=ns)[0]\
        .attrib["imageFilename"]
    filename = imgpath + "/" + filename

    im = Image.open(filename)

    for n, c in enumerate(sorted(coordmap)):
        if type(scale) == dict:
            if coordmap[c]['type'] in scale:
                rscale = scale[coordmap[c]['type']]
            elif "other" in scale:
                rscale = scale["other"]
            else:
                rscale = None
        else:
            rscale = scale
        coords = coordmap[c]['coords']
        if len(coords) < 3:
            continue
        cropped = cutout(im, coords)
        offset = (min([x[0] for x in coords]), min([x[1] for x in coords]))
        if cropped is not None:
            if not binarization.is_bitonal(cropped):
                try:
                    cropped = binarization.nlbin(cropped)
                except SystemError:
                    continue
            if coordmap[c]["type"] == "drop-capital":
                lines = [1]
            else:
                # if line in
                lines = segment(cropped,
                                text_direction=text_direction,
                                scale=rscale,
                                maxcolseps=-1)

                lines = lines["lines"] if "lines" in lines else []
        else:
            lines = []

        for n, l in enumerate(lines):
            if coordmap[c]["type"] == "drop-capital":
                coordstrg = coordmap[c]["coordstring"]
            else:
                coords = ((x[1] + offset[0], x[0] + offset[1])
                          for x in l.polygon)
                coordstrg = " ".join(
                    [str(x[0]) + "," + str(x[1]) for x in coords])
            textregion = root.xpath('//ns:TextRegion[@id="' + c + '"]',
                                    namespaces=ns)[0]
            linexml = etree.SubElement(
                textregion,
                "TextLine",
                attrib={"id": "{}_l{:03d}".format(c, n + 1)})
            coordsxml = etree.SubElement(linexml,
                                         "Coords",
                                         attrib={"points": coordstrg})
    xmlstring = etree.tounicode(root.getroottree()).replace(
        "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19",
        "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15")
    no_lines_segm = int(root.xpath("count(//TextLine)"))
    return xmlstring, no_lines_segm