Python OCRRecord.img Examples

Programming Language: Python

Namespace/Package Name: nidaba.tei

Class/Type: OCRRecord

Method/Function: img

Examples at hotexamples.com: 4

Python OCRRecord.img - 4 examples found. These are the top rated real world Python examples of nidaba.tei.OCRRecord.img extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OCRRecord(12)

load_tei(10)

add_respstmt(4)

add_graphemes(3)

add_line(2)

add_segment(2)

clear_graphemes(2)

clear_segments(2)

dimensions(2)

img(2)

scope_line(2)

title(2)

load_hocr(1)

Example #1

Show file

File: kraken.py Project: brunsgaard/nidaba

def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False):
    """ 
    Performs page segmentation using kraken's built-in algorithm and writes a
    skeleton TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        black_colseps (bool): Assume black column separator instead of white
        ones.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """

    input_path = storage.get_abs_path(*doc)
    output_path, ext = os.path.splitext(
        storage.insert_suffix(input_path, method))
    logger.debug('Reading image using PIL')
    img = Image.open(input_path)
    with open(output_path + '.xml', 'w') as fp:
        logger.debug('Initializing TEI with {} ({} {})'.format(
            doc[1], *img.size))
        tei = OCRRecord()
        tei.img = storage.get_url(*doc)
        tei.dimensions = img.size
        tei.title = os.path.basename(doc[1])
        tei.add_respstmt('kraken', 'page segmentation')
        for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']:
            logger.debug('Found line at {} {} {} {}'.format(*seg))
            tei.add_line(seg)
        logger.debug('Write segmentation to {}'.format(fp.name))
        tei.write_tei(fp)
    return storage.get_storage_path(output_path + '.xml')

Example #2

Show file

File: kraken.py Project: OpenPhilology/nidaba

def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False):
    """ 
    Performs page segmentation using kraken's built-in algorithm and writes a
    skeleton TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        black_colseps (bool): Assume black column separator instead of white
        ones.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """

    input_path = storage.get_abs_path(*doc)
    output_path, ext = os.path.splitext(storage.insert_suffix(input_path,
                                        method))
    logger.debug('Reading image using PIL')
    img = Image.open(input_path)
    with open(output_path + '.xml', 'w') as fp:
        logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size))
        tei = OCRRecord()
        tei.img = storage.get_url(*doc)
        tei.dimensions = img.size
        tei.title = os.path.basename(doc[1])
        tei.add_respstmt('kraken', 'page segmentation')
        for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']:
            logger.debug('Found line at {} {} {} {}'.format(*seg))
            tei.add_line(seg)
        logger.debug('Write segmentation to {}'.format(fp.name))
        tei.write_tei(fp)
    return storage.get_storage_path(output_path + '.xml')

Example #3

Show file

File: tesseract.py Project: kamilc/nidaba

def segmentation_tesseract(doc, method=u'segment_tesseract'):
    """
    Performs page segmentation using tesseract's built-in algorithm and writes
    a TEI XML segmentation file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = splitext(storage.insert_suffix(input_path,
                                                 method))[0] + '.xml'

    ver = tesseract.TessVersion()
    if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2:
        raise NidabaTesseractException('libtesseract version is too old. Set '
                                       'implementation to direct.')

    # tesseract has a tendency to crash arbitrarily on some inputs
    # necessitating execution in a separate process to ensure the worker
    # doesn't just die. We use fork as the multiprocessing module thinks
    # programmers are too stupid to reap their children.
    logger.info('Forking before entering unstable ctypes code')
    pid = os.fork()
    if pid != 0:
        try:
            logger.info('Waiting for child to complete')
            _, status = os.waitpid(pid, 0)
        except OSError as e:
            if e.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return storage.get_storage_path(output_path)
        if os.WIFSIGNALED(status):
            raise NidabaTesseractException(
                'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status)))
        return storage.get_storage_path(output_path)

    api = tesseract.TessBaseAPICreate()
    rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None)
    if (rc):
        tesseract.TessBaseAPIDelete(api)
        raise NidabaTesseractException('Tesseract initialization failed.')

    # only do segmentation and script detection
    logger.debug('Setting page set mode to 2')
    tesseract.TessBaseAPISetPageSegMode(api, 2)

    logger.debug('Reading {} using leptonica'.format(input_path))
    pix = leptonica.pixRead(input_path.encode('utf-8'))
    logger.debug('Setting PIX as input image')
    tesseract.TessBaseAPISetImage2(api, pix)
    logger.debug('Analyzing page layout')
    it = tesseract.TessBaseAPIAnalyseLayout(api)
    logger.debug('Destroying PIX')
    leptonica.pixDestroy(ctypes.byref(pix))
    x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(),
                      ctypes.c_int())

    w, h = Image.open(input_path).size
    logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(
        w, h, *doc))
    tei = OCRRecord()
    tei.dimensions = (w, h)
    tei.img = storage.get_url(*doc)
    tei.title = os.path.basename(doc[1])
    tei.add_respstmt('tesseract', 'page segmentation')

    while True:
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE):
            tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_line((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new line at {} {} {} {}'.format(
                x0.value, y0.value, x1.value, y1.value))
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD):
            tesseract.TessPageIteratorBoundingBox(it, RIL_WORD,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_segment((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new word at {} {} {} {}'.format(
                x0.value, y0.value, x1.value, y1.value))

        tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0),
                                              ctypes.byref(y0),
                                              ctypes.byref(x1),
                                              ctypes.byref(y1))
        tei.add_graphemes([{
            'grapheme': '',
            'bbox': (x0.value, y0.value, x1.value, y1.value)
        }])
        logger.debug('Segmenter found new symbol at {} {} {} {}'.format(
            x0.value, y0.value, x1.value, y1.value))
        if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL):
            logger.debug('No more elements on page')
            break
    logger.debug('Deleting page iterator and base API')
    tesseract.TessPageIteratorDelete(it)
    tesseract.TessBaseAPIEnd(api)
    tesseract.TessBaseAPIDelete(api)
    logger.info('Writing segmentation to {}'.format(output_path))
    with open(output_path, 'w') as fp:
        tei.write_tei(fp)
    logger.info('Quitting child process')
    os._exit(os.EX_OK)
    return storage.get_storage_path(output_path)

Example #4

Show file

File: tesseract.py Project: OpenPhilology/nidaba

def segmentation_tesseract(doc, method=u'segment_tesseract'):
    """
    Performs page segmentation using tesseract's built-in algorithm and writes
    a TEI XML segmentation file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml'

    ver = tesseract.TessVersion()
    if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2:
        raise NidabaTesseractException('libtesseract version is too old. Set '
                                       'implementation to direct.')

    # tesseract has a tendency to crash arbitrarily on some inputs
    # necessitating execution in a separate process to ensure the worker
    # doesn't just die. We use fork as the multiprocessing module thinks
    # programmers are too stupid to reap their children.
    logger.info('Forking before entering unstable ctypes code')
    pid = os.fork()
    if pid != 0:
        try:
            logger.info('Waiting for child to complete')
            _, status = os.waitpid(pid, 0)
        except OSError as e:
            if e.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return storage.get_storage_path(output_path)
        if os.WIFSIGNALED(status):
            raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status)))
        return storage.get_storage_path(output_path)

    api = tesseract.TessBaseAPICreate()
    rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None)
    if (rc):
        tesseract.TessBaseAPIDelete(api)
        raise NidabaTesseractException('Tesseract initialization failed.')

    # only do segmentation and script detection
    logger.debug('Setting page set mode to 2')
    tesseract.TessBaseAPISetPageSegMode(api, 2)

    logger.debug('Reading {} using leptonica'.format(input_path))
    pix = leptonica.pixRead(input_path.encode('utf-8'))
    logger.debug('Setting PIX as input image')
    tesseract.TessBaseAPISetImage2(api, pix)
    logger.debug('Analyzing page layout')
    it = tesseract.TessBaseAPIAnalyseLayout(api)
    logger.debug('Destroying PIX')
    leptonica.pixDestroy(ctypes.byref(pix))
    x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(),
                      ctypes.c_int())

    w, h = Image.open(input_path).size
    logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc))
    tei = OCRRecord()
    tei.dimensions = (w, h)
    tei.img = storage.get_url(*doc)
    tei.title = os.path.basename(doc[1])
    tei.add_respstmt('tesseract', 'page segmentation')

    while True:
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE):
            tesseract.TessPageIteratorBoundingBox(it,
                                                  RIL_TEXTLINE,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_line((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value,
                                                                          y0.value,
                                                                          x1.value,
                                                                          y1.value))
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD):
            tesseract.TessPageIteratorBoundingBox(it,
                                                  RIL_WORD,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_segment((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value,
                                                                          y0.value,
                                                                          x1.value,
                                                                          y1.value))

        tesseract.TessPageIteratorBoundingBox(it,
                                              RIL_SYMBOL,
                                              ctypes.byref(x0),
                                              ctypes.byref(y0),
                                              ctypes.byref(x1),
                                              ctypes.byref(y1))
        tei.add_graphemes([{'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value)}])
        logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value,
                                                                        y0.value,
                                                                        x1.value,
                                                                        y1.value))
        if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL):
            logger.debug('No more elements on page')
            break
    logger.debug('Deleting page iterator and base API')
    tesseract.TessPageIteratorDelete(it)
    tesseract.TessBaseAPIEnd(api)
    tesseract.TessBaseAPIDelete(api)
    logger.info('Writing segmentation to {}'.format(output_path))
    with open(output_path, 'w') as fp:
        tei.write_tei(fp)
    logger.info('Quitting child process')
    os._exit(os.EX_OK)
    return storage.get_storage_path(output_path)