Exemple #1
0
def ocr(filename, out_basename, lang, config = ''):
    if tesseract_data_prefix:
        os.environ['TESSDATA_PREFIX'] = tesseract_data_prefix

    ls = subprocess.Popen([ tesseract_path, filename, out_basename, "-l", lang, config], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text,
    ls.wait()

    if config == '':
        out_filename = out_basename + ".txt"
    else:
        out_filename = out_basename + ".hocr"

    if not os.path.exists(out_filename) or ls.returncode:
        # in case returncode == 0
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename

        fd = open(out_filename, 'w')
        fd.write('An error occurred during ocr processing: '  + filename)
        fd.close()

    fd = open(out_filename)
    txt = fd.read()
    fd.close()

    if tesseract_data_prefix:
        del os.environ['TESSDATA_PREFIX']

    if ls.returncode != 0:
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename
        return None

    return txt
Exemple #2
0
def extract_image(opt, page_nr, filename):
    try:
        width, height = image_size(page_nr, filename)

        subsample = 1
        while (width * height) / subsample > (1 << 20) * 50:
            subsample += 1

        subsample = min(subsample, 12)
    except Exception:
        utils.print_traceback("Unable to get image size, subsample=1",
                              filename)
        subsample = 1

    if subsample != 1:
        print "subsample", subsample

    tiff_name = opt.temp_tiff_dir + '/page_%04d.tif' % page_nr
    ddjvu = djvulibre_path + 'ddjvu'
    ls = subprocess.Popen([
        ddjvu, "-format=tiff",
        "-page=%d" % page_nr,
        "-subsample=%d" % subsample, filename, tiff_name
    ],
                          stdout=subprocess.PIPE,
                          preexec_fn=setrlimits,
                          close_fds=True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr
        return None
    return tiff_name
Exemple #3
0
def extract_image(opt, page_nr, filename):
    try:
        width, height = image_size(page_nr, filename)

        subsample = 1
        while (width*height) / subsample > (1 << 20) * 50:
            subsample += 1

        subsample = min(subsample, 12)
    except Exception:
        utils.print_traceback("Unable to get image size, subsample=1", filename)
        subsample = 1

    if subsample != 1:
        print "subsample", subsample

    tiff_name = opt.out_dir + 'page_%04d.tif' % page_nr
    ddjvu = djvulibre_path + 'ddjvu'
    ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr
        return None
    return tiff_name
Exemple #4
0
def get_nr_pages_djvu(filename):
    djvused = djvulibre_path + 'djvused'
    ls = subprocess.Popen([ djvused, "-e", "n", filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "Error: djvused fail to exec", ls.returncode
        return None
    return int(text)
Exemple #5
0
def image_size(page_nr, filename):
    djvused = djvulibre_path + 'djvused'
    ls = subprocess.Popen([ djvused, "-e", "select %d; size" % page_nr, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "Error: djvused fail to exec", ls.returncode
        return None

    match = re.search('width=(\d+) height=(\d+)', text)
    return int(match.group(1)), int(match.group(2))