Beispiel #1
0
def extract_image(opt, page_nr, filename):
    try:
        width, height = image_size(page_nr, filename)

        subsample = 1
        while (width*height) / subsample > (1 << 20) * 50:
            subsample += 1

        subsample = min(subsample, 12)
    except Exception:
        utils.print_traceback("Unable to get image size, subsample=1", filename)
        subsample = 1

    if subsample != 1:
        print "subsample", subsample

    tiff_name = opt.out_dir + 'page_%04d.tif' % page_nr
    ddjvu = djvulibre_path + 'ddjvu'
    ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr
        return None
    return tiff_name
Beispiel #2
0
def ocr(filename, out_basename, lang, config = ''):
    if tesseract_data_prefix:
        os.environ['TESSDATA_PREFIX'] = tesseract_data_prefix

    ls = subprocess.Popen([ tesseract_path, filename, out_basename, "-l", lang, config], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text,
    ls.wait()

    if config == '':
        out_filename = out_basename + ".txt"
    else:
        out_filename = out_basename + ".hocr"

    if not os.path.exists(out_filename) or ls.returncode:
        # in case returncode == 0
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename

        fd = open(out_filename, 'w')
        fd.write('An error occurred during ocr processing: '  + filename)
        fd.close()

    fd = open(out_filename)
    txt = fd.read()
    fd.close()

    if tesseract_data_prefix:
        del os.environ['TESSDATA_PREFIX']

    if ls.returncode != 0:
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename
        return None

    return txt
Beispiel #3
0
def ocr(filename, out_basename, lang, config = ''):
    if tesseract_data_prefix:
        os.environ['TESSDATA_PREFIX'] = tesseract_data_prefix

    ls = subprocess.Popen([ tesseract_path, filename, out_basename, "-l", lang, config], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text,
    ls.wait()

    if config == '':
        out_filename = out_basename + ".txt"
    else:
        out_filename = out_basename + ".html"

    if not os.path.exists(out_filename) and not ls.returncode:
        # in case returncode == 0
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename

        fd = open(out_filename, 'w')
        fd.write('An error occurred during ocr processing: '  + filename)
        fd.close()

    fd = open(out_filename)
    txt = fd.read()
    fd.close()

    if tesseract_data_prefix:
        del os.environ['TESSDATA_PREFIX']

    if ls.returncode != 0:
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename
        return None

    return txt
Beispiel #4
0
def extract_image(opt, page_nr, filename):
    try:
        width, height = image_size(page_nr, filename)

        subsample = 1
        while (width*height) / subsample > (1 << 20) * 50:
            subsample += 1

        subsample = min(subsample, 12)
    except Exception:
        utils.print_traceback("Unable to get image size, subsample=1", filename)
        subsample = 1

    if subsample != 1:
        print "subsample", subsample

    tiff_name = opt.out_dir + 'page_%04d.tif' % page_nr
    ddjvu = djvulibre_path + 'ddjvu'
    ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr
        return None
    return tiff_name
Beispiel #5
0
def get_nr_pages_djvu(filename):
    djvused = djvulibre_path + 'djvused'
    ls = subprocess.Popen([ djvused, "-e", "n", filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "Error: djvused fail to exec", ls.returncode
        return None
    return int(text)
Beispiel #6
0
def get_nr_pages_djvu(filename):
    djvused = djvulibre_path + 'djvused'
    ls = subprocess.Popen([ djvused, "-e", "n", filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "Error: djvused fail to exec", ls.returncode
        return None
    return int(text)
Beispiel #7
0
def image_size(page_nr, filename):
    djvused = djvulibre_path + 'djvused'
    ls = subprocess.Popen([ djvused, "-e", "select %d; size" % page_nr, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "Error: djvused fail to exec", ls.returncode
        return None

    match = re.search('width=(\d+) height=(\d+)', text)
    return int(match.group(1)), int(match.group(2))
Beispiel #8
0
def image_size(page_nr, filename):
    djvused = djvulibre_path + 'djvused'
    ls = subprocess.Popen([ djvused, "-e", "select %d; size" % page_nr, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "Error: djvused fail to exec", ls.returncode
        return None

    match = re.search('width=(\d+) height=(\d+)', text)
    return int(match.group(1)), int(match.group(2))
Beispiel #9
0
'''
extract the speech from the whole dataset, and save each file under the same folder as the
'''
from utils import get_all_transcript_paths, safe_read
from extract_speech import extract_speech_string
from tqdm import tqdm

if __name__ == '__main__':
    for path in tqdm(list(get_all_transcript_paths())):
        transcript = extract_speech_string(safe_read(path))

        with open(f'transcripts/{path.parts[-1]}', 'w') as f:
            f.write(transcript)