def hocr2pdf(input_file, output_file=None, working_dir=None, font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, pdfopt=True, **dummy):
    """
    @param working_dir the directory containing images to build the PDF.
    @param font the default font (e.g. Courier, Times-Roman).
    @param author the author name.
    @param subject the subject of the document.
    @param title the title of the document.
    @param draft whether to enable debug information in the output.
    """
    if working_dir:
        working_dir = os.path.abspath(working_dir)
    else:
        working_dir = os.path.abspath(os.path.dirname(input_file))

    if pdfopt:
        input_file, tmp_output_file, dummy = prepare_io(input_file, output_ext='.pdf', need_working_dir=False)
    else:
        input_file, output_file, dummy = prepare_io(input_file, output_file=output_file, need_working_dir=False)
        tmp_output_file = output_file

    try:
        create_pdf(extract_hocr(open(input_file).read()), tmp_output_file, font=font, author=author, keywords=keywords, subject=subject, title=title, image_path=working_dir, draft=draft)
    except:
        register_exception()
        raise

    if pdfopt:
        output_file = pdf2pdfopt(tmp_output_file, output_file)
        os.remove(tmp_output_file)
        return output_file
    else:
        return tmp_output_file
def hocr2pdf(input_file, output_file=None, working_dir=None, font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, pdfopt=True, **dummy):
    """
    @param working_dir the directory containing images to build the PDF.
    @param font the default font (e.g. Courier, Times-Roman).
    @param author the author name.
    @param subject the subject of the document.
    @param title the title of the document.
    @param draft whether to enable debug information in the output.
    """
    if working_dir:
        working_dir = os.path.abspath(working_dir)
    else:
        working_dir = os.path.abspath(os.path.dirname(input_file))

    if pdfopt:
        input_file, tmp_output_file, dummy = prepare_io(input_file, output_ext='.pdf', need_working_dir=False)
    else:
        input_file, output_file, dummy = prepare_io(input_file, output_file=output_file, need_working_dir=False)
        tmp_output_file = output_file

    try:
        create_pdf(extract_hocr(open(input_file).read()), tmp_output_file, font=font, author=author, keywords=keywords, subject=subject, title=title, image_path=working_dir, draft=draft)
    except:
        register_exception()
        raise

    if pdfopt:
        output_file = pdf2pdfopt(tmp_output_file, output_file)
        os.remove(tmp_output_file)
        return output_file
    else:
        return tmp_output_file
 def _get_words_from_hocr():
     ret = []
     hocr = extract_hocr(open(input_file).read())
     for dummy, dummy, lines in hocr:
         for dummy, line in lines:
             for word in line.split():
                 ret.append(word.strip())
     return ret
 def _get_words_from_hocr():
     ret = []
     hocr = extract_hocr(open(input_file).read())
     for dummy, dummy, lines in hocr:
         for dummy, line in lines:
             for word in line.split():
                 ret.append(word.strip())
     return ret