def hocr2pdf(input_file, output_file=None, working_dir=None, font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, pdfopt=True, **dummy): """ @param working_dir the directory containing images to build the PDF. @param font the default font (e.g. Courier, Times-Roman). @param author the author name. @param subject the subject of the document. @param title the title of the document. @param draft whether to enable debug information in the output. """ if working_dir: working_dir = os.path.abspath(working_dir) else: working_dir = os.path.abspath(os.path.dirname(input_file)) if pdfopt: input_file, tmp_output_file, dummy = prepare_io(input_file, output_ext='.pdf', need_working_dir=False) else: input_file, output_file, dummy = prepare_io(input_file, output_file=output_file, need_working_dir=False) tmp_output_file = output_file try: create_pdf(extract_hocr(open(input_file).read()), tmp_output_file, font=font, author=author, keywords=keywords, subject=subject, title=title, image_path=working_dir, draft=draft) except: register_exception() raise if pdfopt: output_file = pdf2pdfopt(tmp_output_file, output_file) os.remove(tmp_output_file) return output_file else: return tmp_output_file
def _get_words_from_hocr(): ret = [] hocr = extract_hocr(open(input_file).read()) for dummy, dummy, lines in hocr: for dummy, line in lines: for word in line.split(): ret.append(word.strip()) return ret