def scan(dir): """ Main routine. The directory "dir" is parsed for downloaded PDF files (searchreports) and calls further routines to process these files. """ _move_to_data_folder(dir + "/") folders = glob.glob(config.data_dir + '*') for folder in folders: if not os.path.isdir(folder) or \ folder.split('/')[-1][0] == '_': continue pdfs = glob.glob(folder + "/*.pdf") for pdf in pdfs: lines = folder + '/' + os.path.basename(pdf).replace('.pdf', '.rawtext') try: lines = open(lines).readlines() except: lines = ocr.tesseract(ocr.pdf2jpg(pdf)) print 'Processing ' + pdf found, indices = _scan_lines_for_patnums(lines) found = _scan_lines_for_references(indices, found, lines) _write_output(indices, found, pdf) _tiff_to_jpg(folder)
def prepare_pdf_pages(patnum, doc): """ Transfrom every pdf for given patent number to jpgs and the jpg to html files (via tesseract). (If the jpg file already exists, it is not created.) (If the html file already exists, it is not created.) """ pdfs = glob.glob(config.data_dir + patnum + "/" + doc + "/*.pdf") for pdf in pdfs: if not os.path.exists(pdf.replace(".pdf", ".jpg")): ocr.pdf2jpg(pdf) imgs = glob.glob(config.data_dir + patnum + "/" + doc + "/*.jpg") for img in imgs: # Exclude the front page ('EP123456-.jpg') if not os.path.exists(img.replace(".jpg", ".html")) and img[-5:] != '-.jpg': ocr.tesseract(img, clean_image=False, hough=False, use_config=False, html=True) htmls = glob.glob(config.data_dir + patnum + "/" + doc + "/*.html") +\ glob.glob(config.data_dir + patnum + "/" + doc + "/*.hocr") for html in htmls: fn = html.replace('.hocr', '.html') os.rename(html, fn) layoutdetection.write_layout_files(fn)