def scan(dir):
    """ Main routine. The directory "dir" is parsed
    for downloaded PDF files (searchreports) and calls further routines
    to process these files.
    """
    _move_to_data_folder(dir + "/")
    folders = glob.glob(config.data_dir + '*')
    for folder in folders:
        if not os.path.isdir(folder) or \
                        folder.split('/')[-1][0] == '_':
            continue

        pdfs = glob.glob(folder + "/*.pdf")
        for pdf in pdfs:
            lines = folder + '/' + os.path.basename(pdf).replace('.pdf', '.rawtext')
            try:
                lines = open(lines).readlines()
            except:
                lines = ocr.tesseract(ocr.pdf2jpg(pdf))
            print 'Processing ' + pdf
            found, indices = _scan_lines_for_patnums(lines)
            found = _scan_lines_for_references(indices, found, lines)
            _write_output(indices, found, pdf)

        _tiff_to_jpg(folder)
Exemple #2
0
def prepare_pdf_pages(patnum, doc):
    """
    Transfrom every pdf for given patent number
    to jpgs and the jpg to html files (via tesseract).
    (If the jpg file already exists, it is not created.)
    (If the html file already exists, it is not created.)
    """
    pdfs = glob.glob(config.data_dir + patnum + "/" + doc + "/*.pdf")
    for pdf in pdfs:
        if not os.path.exists(pdf.replace(".pdf", ".jpg")):
            ocr.pdf2jpg(pdf)
    imgs = glob.glob(config.data_dir + patnum + "/" + doc + "/*.jpg")
    for img in imgs:
        # Exclude the front page ('EP123456-.jpg')
        if not os.path.exists(img.replace(".jpg", ".html")) and img[-5:] != '-.jpg':
            ocr.tesseract(img, clean_image=False, hough=False, use_config=False, html=True)
    htmls = glob.glob(config.data_dir + patnum + "/" + doc + "/*.html") +\
        glob.glob(config.data_dir + patnum + "/" + doc + "/*.hocr")

    for html in htmls:
        fn = html.replace('.hocr', '.html')
        os.rename(html, fn)
        layoutdetection.write_layout_files(fn)