Esempio n. 1
0
def process_document(state, processing_date, fp, out_file):
    if path.exists(out_file):
        contents = open(out_file, "rb").read().decode("utf-8")
    else:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        mgr = PDFResourceManager()
        device = PDFPageAggregator(mgr)
        interpreter = PDFPageInterpreter(mgr, device)

        contents = ""
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            for element in layout:
                if isinstance(element, LTChar):
                    contents += element.get_text()
        open(out_file, "wb").write(contents.encode("utf-8"))

    parsers = [p for p in ALL_PARSERS if p.can_parse(state, processing_date)]
    if not parsers:
        return None

    parser = parsers[0]
    return parser.parse_document(contents)