Esempio n. 1
0
def parse(session: Session, docs_path: str, pdf_path: str) -> List[Document]:
    """Parse documents using Parser UDF Runner."""
    # Preprocessor for the Docs
    doc_preprocessor = HTMLDocPreprocessor(docs_path)

    # Create an Parser and parse the documents
    corpus_parser = Parser(
        session,
        parallelism=1,
        structural=True,
        lingual=True,
        visual_parser=PdfVisualParser(pdf_path),
    )

    corpus_parser.clear()
    corpus_parser.apply(doc_preprocessor)
    return corpus_parser.get_documents()
Esempio n. 2
0
    def parse(docs_path, pdf_path):
        # Preprocessor for the Docs
        doc_preprocessor = HTMLDocPreprocessor(docs_path)

        # Create an Parser and parse the documents
        corpus_parser = Parser(
            session,
            parallelism=1,
            structural=True,
            lingual=True,
            visual=True,
            pdf_path=pdf_path,
        )

        corpus_parser.clear()
        corpus_parser.apply(doc_preprocessor)
        return corpus_parser