def parse(session: Session, docs_path: str, pdf_path: str) -> List[Document]: """Parse documents using Parser UDF Runner.""" # Preprocessor for the Docs doc_preprocessor = HTMLDocPreprocessor(docs_path) # Create an Parser and parse the documents corpus_parser = Parser( session, parallelism=1, structural=True, lingual=True, visual_parser=PdfVisualParser(pdf_path), ) corpus_parser.clear() corpus_parser.apply(doc_preprocessor) return corpus_parser.get_documents()
def parse(docs_path, pdf_path): # Preprocessor for the Docs doc_preprocessor = HTMLDocPreprocessor(docs_path) # Create an Parser and parse the documents corpus_parser = Parser( session, parallelism=1, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.clear() corpus_parser.apply(doc_preprocessor) return corpus_parser