def extract_meta(doc: Document, pattern: str, page: Optional[int] = None, ign_case: bool = False) -> List[dict]: """Extract meta for a `pattern` on `page` in a pdf document Arguments doc: document from pymupdf pattern: a regular expression pattern page: page number (1-based index), if None is given, search for the entire document, but this is highly discouraged. ign_case: ignore case? """ result = [] if page is None: pages = doc.pages() elif 1 <= page <= doc.pageCount: pages = [doc[page - 1]] else: # page out of range return result regex = re.compile(pattern, re.IGNORECASE) if ign_case else re.compile(pattern) # we could parallelize this, but I don't see a reason # to *not* specify a page number for p in pages: result.extend(search_in_page(regex, p)) return result
def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]: """Extract toc entries from a document Arguments doc: a pdf document recipe: recipe from user Returns a list of toc entries in the document """ result = [] for page in doc.pages(): for blk in page.getTextPage().extractDICT().get('blocks', []): result.extend( recipe.extract_block(blk, page.number + 1) ) return result