def extract_texkeys_from_pdf(pdf_file): """ Extract the texkeys from the given PDF file This is done by looking up the named destinations in the PDF @param pdf_file: path to a PDF @return: list of all texkeys found in the PDF """ with open(pdf_file, 'rb') as pdf_stream: try: pdf = PdfFileReader(pdf_stream, strict=False) destinations = pdf.getNamedDestinations() except Exception as exc: print("* PDF: Internal PyPDF2 error, no TeXkeys returned.", exc, file=sys.stderr) return [] # not all named destinations point to references refs = [ dest for dest in destinations.iteritems() if re_reference_in_dest.match(dest[0]) ] try: if _destinations_in_two_columns(pdf, refs): print("* PDF: Using two-column layout") def sortfunc(dest_couple): return _destination_position(pdf, dest_couple[1]) else: print("* PDF: Using single-column layout") def sortfunc(dest_couple): (page, _, ypos, xpos) = _destination_position(pdf, dest_couple[1]) return (page, ypos, xpos) refs.sort(key=sortfunc) # extract the TeXkey from the named destination name return [ re_reference_in_dest.match(destname).group(1) for (destname, _) in refs ] except Exception as exc: print("* PDF: Impossible to determine layout, no TeXkeys returned", exc, file=sys.stderr) return []
def get_links_and_destinations(f): # Based on <https://stackoverflow.com/a/5978161/393146> pdf = PdfFileReader(f) links = set() for pg in range(pdf.getNumPages()): obj = pdf.getPage(pg).getObject() for annotation in obj.get('/Annots', []): uri = annotation.getObject().get('/A', {}).get('/URI', None) if uri is not None and uri not in links: links.add(uri) dests = pdf.getNamedDestinations() return (links, dests)
def extract_texkeys_and_urls_from_pdf(pdf_file): """ Extract the texkeys and corresponding urls from the given PDF file This is done by looking up the named destinations in the PDF @param pdf_file: path to a PDF @return: list of dictionaries with all texkeys and corresponding urls found in the PDF """ with open(pdf_file, "rb") as pdf_stream: try: pdf = PdfFileReader(pdf_stream, strict=False) destinations = pdf.getNamedDestinations() urls = extract_urls(pdf) except Exception: LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.") return [] # not all named destinations point to references refs = [ dest for dest in destinations.items() if re_reference_in_dest.match(dest[0]) ] try: if _destinations_in_two_columns(pdf, refs): LOGGER.debug(u"PDF: Using two-column layout") def sortfunc(dest_couple): return dest_couple[1] else: LOGGER.debug(u"PDF: Using single-column layout") def sortfunc(dest_couple): page, _, ypos, xpos = dest_couple[1] return (page, ypos, xpos) refs = [(dest[0], _destination_position(pdf, dest[1])) for dest in refs] refs.sort(key=sortfunc) urls = [(uri["/A"]["/URI"], _uri_position(pdf, uri)) for uri in urls] urls.sort(key=sortfunc) texkey_url_list = [] for nb, ref in enumerate(refs): current_texkey_urls_dict = {} current_texkey_urls_dict[ "texkey"] = re_reference_in_dest.match(ref[0]).group(1) if nb < len(refs) - 1: next_reference_data = refs[nb + 1] matched_urls_for_reference, urls = _match_urls_with_reference( urls, ref, next_reference_data) else: matched_urls_for_reference, urls = _match_urls_with_reference( urls, ref) if matched_urls_for_reference: current_texkey_urls_dict[ "urls"] = matched_urls_for_reference texkey_url_list.append(current_texkey_urls_dict) return texkey_url_list except Exception: LOGGER.debug( u"PDF: Impossible to determine layout, no TeXkeys returned") return []
def pdfmerge(basePath, outputPath, pdfRanges=None, rotate=0, progress=None, transform="base"): if isinstance(basePath, PdfFileReader): baseReader = basePath else: baseReader = PdfFileReader(basePath, strict=False) annotReader = PdfFileReader(outputPath, strict=False) if pdfRanges is None: pageNum = min(baseReader.getNumPages(), annotReader.getNumPages()) pdfRanges = range(pageNum) else: pageNum = sum(len(r) for r in pdfRanges) pdfRanges = chain(*pdfRanges) writer = TolerantPdfWriter() # writer.cloneReaderDocumentRoot(baseReader) # PDF-Annotations and metadata can rely on OCG therefore we add them to the base # annot = baseReader.getNamedDestinations() # TODO need to be transformed writer.addMetadata(baseReader.getDocumentInfo()) _progress(progress, 0, pageNum + 1) for apage, page in enumerate(pdfRanges): bp = baseReader.getPage(page) ap = annotReader.getPage(apage) s = ap.cropBox or ap.artBox aw, ah = s.upperRight[0] - s.upperLeft[0], s.upperLeft[ 1] - s.lowerLeft[1] s = bp.cropBox or bp.artBox bw, bh = s.upperRight[0] - s.upperLeft[0], s.upperLeft[ 1] - s.lowerLeft[1] if transform == "base": np = PageObject.createBlankPage(writer, aw, ah) args = _trafo(bw, bh, aw, ah) np.mergeRotatedScaledTranslatedPage(bp, *args) transformAnnot(np, *args) np.mergePage(ap) elif transform == "annot": np = bp args = _trafo(aw, ah, bw, bh) np.mergeRotatedScaledTranslatedPage(ap, *args) else: log.error( "Sorry, I can only transform 'base' (pdf) or 'annot'ations.") if rotate: np.rotateCounterClockwise(rotate) writer.addPage(np) _progress(progress, page, pageNum + 1) for nd in baseReader.getNamedDestinations(): # FIXME it woulb be nicer to use and transform the dest if applicable # writer.addNamedDestinationObject(baseReader.namedDestinations[nd]) print(nd) print(baseReader.namedDestinations[nd].page) # writer.addNamedDestination(nd, ) with open(outputPath, 'wb') as out: writer.write(out) _progress(progress, pageNum + 1, pageNum + 1)
def pdf_list_anchors(fh, ofh): reader = PdfFileReader(fh) destinations = reader.getNamedDestinations() for name in destinations: ofh.write(name + "\n")