コード例 #1
0
ファイル: pdf.py プロジェクト: vbalbp/refextract
def extract_texkeys_from_pdf(pdf_file):
    """
    Extract the texkeys from the given PDF file

    This is done by looking up the named destinations in the PDF

    @param pdf_file: path to a PDF

    @return: list of all texkeys found in the PDF
    """
    with open(pdf_file, 'rb') as pdf_stream:
        try:
            pdf = PdfFileReader(pdf_stream, strict=False)
            destinations = pdf.getNamedDestinations()
        except Exception as exc:
            print("* PDF: Internal PyPDF2 error, no TeXkeys returned.",
                  exc,
                  file=sys.stderr)
            return []
        # not all named destinations point to references
        refs = [
            dest for dest in destinations.iteritems()
            if re_reference_in_dest.match(dest[0])
        ]
        try:
            if _destinations_in_two_columns(pdf, refs):
                print("* PDF: Using two-column layout")

                def sortfunc(dest_couple):
                    return _destination_position(pdf, dest_couple[1])

            else:
                print("* PDF: Using single-column layout")

                def sortfunc(dest_couple):
                    (page, _, ypos,
                     xpos) = _destination_position(pdf, dest_couple[1])
                    return (page, ypos, xpos)

            refs.sort(key=sortfunc)
            # extract the TeXkey from the named destination name
            return [
                re_reference_in_dest.match(destname).group(1)
                for (destname, _) in refs
            ]
        except Exception as exc:
            print("* PDF: Impossible to determine layout, no TeXkeys returned",
                  exc,
                  file=sys.stderr)
            return []
コード例 #2
0
def get_links_and_destinations(f):
    # Based on <https://stackoverflow.com/a/5978161/393146>
    pdf = PdfFileReader(f)

    links = set()
    for pg in range(pdf.getNumPages()):
        obj = pdf.getPage(pg).getObject()

        for annotation in obj.get('/Annots', []):
            uri = annotation.getObject().get('/A', {}).get('/URI', None)
            if uri is not None and uri not in links:
                links.add(uri)

    dests = pdf.getNamedDestinations()

    return (links, dests)
コード例 #3
0
ファイル: pdf.py プロジェクト: inspirehep/refextract
def extract_texkeys_and_urls_from_pdf(pdf_file):
    """
    Extract the texkeys and corresponding urls from the given PDF file

    This is done by looking up the named destinations in the PDF

    @param pdf_file: path to a PDF

    @return: list of dictionaries with all texkeys and corresponding urls found in the PDF
    """
    with open(pdf_file, "rb") as pdf_stream:
        try:
            pdf = PdfFileReader(pdf_stream, strict=False)
            destinations = pdf.getNamedDestinations()
            urls = extract_urls(pdf)
        except Exception:
            LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.")
            return []
        # not all named destinations point to references
        refs = [
            dest for dest in destinations.items()
            if re_reference_in_dest.match(dest[0])
        ]
        try:
            if _destinations_in_two_columns(pdf, refs):
                LOGGER.debug(u"PDF: Using two-column layout")

                def sortfunc(dest_couple):
                    return dest_couple[1]

            else:
                LOGGER.debug(u"PDF: Using single-column layout")

                def sortfunc(dest_couple):
                    page, _, ypos, xpos = dest_couple[1]
                    return (page, ypos, xpos)

            refs = [(dest[0], _destination_position(pdf, dest[1]))
                    for dest in refs]
            refs.sort(key=sortfunc)
            urls = [(uri["/A"]["/URI"], _uri_position(pdf, uri))
                    for uri in urls]
            urls.sort(key=sortfunc)
            texkey_url_list = []
            for nb, ref in enumerate(refs):
                current_texkey_urls_dict = {}
                current_texkey_urls_dict[
                    "texkey"] = re_reference_in_dest.match(ref[0]).group(1)
                if nb < len(refs) - 1:
                    next_reference_data = refs[nb + 1]
                    matched_urls_for_reference, urls = _match_urls_with_reference(
                        urls, ref, next_reference_data)
                else:
                    matched_urls_for_reference, urls = _match_urls_with_reference(
                        urls, ref)
                if matched_urls_for_reference:
                    current_texkey_urls_dict[
                        "urls"] = matched_urls_for_reference
                texkey_url_list.append(current_texkey_urls_dict)
            return texkey_url_list
        except Exception:
            LOGGER.debug(
                u"PDF: Impossible to determine layout, no TeXkeys returned")
            return []
コード例 #4
0
def pdfmerge(basePath,
             outputPath,
             pdfRanges=None,
             rotate=0,
             progress=None,
             transform="base"):
    if isinstance(basePath, PdfFileReader):
        baseReader = basePath
    else:
        baseReader = PdfFileReader(basePath, strict=False)
    annotReader = PdfFileReader(outputPath, strict=False)
    if pdfRanges is None:
        pageNum = min(baseReader.getNumPages(), annotReader.getNumPages())
        pdfRanges = range(pageNum)
    else:
        pageNum = sum(len(r) for r in pdfRanges)
        pdfRanges = chain(*pdfRanges)
    writer = TolerantPdfWriter()
    # writer.cloneReaderDocumentRoot(baseReader)
    # PDF-Annotations and metadata can rely on OCG therefore we add them to the base
    # annot = baseReader.getNamedDestinations()
    # TODO need to be transformed
    writer.addMetadata(baseReader.getDocumentInfo())
    _progress(progress, 0, pageNum + 1)
    for apage, page in enumerate(pdfRanges):
        bp = baseReader.getPage(page)
        ap = annotReader.getPage(apage)

        s = ap.cropBox or ap.artBox
        aw, ah = s.upperRight[0] - s.upperLeft[0], s.upperLeft[
            1] - s.lowerLeft[1]
        s = bp.cropBox or bp.artBox
        bw, bh = s.upperRight[0] - s.upperLeft[0], s.upperLeft[
            1] - s.lowerLeft[1]

        if transform == "base":
            np = PageObject.createBlankPage(writer, aw, ah)
            args = _trafo(bw, bh, aw, ah)
            np.mergeRotatedScaledTranslatedPage(bp, *args)
            transformAnnot(np, *args)
            np.mergePage(ap)
        elif transform == "annot":
            np = bp
            args = _trafo(aw, ah, bw, bh)
            np.mergeRotatedScaledTranslatedPage(ap, *args)
        else:
            log.error(
                "Sorry, I can only transform 'base' (pdf) or 'annot'ations.")

        if rotate:
            np.rotateCounterClockwise(rotate)

        writer.addPage(np)
        _progress(progress, page, pageNum + 1)
    for nd in baseReader.getNamedDestinations():
        # FIXME it woulb be nicer to use and transform the dest if applicable
        # writer.addNamedDestinationObject(baseReader.namedDestinations[nd])
        print(nd)
        print(baseReader.namedDestinations[nd].page)
        # writer.addNamedDestination(nd, )

    with open(outputPath, 'wb') as out:
        writer.write(out)

    _progress(progress, pageNum + 1, pageNum + 1)
コード例 #5
0
def pdf_list_anchors(fh, ofh):
    reader = PdfFileReader(fh)
    destinations = reader.getNamedDestinations()
    for name in destinations:
        ofh.write(name + "\n")