Ejemplo n.º 1
0
def convert_pdf_to_txt(path_to_file):
    try:
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        device = TextConverter(rsrcmgr,
                               retstr,
                               codec='utf-8',
                               laparams=LAParams())
        fp = open(path_to_file, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        ToC_list = []
        for i in doc.get_outlines():
            ToC_list.append(i[1])
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos=set(),
                                      maxpages=0,
                                      password="",
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()
        print(text)
    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
    return text, ToC_list
Ejemplo n.º 2
0
def get_outphs(path):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.psparser import PSLiteral
    from pdfminer.pdftypes import resolve1
    fp = open(path, "rb")
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    pages = dict((page.pageid, pageno) for (pageno,page)
              in enumerate(PDFPage.create_pages(document)))
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(document.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(document.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest
    toc = []
    for (level, title, dest, a, structelem) in document.get_outlines():
        pageno = None
        if dest:
            dest = resolve_dest(dest)
            pageno = pages[dest[0].objid]
        elif a:
            action = a
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = resolve_dest(action['D'])
                    pageno = pages[dest.resolve()[0].objid]
        toc.append({"level": level, "raw_title": title, "pageno": pageno + 1})
    return toc
Ejemplo n.º 3
0
def table_of_contents_example():
    pdf_filepath = '/path/to/sample.pdf'

    fp = None
    try:
        # Open a PDF file.
        fp = open(pdf_filepath, 'rb')
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        document = PDFDocument(parser, password=b'')
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        try:
            # Get the outlines of the document.
            outlines = document.get_outlines()
            for (level, title, dest, action, se) in outlines:
                print(level, title)
        except PDFNoOutlines as ex:
            print('No outline in {}: {}.'.format(pdf_filepath, ex))
    except FileNotFoundError as ex:
        print('File not found, {}: {}.'.format(pdf_filepath, ex))
    except Exception as ex:
        print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex))
    finally:
        if fp: fp.close()
Ejemplo n.º 4
0
def purge_index(data, file):

    titles = []

    datas = ''

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    # Get the outlines of the document.
    outlines = document.get_outlines()

    for (level, title, dest, a, se) in outlines:
        #titles.append(''.join([i for i in title if not i.isdigit() and i != '.']).strip())
        titles.append(title.strip())

    bc_text = ' '.join(data.split('\n'))

    #sentenceSplit = bc_text.split(".")

    for title in titles:
        if re.search(title, bc_text, re.IGNORECASE):
            bc_text = re.sub(title, '', bc_text, flags=re.IGNORECASE)

    return bc_text
Ejemplo n.º 5
0
def convert_pdf_to_txt(path_to_file):
    try:
        # rsrcmgr = PDFResourceManager()
        # retstr = io.StringIO()
        # device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
        # fp = open(path_to_file, 'rb')
        # parser = PDFParser(fp)
        # doc = PDFDocument(parser)
        # ToC_list = []
        # for i in doc.get_outlines():
        #     ToC_list.append(i[1])
        # interpreter = PDFPageInterpreter(rsrcmgr, device)
        # for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="",caching=True, check_extractable=True):
        #     interpreter.process_page(page)
        # text = retstr.getvalue()
        # fp.close()
        # device.close()
        # retstr.close()
        # Filtered_ToC_list = [re.sub("^[0-9]+", "", i).strip() for i in ToC_list]
        parsed_txt = parser.fromfile(path_to_file)
        text = parsed_txt["content"]
        fp = open(path, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        ToC_list = [i[1] for i in doc.get_outlines()]
        Filtered_ToC_list = [
            re.sub("^[0-9]+", "", i).strip() for i in ToC_list
        ]
    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
    return text, Filtered_ToC_list
Ejemplo n.º 6
0
    def parse_paragraphs(self, text):
        # Will only work for markdown elements
        #   divided by '##' markers
        #   or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n
        lines = text.split('\n')
        headlines = []

        if self.is_pdf:
            with open(self.paper_filename, 'rb') as pdf:
                parser = PDFParser(pdf)
                document = PDFDocument(parser)

                try:
                    outlines = document.get_outlines()
                    for (level, title, _, _, _) in outlines:
                        if level == 1:
                            headlines.append(title)
                except PDFNoOutlines:
                    logging.info(
                        "No outline found -> skipping paragraph search..."
                    )
        else:  # check markdown headlines
            for index, line in enumerate(lines):
                if line.startswith('## '):
                    headlines.append(line)

        if len(headlines) > 0:
            self.count_paragraphs(text, lines, headlines)
Ejemplo n.º 7
0
def main():
    # Open a PDF file.
    with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            print interpreter.process_page(page)
        outlines = document.get_outlines()
        for (level,title,dest,a,se) in outlines:
            print (level, title)
    return 0
Ejemplo n.º 8
0
def dumpoutline(
    outfp,
    fname,
    objids,
    pagenos,
    password="",
    dumpall=False,
    codec=None,
    extractdir=None,
):
    fp = open(fname, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = {
        page.pageid: pageno
        for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
    }

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest["D"]
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write("<outlines>\n")
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get("S")
                    if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
                        dest = resolve_dest(action["D"])
                        pageno = pages[dest[0].objid]
            s = e(title).encode("utf-8", "xmlcharrefreplace")
            outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
            if dest is not None:
                outfp.write("<dest>")
                dumpxml(outfp, dest)
                outfp.write("</dest>\n")
            if pageno is not None:
                outfp.write("<pageno>%r</pageno>\n" % pageno)
            outfp.write("</outline>\n")
        outfp.write("</outlines>\n")
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Ejemplo n.º 9
0
def dumpoutline(outfp: TextIO,
                fname: str,
                objids: Any,
                pagenos: Container[int],
                password: str = '',
                dumpall: bool = False,
                codec: Optional[str] = None,
                extractdir: Optional[str] = None) -> None:
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = {
        page.pageid: pageno
        for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
    }

    def resolve_dest(dest: object) -> Any:
        if isinstance(dest, (str, bytes)):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = escape(title)
            outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
def extract_outline(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    # Get the outlines of the document.
    outlines = document.get_outlines()
    #print(list(outlines))
    result = [(level, title) for (level, title, dest, a, se) in outlines]
    return result
Ejemplo n.º 11
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None,
                extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = dict((page.pageid, pageno)
                 for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Ejemplo n.º 12
0
def read_outline(f):
    parser = PDFParser(f)
    document = PDFDocument(parser, None)

    if document.catalog.get('Outlines') is not None:  #← アウトラインの有無を確認
        outlines = document.get_outlines()
        for level, title, dest, a, se in outlines:
            print(f'階層: {level}, タイトル: {title}')
    else:
        print(f'PDF文書にアウトラインはありません')
Ejemplo n.º 13
0
def get_toc(pdf_path):
    infile = open(pdf_path, "rb")
    parser = PDFParser(infile)
    document = PDFDocument(parser)

    toc = list()
    for (level, title, dest, a, structelem) in document.get_outlines():
        toc.append((level, title))

    return toc
Ejemplo n.º 14
0
def get_toc(pdf_path):
    infile = open(pdf_path, 'rb')
    parser = PDFParser(infile)
    document = PDFDocument(parser)

    toc = list()
    for (level, title, dest, a, structelem) in document.get_outlines():
        toc.append(level)
        toc.append(title)
    return toc
Ejemplo n.º 15
0
def parse(filename, maxlevel):
    fp = open(filename, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        if level <= maxlevel:
            title_words = title.encode("utf8").replace("\n", "").split()
            title = " ".join(title_words)
            print("<h{level}>{title}</h{level}>".format(level=level, title=title))
Ejemplo n.º 16
0
def read_outlines(fp):
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    # 获得文档的目录(纲要)
    try:
        outlines = document.get_outlines()

        for (level, title, dest, a, se) in outlines:
            print(level, title)
    except:
        print("文档不存在大纲!")
Ejemplo n.º 17
0
 def get_toc(self, pdf_path):
     infile = open(pdf_path, 'rb')
     parser = PDFParser(infile)
     document = PDFDocument(parser)
     toc = list()
     try:
         for (level, title, dest, a, structelem) in document.get_outlines():
             toc.append((level, title))
         return toc
     except Exception:
         return False
Ejemplo n.º 18
0
def get_pdf_file_structure(path_to_pdf):
    fp = open(path_to_pdf, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser, password="")

    outlines = document.get_outlines()
    chapters = list()
    for (level, title, dest, a, se) in outlines:
        chapters.append(title)

    fp.close()
    return (chapters)
Ejemplo n.º 19
0
def get_toc(pdf_path):
    infile = open(pdf_path, 'rb')
    parser = PDFParser(infile)
    document = PDFDocument(parser)
    toc = list()
    try:
        for (level, title, dest, a, structelem) in document.get_outlines():
            print remove_non_ascii(title.strip())
            toc = '-'
    except PDFNoOutlines:
        pass
    return toc
def extract_outlines(path):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument

    # Open a PDF document.
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    # Get the outlines of the document.
    outlines = document.get_outlines()
    for (level, title, dest, a, se) in outlines:
        print(level, title)
Ejemplo n.º 21
0
def parse(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument(praser)
    # 连接分析器 与文档对象
    praser.set_document(doc)
    #doc.set_parser(praser)

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        print(level, json.dumps(title, ensure_ascii=False))
def via_toc(path):
    try:
        titles = []
        infile = open(path, 'rb')
        parser = PDFParser(infile)
        document = PDFDocument(parser)
        toc = list()
        title = [(level, title)
                 for (level, title, dest, a,
                      structelem) in document.get_outlines()][0][1]
        return {"/Title": title, "/Author": ""}
    except Exception, e:
        return None
Ejemplo n.º 23
0
def extract_from_awspdf(url):
    pdf_file = get_or_dl_pdf(url)
    print("Analyzing pdf... ", end='', flush=True),
    document = PDFDocument(PDFParser(pdf_file))
    outlines = document.get_outlines()
    current_section = ""
    actions = []
    for level, title, *_ in outlines:
        if level == 1:
            current_section = title
        if current_section == "Actions" and level == 2:
            actions.append(title)
    return actions
Ejemplo n.º 24
0
def extract_TOC(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser, b"")
    pages = {page.pageid: pageno for (pageno, page)
             in enumerate(PDFPage.create_pages(document), 1)}
    
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(document.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(document.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest
    
    toc = ""

    try:
        outlines = document.get_outlines()
        toc += '<outlines>\n'
        for (level, title, dest, a, se) in tqdm(outlines, leave=False):
            pageno = None
            if dest:
                dest = resolve_dest(dest) # Very imperative and can cause errors that are hard to debug since we overwrite
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get("S")
                    if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
                        dest = resolve_dest(action.get("D"))
                        pageno = pages[dest[0].objid]
            string = escape_str(title).encode("utf-8", "xmlcharrefreplace")
            toc += '<outline level="{!r}" title="{}">\n'.format(level, string)
            if dest is not None:
                toc += "<dest>"
                toc = dumpxml(toc, dest)
                toc += "</dest>\n"
            if pageno is not None:
                toc += "<pageno>{}</pageno>\n".format(pageno)
            toc += "</outline>\n"
        toc += "</outlines>\n"
    except PDFNoOutlines:
        pass
    
    parser.close()
    fp.close()
    return toc
Ejemplo n.º 25
0
def parse(filename, maxlevel):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        if level <= maxlevel:
            title_words = title.encode('utf8') \
                               .replace('\n', '') \
                               .split()
            title = ' '.join(title_words)
            print('<h{level}>{title}</h{level}>'
                  .format(level=level, title=title))
Ejemplo n.º 26
0
def process_text():
    global count,ch_map,sec_map,para_map,book_trie,book

    fp = open('Algorithms.pdf', 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser, "secret")
    outlines = document.get_outlines()

    #get index contexts
    i=j=0
    for (level,title,dest,a,se) in outlines:
        if level==1:
            if i<9:
                i+=1
                chapters[i]=unicodedata.normalize('NFKD', title).encode('ascii','ignore')
        else:
            if j< 44:
                j+=1
                sections[j]=unicodedata.normalize('NFKD', title).encode('ascii','ignore')
    

    with open('Algorithms.txt','rb') as f:
        book=f.read()       

    a=0
    for j in chapters:
        ch_map[j]=book.index(chapters[j])
        a=ch_map[j]

    a=0
    for j in sections:
        sec_map[j]=book.index(sections[j])
        a=sec_map[j]

    i=1
    a=0
    para_map[i]=a
    i+=1
    while(a<len(book)):
        try:
            a=book.index('\n\n',a+1)
        except:
            break
        para_map[i]=a+2
        i+=1

    count[1]=len(ch_map)
    count[2]=len(sec_map)
    count[3]=len(para_map)
    book_trie=make_trie()
Ejemplo n.º 27
0
def get_pageno(pdf_file):
    logging.debug('get_pageno in...' + pdf_file)
    with open(pdf_file, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(PDFPage.create_pages(doc)) )

        # Get the outlines of the document.
        outlines = doc.get_outlines()
        for (level,title,dest,a,se) in outlines:
            pageno = pages[dest[0].objid]
            # III. 재무에 관한 사항 페이지 찾기
            if title.startswith((u'III', u'Ⅲ')):
                return pageno
Ejemplo n.º 28
0
def succ_test():
    try:
        os.chdir(r'F:\allitebooks\making-games')
        fp = open('Making Games.pdf', 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        print "extractable:", document.is_extractable, ",modifiable:", document.is_modifiable, ", printable:", document.is_printable
        outlines = document.get_outlines()
        print outlines
    except:
        traceback.print_exc()
    finally:
        parser.close()
        fp.close()
Ejemplo n.º 29
0
def getDocTitle(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    # Open a PDF document.
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser, '')
    #定义一个存储输出结果的变量
    tit_res = []
    # Get the outlines of the document.
    outlines = document.get_outlines()
    for (level,title,dest,a,se) in outlines:
        tit_res.append([level, title])
    return tit_res
Ejemplo n.º 30
0
def parse(filename, maxlevel):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        if level <= maxlevel:
            title_words = title.encode('utf8') \
                               .replace('\n', '') \
                               .split()
            title = ' '.join(title_words)
            print('<h{level}>{title}</h{level}>'.format(level=level,
                                                        title=title))
Ejemplo n.º 31
0
    def convertPDFFilter(self, path):
        if not os.path.exists(path):
            return False

        fp = open(path, 'rb')

        ri = self.reinit()
        retstr = ri['retstr']
        device = ri['device']
        interpreter = ri['interpreter']

        parser = PDFParser(fp)
        document = PDFDocument(parser, self.password)
        try:
            outlines = document.get_outlines()
            for (level,title,dest,a,se) in outlines:
                self.titles.append(str(level) + ' ' + title)
                #print (level, title)
        except PDFNoOutlines:
            self.titles = []

        #metadata = document.info
        #print(metadata)
        #for x in metadata:
        #    if x == "Title":
        #        print(x)
        

        i = 0
        for page in PDFPage.get_pages(fp):
            print(i)
            if i > 20 and i < 40:
                i+=1
                continue
            i+=1
            interpreter.process_page(page)
            layout = device.get_result()
            ptxt = ''
            for e in layout:
                if isinstance(e, LTTextBoxHorizontal):
                    #print(element.get_text())
                    ptxt += e.get_text()
            self.pages.append(ptxt)
        fp.close()
        device.close()
        retstr.close()
        return True
Ejemplo n.º 32
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page)
                  in enumerate(PDFPage.create_pages(doc)) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest
    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level,title,dest,a,se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Ejemplo n.º 33
0
def display_pageno(pdffile):
    result = []
    fp = open(pdffile, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    pages = dict((page.pageid, pageno)
                 for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1))

    def resolve_dest(dest):
        if isinstance(dest, str) or isinstance(dest, bytes):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        pageno = None
        pageid = None
        if dest:
            dest = resolve_dest(dest)
            pageid = dest[0].objid
            pageno = pages[pageid]
        elif a:
            action = a
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                        'D'):
                    dest = resolve_dest(action['D'])
                    pageid = dest[0].objid
                    pageno = pages[pageid]
        # print (level, title, pageno, pageid)
        result.append({
            "level": level,
            "title": title,
            "pageno": pageno,
            "pageid": pageid
        })
    return result
Ejemplo n.º 34
0
    def __get_outlines_pdf(self, book_name):
        """Get the titles and pages that this titles link to. If there's no destination (link to text from title
        it'll not be possible to find out the title of the page)"""

        fp = open(book_name, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)

        try:
            outlines = document.get_outlines()
            for (level, title, dest, a, se) in outlines:
                if not dest:
                    break
                self.destination.append(dest[0].objid)
                self.titles.append(title)
        except (PDFNoOutlines, TypeError):
            pass
        fp.close()
Ejemplo n.º 35
0
    def get_toc(pdf_path):
        infile = open(pdf_path, 'rb')
        parser = PDFParser(infile)
        document = PDFDocument(parser)

        toc = list()
        for (level, title, dest, ref, structelem) in document.get_outlines():
            resolved_ref = ref.resolve()
            stringified_ref = {
                key: str(resolved_ref[key])
                for key in resolved_ref
            }
            toc.append({
                'level': level,
                'title': title,
                'ref': stringified_ref
            })

        return toc, document
Ejemplo n.º 36
0
    def extract_contents(self):
        parser = PDFParser(self.fd)
        doc = PDFDocument(parser)
        self.total_pages = self.get_pages_total()
        self.pages = zip(PDFPage.get_pages(self.fd), range(1, self.total_pages))

        try:
            outlines = doc.get_outlines()
        except PDFNoOutlines:
            # No built-in outlines
            return None
        else:
            # built-in outlines exist
            def search_page_toc(objid):
                for page, pagenum in self.pages:
                    if page.pageid == objid:
                        return pagenum
                return 0

            for (level, title, dest, a, se) in outlines:
                if dest is not None:
                    pn = search_page_toc(dest[0].objid)
                    if pn > 0:
                        self.outlines.append((title, pn))
Ejemplo n.º 37
0
def get_headings(filename):
    os.chdir('..')
    rd.open_location("/PDF",True)
    filename_=filename[:-14]

    for compare_filename in os.listdir(os.getcwd()):

        if filename_ == compare_filename[:-4]:
            in_file=open(compare_filename, 'rb')
            
            parse_file=PDFParser(in_file)
            file=PDFDocument(parse_file)
            pages=0
            for page in PDFPage.get_pages(in_file):
                pages+=1   
            headings_list=[]
            try:
                for (level,title,dest,a,structelem) in file.get_outlines():
                    headings_list.append((level,title))
                rd.open_location("/program",True)    
                return headings_list,pages
            except:
                rd.open_location("/program",True)
                return None,pages
Ejemplo n.º 38
0
 def valid_toc(self, toc):
     with open(str(self._doc), "rb") as pdffile:
         parser = PDFParser(pdffile)
         document = PDFDocument(parser)
         try:
             real_toc = list(document.get_outlines())
         except PDFNoOutlines:
             return len(toc) == 0
         print("TOC from PDF file:", real_toc)
         if len(real_toc) != len(toc):
             print("Incorrect TOC length")
             return False
         for ref, real in zip(toc, real_toc):
             print("Checking", ref)
             if not ref[0] + 1 == real[0]:
                 # level
                 return False
             if not self._is_reference_to_ith_page(real[2][0], ref[1] - 1):
                 # destination
                 return False
             if not ref[2] == real[1]:
                 # title
                 return False
     return True
Ejemplo n.º 39
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

# Open a PDF document.
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)

# Get the outlines of the document.
outlines = document.get_outlines()
for (street_name, type , dir, address_range, city, elementary, middle, high_school) in outlines:
    print (level, title)
Ejemplo n.º 40
0
    def createFromPdfminer(filename):
        from pdfminer.pdfparser import PDFParser
        from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
        from pdfminer.pdfpage import PDFPage
        from pdfminer.pdftypes import PDFObjRef

        fp = open(filename, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        doc.initialize()
        assert doc.is_extractable

        result = PDFInfos()
        result._metaInfo = dict((key, str.decode(value, 'utf-16') if value.startswith('\xfe\xff') else value)
                                for key, value in doc.info[0].items()
                                if isinstance(value, basestring))

        pageids = [page.pageid for page in PDFPage.create_pages(doc)]
        result._pageCount = len(pageids)

        def get(obj, attr = None):
            """Resolve PDFObjRefs, otherwise a no-op. May also perform
            dict lookup, i.e. get(obj, 'A') is roughly the same as
            get(obj)['A']."""
            while isinstance(obj, PDFObjRef):
                obj = obj.resolve()
            if attr is not None:
                return get(obj[attr])
            return obj

        def actionToPageIndex(action):
            assert get(action, 'S').name == 'GoTo'
            name = get(action, 'D')
            # resolve "named destination":
            dest = get(doc.get_dest(name))
            return destToPageIndex(dest)

        def destToPageIndex(dest):
            dest = get(dest)
            if isinstance(dest, dict):
                assert dest.keys() == ['D'], repr(dest)
                dest = get(dest, 'D')
            # destinations contain the page as first element,
            # the rest concerns the ROI / zoom state (various modes there):
            return pageids.index(dest[0].objid)

        try:
            result._outline = [(level, title, actionToPageIndex(a) if a else destToPageIndex(dest))
                               for level, title, dest, a, se in doc.get_outlines()]
        except PDFNoOutlines:
            result._outline = None

        result._pageInfos = []

        # get annotations (links):
        for page in PDFPage.create_pages(doc):
            pageLinks = []

            for anno in get(page.annots) or []:
                anno = get(anno)
                rect = numpy.array(get(anno, 'Rect'), float).reshape((2, 2))
                if 'Dest' in anno:
                    # 'Dest' is the older (more compatible) way to
                    # specify links
                    dest = get(anno, 'Dest')
                    pageLinks.append((rect, destToPageIndex(dest)))
                elif 'A' in anno:
                    # actions are much more general and include 'GoTo'
                    # (with viewport spec.) with variants for remote
                    # and embedded documents
                    action = get(anno, 'A')
                    subType = get(action, 'S').name
                    if subType == 'GoTo':
                        pageLinks.append((rect, actionToPageIndex(action)))
                    elif subType == 'URI':
                        #assert sorted(action.keys()) == ['S', 'Type', 'URI']
                        link = get(action, 'URI')
                        if link.startswith('file:'):
                            # resolve relative pathname w.r.t. PDF filename:
                            link = 'file:' + os.path.join(os.path.dirname(filename),
                                                          link[5:])
                        pageLinks.append((rect, link))

            pageBox = numpy.array([page.mediabox], float).reshape((2, 2))

            result._pageInfos.append(PDFPageInfos(links = pageLinks, pageBox = pageBox))

        # extract all named destinations:
        def extract_names(dests, result = None):
            if result is None:
                result = {}
            if 'Names' in dests:
                it = iter(get(dests, 'Names'))
                for name, ref in zip(it, it):
                    result[name] = destToPageIndex(ref)
            if 'Kids' in dests:
                for kid in get(dests, 'Kids'):
                    extract_names(get(kid), result)
            return result

        try:
            result._names = extract_names(get(doc.catalog['Names'], 'Dests'))
        except KeyError:
            pass

        return result
Ejemplo n.º 41
0
    def pdf_cover(self, pdf, images):
        '''Attempt to use embedded outline information in the PDF to determine
        which image to use as the cover or primary image for the volume.

        :param pdf: path to the pdf file for this volume
        :param images: list of image file paths for this volume
        '''
        with open(pdf, 'rb') as pdf_file:
            parser = PDFParser(pdf_file)
            document = PDFDocument(parser)
            try:
                outlines = document.get_outlines()
                logger.debug('PDF %s includes outline information, using for cover identification',
                             pdf)
            except PDFNoOutlines:
                logger.debug('PDF %s does not include outline information', pdf)
                return None

            # generate a dictionary of page object id and zero-based page number
            pages = dict((page.pageid, pageno) for (pageno, page)
                  in enumerate(PDFPage.create_pages(document)))

            possible_coverpages = []
            page_count = 0
            for (level, title, dest, a, se) in outlines:

                # NOTE: some LSDI PDFs trigger a maximum recursion error in
                # pdfminer; try to avoid this by bailing out after processing
                # a set number of outline items
                # caveat: outline entries are not necessarily returned in order
                page_count += 1
                if page_count > 15:
                    break

                # title is the label of the outline element

                # dest is the target page object; apparently in some cases this can be None ?
                # if so, skip it
                if dest is None:
                    continue

                # we can probably use either Cover or Title Page; there
                # may be multiple Covers (for back cover)
                if title.lower() in ['cover', 'title page']:
                    # determine page number for the reference
                    page_num = pages[dest[0].objid]

                    # check if the page is blank, as seems to be happening in some
                    # cases for what is labeled as the cover
                    try:
                        img = images[page_num]
                    except IndexError:
                        logger.error('Not enough images for requested page number %s',
                                     page_num)
                        continue

                    if self.is_blank_page(img):
                        logger.debug('PDF outline places %s at page %s but it is blank', title, page_num)
                        # do NOT include as a possible cover page
                    else:
                        # non-blank: include as possible cover page
                        logger.debug('PDF outline places %s at page %s', title, page_num)
                        possible_coverpages.append(page_num)

            if possible_coverpages:
                # for now, just return the lowest page number, which should be
                # the first cover or title page if cover is blank
                return sorted(possible_coverpages)[0]