Beispiel #1
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None,
                extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = dict((page.pageid, pageno)
                 for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
def createPDFDoc(fpath):
    fp = open(fpath, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser, password='')
    # Check if the document allows text extraction. If not, abort.
    assert document.is_extractable
    return document
Beispiel #3
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
Beispiel #4
0
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print('extracting: %r' % path, file=sys.stderr)
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    fp.close()
    return
    def __init__(self, pdf, codec='utf-8'):
        """
        Parameters:
        --------------
        codec:      codific, default utf-8
        pdf:        path to the pdf file

        Attributes:
        ---------------
        records:        list of lines from the pdf file
        text:           string of joined records, default ""
        didascalies:    list of found didascalies with regexpr
        nimages:        int, number of found images

        """
        self.pdf = pdf
        self.text = ""
        self.records = []
        self.didascalies = []
        self.nimages = 0
        self.images = []

        parser = PDFParser(pdf)
        #parser = PDFParser(open(pdf, 'rb'))
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object
        # that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a buffer for the parsed text
        retstr = StringIO()
        # Spacing parameters for parsing
        laparams = LAParams()
        self.codec = codec
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        #images

        img_device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        img_interpreter = PDFPageInterpreter(rsrcmgr, img_device)
        for page in PDFPage.create_pages(document):
            img_interpreter.process_page(page)
            pdf_item = img_device.get_result()
            if pdf_item is not None:
                for thing in pdf_item:
                    if isinstance(thing, LTImage):
                        self.save_image(thing)
                    if isinstance(thing, LTFigure):
                        self.find_images_in_thing(thing)

        lines = retstr.getvalue().splitlines()
        for line in lines:
            self.records.append(line)
Beispiel #6
0
Datei: pdf.py Projekt: rrbn/tiltr
def _extract_pdf_scores(stream):
    # these laparams seem to work ok with the ILIAS default PDF
    # formatting as well as with UR custom styling.

    # see pdf/tests/default_style.pdf and pdf/tests.ur_style.pdf

    laparams = LAParams(line_overlap=0,
                        char_margin=20,
                        word_margin=0.1,
                        boxes_flow=0,
                        detect_vertical=False)

    rsrcmgr = PDFResourceManager()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    parser = PDFParser(stream)
    document = PDFDocument(parser)

    page = next(PDFPage.create_pages(document))

    interpreter.process_page(page)
    layout = device.get_result()

    boxes = []
    table_head_y = None  # y position of result table header

    order_name = "Reihenfolge"  # FIXME localize

    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            boxes.append(element)
            if order_name in element.get_text().strip():
                table_head_y = element.y0

    tboxes = list(filter(lambda box: box.y0 == table_head_y, boxes))

    # if LAParams is set correctly, head should extract the whole
    # results table's text now.
    table = tboxes[0].get_text().replace('\t', '')

    table = table[table.find(order_name):]

    # note: question titles might lack spaces; this is no problem
    # since we compare question names and scores only through
    # Result.normalize_question_title() later.

    scores = dict()
    cols = []
    for line in table.split("\n")[1:]:
        cols += re.split(r'\s+', line)
        if len(cols) >= 6:
            scores[cols[2]] = cols[4]
            cols = cols[6:]

    return scores
Beispiel #7
0
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return (txt)
 def _createPDFDoc(self, fpath, password):
     fp = open(fpath, 'rb')
     parser = PDFParser(fp)
     try:
         document = PDFDocument(parser, password)
     except PDFPasswordIncorrect:
         raise AssertionError(
             "Password '{}' is incorrect.".format(password))
     except TypeError:
         raise AssertionError(
             "Unable to extract the pdf. Please check the password.")
     return fp, document
def convert(pdffile):
    my_file = pdffile
    extracted_text = ""
    # Open and read the pdf file in binary mode
    fp = open(my_file, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser, password)

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis
    laparams = LAParams()

    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # device = PDFDevice(rsrcmgr)
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
        # As the interpreter processes the page stored in PDFDocument object
        interpreter.process_page(page)
        # The device renders the layout from interpreter
        layout = device.get_result()
        # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    #close the pdf file
    fp.close()

    # print (extracted_text.encode("utf-8"))

    with open(log_file, "wb") as my_log:
        my_log.write(extracted_text.encode("utf-8"))
    print("Done !!")
def parse(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        fulltext = []
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            str_page = ""
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    results = x.get_text()
                    str_page += results
            fulltext.append(str_page)
    return fulltext
Beispiel #11
0
def main(args):
    msg(SCRIPT, args)

    if len(args) != 1:
        msg('Parse a PDF file and print some pdfminer-specific stats')
        msg('Usage:', SCRIPT, '<PDF-filename>')
        return 1

    infilename, = args

    lt_types = collections.Counter()

    with open(infilename, 'rb') as pdf_file:

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(pdf_file)

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        password = ''
        document = PDFDocument(parser, password)
        # Check if the document allows text extraction.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed(filename)

        # Make a page iterator
        pages = PDFPage.create_pages(document)


        # Set up for some analysis
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(
            detect_vertical=True,
            all_texts=True,
            )
        #device = PDFDevice(rsrcmgr)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Look at all (nested) objects on each page
        for page_count, page in enumerate(pages, 1):
            # oh so stateful
            interpreter.process_page(page)
            layout = device.get_result()

            lt_types.update(type(item).__name__ for item in flat_iter(layout))

    msg('page_count', page_count)
    msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
Beispiel #12
0
def get_ToC(file):
    """This funciton will locate the Table of Content, and return a dataframe with the corresponding ToC-number and name 

    Args:
        file (the pdf-file that will be read): reads and extract the specific words from the string
    """
    # Open a PDF document.
    fp = open(file, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    # Get the outlines of the document.
    outlines = document.get_outlines()
    for (level, title, dest, a, se) in outlines:
        print(level, title)
Beispiel #13
0
def extractembedded(fname, password='', extractdir=None, emailsDir=None):
    def extract1(obj):
        filename = os.path.basename(obj['F'])
        #       filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile'
                % (filename))
        file_name, extension = os.path.splitext(fname)
        path = os.path.join(extractdir, file_name + " " + filename)
        while os.path.exists(path):
            path = os.path.join(
                extractdir,
                file_name + " " + str(randint(1, 100)) + " " + filename)
            print >> sys.stderr, "file exists, create random name %s" % path
        # print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(os.path.join(emailsDir, fname), 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        if type(xref
                ) == PDFXRef:  # Ignore PDFXreffallback. Not sure what it is.
            for objid in xref.get_objids():
                obj = doc.getobj(objid)
                if isinstance(obj,
                              dict) and obj.get('Type') is LITERAL_FILESPEC:
                    extract1(obj)
    return
Beispiel #14
0
def other_causes(_row_array, filename, delimiter, num_causes):

    if num_causes < 1:
        num_causes = 1
    elif num_causes > 5:
        num_causes = 5

    causes_table = []

    causes_table.append(
        "State, Cause 1, Cause 1 Value, Cause 1 Value (Per Capita)")

    for index in range(0, num_causes - 1):
        causes_table[0] = causes_table[0] + delimiter + "Cause " + str(index+2) \
                                + delimiter + "Cause Value " + str(index + 2)\
                                + delimiter + "Cause Value " + str(index + 2) + " (Per Capita)"

    causes_table[0] = causes_table[0] + '\r'

    count = 1

    output_string = StringIO()
    print("Processing other causes of death (2015)...")
    with open(filename + '.pdf', 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr,
                               output_string,
                               laparams=LAParams(char_margin=20))
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    text = str(output_string.getvalue())
    writeblock(filename, text, '', '.txt')

    skip_line = lambda str: -1 if str.find('\n') == -1 else str[str.find('\n')
                                                                + 1:]
    isolate_line = lambda str: -1 if str.find('\n') == -1 else str[:str.find(
        '\n')]

    entry_value_per_capita = 0
    entry_value = 0

    for row in _row_array:
        if row == _row_array[0]: continue

        state = parseop(row, ',', 1, 0, parse.RETRIEVE)
        pop = float(parseop(row, delimiter, 8, 0, parse.RETRIEVE))
        pattern = re.compile(state)
        search = pattern.search(text)
        if search == None: continue
        ##        print(entry1.end())
        placeholder_string = text[search.end():]

        #find first cause of death value
        if state == "Maryland": lines = 8
        else: lines = 4

        for i in range(0, lines):  #Skip first 4 lines
            return_value = skip_line(placeholder_string)
            if return_value == -1: exit(-1)
            placeholder_string = return_value

        causes_line = state

        for index in range(0, num_causes):

            #find cause of death label
            ##            placeholder_string = skip_line(placeholder_string) #skip another line
            entry_name = isolate_line(placeholder_string)  #isolate it

            pattern = re.compile("\D+(?= )")
            search = pattern.search(entry_name)
            if search == None:
                print("Name not found..")
            else:
                entry_name = search.group()  #Apply regular expression
                entry_name = entry_name[1:]  # remove prefix space

            #find cause of death value
            placeholder_string = skip_line(
                placeholder_string)  #skip another line
            entry_value = placeholder_string[:placeholder_string.find(' ')]
            for i in range(0, 10):
                entry_value = entry_value.replace(',',
                                                  '')  #Remove up to 9 commas

            #per capita
            if pop > 0:
                entry_value_per_capita = float(
                    float(float(entry_value) / pop) * 100)
            else:
                entry_value_per_capita = -1

            placeholder_string = skip_line(
                placeholder_string)  #skip another line

            #Add to table
            causes_line = causes_line + delimiter + entry_name + delimiter + entry_value \
                                  + delimiter + str(entry_value_per_capita)

        causes_table.append(causes_line + '\r')
        count += 1

    writeblock("us_cause_of_death_2015", causes_table, '', '.csv')
Beispiel #15
0
if __name__ == '__main__':
    #
    # Documents: https: // buildmedia.readthedocs.org / media / pdf / pdfminer - docs / latest / pdfminer - docs.pdf
    #
    tic = time()
    sys.path.append('../')
    from utils import virtual_environment
    parser = argparse.ArgumentParser(prog='pdfminer.poc')
    parser.add_argument('file_name', type=str)
    cmd_args = virtual_environment(parser)
    from pdfminer3.pdfdocument import PDFDocument
    from pdfminer3.pdftypes import PDFObjectNotFound
    from pdfminer3.pdfparser import PDFParser, PDFStream
    print(cmd_args.file_name)
    input_file = open(cmd_args.file_name, "rb")
    parsed = PDFDocument(PDFParser(input_file))
    try:
        shutil.rmtree('%s.pdfminer_out' % cmd_args.file_name)
    except FileNotFoundError:
        pass
    os.mkdir('%s.pdfminer_out' % cmd_args.file_name)
    for obj_id in set(obj_id for xref in parsed.xrefs
                      for obj_id in xref.get_objids()):
        try:
            obj = parsed.getobj(obj_id)
        except PDFObjectNotFound:
            continue
        if not isinstance(obj, PDFStream):
            continue
        print('%s' % obj)
        obj.decode()
Beispiel #16
0
    def extract_pdf(self):
        assert self.extension in ['pdf']

        self.content = self.file.read()
        parser = PDFParser(self.file)
        doc = PDFDocument(parser)

        available_fields = list(doc.info[0].keys())
        self.properties['auteur'] = None
        self.properties['creation_date'] = None
        self.properties['modification_date'] = None
        self.properties['creator'] = None
        self.properties['producer'] = None

        if 'CreationDate' in available_fields:
            if isinstance(doc.info[0]["CreationDate"], PDFObjRef):
                doc.info[0]["CreationDate"] = resolve1(
                    doc.info[0]["CreationDate"])
            try:
                pdf_creation_date = str(
                    self.convertPdfDatetime(doc.info[0]["CreationDate"]))
                self.properties['creation_date'] = str(pdf_creation_date)
            except:
                pass
        if 'ModDate' in available_fields:
            if isinstance(doc.info[0]["ModDate"], PDFObjRef):
                doc.info[0]["ModDate"] = resolve1(doc.info[0]["ModDate"])

            try:
                pdf_modif_date = str(
                    self.convertPdfDatetime(doc.info[0]["ModDate"]))
                self.properties['modification_date'] = str(pdf_modif_date)
            except:
                pass
        if 'Author' in available_fields:
            if isinstance(doc.info[0]["Author"], PDFObjRef):
                doc.info[0]["Author"] = resolve1(doc.info[0]["Author"])
            try:
                pdf_auteur = doc.info[0]["Author"].decode("utf-8")
                self.properties['auteur'] = pdf_auteur
            except:
                pass
        if 'Creator' in available_fields:
            if isinstance(doc.info[0]["Creator"], PDFObjRef):
                doc.info[0]["Creator"] = resolve1(doc.info[0]["Creator"])
            try:
                pdf_creator = doc.info[0]["Creator"].decode("utf-16")
                self.properties['creator'] = pdf_creator
            except:
                pass
        if 'Producer' in available_fields:
            if isinstance(doc.info[0]["Producer"], PDFObjRef):
                doc.info[0]["Producer"] = resolve1(doc.info[0]["Producer"])
            try:
                pdf_producer = doc.info[0]["Producer"].decode("utf-16")
                self.properties['producer'] = pdf_producer
            except:
                pass

        parser.set_document(doc)
        pages = resolve1(doc.catalog['Pages'])
        pages_count = pages.get('Count', 0)

        #Only the first 300 characters for clarity
        self.content = self.convert_pdf_to_txt()
        self.properties['content'] = self.content[:300] + '(...)'
        self.properties['page_count'] = pages_count

        return self.properties