コード例 #1
0
ファイル: pdftables.py プロジェクト: metador/tables_from_pdf
def get_tables(fh):
    """
    Return a list of 'tables' from the given file handle, where a table is a
    list of rows, and a row is a list of strings.
    """
    result = []
    doc, interpreter, device = initialize_pdf_miner(fh)
    doc_length = len(list(PDFPage.create_pages(doc)))
    for i, pdf_page in enumerate(PDFPage.create_pages(doc)):
        #print("Trying page {}".format(i + 1))
        if not page_contains_tables(pdf_page, interpreter, device):
            #print("Skipping page {}: no tables.".format(i + 1))
            continue

        # receive the LTPage object for the page.
        interpreter.process_page(pdf_page)
        processed_page = device.get_result()

        (table, _) = page_to_tables(
            processed_page,
            extend_y=False,
            hints=[],
            atomise=True)
        crop_table(table)
        result.append(Table(table,i+1,doc_length,1,1))

    return result
コード例 #2
0
ファイル: ScriptParse.py プロジェクト: mikev37/DeepSpheres
def GetScript(filename):
    global scriptName
    ResetGlobals()
    scriptName = filename
    password = ""
    # Open a PDF file.
    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        print "---Not translatable---"
        return
        #raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
    
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.boxes_flow = 2
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for pgnum,page in enumerate(PDFPage.create_pages(document)):
        if pgnum == 0:
            continue
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        text = []
        for page in layout:
            try:
                if page.get_text().strip():
                    text.append(TextBlock(page.x0,page.y1,page.get_text().strip()))
            except:
                temp=5  
            print ".",
        text.sort(key = lambda row:(-row.y))
        # Parse all of the "line" objects in each page
        for line in text:
            ParseLine(line.text, line.x)
コード例 #3
0
def calculate_locations(filename,keywords):
    locations = []
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)    
    #Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    pagenum = 0
    reader = PdfFileReader(file(filename,"rb"))
    for page in pages:
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()    
        page = reader.getPage(pagenum)
        x = page.trimBox[0].as_numeric()
        y = page.trimBox[1].as_numeric()
        #Handling special case
        if  (x > 0 and y < 0):
                x = 0
#         print "At page = %s  X  = %s , y = %s"%(pagenum,x,y)
        for keyword in keywords:    
            print '********************************'
            co_ordinates = get_location(keyword,layout,x,y)
            print'Keyword %s , location %s'%(keyword,co_ordinates)
            print '********************************'
            if co_ordinates != None :
                for location in co_ordinates:
                    print "PageNum-->%s"%pagenum
                    l = LocationKeeper(keyword,location,pagenum)
                    locations.append(l)
        pagenum+=1
    return locations
コード例 #4
0
def dwn_pdf_txt(url):
	""" Given a readable but encrypted PDF URL, parses document to text """

	r = requests.get(url)

	memory_file = StringIO(r.content)

	# Create a PDF parser object associated with the StringIO object
	parser = PDFParser(memory_file)

	# Create a PDF document object that stores the document structure
	document = PDFDocument(parser)

	# Define parameters to the PDF device objet 
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	laparams = LAParams()
	codec = 'utf-8'

	# Create a PDF device object
	device = TextConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)

	# Create a PDF interpreter object
	interpreter = PDFPageInterpreter(rsrcmgr, device)

	# Process each page contained in the document
	for page in PDFPage.create_pages(document):
	    interpreter.process_page(page)
	    parsed_document =  retstr.getvalue()

	return parsed_document # everything is stored here, needs to be cleaned up
コード例 #5
0
def setup(path):
	# Open a PDF file.
	fp = open(path, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	# Supply the password for initialization.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
	    raise PDFTextExtractionNotAllowed
	# Create a PDF device object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.

	# now extract dialogue from 
	for i, page in enumerate(PDFPage.create_pages(document)):
		# skip the title page
		if i > 0:
			# process page with interpreter
			interpreter.process_page(page)
			# get layout info
			layout = device.get_result()
			# iterate through layout objects
			for obj in layout:
				# we only want to bother with LTTextBox and LTTextLine
				if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
					# only extract text segments within a certain margin range
					if obj.bbox[0] > DIALOGUE_BBOX_MIN and obj.bbox[0] < DIALOGUE_BBOX_MAX:
						# need to convert unicode characters
						converted = unicodedata.normalize('NFKD', obj.get_text()).encode('ascii', 'ignore')
						print(converted)
コード例 #6
0
ファイル: pdf_miner.py プロジェクト: dcclogin/TextGenerator
def pdf_to_txt(in_file):
	""" turn a PDF file to a TXT file (roughly processed)
	"""
	# Open a PDF file.
	fp = open(in_file, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		raise PDFTextExtractionNotAllowed
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.create_pages(document):
		interpreter.process_page(page)
		# Receive the LTPage object for the page.
		layout = device.get_result()
		for klass in layout:
			if isinstance(klass, LTTextBoxHorizontal):
				out_file = in_file[:-3] + 'txt'
				with open(out_file, 'a') as dst_file:
					text = klass.get_text().encode('utf-8')
					dst_file.write(text + '\n')
	return None
コード例 #7
0
def readPdf(file):
    # Open a PDF file.
    fp = open(file, 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
        
    # Set parameters for analysis.
    laparams = LAParams(line_margin=0.1)
    
    pages = []

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in islice(PDFPage.create_pages(document), 2):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        pages.append(layout)
        
    return pages
コード例 #8
0
ファイル: pdftab.py プロジェクト: ren-hoek/Pdf-Extract
def convert_pdf_table(pdf_file):
    pdf_file = open(pdf_file, 'rb')
    parser = PDFParser(pdf_file)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()

    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    table = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page) 
        layout = device.get_result()
        page_table = tabulate_page(layout)
        header = page_table[0]
        rows = page_table[1:]
        for row in rows:
            row_dict = {}
            for item, detail in enumerate(row):
                if detail != '':
                    row_dict[header[item].lower()] =  detail
            table.append(row_dict)           
                
    return table
コード例 #9
0
ファイル: pdfparser1.py プロジェクト: jijoy/pdfparser
def parsepdf(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    # Create a PDF device object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    found_randers = False
    found_aarhus = False
    _randers = []
    headings = [u'Ledige lejligheder\n',u'afd. adresse\n',u'rum m2\n',u'leje \n',
                u'a\xb4c varme a\xb4c vand\n',u'indskud\n',u'ledig pr.\n',u'bem\xe6rkning\n'
                ]
    location_map = OrderedDict()
    header_ycord = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()

        for obj in layout._objs:
            # print obj
            if isinstance(obj,LTTextBoxHorizontal):
                for o in obj._objs:
                    y0 = o.y0
                    # print o
                    if isinstance(o,LTTextLineHorizontal) and obj.get_text() not in headings:

                        if y0 not in header_ycord:
                            if y0 in location_map :
                                objs = location_map.get(y0)
                            else:
                                objs = []
                            string_val = o.get_text().encode('ascii', 'ignore')
                            string_val = string_val.replace('\n','')
                            objs.append(string_val)
                            location_map.__setitem__(y0,objs)
                    else :
                        if y0 not in header_ycord:
                            header_ycord.append(y0)





    for key in location_map:
        print '**************************'
    #     # print key
        print location_map.get(key)
        print '**************************'
    print 'Total Rowss = %s'%len(location_map)
コード例 #10
0
ファイル: dumppdf.py プロジェクト: coolioxlr/ziply
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
コード例 #11
0
ファイル: pdftk.py プロジェクト: alentrekin/docassemble
def read_fields(pdffile):
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for i in fields:
        field = resolve1(i)
        name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT')
        logmessage("name is " + str(name) + " and FT is |" + str(field_type) + "|")
        if page is not None:
            pageno = id_to_page[page.objid]
        else:
            pageno = 1
        if str(field_type) == '/Btn':
            if value == '/Yes':
                default = "Yes"
            else:
                default = "No"
        elif str(field_type) == '/Sig':
            default = '${ user.signature }'
        else:
            if value is not None:
                default = value
            else:
                default = word("something")
        outfields.append((name, default, pageno, rect, field_type))
    return outfields
コード例 #12
0
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
コード例 #13
0
def dumppdf(fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    res = ""
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            res += dumpxml(obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        res += dumpxml( obj, codec=codec)
                else:
                    res += dumpxml(page.attrs)
    #print "before dumpall"
    if dumpall:
        res += dumpallobjs( doc, codec=codec)
        #print "after dumpall"
    if (not objids) and (not pagenos) and (not dumpall):
        res += dumptrailers( doc)
    fp.close()
    if codec not in ('raw','binary'):
        res += '\n'
    #print "end proc"
    return res
コード例 #14
0
ファイル: classes.py プロジェクト: StumpyFrostreaver/slate
    def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1):
        self.parser = PDFParser(file)
        self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)

        if PYTHON_3:
            self.doc = PDFDocument()
            self.parser.set_document(self.doc)
            self.doc.set_parser(self.parser)
            self.doc.initialize(password)
        else:
            self.doc = PDFDocument(self.parser, password)

        if not check_extractable or self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)

            if PYTHON_3:
                page_generator = self.doc.get_pages()
            else:
                page_generator = PDFPage.create_pages(self.doc)

            for page in page_generator:
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()
コード例 #15
0
ファイル: drag.py プロジェクト: Gallaecio/hunspell-gl
    def generateFileContent(self):

        import tempfile
        import urllib

        abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157"
        temporaryFile = tempfile.NamedTemporaryFile()
        urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name)

        entries = set()
        fileObject = open(temporaryFile.name, "rb")
        parser = PDFParser(fileObject)
        document = PDFDocument(parser)
        resourceManager = PDFResourceManager()
        device = PDFPageAggregator(resourceManager)
        interpreter = PDFPageInterpreter(resourceManager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            objects = [object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve)]
            params = LAParams()
            for line in layout.group_objects(params, objects):
                text = line.get_text()
                if u":" in text:
                    entry = text.split(u":")[0]
                    entry = entry.strip()
                    entry = entry.replace(u"..", ".")
                    entries.add(entry)

        dictionary  = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n"
        dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n"
        dictionary += u"\n"
        for entry in formatEntriesForDictionary(entries, u"abreviatura"):
            dictionary += entry
        return dictionary
コード例 #16
0
ファイル: pdf2epub.py プロジェクト: seignovert/pdf2epub
def main(argv):

    infile  = sys.argv[1]
    outfile = 'test.xhtml'

    fp    = file(infile, 'rb')
    outfp = file(outfile, 'w')     # OR sys.stdout

    password    = ''
    codec       = 'utf-8'
    caching     = True
    
    parser   = PDFParser(fp)
    document = PDFDocument(parser, password=password, caching=caching)
    rsrcmgr  = PDFResourceManager(caching=caching)
    device   = XHTMLConverter(rsrcmgr, outfp, codec=codec, laparams=LAArticle(), document=document)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    return
コード例 #17
0
ファイル: parsepdf.py プロジェクト: kug3lblitz/Sir-Worm
def getPDFText(path):

	'''
	Takes in PDF files and converts to Python human-readable Python data.

	Input:
	path -> full path to PDF file

	Output:
	String representation of parsed data
	'''
	
	retstr = StringIO()
	parser = PDFParser(open(path, 'r'))

	try:
		document = PDFDocument(parser)

	except Exception:
		print path + 'is not a readable pdf'
		return ''

	if document.is_extractable:
		rsrcmgr = PDFResourceManager()
		device = TextConverter(rsrcmgr, retstr, codec = 'ascii', laparams = LAParams())
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		for page in PDFPage.create_pages(document):
			interpreter.process_page(page)
		return retstr.getvalue()        

	else:
		print path, "Warning: could not extract text from PDF file."
		return ''
コード例 #18
0
def analyze_pages(file_name):
    '''
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    '''
    # Open a PDF file.
    with open(os.path.realpath(file_name), 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password = '')
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(char_margin = 2.0, word_margin = 0.1, detect_vertical = True)
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams = laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                print oe, ', skipping page', page_num, 'of', file_name
                traceback.print_exc()
                continue
            layout = device.get_result()
            yield layout
コード例 #19
0
ファイル: Layout.py プロジェクト: cmthompson/weiss
def main():
    # Open a PDF file.
    with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            print interpreter.process_page(page)
        outlines = document.get_outlines()
        for (level,title,dest,a,se) in outlines:
            print (level, title)
    return 0
コード例 #20
0
ファイル: utils.py プロジェクト: 10clouds/edx-platform
def parse_pages(pdf_buffer, password):
    """
    With an PDF buffer object, get the pages, parse each one, and return the entire pdf text
    """
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(pdf_buffer)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)

    resource_manager = PDFResourceManager()
    la_params = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=la_params)
    interpreter = PDFPageInterpreter(resource_manager, device)

    text_content = []  # a list of strings, each representing text collected from each page of the doc
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        layout = device.get_result()
        # layout is an LTPage object which may contain
        #  child objects like LTTextBox, LTFigure, LTImage, etc.
        text_content.append(parse_lt_objects(layout._objs))  # pylint: disable=protected-access

    return text_content
コード例 #21
0
ファイル: pdf.py プロジェクト: 01-/extractors
def extract_pdf(path, languages=None):
    """ Extract content from a PDF file. This will attempt to use PyPDF2
    to extract textual content first. If none is found, it'll send the file
    through OCR. """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')
        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            text = _convert_page(layout, languages)
            result['pages'].append(text)
        device.close()
        return result
コード例 #22
0
ファイル: extractpdf.py プロジェクト: flatnine/stock
def parsePDF(url):

    # Open the url provided as an argument to the function and read the content
    open = urllib2.urlopen(Request(url)).read()

    # Cast to StringIO object
    from StringIO import StringIO
    memory_file = StringIO(open)

    # Create a PDF parser object associated with the StringIO object
    parser = PDFParser(memory_file)

    # Create a PDF document object that stores the document structure
    document = PDFDocument(parser)

    # Define parameters to the PDF device objet 
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = 'utf-8'

    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)

    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        data =  retstr.getvalue()
        print type(data)
        sys.exit()
コード例 #23
0
ファイル: ENR_4_1_parser.py プロジェクト: jianhe25/ParsePDF
def parsePDF(pdf_file):

    pdf_file = open(pdf_file, "r").read()

    # Cast to StringIO object
    from StringIO import StringIO

    memory_file = StringIO(pdf_file)

    # Create a PDF parser object associated with the StringIO object
    parser = PDFParser(memory_file)

    # Create a PDF document object that stores the document structure
    document = PDFDocument(parser)

    # Define parameters to the PDF device objet
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = "utf-8"

    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        data = retstr.getvalue()
        print data
        break
コード例 #24
0
ファイル: Layout.py プロジェクト: cmthompson/weiss
def Layout():
    # Set parameters for analysis.
    with open('/home/chris/Documents/Literature/Donghun_ACSNano_2014', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
           
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
            
        return layout
コード例 #25
0
	def parse_page(self):
		# Create a PDF resource manager object that stores shared resources
		rsrcmgr = PDFResourceManager()
		
		# Create a PDF device object.
		device = PDFDevice(rsrcmgr)
	
		# BEGIN LAYOUT ANALYSIS
		# Set parameters for analysis.
		laparams = LAParams()
	
		# Create a PDF page aggregator object.
		device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	
		# Create a PDF interpreter object.
		interpreter = PDFPageInterpreter(rsrcmgr, device)
	
		text_content = list()
		page_count = 0
		
		for i, page in enumerate(PDFPage.create_pages(self.document)):
			# read the page into a layout object
			interpreter.process_page(page)
			layout = device.get_result()
			
			if self.pages:
				if page_count == self.pages:
					break
			page_count += 1
			self.parse_layout_objects(layout._objs, (i+1))
コード例 #26
0
ファイル: pdftables.py プロジェクト: ziweizhou/pdftables
def iter_tables(fh, x_comb = None, y_comb = None, hints = []):
    """
    iterate over the tables in a document.  See get_tables for the non-iter version.
    
    :param x_comb: Specify x_comb and y_comb to override the automatic comb creation.  
    :param y_comb: 
    :param hints:  tuple of strings to search for to determine the y limits of the page. 
    """
    doc, interpreter, device = initialize_pdf_miner(fh)
    
    pdf_iter = PDFPage.create_pages(doc)

    for i, pdf_page in enumerate(pdf_iter):   
        interpreter.process_page(pdf_page)
        # receive the LTPage object for the page.
        processed_page = device.get_result()
        if not page_contains_tables(processed_page, device):
            #print("Skipping page {}: no tables.".format(i + 1))
            continue

        (table, diag) = page_to_tables(
            processed_page,
            extend_y=True,
            hints=hints,
            atomise=True, 
            x_comb = x_comb,
            y_comb = y_comb)
        crop_table(table)
        yield Table(table, i+1, -1, 1, 1), diag
コード例 #27
0
ファイル: lang_pdf.py プロジェクト: UAM-MOOCAndroid/sniffer
 def fix_text(self, filename):
     # Open a PDF file.
     pdfText = StringIO()
     fp = open(filename, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     # Supply the password for initialization.
     if not self.password:
         document = PDFDocument(parser)
     else:
         document = PDFDocument(parser, self.password)
     # Check if the document allows text extraction. If not, abort.
     if not document.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = TextConverter(rsrcmgr, pdfText, codec=self.codec
             , laparams=LAParams(), imagewriter=None
             )
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in PDFPage.create_pages(document):
         interpreter.process_page(page)
     txt = pdfText.getvalue()
     return txt
コード例 #28
0
ファイル: parsepdf.py プロジェクト: cbbing/fund_spider
def parse_pdf(fname):
    fp = open(fname, 'rb')
    # 来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager()
        # 设定参数进行分析
        laparams=LAParams()
        # 创建一个PDF设备对象
        # device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        # 处理每一页

        contents = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout=device.get_result()
            for x in layout:
                if(isinstance(x, LTTextBoxHorizontal)):
                    content = x.get_text().strip()

                    # print type(content)
                    # print content
                    if content:
                        contents.append(content)
        return contents
コード例 #29
0
ファイル: pdf.py プロジェクト: CodeForAfrica/aleph
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    fh = open(path, "rb")
    result = {"pages": []}
    try:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, "")

        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != "pages" and v is not None and "<PDFObjRef:" not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages))
        device.close()
        return result
    except PSEOF as eof:
        log.info("Unexpected EOF: %r", eof)
        return result
    finally:
        fh.close()
コード例 #30
0
def parsePDF(filename):
    fp = open(filename, 'rb')
    #来创建一个pdf文档分析器
    parser = PDFParser(fp)  
    #创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager()
        # 设定参数进行分析
        laparams=LAParams()
        # 创建一个PDF设备对象
        # device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        # 处理每一页
        result = []
        for page in PDFPage.create_pages(document):
            pageResult = parsePage(page, interpreter, device, filename)
            result.append(pageResult)
        return result
コード例 #31
0
ファイル: pdfjinja.py プロジェクト: anphase/pdfjinja
    def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page)
コード例 #32
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None, extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = {page.pageid: pageno for (pageno, page)
             in enumerate(PDFPage.create_pages(doc), 1)}

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
コード例 #33
0
def extract_text_from_pdf(fobj):
    parser = PDFParser(fobj)
    doc = PDFDocument(parser)
    text = ""
    for page_number, page in enumerate(PDFPage.create_pages(doc), start=1):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        result = io.StringIO()
        device = TextConverter(rsrcmgr, result, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)
        text += result.getvalue()

    return text
コード例 #34
0
def parse(DataIO, save_path, paperID):
    '''
    It is used to pare PDF, and save the content to the target path
    '''
    # create pdf parser
    parser = PDFParser(DataIO)
    # create pdf document
    try:
        doc = PDFDocument(parser)
    except PDFSyntaxError:
        print("can't parse this file!")
        with open('data/nonpdfdoc.txt', 'a') as f:
            f.write(paperID)
            f.write('\n')
        return
    # link document and parser
    parser.set_document(doc)
    # check if the document can be converted to text
    if not doc.is_extractable:
        print("Can't Parse this File! Ignore it and keep parsing")
        with open('data/parseFailed.txt', 'a') as f:
            f.write(paperID)
            f.write('\n')
        raise PDFTextExtractionNotAllowed
    else:
        # create pdf source manager
        rsrcmagr = PDFResourceManager()
        # create PDF device obj
        laparams = LAParams()

        device = PDFPageAggregator(rsrcmagr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmagr, device)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            # get the LTPage obj, including LTTextBox, LTFigure,
            # LTImage, LTTextBoxHorizontal
            layout = device.get_result()
            for x in layout:
                try:
                    if isinstance(x, LTTextBoxHorizontal):
                        with open('%s' % save_path, 'a') as f:
                            result = x.get_text()
                            f.write(result + '\n')
                            # print("Parse pdf successfully")
                except:
                    with open('data/parseFailed.txt', 'a') as f:
                        f.write(paperID)
                        f.write('\n')
                    print("Failed")
コード例 #35
0
ファイル: pdf_utils.py プロジェクト: MTleen/202005-tf_idf
def read_pdf(pdf_path):
    try:
        fp = open(pdf_path, 'rb')
        # 用文件对象来创建一个pdf文档分析器
        parser = PDFParser(fp)
        # 创建一个  PDF 文档
        doc = PDFDocument(parser=parser)
        # 连接分析器 与文档对象
        parser.set_document(doc)
        # 检测文档是否提供txt转换,不提供就忽略; 当然对于不提供txt转换的PDF 可以采用OCR 技术
        if not doc.is_extractable:
            messagebox.showerror(
                message='无法解析 PDF 文件 {},请重新选择。'.format(pdf_path))
            return
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理文档对象中每一页的内容
        # doc.get_pages() 获取page列表
        # 循环遍历列表,每次处理一个page的内容
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
        page_count = 0
        content = ''
        for i, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    result = x.get_text()
                    content += result
                    print(result)
            page_count += 1
        # with open(pdf_path, 'rb') as f:
        #     pdf_reader = PdfFileReader(f, strict=False)
        #     page_count = pdf_reader.getNumPages()
        #     # page_count = len(pdf_reader.pages)
        #     content = None
        #     for i in range(page_count):
        #         page = pdf_reader.getPage(i)
        #         page_text = page.extractText()
        #         page_text = page_text
        #         content = page_text if content is None else content + page_text + '\n'
        return page_count, content
    except PdfReadError:
        messagebox.showerror(message='{}文件已加密或损坏,请重新选择。'.format(pdf_path))
        traceback.print_exc()
コード例 #36
0
    def parsepdf(self):
        # Open a PDF file.
        fp = open(self.filename, 'rb')

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            print('extraction not allowed')
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)


        i = 0
        # loop over all pages in the 
        for page in PDFPage.create_pages(document):
            # int i to keep track of page numbers
            i+=1

            # read the page into a layout object
            interpreter.process_page(page)
            layout = device.get_result()

            # extract text from this object
            self.parse_page(layout._objs, i)
        return self.word_array
    

#test = PDFpos("FinancialAccounting1.pdf")
#test.parsepdf()
コード例 #37
0
ファイル: pdfHandle.py プロジェクト: abcxs/ctpn_pytorch
def get_page_layout(
    filename,
    char_margin=1.0,
    line_margin=0.5,
    word_margin=0.1,
    detect_vertical=True,
    all_texts=True,
):
    """Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. See https://euske.github.io/pdfminer/ to get definitions
    of kwargs.
    Parameters
    ----------
    filename : string
        Path to pdf file.
    char_margin : float
    line_margin : float
    word_margin : float
    detect_vertical : bool
    all_texts : bool
    Returns
    -------
    layout : object
        PDFMiner LTPage object.
    dim : tuple
        Dimension of pdf page in the form (width, height).
    """
    with open(filename, "rb") as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        laparams = LAParams(
            char_margin=char_margin,
            line_margin=line_margin,
            word_margin=word_margin,
            detect_vertical=detect_vertical,
            all_texts=all_texts,
        )
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            width = layout.bbox[2]
            height = layout.bbox[3]
            dim = (width, height)
        return layout, dim
コード例 #38
0
ファイル: model.py プロジェクト: vestigegroup/rpaframework
    def convert(self, source_path: str = None) -> None:
        """Parse source PDF into entities which can be
        used for text searches, for example.

        This is also used inside other PDF keywords.

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Example Keyword
                Convert    /tmp/sample.pdf

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            def example_keyword():
                pdf.convert("/tmp/sample.pdf")

        :param source_path: source PDF filepath.
        """
        self.ctx.switch_to_pdf(source_path)
        source_parser = PDFParser(self.ctx.active_pdf_document.fileobject)
        source_document = PDFDocument(source_parser)
        source_pages = PDFPage.create_pages(source_document)
        rsrcmgr = PDFResourceManager()
        laparams = pdfminer.layout.LAParams(
            detect_vertical=True,
            all_texts=True,
        )
        device = Converter(self.ctx.active_pdf_document, rsrcmgr, laparams=laparams)
        interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device)

        # Look at all (nested) objects on each page
        for _, page in enumerate(source_pages, 0):
            interpreter.process_page(page)
        self.ctx.active_pdf_document = device.close()
        self.ctx.active_pdf_document.is_converted = True
コード例 #39
0
def parse(in_path, out_path, start_num, end_num):
    fp = open(in_path, 'rb')  # 以二进制读模式打开#
    praser = PDFParser(fp)  # 用文件对象来创建一个pdf文档分析器
    doc = PDFDocument(praser)  # 创建一个PDF文档
    praser.set_document(doc)  # 连接分析器 与文档对象

    if start_num == 0 and end_num == 0:
        mode = 1
    elif start_num != 0 and end_num == 0:
        mode = 2
    elif start_num != 0 and end_num != 0:
        mode = 3

    temp_num = 0
    if not doc.is_extractable:  # 检测文档是否提供txt转换,不提供就忽略
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()  # 创建PDf 资源管理器 来管理共享资源
        laparams = LAParams()  # 创建一个PDF设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)  # 创建一个PDF解释器对象

        for page in PDFPage.create_pages(
                doc):  # 循环遍历列表,每次处理一个page的内容 其中doc.get_pages()获取page列表
            if mode == 2:
                temp_num += 1
                if temp_num < start_num:
                    continue
                elif temp_num > start_num:
                    break
            elif mode == 3:
                temp_num += 1
                if temp_num < start_num:
                    continue
                elif temp_num > end_num:
                    break

            interpreter.process_page(page)  # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (
                        isinstance(x, LTTextBoxHorizontal)
                ):  #需要写出编码格式 解决\u8457\u5f55\u683c\u5f0f\uff1a\u67cf\u6167乱码
                    with open(out_path, 'a', encoding='utf-8') as out_txt:
                        results = x.get_text()
                        #print(results)
                        out_txt.write(results + '\n')
    return
コード例 #40
0
ファイル: ontologies.py プロジェクト: 413J4NDR0/project24
def build_frequency_matrix(my_pdf, arr_stemmed_ontology, stemmer):
    frequency_matrix = []

    #set up document for PDFMiner
    fp = open(my_pdf, "rb")
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #initialize page counter
    page_number = 0
    #variable to store processed text for the whole pdf document
    doc_text = ""

    for page in PDFPage.create_pages(document):
        #Create string of text both by page and append each page to create string of text for entire doc
        #variable to store processed text for each page
        page_text = ""
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                page_text += lt_obj.get_text()

        #process the text: stem all words
        page_text = process_pdf(page_text, stemmer)
        doc_text += page_text

        frequency_matrix.append([])
        for key in arr_stemmed_ontology.keys():
            #key = word (not stemmed)
            #synonym_list = stemmed list word/synonyms
            synonym_list = arr_stemmed_ontology[key]
            freq = 0

            #find number of occurance of all synonyms on a page
            #add this value to frequency_matrix at index of current page
            for syn in synonym_list:
                freq += len(re.findall(syn, page_text))
            frequency_matrix[page_number].append(freq)

        page_number += 1

    return frequency_matrix, doc_text
コード例 #41
0
def mine_pdf(fp):
    print('mining pdf')
    with open(Path(fp), 'rb') as file:
        parser = PDFParser(file)

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a buffer for the parsed text
        retstr = StringIO()

        # Spacing parameters for parsing
        #https://github.com/obeattie/pdfminer/wiki/pdfminer.layout
        laparams = LAParams(char_margin=4.0, word_margin=0)
        #print(laparams.__dict__)
        codec = 'utf-8'

        # Create a PDF device object
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        lines = retstr.getvalue().splitlines()

        text = ""
        print('iterate over lines')
        for i in range(len(lines)):

            lines[i] = lines[i].lower()

            if lines[i] == '':
                lines[i] = ' '
            elif lines[i][-1] == '-':
                lines[i] = lines[i][:-1]

            else:
                lines[i] = lines[i] + ' '

            text += lines[i]
    return text
コード例 #42
0
ファイル: tlsh_pdf.py プロジェクト: ekmixon/ThreatExchange
 def hash_from_file(cls, file: pathlib.Path) -> str:
     if not str(file).endswith(".pdf"):
         warnings.warn("File does not appear to be a pdf. ",
                       category=UserWarning)
         return ""
     text = StringIO()
     with open(file, "rb") as in_file:
         parser = PDFParser(in_file)
         doc = PDFDocument(parser)
         rsrcmgr = PDFResourceManager()
         device = TextConverter(rsrcmgr, text, laparams=LAParams())
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         for page in PDFPage.create_pages(doc):
             interpreter.process_page(page)
         return str(tlsh.hash(text.getvalue().encode()))
コード例 #43
0
    def __init__(self, document, laparams=None):
        if laparams is None:
            laparams = LAParams()

        self._page_iterator = iter(PDFPage.create_pages(document))
        self._page = next(self._page_iterator)

        rsrcmgr = PDFResourceManager()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

        self._lines = None
        self.i_page = 1
        self.is_at_end = False
コード例 #44
0
def pdf2text(path):
    """Converts the PDF to text
    Based on the implementation developed by:
    https://pdfminersix.readthedocs.io/en/latest/tutorial/composable.html
    """
    output_string = StringIO()
    with open(path, "rb") as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()
コード例 #45
0
def read_fields(pdffile):
    outfields = []
    fp = open(pdffile, 'rb')
    id_to_page = {}
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return []
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter)
コード例 #46
0
def iterate_pages(
        pdf_fn: str,
        use_advanced_detection: bool = False) -> Generator[LTPage, None, None]:
    with open(pdf_fn, 'rb') as pdf_f:
        parser = PDFParser(pdf_f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(all_texts=True, grid_size=0) if use_advanced_detection \
            else LAParams(all_texts=True, boxes_flow=None, grid_size=0)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            page_layout: LTPage = device.get_result()
            yield page_layout
コード例 #47
0
def _parse_pages(doc, images_folder):
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = [
    ]  # a list of strings, each representing text collected from each page of the doc
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        layout = device.get_result()
        # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
        text_content.append(parse_lt_objs(layout._objs, (i + 1),
                                          images_folder))
    return text_content
コード例 #48
0
def le_pdf(filename_or_fobj):
    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    parser = PDFParser(fobj)
    doc = PDFDocument(parser)
    texto = ''
    for num_pagina, pagina in enumerate(PDFPage.create_pages(doc), start=1):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        resultado = io.StringIO()
        conversor = TextConverter(rsrcmgr, resultado, laparams=laparams)
        interpretador = PDFPageInterpreter(rsrcmgr, conversor)
        interpretador.process_page(pagina)
        texto += resultado.getvalue()

    return texto.strip()
コード例 #49
0
ファイル: TASK3.py プロジェクト: amiteshag320/AI-CHAMP-CV-
def extract_text_from_pdf(pdf_path):
    output_string = StringIO()
    with open(pdf_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    text = output_string.getvalue()
    final_txt = text.replace("\n", " ")

    return final_txt
コード例 #50
0
ファイル: capston.py プロジェクト: tjtmddnjswkd/capstone
def read_pdf_PDFMINER(pdf_file_path):
    """
    pdf_file_path: 'dir/aaa.pdf'로 구성된 path로부터 
    내부의 text 파일을 모두 읽어서 스트링을 리턴함.
    """
    output_string = StringIO()
    with open(pdf_file_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return str(output_string.getvalue())
コード例 #51
0
def convert_pdf_to_string(file_path):
    output_string = StringIO()
    with open(file_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

        words = nltk.word_tokenize(output_string.getvalue())
        words = [word.lower() for word in words if word.isalpha()]

    return (words)
コード例 #52
0
def PdfToString(path_to_file):
    try:
        txt = StringIO()
        file = open(path_to_file, 'rb') #read binary file
        parse = PDFParser(file)
        document = PDFDocument(parse)
        manage = PDFResourceManager()
        convert = TextConverter(manage, txt, laparams = LAParams())
        interpret = PDFPageInterpreter(manage, convert)
        for page in PDFPage.create_pages(document):
            interpret.process_page(page)
        file.close()
        return(txt.getvalue())
    except:
        return('error')
コード例 #53
0
def get_layout_elements(content):
    '''Take content of pdf and return list of text in order that it occurs'''
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Create a PDF device object.
    parser = PDFParser(io.BytesIO(content))
    # Create a PDF page aggregator object.
    page = next(PDFPage.create_pages(PDFDocument(parser)))
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    return [
        child.get_text() for child in device.get_result()
        if hasattr(child, 'get_text')
    ]
コード例 #54
0
    def pages(self):
        if hasattr(self, "_pages"):
            return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            page_number = i + 1
            if pp is not None and page_number not in pp:
                continue
            p = Page(self, page, page_number=page_number, initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages
コード例 #55
0
def pdf2text_all(stream):
    parser = PDFParser(stream)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    resmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(resmgr, device)
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        for obj in device.get_result():
            if isinstance(obj, (LTTextBox, LTTextLine)):
                yield obj.get_text()
コード例 #56
0
ファイル: files.py プロジェクト: NickDale/simple_pdf_editor
    def convert_pdf_to_string(self, txt_edit, path):
        output_string = StringIO()
        txt_edit.delete(1.0, tk.END)
        with open(path, 'rb') as in_file:
            parser = PDFParser(in_file)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            interpreter = PDFPageInterpreter(
                rsrcmgr,
                TextConverter(rsrcmgr, output_string, laparams=LAParams()))
            pages = PDFPage.create_pages(doc)
            for page in pages:
                interpreter.process_page(page)

            txt_edit.insert(tk.END, output_string.getvalue())
コード例 #57
0
def convert_pdf_to_txt(content):
    try:
        pdf = io.BytesIO(content.content)
    except:
        pdf = io.BytesIO(content)
    parser = PDFParser(pdf)
    document = PDFDocument(parser, password=None)  # this fails
    write_text = ''
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        write_text += retstr.getvalue()
        #write_text = write_text.join(retstr.getvalue())
    # Process all pages in the document
    text = str(write_text)
    return text
コード例 #58
0
def PDF_to_TXT_regex2(title):
	
	#print("\n\n ~~~~~~~~ \n\n ~~~~~~~~ \n\n")
	print("Title: {}".format(title))
	
	with open(title, 'rb') as in_file:
		parser = PDFParser(in_file)
		doc = PDFDocument(parser)
		rsrcmgr = PDFResourceManager()
		device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		for page in PDFPage.create_pages(doc):
			interpreter.process_page(page)

	print(output_string.getvalue())
コード例 #59
0
ファイル: demo.py プロジェクト: a10423006/PDF-crawler
 def _parse_pages(doc, images_folder):
     """With an open PDFDocument object, get the pages and parse each one   [this is a higher-order function to be passed to with_pdf()]"""
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     text_content = []
     for i, page in enumerate(PDFPage.create_pages(doc)):
         interpreter.process_page(page)
         # receive the LTPage object for this page
         layout = device.get_result()
         # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
         text_content.append(parse_lt_objs(
             layout, (i + 1), images_folder))
         return text_content
コード例 #60
0
def convert_pdf_to_string(file_path):

    output_string = StringIO()
    laparams = LAParams()
    laparams.all_texts = True
    with open(file_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    return (output_string.getvalue())