def getPDFMetadata(path):

    result = {}

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    result = doc.info

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        
        try:
            result.update( metadata ) # The raw XMP metadata
            
        except:
            pass
            
        try:
            result.update( xmp_to_dict(metadata) )
            
        except:
            pass

    return result[0]
Example #2
0
def pdf_isvalid(filelike):
    ''' returns True if valid pdf, else False
    @param filelike: filelike object, seekable
    '''
    logger = logging.getLogger()
    isvalid = False    
    filelike.seek(0)  
    
    if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
        return False
    else:
        filelike.seek(0)
    try:
        parser = PDFParser(filelike)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        if doc.is_extractable:
            isvalid = True
    except PDFException as excobj:
        logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
        isvalid = False
            
    filelike.seek(0)
    return isvalid
Example #3
0
 def WithPdf(self, pdfdoc, password, fn, *args):
     """Open the pdf document, and apply the function, returning the results"""
     result = None
     try:
         # open the pdf file
         fp = open(pdfdoc, 'rb')
         # create a parser object associated with the file object
         parser = PDFParser(fp)
         # create a PDFDocument object that stores the document structure
         doc = PDFDocument()
         # connect the parser and document objects
         parser.set_document(doc)
         doc.set_parser(parser)
         # supply the password for initialization
         if password:
             self.password = password
         doc.initialize(self.password)
 
         if doc.is_extractable:
             # apply the function and return the result
             result = fn(doc, *args)
 
         # close the pdf file
         fp.close()
     except IOError:
         # the file doesn't exist or similar problem
         pass
     return result
Example #4
0
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
    if len(args) > 1:
        prefix = fileOrUrl + ':'
    fp = None
    if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
        request = urllib2.Request(fileOrUrl)
        if basicauth:
            request.add_header('Authorization', 'Basic ' + basicauth)
        fobj = urllib2.urlopen(request)
        pdfdata = fobj.read()
        fobj.close()
        fp = StringIO.StringIO(pdfdata)
    else:
        fp = open(fileOrUrl, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    fp.close()
    if textmode:
        for obj in doc.info:
            for (name, val) in obj.iteritems():
                print '{0}:{1}={2}'.format(
                    fileOrUrl, name, val
                )
    else:
        val = doc.info
        if type(val) is list and len(val) == 1:
            val = val[0]
        print prefix + str(val)
Example #5
0
	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
Example #6
0
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
Example #7
0
    def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1):
        self.parser = PDFParser(file)
        self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)

        if PYTHON_3:
            self.doc = PDFDocument()
            self.parser.set_document(self.doc)
            self.doc.set_parser(self.parser)
            self.doc.initialize(password)
        else:
            self.doc = PDFDocument(self.parser, password)

        if not check_extractable or self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)

            if PYTHON_3:
                page_generator = self.doc.get_pages()
            else:
                page_generator = PDFPage.create_pages(self.doc)

            for page in page_generator:
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()
Example #8
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
Example #11
0
class Pdf(object):

    def __init__(self, pdf_file):
        parser = PDFParser(pdf_file)
        self._doc = PDFDocument()
        parser.set_document(self._doc)
        self._doc.initialize
        self._doc.set_parser(parser)

    @property
    def pages(self):
        return len(tuple(self._doc.get_pages()))

    def to_text(self):
        rsrcmgr = PDFResourceManager()
        output = StringIO()
        laparams = LAParams()
        laparams.detect_vertical = True
        laparams.all_texts = True
        laparams.word_margin = 0.4
        device = TextConverter(rsrcmgr, output, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self._doc.get_pages():
                interpreter.process_page(page)
        return output.getvalue().decode('utf-8', 'ignore')
Example #12
0
 def harvest_file(self, path):
     with open(path, 'rb') as fp:
         # FIXME: how do we know which encoding to use? Should we
         # use 'chardet' to detect it?
         encoding = 'utf-8'
         parser = PDFParser(fp)
         if HAS_PDFMINER_3K:
             doc = PDFDocument()
             parser.set_document(doc)
             doc.set_parser(parser)
         else:
             doc = PDFDocument(parser)
         title = doc.info[0].get('Title', '')
         if isinstance(title, PDFObjRef):
             title = title.resolve()
         if isinstance(title, bytes):
             # This may not be necessary with pdfminer3k.
             try:
                 title = title.decode(encoding)
             except UnicodeDecodeError:
                 logger.warning('Could not correctly decode title of "%s".', path)
                 title = title.decode(encoding, 'ignore')
         fp.seek(0)
         content = extract_content(fp, encoding).strip()
         try:
             content = content.decode(encoding)
         except UnicodeDecodeError:
             logger.warning('Could not correctly decode content of "%s".', path)
             content = content.decode(encoding, 'ignore')
     return {
         'title': title,
         'content': content,
         'kind': 'PDF',
     }
class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
def extractContent(file):
    print "extractContent"

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    rsrcmgr = PDFResourceManager()
    codec = 'UTF-8'
    laparams = LAParams()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #if not doc.is_extractable:
    #    return None

    for i, page in enumerate(doc.get_pages()):
        print "page=" + str(i)
        if page is not None:
            interpreter.process_page(page)
    print "EOF"
    device.close()
    fp.close()

    return outfp.getvalue()
Example #15
0
    def pdf_function(pdf_doc, password='', *args, **kwargs):
        result = None
        try:
            # open the pdf file
            fp = open(pdf_doc, 'rb')
            # create a parser object associated with the file object
            parser = PDFParser(fp)
            # create a PDFDocument object that stores the document structure
            doc = PDFDocument()
            # connect the parser and document objects
            parser.set_document(doc)
            doc.set_parser(parser)
            # supply the password for initialization
            doc.initialize(password)

            if doc.is_extractable:
                # apply the function and return the result
                result = function(doc, *args, **kwargs)

            # close the pdf file
            fp.close()
        except IOError:
            # the file doesn't exist or similar problem
            pass
        return result
Example #16
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #18
0
class PDFController(object):
    def __init__(self, fd=None, password=''):
        self.fd = fd
        self.password = password
        self.parsed = False
        self.document = PDFDocument()
        self.laparams = LAParams()
        self.rsrcmgr = PDFResourceManager()
        self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams)
        self.layout = []

        if fd:
            self.open(fd, password)

    def open(self, fd, password=''):
        self.password = password
        self.fd = fd if hasattr(fd, 'read') else open(fd)

    def close(self):
        if self.fd:
            self.fd.close()
            self.fd = None
        self.parsed = False

    def parse(self):
        parser = PDFParser(self.fd)
        parser.set_document(self.document)
        self.document.set_parser(parser)
        self.document.initialize(self.password)
        if not self.document.is_extractable:
            self.fd.close()
            raise PDFTextExtractionNotAllowed

        if not self.layout:
            self.layout = self._get_layout()

        self.parsed = True

    def _get_layout(self):
        layout = []
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        for page in self.document.get_pages():
            interpreter.process_page(page)
            layout = self.device.get_result()
        return layout

    def lookup_term(self, term, ignore_case=True):
        layout_list = list(self.layout)
        indexes = [ i for i, v in enumerate(layout_list)
                    if hasattr(v, 'get_text') and (term.lower() if ignore_case else term) in
                        (v.get_text().lower() if ignore_case else v.get_text()) ]
        return indexes

    def __del__(self):
        self.fd.close()

    def __repr__(self):
        return '<PDFController> %s, %s' % ('Open file "%s"' % self.fd.name if self.fd else 'No file opened',
                                             'not parsed' if not self.parsed else 'parsed')
def open_pdf(filename, password=''):
  fp = open(filename, 'rb')
  parser = PDFParser(fp)
  doc = PDFDocument(caching=True)
  parser.set_document(doc)
  doc.set_parser(parser)
  doc.initialize(password)
  return doc
Example #20
0
def open_pdf(filepath):
	""" Read in a PDF file, create a PDFMiner document object and return it. """
	fp = open(filepath, 'rb') # Open the file
	parser = PDFParser(fp) # Create the parser
	doc = PDFDocument() # Create the document object
	parser.set_document(doc)
	doc.set_parser(parser)
	doc.initialize('')
	
	return doc
Example #21
0
class PDFScraper(object):
    """
    """

    converterClass = TabbedConverter

    def __init__(self, filename, skipStartsWith=None, skipIn=None):
        self.filename = filename
        rsrc = PDFResourceManager()
        self.outfp = StringIO()
        self.converter = self.converterClass(
            rsrc,
            self.outfp,
            codec="utf-8",
            laparams=LAParams(),
            skip_startswith=skipStartsWith or [],
            skip_in=skipIn or [],
            isLineStart=self.isLineStart,
            cleanTerm=self.cleanTerm,
            preProcessLine=self.preProcessLine,
        )
        self.interpreter = PDFPageInterpreter(rsrc, self.converter)

    def isLineStart(self, line):
        return False

    def cleanTerm(self, line):
        return line

    def preProcessLine(self, line):
        return line

    def prepare(self):
        self.doc = PDFDocument()
        self.source = open(self.filename, "rb")
        parser = PDFParser(self.source)
        parser.set_document(self.doc)
        self.doc.set_parser(parser)
        self.doc.initialize("")

    def finish(self):
        self.converter.close()
        self.source.close()

    def postProcess(self):
        return self.outfp.getvalue()

    def run(self):
        self.prepare()
        # for i, page in enumerate(list(self.doc.get_pages())[0:1]):
        for i, page in enumerate(self.doc.get_pages()):
            if page is not None:
                self.interpreter.process_page(page)
        self.finish()
        return self.postProcess()
def getTableOfContents (path, pageNum):
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    #doc.initialize(password)

    for pageNumber, page in enumerate(doc.get_pages()):
        if pageNumber == pageNum:
            return getParsedPage(doc, pageNum)
Example #23
0
class PDF(list):
    def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1):
        self.parser = PDFParser(file)
        self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)

        if PYTHON_3:
            self.doc = PDFDocument()
            self.parser.set_document(self.doc)
            self.doc.set_parser(self.parser)
            self.doc.initialize(password)
        else:
            self.doc = PDFDocument(self.parser, password)

        if not check_extractable or self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)

            if PYTHON_3:
                page_generator = self.doc.get_pages()
            else:
                page_generator = PDFPage.create_pages(self.doc)

            for page in page_generator:
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()

    def _cleanup(self):
        """
        Frees lots of non-textual information, such as the fonts
        and images and the objects that were needed to parse the
        PDF.
        """
        self.device = None
        self.doc = None
        self.parser = None
        self.resmgr = None
        self.interpreter = None

    def text(self, clean=True):
        """
        Returns the text of the PDF as a single string.
        Options:

          :clean:
            Removes misc cruft, like lots of whitespace.
        """
        if clean:
            return utils.normalise_whitespace(''.join(self).replace('\n', ' '))
        else:
            return ''.join(self)
Example #24
0
def pdfextract(pdfbin):
    cin = StringIO.StringIO()
    cin.write(pdfbin)
    cin.seek(0)
    parser = PDFParser(cin)
    doc = PDFDocument()
    parser.set_document(doc)
    try:
        doc.set_parser(parser)
    except PDFSyntaxError, e:
        return [{"npage":npage, "x0":0, "y0":0, "x1":0.099, "y1":0.099, "val":"SyntaxError "+str(e)}]
Example #25
0
def pdfMetadata(pathToPDF):
    '''
        Innen: http://stackoverflow.com/questions/14209214/reading-the-pdf-properties-metadata-in-python
    '''
    with open(pathToPDF,"rb") as pdfFile:
        parser=PDFParser(pdfFile)
        document=PDFDocument()
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize()
    return document.info
Example #26
0
 def get_pdf_num_page(self, pdf):
     """
         Get count page
     """
     if os.path.exists(pdf):
         fp = open(pdf, 'rb')
         parser = PDFParser(fp)
         doc = PDFDocument()
         parser.set_document(doc)
         doc.set_parser(parser)
         count_page = [i for i in doc.get_pages()]
         return len(count_page)
Example #27
0
File: pwat.py Project: utero/P-WAT
def readpdf():
    fp = open(pdf, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    print blue + "Info Extraida del PDF:" + "\n" + END
    print green
    print str(doc.catalog) + "\n"
    print str(doc.info)
    print END
Example #28
0
class PDFText:
	
	def __init__(self, filepath):
		self.doc = PDFDocument() # the underlying pdf document
		fp = open(filepath, 'rb')
		parser = PDFParser(fp)
		parser.set_document(self.doc)
		self.doc.set_parser(parser)
		self.doc.initialize()
		
	def words(self):
		return []
Example #29
0
def open_pdf_file(pdf_file, password=''):
    # Create a PDF parser object associated with the file object.
    pdf_file_parser = PDFParser(pdf_file)
    # Create a PDF document object that stores the document structure.
    pdf_file_document = PDFDocument()
    # Connect the parser and document objects.
    pdf_file_parser.set_document(pdf_file_document)
    pdf_file_document.set_parser(pdf_file_parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    pdf_file_document.initialize(password)
    return pdf_file_document
Example #30
0
 def __enter__(self):
     # открываем пдф-файл
     self.fp = open(self.pdf_doc, "rb")
     # создаем объект парсера
     parser = PDFParser(self.fp)
     # создаем объект пдф документа
     doc = PDFDocument()
     # подключение парсера к объекту документа
     parser.set_document(doc)
     doc.set_parser(parser)
     # инициализация по паролю
     doc.initialize(self.pdf_pwd)
     return doc
Example #31
0
File: cs.py Project: liman21/xinwen
 def changePdfToText(self, filePath):
   file = open(path, 'rb') # 以二进制读模式打开
   #用文件对象来创建一个pdf文档分析器
   praser = PDFParser(file)
   # 创建一个PDF文档
   doc = PDFDocument()
   # 连接分析器 与文档对象
   praser.set_document(doc)
   doc.set_parser(praser)
   # 提供初始化密码
   # 如果没有密码 就创建一个空的字符串
   doc.initialize()
   # 检测文档是否提供txt转换,不提供就忽略
   if not doc.is_extractable:
     raise PDFTextExtractionNotAllowed
   # 创建PDf 资源管理器 来管理共享资源
   rsrcmgr = PDFResourceManager()
   # 创建一个PDF设备对象
   laparams = LAParams()
   device = PDFPageAggregator(rsrcmgr, laparams=laparams)
   # 创建一个PDF解释器对象
   interpreter = PDFPageInterpreter(rsrcmgr, device)
   pdfStr = ''
   # 循环遍历列表,每次处理一个page的内容
   for page in doc.get_pages(): # doc.get_pages() 获取page列表
     interpreter.process_page(page)
     # 接受该页面的LTPage对象
     layout = device.get_result()
     for x in layout:
       if hasattr(x, "get_text"):
         # print x.get_text()
         result.append(x.get_text())
         fileNames = os.path.splitext(filePath)
         with open(fileNames[0] + '.txt','wb') as f:
           results = x.get_text()
           print(results)
           f.write(results + '\n')
Example #32
0
def parse():
    fp = open(path, 'rb')  #二进制读模式打开
    #创建pdf文档分析器
    praser = PDFParser(fp)
    #创建一个pdf文档
    doc = PDFDocument()

    #连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    #提供初始化密码,没有密码则创建一个空字符串
    doc.initialize()

    #检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF资源管理器 共享资源
        rsrcmgr = PDFResourceManager()
        #创建PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        #创建PDF解释器
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #循环遍历,每次处理一Page内容
        for page in doc.get_pages():  #doc.get_pages()获取pag列表
            interpreter.process_page(page)
            #接受页面的LTPage对象,这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            layout = device.get_result()
            for x in layout:
                with open(r'./2.txt', 'a', encoding="UTF-8") as f:
                    if (isinstance(x, LTTextBoxHorizontal)):
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n')
Example #33
0
 def noimgpdf_change_word(self, _path):
     """
     没有图片的pdf文件转word
     :param _path: pdf文件路径
     :return:
     """
     try:
         if 'http://www' in _path:
             re = Request(
                 url=_path,
                 headers={'User-Agent': random.choice(self.user_agent)})
             fp = urlopen(re)  # 打开在线PDF文档
         else:
             fp = open(_path, 'rb')  # 打开本地pdf文档
         praser_pdf = PDFParser(fp)
         doc = PDFDocument()
         praser_pdf.set_document(doc)
         doc.set_parser(praser_pdf)
         doc.initialize()
         if not doc.is_extractable:
             raise PDFTextExtractionNotAllowed
         else:
             rsrcmgr = PDFResourceManager()
             laparams = LAParams()
             device = PDFPageAggregator(rsrcmgr, laparams=laparams)
             interpreter = PDFPageInterpreter(rsrcmgr, device)
             all_results = ''
             for page in doc.get_pages():
                 interpreter.process_page(page)
                 layout = device.get_result()
                 for out in layout:
                     if isinstance(out, LTTextBoxHorizontal):
                         results = out.get_text()
                         all_results += results
             return all_results
     except:
         return None
Example #34
0
def readPDF(path, toPath):
    #以二进制形式打开PDF文件
    f = open(path, "rb")
    #创建一个PDF文件分析器
    parser = PDFParser(f)
    #创建PDF文档
    pdfFile = PDFDocument()
    #连接分析器与文档对象
    parser.set_document(pdfFile)
    pdfFile.set_parser(parser)
    #提供初始化代码
    pdfFile.initialize()

    #检测文档是否提供TXT转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #解析数据

        #数据管理器
        manager = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        #解释器对象
        interpreter = PDFPageInterpreter(manager, device)
        #开始循环处理,每次处理一页
        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            #图层
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(toPath, "a") as f:
                        str = x.get_text()
                        print(str)
                        f.write(str + "\n")
Example #35
0
def parse(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    if results != None and len(results) > 6 and results.find(
                            '表') > 0:
                        results = results.replace('\n', '')
                        return results
    return ""
Example #36
0
 def pdfparse(url, name):
     res = s.get(url, headers={"user-agent": generate_user_agent()})
     path1 = os.getcwd() + "\\%s.pdf" % name.split(".")[0]
     #        path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0]
     with open(path1, 'wb') as f:
         f.write(res.content)
     f = open(path1, 'rb')
     praser = PDFParser(f)
     doc = PDFDocument()
     praser.set_document(doc)
     doc.set_parser(praser)
     f.close()
     doc.initialize()
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     else:
         # 创建PDf 资源管理器 来管理共享资源
         rsrcmgr = PDFResourceManager()
         # 创建一个PDF设备对象
         laparams = LAParams()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         # 创建一个PDF解释器对象
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         text = ''
         # 循环遍历列表,每次处理一个page的内容
         for page in doc.get_pages():  # doc.get_pages() 获取page列表
             interpreter.process_page(page)
             # 接受该页面的LTPage对象
             layout = device.get_result()
             #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout))
             #print(text)
             # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
             for x in layout:
                 results = x.get_text()
                 if results:
                     text = text + results.strip('\n')
         return text
Example #37
0
def parse(pdf_path, toPath):
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表

            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    # 保存文本内容
                    with open(toPath, 'a', encoding="utf-8") as f:
                        results = x.get_text()
                        f.write(results + '\n')
Example #38
0
def PDFreader(pdfPATH, TXTname=""):
    #获取文档对象
    fp = open(pdfPATH, "rb")
    #创建一个一个与文档关联的解释器
    parser = PDFParser(fp)
    #PDF文档的对象
    doc = PDFDocument()
    #连接解释器和文档对象
    parser.set_document(doc)
    doc.set_parser(parser)
    #初始化文档,当前文档没有密码,设为空字符串
    doc.initialize("")
    #创建PDF资源管理器
    resource = PDFResourceManager()
    #参数分析器
    laparam = LAParams()
    #创建一个聚合器
    device = PDFPageAggregator(resource, laparams=laparam)
    #创建PDF页面解释器
    interpreter = PDFPageInterpreter(resource, device)
    #使用文档对象得到页面的集合

    list = []
    for page in doc.get_pages():
        # 使用页面解释器读取
        interpreter.process_page(page)
        # 使用聚合器来获得内容
        layout = device.get_result()
        for out in layout:
            if hasattr(out, "get_text"):
                str = out.get_text()
                list.append(str)
                #print(str)
    TXTstr = "\n".join(list)
    if TXTname == "":
        TXTname = pdfPATH.replace(".pdf", ".txt")
    writeFile(TXTname, TXTstr)
Example #39
0
    def readPDF(self, path, callback=None, toPath=''):
        f = open(path, 'rb')  # 以二进制可读形式打开pdf文件,'rb'
        parser = PDFParser(f)  # 创建一个pdf文档分析器
        pdfFile = PDFDocument()  # 创建pdf文档
        parser.set_document(pdfFile)  # 链接文档对象与分析器
        pdfFile.set_parser(parser)  # 链接分析器与文档对象
        pdfFile.initialize('')  # 提供初始化密码
        # 检测文档是否提供txt转换
        if not pdfFile.is_extractable:  #
            raise PDFTextExtractionNotAllowed
        else:
            # 解析数据
            # #数据管理器
            manager = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(manager, laparams=laparams)
            # 创建解释器对象
            interpreter = PDFPageInterpreter(manager, device)

            # 开始循环处理,每次处理一页,只能把文本读出来,图片读不出
            for page in pdfFile.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:  # 循环处理图层
                    if isinstance(x, LTTextBoxHorizontal
                                  ):  # 判断图层类型为LTTextBoxHorizontal才可以进行读取
                        if toPath == '':
                            #处理每行数据
                            str = x.get_text()
                            if callback != None:
                                callback(str)
                            else:
                                print(str)
                        else:
                            #写文件
                            print('将PDF数据写入文件')
Example #40
0
def parse():
    fp = open(pdf_path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            with open(txt_path, 'w') as f:
                for x in layout:
                    if (isinstance(x, LTTextBoxHorizontal)):
                        results = x.get_text()
                        f.write(results)
Example #41
0
def with_pdf(pdf_doc, pdf_pwd, fn, *args):
    result = None

    fp = open(pdf_doc, 'rb')
    # create a parser object associated with the file object
    parser = PDFParser(fp)
    # create a PDFDocument object that stores the document structure
    doc = PDFDocument()
    # connect the parser and document objects
    parser.set_document(doc)
    doc.set_parser(parser)
    # supply the password for initialization
    doc.initialize(pdf_pwd)
    if doc.is_extractable:
        # apply the function and return the result
        result = fn(doc, *args)
    # close the pdf file
    fp.close()

    return result
 def getData(self):
     try:
         doc = PDFDocument()
         fp = file(self.fname, 'rb')
         parser = PDFParser(fp)
         parser.set_document(doc)
         doc.set_parser(parser)
         doc.initialize(self.password)
         metadata = resolve1(doc.catalog['Metadata'])
         parser.close()
         fp.close()
         for xref in doc.xrefs:
             info_ref = xref.trailer.get('Info')
             if info_ref:
                 info = resolve1(info_ref)
             self.metadata = info
             self.raw = info
         return "ok"
     except:
         return "error"
Example #43
0
    def _make_pages(self, fp):
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        document = PDFDocument()
        parser.set_document(document)

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        password = ""
        document.set_parser(parser)
        document.initialize(password)

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        pages = list(document.get_pages())
        return (device, interpreter, pages, rsrcmgr)
Example #44
0
    def extract_text_from_pdf(self):
        if (self.page_end == 0):
            self.page_end = self.page_beg

        fp = open(self.filepath_in + '/' + self.nom_fichier, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()

        parser.set_document(doc)
        doc.set_parser(parser)

        doc.initialize('')

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 4.0  # 2.0 by default :  two char whose distance is closer than this value are considered contiguous and get grouped into one.
        laparams.word_margin = 0.3  # 0.1 by default : distance between two words is greater than this value => insert space
        laparams.line_margin = 0.5  # 0.5 by default : Distance between 2 Lines under this value are grouped

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''

        x = list(doc.get_pages())
        for i in range(self.page_beg - 1, self.page_end):
            page = x[i]
            extracted_text += "EXTRACTION DE LA PAGE " + str(i + 1) + "\n\n"
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
                    extracted_text += "\n"

        return extracted_text
Example #45
0
def parsePDF(pathPDF, pathText, fname):

	outfile = open(str(os.path.join(pathText, fname))[0:-4] + '.txt', 'w+', encoding='utf-8')

	fp = open(str(os.path.join(pathPDF, fname)), 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument()
	parser.set_document(doc)
	doc.set_parser(parser)
	doc.initialize('')
	rsrcmgr = PDFResourceManager()
	laparams = LAParams()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.
	for page in doc.get_pages():
	    interpreter.process_page(page)
	    layout = device.get_result()
	    for lt_obj in layout:
	        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
	            #print(lt_obj.get_text())
	            outfile.write(lt_obj.get_text())
	            #outfile.write(lt_obj.get_text())
	    outfile.write ('=' * 100 + '\n')
Example #46
0
def pdf_to_string(pdf_file):
    fp = open(pdf_file, 'rb')

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.line_margin = 0.3
    laparams.word_margin = 0.3
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, (LTTextBox, LTTextLine)):
                extracted_text += lt_obj.get_text()

    return extracted_text
Example #47
0
def parsePDFByURLandTokenize_PDFMiner(url):
    file = urllib.request.urlopen(url).read()

    if file is not None:

        memory = io.BytesIO(file)
        parser = PDFParser(memory)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = pdfminer.layout.LAParams()

        #sets the layout analyzer params so we can extract the text with whitespaces
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()

        return word_tokenize(extracted_text)

    return None
Example #48
0
def readPDF(path, topath):
    # 以二进制形式打开pdf文件
    f = open(path, 'rb')
    # 创建pdf文档分析器
    parser = PDFParser(f)
    # 创建pdf文档
    pdfFile = PDFDocument()
    # 连接分析器与文档对象
    parser.set_document(pdfFile)
    pdfFile.set_parser(parser)
    # 提供初始化密码
    pdfFile.initialize()
    # 检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 解释数据
        # 数据管理器
        manager = PDFResourceManager()
        # 创建一个pdf设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        # 解释器对象
        interpreter = PDFPageInterpreter(manager, device)

        # 开始循环处理,每次处理一页
        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            # 获取图层
            layout = device.get_result()
            for x in layout:
                # isinstance函数是用来判断一个对象的变量类型
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(toPath, 'a') as f:
                        str1 = x.get_text()
                        f.write(str1 + "\n")
Example #49
0
def parse_pdf(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    if results[:5] == "准考证号:":
                        return results[5:].replace("\n", "")
Example #50
0
def parse(path):
    tmp = file[i].split('/')
    #The output files will be kept in "res" directory,so please make a "res" directory before run this program
    res_name = './res/' + os.path.splitext(tmp[len(tmp) - 1])[0] + '.txt'
    fp = open(path, 'rb')
    #Create a pdf parser
    praser = PDFParser(fp)
    # Create a pdf doc
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    # init
    doc.initialize()

    #Check whether the doc provides TXT conversion
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # Create pdf resource manager
        rsrcmgr = PDFResourceManager()
        # Create a pdf device object
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a pdf interpreter object
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(res_name, 'a', encoding='utf-8') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n')
Example #51
0
def parse(file_name, target_name):
    fp = open(file_name, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        page_number = 1
        for page in doc.get_pages():
            print('page: ' + str(page_number))
            interpreter.process_page(page)
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(target_name, 'a') as f:
                        results = x.get_text()
                        translate_text = translate(results)
                        f.write(translate_text + '\n')
                # if (isinstance(x, LTImage)):
                #     with open('patternColoring.txt', 'a') as f:
                #         results = x.get_image()
                #         f.write('###########\n' + results + '\n')
            page_number += 1
Example #52
0
def parse(inpath, outpath):
    remove(TMPDIR) # 清除临时目录 
    os.mkdir(TMPDIR)
    remove(outpath) # 清除输出文件
    fp = open(inpath, 'rb')
    praser = PDFParser(fp) # pdf文档分析器
    doc = PDFDocument() # 创建一个PDF文档
    praser.set_document(doc) # 连接分析器与文档对象
    doc.set_parser(praser)
    doc.initialize()
    
    if not doc.is_extractable: # 是否提供txt转换
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager() # 创建PDF资源管理器
        laparams = LAParams() 
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象
                
        for idx,page in enumerate(doc.get_pages()): # 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()
            print("parse", idx)
            parse_section(layout, outpath)
Example #53
0
def parse():
    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open('/Users/liamtheron/Desktop/Deloiite/test.txt',
                              'a') as f:
                        results = x.get_text()
                        f.write(results)
                        f.write('\n')
Example #54
0
def pdf2txt(path):
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    '''
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    '''
    laparams = LAParams()
    for param in (
            "all_texts", "detect_vertical", "word_margin", "char_margin",
            "line_margin",
            "boxes_flow"):
        paramv = locals().get(param, None)
        if paramv is not None:
            setattr(laparams, param, paramv)

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    fp.close()

    return extracted_text
Example #55
0
def convert_pdf_2_text(path, name):
    parser = PDFParser(open(path + name, "rb"))
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if (not doc.is_extractable):
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(path + name[:-4] + ".txt", 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")
def process(path):
    aud = cur = dat = gen = genlong = geo = nam = 0

    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    fp.close()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text().lower()
                    list = results.split()
                    for part in list:
                        aud += count_word(part, auditor)
                        cur += count_word(part, currency)
                        dat += count_word(part, datesand)
                        gen += count_word(part, generic)
                        genlong += count_word(part, genericlong)
                        geo += count_word(part, geographic)
                        nam += count_word(part, names)
    return [aud, cur, dat, gen, genlong, geo, nam]
def Pdf_generation_TF(f, qaStatus=False):
    allSentances = []
    num_words = 0
    parser = PDFParser(f)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    NativeallWords = []
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                num_words += len(lt_obj.get_text().split())
                lt_obj.get_text().encode("utf8")
                rawSentance = str(lt_obj)
                # allSentances=sentanceGernaration(rawSentance)
                endPoint = rawSentance.rfind("\\n") - 1
                rawSentance = rawSentance[55:endPoint].replace('\\n',
                                                               '').replace(
                                                                   '\\s', '')
                allSentances.extend(rawSentance.split('.'))
                NativeallWords.extend(lt_obj.get_text().split())

    # print(NativeallWords)
    # print('Sentances from pdf',allSentances)
    if qaStatus:
        print('from PDF ')
        return allSentances
    return calc_TF(NativeallWords, num_words)
Example #58
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(doc, fp)
    doc.initialize(password)
    pages = dict(
        (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))
    for (level, title, dest, a, se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1(doc.lookup_name('Dests', dest))
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        outfp.write(repr((level, title, dest, pageno)) + '\n')
    parser.close()
    fp.close()
    return
Example #59
0
def getTextFromFirstPage(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''
    po = None
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
        break
    fp.close()
    return extracted_text
Example #60
0
def parse(pdf_path, txt_path):
    fp = open(pdf_path, 'rb')

    parser = PDFParser(fp)

    doc = PDFDocument()

    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if not doc.is_extractable:
        print('走了')
        raise PDFTextExtractionNotAllowed
    else:

        mgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(mgr, laparams=laparams)

        interpreter = PDFPageInterpreter(mgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)

            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(txt_path, 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")