Beispiel #1
0
    def convert(self, data):
        # convert binary pdf data into a file like structure
        pdfdata = StringIO(data)

        # I have no idea why this is needed
        CMapDB.initialize('CMap', 'CDBCMap')

        # create the converter and resource manager
        rsrc = PDFResourceManager()
        converter = TextConverter(rsrc)

        # setup the parser
        doc = PDFDocument()
        parser = PDFParser(doc, pdfdata)

        # initialize the pdf
        try:
            # use empty password
            doc.initialize('')
        except PDFPasswordIncorrect:
            return ''

        # check if we can extract the contents of this file
        if not doc.is_extractable:
            return ''

        # do the conversion
        interpreter = PDFPageInterpreter(rsrc, converter)
        for page in doc.get_pages():
            interpreter.process_page(page)

        converter.close()
        pdfdata.close()

        return converter.get_text()
Beispiel #2
0
    def convert(self, data):
        # convert binary pdf data into a file like structure
        pdfdata = StringIO(data)

        # I have no idea why this is needed
        CMapDB.initialize('CMap', 'CDBCMap')

        # create the converter and resource manager
        rsrc = PDFResourceManager()
        converter = TextConverter(rsrc)

        # setup the parser
        doc = PDFDocument()
        parser = PDFParser(doc, pdfdata)

        # initialize the pdf
        try:
            # use empty password
            doc.initialize('')
        except PDFPasswordIncorrect:
            return ''

        # check if we can extract the contents of this file
        if not doc.is_extractable:
            return ''
 
        # do the conversion
        interpreter = PDFPageInterpreter(rsrc, converter)
        for page in doc.get_pages():
            interpreter.process_page(page)

        converter.close()
        pdfdata.close()

        return converter.get_text()
Beispiel #3
0
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  try:
    doc.initialize(password)
  except PDFPasswordIncorrect:
    raise TextExtractionNotAllowed('Incorrect password')
  if not doc.is_extractable:
    raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
  interpreter = PDFPageInterpreter(rsrc, device)
  for (pageno,page) in enumerate(doc.get_pages()):
    if pagenos and (pageno not in pagenos): continue
    interpreter.process_page(page)
    if maxpages and maxpages <= pageno+1: break
  device.close()
  fp.close()
  return
Beispiel #4
0
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(doc, fp)
    try:
        doc.initialize(password)
    except PDFPasswordIncorrect:
        raise TextExtractionNotAllowed('Incorrect password')
    if not doc.is_extractable:
        raise TextExtractionNotAllowed('Text extraction is not allowed: %r' %
                                       fname)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno, page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno + 1: break
    device.close()
    fp.close()
    return