Beispiel #1
0
    def convert(self, data):
        # convert binary pdf data into a file like structure
        pdfdata = StringIO(data)

        # I have no idea why this is needed
        CMapDB.initialize('CMap', 'CDBCMap')

        # create the converter and resource manager
        rsrc = PDFResourceManager()
        converter = TextConverter(rsrc)

        # setup the parser
        doc = PDFDocument()
        parser = PDFParser(doc, pdfdata)

        # initialize the pdf
        try:
            # use empty password
            doc.initialize('')
        except PDFPasswordIncorrect:
            return ''

        # check if we can extract the contents of this file
        if not doc.is_extractable:
            return ''

        # do the conversion
        interpreter = PDFPageInterpreter(rsrc, converter)
        for page in doc.get_pages():
            interpreter.process_page(page)

        converter.close()
        pdfdata.close()

        return converter.get_text()
Beispiel #2
0
    def convert(self, data):
        # convert binary pdf data into a file like structure
        pdfdata = StringIO(data)

        # I have no idea why this is needed
        CMapDB.initialize('CMap', 'CDBCMap')

        # create the converter and resource manager
        rsrc = PDFResourceManager()
        converter = TextConverter(rsrc)

        # setup the parser
        doc = PDFDocument()
        parser = PDFParser(doc, pdfdata)

        # initialize the pdf
        try:
            # use empty password
            doc.initialize('')
        except PDFPasswordIncorrect:
            return ''

        # check if we can extract the contents of this file
        if not doc.is_extractable:
            return ''
 
        # do the conversion
        interpreter = PDFPageInterpreter(rsrc, converter)
        for page in doc.get_pages():
            interpreter.process_page(page)

        converter.close()
        pdfdata.close()

        return converter.get_text()
Beispiel #3
0
def main(argv):
    import getopt

    def usage():
        print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[
            0]
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = 0
    cmapdir = 'CMap'
    cdbcmapdir = 'CDBCMap'
    codec = 'ascii'
    pagenos = set()
    maxpages = 0
    outtype = 'html'
    password = ''
    outfp = stdout
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-P': password = v
        elif k == '-c': codec = v
        elif k == '-m': maxpages = int(v)
        elif k == '-C': cmapdir = v
        elif k == '-D': cdbcmapdir = v
        elif k == '-t': outtype = v
        elif k == '-o': outfp = file(v, 'wb')
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    #
    CMapDB.initialize(cmapdir, cdbcmapdir)
    rsrc = PDFResourceManager()
    if outtype == 'sgml':
        device = SGMLConverter(rsrc, outfp, codec)
    elif outtype == 'html':
        device = HTMLConverter(rsrc, outfp, codec)
    elif outtype == 'tag':
        device = TagExtractor(rsrc, outfp, codec)
    else:
        return usage()
    for fname in args:
        convert(rsrc,
                device,
                fname,
                pagenos,
                maxpages=maxpages,
                password=password)
    return
Beispiel #4
0
def main(argv):
  import getopt
  def usage():
    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0]
    return 100
  try:
    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
  debug = 0
  cmapdir = 'CMap'
  cdbcmapdir = 'CDBCMap'
  codec = 'ascii'
  pagenos = set()
  maxpages = 0
  outtype = 'html'
  password = ''
  outfp = stdout
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-P': password = v
    elif k == '-c': codec = v
    elif k == '-m': maxpages = int(v)
    elif k == '-C': cmapdir = v
    elif k == '-D': cdbcmapdir = v
    elif k == '-t': outtype = v
    elif k == '-o': outfp = file(v, 'wb')
  #
  CMapDB.debug = debug
  PDFResourceManager.debug = debug
  PDFDocument.debug = debug
  PDFParser.debug = debug
  PDFPageInterpreter.debug = debug
  #
  CMapDB.initialize(cmapdir, cdbcmapdir)
  rsrc = PDFResourceManager()
  if outtype == 'sgml':
    device = SGMLConverter(rsrc, outfp, codec)
  elif outtype == 'html':
    device = HTMLConverter(rsrc, outfp, codec)
  elif outtype == 'tag':
    device = TagExtractor(rsrc, outfp, codec)
  else:
    return usage()
  for fname in args:
    convert(rsrc, device, fname, pagenos, 
            maxpages=maxpages, password=password)
  return