def main(argv): import getopt def usage(): print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-o output] file ...' % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:o:') except getopt.GetoptError: return usage() if not args: return usage() debug = 0 cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' codec = 'ascii' pages = set() password = '' outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pages.add(int(v)) elif k == '-P': password = v elif k == '-c': codec = v elif k == '-o': outfp = file(v, 'wb') # CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) for fname in args: pdf2txt(outfp, rsrc, fname, pages, codec, password=password, debug=debug) return
def convert_cmap(files, cmapdir, cdbcmapdir, force=False): from cmap import CMapDB CMapDB.initialize(cmapdir) for fname in files: if fname.endswith('.upr'): continue cmapname = os.path.basename(fname) cdbname = os.path.join(cdbcmapdir, cmapname+'.cmap.cdb') if not force and os.path.exists(cdbname): print >>stderr, 'Skipping: %r' % cdbname continue print >>stderr, 'Reading: %r...' % fname cmap = CMapDB.get_cmap(cmapname) dumpcdb(cmap, cdbname) return