def bulkProcess(process, query, download=False, overwrite=True, abortOnError=False): """Run a process on a set of files in a query.""" (conn, cur) = connect() cur.execute(*query) rows = cur.fetchall() for row in rows: polfile = Polfile(row) if download and not os.path.exists(pdfpath): try: util.downloadBinary(polfile.url, polfile.pdfpath) except Exception, e: osutil.print_stderr(e) if overwrite or not os.path.exists(tifpath): if abortOnError: process.method(polfile, conn, cur) else: try: process.method(polfile, conn, cur) except Exception, e: osutil.print_stderr(e)
url = link.get('href') fullURL = st.baseurl + url filename = st.basepath + 'html/' + getFilenameFromUrl(fullURL) if filename[-3:].lower() != 'pdf': continue outputFile = filename[:-3] + 'html' if not os.path.exists(outputFile): print outputFile try: if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) util.downloadBinary(fullURL, filename) util.pdfToText(filename) rec = processRec(url) rec['id'] = entry.id.get_text().replace( ':', '_').replace(')', '') rec['station'] = rec['id'].split('_')[0] rec['updated'] = entry.updated.get_text() #rec['outfile'] = outputFile db.write(json.dumps(rec) + '\n') dbInsert(rec, 'polfile') pdfpath = '%spdfs/%s.pdf' % (st.basepath, rec['id']) shutil.move(filename, pdfpath)
url = link.get('href') fullURL = st.baseurl + url filename = st.basepath + 'html/' + getFilenameFromUrl(fullURL) if filename[-3:].lower() != 'pdf': continue outputFile = filename[:-3] + 'html' if not os.path.exists(outputFile): print outputFile try: if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) util.downloadBinary(fullURL, filename) util.pdfToText(filename) rec = processRec(url) rec['id'] = entry.id.get_text().replace(':', '_').replace( ')', '') rec['station'] = rec['id'].split('_')[0] rec['updated'] = entry.updated.get_text() #rec['outfile'] = outputFile db.write(json.dumps(rec) + '\n') dbInsert(rec, 'polfile') pdfpath = '%spdfs/%s.pdf' % (st.basepath, rec['id'])