def bulkProcess(process, query, download=False, overwrite=True, abortOnError=False):
    """Run a process on a set of files in a query."""
    (conn, cur) = connect()
    cur.execute(*query)

    rows = cur.fetchall()

    for row in rows:
        polfile = Polfile(row)

        if download and not os.path.exists(pdfpath):
            try:
                util.downloadBinary(polfile.url, polfile.pdfpath)
            except Exception, e:
                osutil.print_stderr(e)

        if overwrite or not os.path.exists(tifpath):

            if abortOnError:
                process.method(polfile, conn, cur)
            else:
                try:
                    process.method(polfile, conn, cur)
                except Exception, e:
                    osutil.print_stderr(e)
                url = link.get('href')
                fullURL = st.baseurl + url
                filename = st.basepath + 'html/' + getFilenameFromUrl(fullURL)

                if filename[-3:].lower() != 'pdf':
                    continue

                outputFile = filename[:-3] + 'html'

                if not os.path.exists(outputFile):
                    print outputFile
                    try:
                        if not os.path.exists(os.path.dirname(filename)):
                            os.makedirs(os.path.dirname(filename))

                        util.downloadBinary(fullURL, filename)

                        util.pdfToText(filename)

                        rec = processRec(url)
                        rec['id'] = entry.id.get_text().replace(
                            ':', '_').replace(')', '')
                        rec['station'] = rec['id'].split('_')[0]
                        rec['updated'] = entry.updated.get_text()
                        #rec['outfile'] = outputFile

                        db.write(json.dumps(rec) + '\n')

                        dbInsert(rec, 'polfile')
                        pdfpath = '%spdfs/%s.pdf' % (st.basepath, rec['id'])
                        shutil.move(filename, pdfpath)
Ejemplo n.º 3
0
                url = link.get('href')
                fullURL = st.baseurl + url
                filename = st.basepath + 'html/' + getFilenameFromUrl(fullURL)

                if filename[-3:].lower() != 'pdf':
                    continue

                outputFile = filename[:-3] + 'html'

                if not os.path.exists(outputFile):
                    print outputFile
                    try:
                        if not os.path.exists(os.path.dirname(filename)):
                            os.makedirs(os.path.dirname(filename))

                        util.downloadBinary(fullURL, filename)

                        util.pdfToText(filename)

                        rec = processRec(url)
                        rec['id'] = entry.id.get_text().replace(':',
                                                                '_').replace(
                                                                    ')', '')
                        rec['station'] = rec['id'].split('_')[0]
                        rec['updated'] = entry.updated.get_text()
                        #rec['outfile'] = outputFile

                        db.write(json.dumps(rec) + '\n')

                        dbInsert(rec, 'polfile')
                        pdfpath = '%spdfs/%s.pdf' % (st.basepath, rec['id'])