def method(self, polfile, conn, cur):
        values = util.dataAtHocrBboxes(
            fs.common_invoice, st.basepath + 'html/' + polfile.docid + '.html')
        columns_raw = [f['field'] for f in fs.common_invoice]
        data = dict(zip(columns_raw, values))
        data['id'] = polfile.docid

        del data['header']

        # munging
        try:

            (data['invoice_from'], data['invoice_to']) = util.checkDates(
                util.fuzzySplit(data['invoice_period'], '-'))
            del data['invoice_period']

            (data['flight_from'], data['flight_to']) = util.checkDates(
                util.fuzzySplit(data['flight_dates'], '-'))
            del data['flight_dates']

            (data['page'], data['total_pages']) = util.checkInts(
                util.fuzzySplit(data['page_of'].replace('Page', ''), 'of'))
            del data['page_of']

            data['invoice_date'] = util.checkDates([data['invoice_date']])[0]

            if data['estimate_no'] == '':
                data['estimate_no'] = st.NULL

        except Exception, e:
            osutil.print_stderr(e)
            osutil.print_stderr(data)
            return
def bulkProcess(process, query, download=False, overwrite=True, abortOnError=False):
    """Run a process on a set of files in a query."""
    (conn, cur) = connect()
    cur.execute(*query)

    rows = cur.fetchall()

    for row in rows:
        polfile = Polfile(row)

        if download and not os.path.exists(pdfpath):
            try:
                util.downloadBinary(polfile.url, polfile.pdfpath)
            except Exception, e:
                osutil.print_stderr(e)

        if overwrite or not os.path.exists(tifpath):

            if abortOnError:
                process.method(polfile, conn, cur)
            else:
                try:
                    process.method(polfile, conn, cur)
                except Exception, e:
                    osutil.print_stderr(e)
    def method(self, polfile, conn, cur, abortOnError=True):
        values = util.dataAtHocrBboxes(
            fs.common_contract, st.basepath + 'html/' + polfile.docid + '.html')

        columns_raw = [f['field'] for f in fs.common_contract]
        data = dict(zip(columns_raw, values))

        data['id'] = polfile.docid

        del data['header']

        # munging

        try:
            (data['contract_from'], data['contract_to']) = util.checkDates(
                util.fuzzySplit(data['contract_dates'], ' - '))
            del data['contract_dates']

            (data['original_date'], data['revision_date']) = util.checkDates(
                util.fuzzySplit(data['original_date_revision'], ' / '))
            del data['original_date_revision']

            data['print_date'] = util.checkDates([data['print_date']])[0]

            (data['page'], data['total_pages']) = util.checkInts(
                util.fuzzySplit(data['page_from_to'].replace('Page', ''), 'of'))
            del data['page_from_to']

            if data['estimate_no'] == '':
                data['estimate_no'] = st.NULL
        except Exception, e:
            osutil.print_stderr(e)
            osutil.print_stderr(data)
            return
    def method(self, polfile, conn, cur):
        values = util.dataAtHocrBboxes(
            fs.common_order, st.basepath + 'html/' + polfile.docid + '.html')
        columns_raw = [f['field'] for f in fs.common_order]
        data = dict(zip(columns_raw, values))

        data['id'] = polfile.docid.replace('_', ':')

        del data['header']

        # munging

        try:

            (data['original_date'], data['revision_date']) = util.checkDates(
                util.fuzzySplit(data['original_date_revision'], ' / '))
            del data['original_date_revision']

            (data['flight_from'], data['flight_to']) = util.checkDates(
                util.fuzzySplit(data['flight_dates'], '-'))
            del data['flight_dates']

            (data['page'], data['total_pages']) = util.checkInts(
                util.fuzzySplit(data['page_of'].replace('Page', ''), 'of'))
            del data['page_of']

        except Exception, e:
            osutil.print_stderr(e)
            osutil.print_stderr(data)
            return
class parseTextContracts(Process):
    """Process to extract data from a text based contract using Poppler."""

    def method(self, polfile, conn, cur, abortOnError=True):
        values = util.dataAtHocrBboxes(
            fs.common_contract, st.basepath + 'html/' + polfile.docid + '.html')

        columns_raw = [f['field'] for f in fs.common_contract]
        data = dict(zip(columns_raw, values))

        data['id'] = polfile.docid

        del data['header']

        # munging

        try:
            (data['contract_from'], data['contract_to']) = util.checkDates(
                util.fuzzySplit(data['contract_dates'], ' - '))
            del data['contract_dates']

            (data['original_date'], data['revision_date']) = util.checkDates(
                util.fuzzySplit(data['original_date_revision'], ' / '))
            del data['original_date_revision']

            data['print_date'] = util.checkDates([data['print_date']])[0]

            (data['page'], data['total_pages']) = util.checkInts(
                util.fuzzySplit(data['page_from_to'].replace('Page', ''), 'of'))
            del data['page_from_to']

            if data['estimate_no'] == '':
                data['estimate_no'] = st.NULL
        except Exception, e:
            osutil.print_stderr(e)
            osutil.print_stderr(data)
            return

        datavalues = [d.replace('\t', ' ').replace('\n', ' ').replace(
            '  ', ' ').strip() for d in data.values()]

        datastr = '\t'.join(datavalues)

        try:
            print datastr
        except Exception, e:
            osutil.print_stderr(polfile.docid)
            osutil.print_stderr(e)
class parseTextOrders(Process):
    """Process to extract data from a text based order using Poppler."""
    
    def method(self, polfile, conn, cur):
        values = util.dataAtHocrBboxes(
            fs.common_order, st.basepath + 'html/' + polfile.docid + '.html')
        columns_raw = [f['field'] for f in fs.common_order]
        data = dict(zip(columns_raw, values))

        data['id'] = polfile.docid.replace('_', ':')

        del data['header']

        # munging

        try:

            (data['original_date'], data['revision_date']) = util.checkDates(
                util.fuzzySplit(data['original_date_revision'], ' / '))
            del data['original_date_revision']

            (data['flight_from'], data['flight_to']) = util.checkDates(
                util.fuzzySplit(data['flight_dates'], '-'))
            del data['flight_dates']

            (data['page'], data['total_pages']) = util.checkInts(
                util.fuzzySplit(data['page_of'].replace('Page', ''), 'of'))
            del data['page_of']

        except Exception, e:
            osutil.print_stderr(e)
            osutil.print_stderr(data)
            return

        datavalues = [d.replace('\t', ' ').replace('\n', ' ').replace(
            '  ', ' ').strip() for d in data.values()]
        datastr = '\t'.join(datavalues)

        try:
            print datastr
        except Exception, e:
            osutil.print_stderr(polfile.docid)
            osutil.print_stderr(e)