def method(self, polfile, conn, cur): values = util.dataAtHocrBboxes( fs.common_invoice, st.basepath + 'html/' + polfile.docid + '.html') columns_raw = [f['field'] for f in fs.common_invoice] data = dict(zip(columns_raw, values)) data['id'] = polfile.docid del data['header'] # munging try: (data['invoice_from'], data['invoice_to']) = util.checkDates( util.fuzzySplit(data['invoice_period'], '-')) del data['invoice_period'] (data['flight_from'], data['flight_to']) = util.checkDates( util.fuzzySplit(data['flight_dates'], '-')) del data['flight_dates'] (data['page'], data['total_pages']) = util.checkInts( util.fuzzySplit(data['page_of'].replace('Page', ''), 'of')) del data['page_of'] data['invoice_date'] = util.checkDates([data['invoice_date']])[0] if data['estimate_no'] == '': data['estimate_no'] = st.NULL except Exception, e: osutil.print_stderr(e) osutil.print_stderr(data) return
def bulkProcess(process, query, download=False, overwrite=True, abortOnError=False): """Run a process on a set of files in a query.""" (conn, cur) = connect() cur.execute(*query) rows = cur.fetchall() for row in rows: polfile = Polfile(row) if download and not os.path.exists(pdfpath): try: util.downloadBinary(polfile.url, polfile.pdfpath) except Exception, e: osutil.print_stderr(e) if overwrite or not os.path.exists(tifpath): if abortOnError: process.method(polfile, conn, cur) else: try: process.method(polfile, conn, cur) except Exception, e: osutil.print_stderr(e)
def method(self, polfile, conn, cur, abortOnError=True): values = util.dataAtHocrBboxes( fs.common_contract, st.basepath + 'html/' + polfile.docid + '.html') columns_raw = [f['field'] for f in fs.common_contract] data = dict(zip(columns_raw, values)) data['id'] = polfile.docid del data['header'] # munging try: (data['contract_from'], data['contract_to']) = util.checkDates( util.fuzzySplit(data['contract_dates'], ' - ')) del data['contract_dates'] (data['original_date'], data['revision_date']) = util.checkDates( util.fuzzySplit(data['original_date_revision'], ' / ')) del data['original_date_revision'] data['print_date'] = util.checkDates([data['print_date']])[0] (data['page'], data['total_pages']) = util.checkInts( util.fuzzySplit(data['page_from_to'].replace('Page', ''), 'of')) del data['page_from_to'] if data['estimate_no'] == '': data['estimate_no'] = st.NULL except Exception, e: osutil.print_stderr(e) osutil.print_stderr(data) return
def method(self, polfile, conn, cur): values = util.dataAtHocrBboxes( fs.common_order, st.basepath + 'html/' + polfile.docid + '.html') columns_raw = [f['field'] for f in fs.common_order] data = dict(zip(columns_raw, values)) data['id'] = polfile.docid.replace('_', ':') del data['header'] # munging try: (data['original_date'], data['revision_date']) = util.checkDates( util.fuzzySplit(data['original_date_revision'], ' / ')) del data['original_date_revision'] (data['flight_from'], data['flight_to']) = util.checkDates( util.fuzzySplit(data['flight_dates'], '-')) del data['flight_dates'] (data['page'], data['total_pages']) = util.checkInts( util.fuzzySplit(data['page_of'].replace('Page', ''), 'of')) del data['page_of'] except Exception, e: osutil.print_stderr(e) osutil.print_stderr(data) return
class parseTextContracts(Process): """Process to extract data from a text based contract using Poppler.""" def method(self, polfile, conn, cur, abortOnError=True): values = util.dataAtHocrBboxes( fs.common_contract, st.basepath + 'html/' + polfile.docid + '.html') columns_raw = [f['field'] for f in fs.common_contract] data = dict(zip(columns_raw, values)) data['id'] = polfile.docid del data['header'] # munging try: (data['contract_from'], data['contract_to']) = util.checkDates( util.fuzzySplit(data['contract_dates'], ' - ')) del data['contract_dates'] (data['original_date'], data['revision_date']) = util.checkDates( util.fuzzySplit(data['original_date_revision'], ' / ')) del data['original_date_revision'] data['print_date'] = util.checkDates([data['print_date']])[0] (data['page'], data['total_pages']) = util.checkInts( util.fuzzySplit(data['page_from_to'].replace('Page', ''), 'of')) del data['page_from_to'] if data['estimate_no'] == '': data['estimate_no'] = st.NULL except Exception, e: osutil.print_stderr(e) osutil.print_stderr(data) return datavalues = [d.replace('\t', ' ').replace('\n', ' ').replace( ' ', ' ').strip() for d in data.values()] datastr = '\t'.join(datavalues) try: print datastr except Exception, e: osutil.print_stderr(polfile.docid) osutil.print_stderr(e)
class parseTextOrders(Process): """Process to extract data from a text based order using Poppler.""" def method(self, polfile, conn, cur): values = util.dataAtHocrBboxes( fs.common_order, st.basepath + 'html/' + polfile.docid + '.html') columns_raw = [f['field'] for f in fs.common_order] data = dict(zip(columns_raw, values)) data['id'] = polfile.docid.replace('_', ':') del data['header'] # munging try: (data['original_date'], data['revision_date']) = util.checkDates( util.fuzzySplit(data['original_date_revision'], ' / ')) del data['original_date_revision'] (data['flight_from'], data['flight_to']) = util.checkDates( util.fuzzySplit(data['flight_dates'], '-')) del data['flight_dates'] (data['page'], data['total_pages']) = util.checkInts( util.fuzzySplit(data['page_of'].replace('Page', ''), 'of')) del data['page_of'] except Exception, e: osutil.print_stderr(e) osutil.print_stderr(data) return datavalues = [d.replace('\t', ' ').replace('\n', ' ').replace( ' ', ' ').strip() for d in data.values()] datastr = '\t'.join(datavalues) try: print datastr except Exception, e: osutil.print_stderr(polfile.docid) osutil.print_stderr(e)