def condense(engine, resource_id, table_id, force): table_suffix = '%s_table%s' % (resource_id, table_id) if not engine.has_table('raw_%s' % table_suffix): return condensed_table = sl.get_table(engine, 'condensed') # Skip over tables we have already extracted if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None: return connection = engine.connect() trans = connection.begin() start = time.time() try: raw_table = sl.get_table(connection, 'raw_%s' % table_suffix) sl.drop_table(connection, 'spending_%s' % table_suffix) spending_table = sl.get_table(connection, 'spending_%s' % table_suffix) columns_table = sl.get_table(connection, 'column_sets') normalise_map = normalised_columns_map(raw_table) normalised_headers = ','.join(sorted(normalise_map.values())) mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers) if mapping_row is None or not mapping_row.get('valid'): # This table is unmapped, cannot be condensed return column_mapping = json.loads(mapping_row['column_map']) # Build the final mapping from input column to output column mapping = {} for k,n in normalise_map.iteritems(): if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0: mapping[k] = column_mapping[n] for row in sl.all(connection, raw_table): spending_row = {} for key, value in row.items(): if key not in mapping: continue if not value or not len(value.strip()): continue if mapping[key] in spending_row: continue spending_row[mapping[key]] = value.strip() #print spending_row sl.add_row(connection, spending_table, spending_row) sl.upsert(connection, condensed_table, {'resource_id': resource_id, 'table_id': table_id, 'condense_time': time.time() - start, }, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def extract_resource_core(engine, row, stats): connection = engine.connect() fh = open(source_path(row), 'rb') source_data = fh.read() if not len(source_data): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Empty file") stats.add_source('Empty file', row) return False, 0 if html_re.search(source_data[0:1024]) is not None: issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "HTML file detected, not a transaction report") stats.add_source('HTML file', row) return False, 0 if source_data.startswith('%PDF'): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "PDF file detected, not a transaction report") stats.add_source('PDF file', row) return False, 0 trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet(fh) elif source_data.startswith('PK'): table_set = XLSXTableSet(filename=source_path(row)) else: #fh.seek(0) from StringIO import StringIO sio = StringIO(source_data) encoding = None detected = chardet.detect(source_data[:200]) log.debug('Encoding detected as: %s', detected.get('encoding')) if detected.get('encoding') == 'ISO-8859-2' and '\xa3' in source_data: # Detected as Latin2 but probably isn't - that is for Eastern # European languages. Probably because the presence of a GBP # pound sign has foxed chardet. It is pretty certain that it is # a single-byte ASCII-variant, and my money is on Windows-1252 encoding = 'windows-1252' log.debug('Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252') table_set = CSVTableSet(sio, encoding=encoding) sheets = 0 for sheet_id, row_set in enumerate(table_set.tables): offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) log.debug("Headers: %r", headers) if len(headers) <= 1: continue sheets += 1 row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) # with one header row, offset=0 and we want row_number=1 so that # the first data row is row_number=2, matching the row number as # seen in Excel row_number = offset + 1 for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \ len(c.column.strip())]) row_number += 1 if is_row_blank(cells): continue for cell, value in cells.items(): values[cell][value] += 1 cells['row_number'] = row_number sl.add_row(connection, raw_table, cells) trans.commit() log.debug(stats.add_source('Extracted ok', row)) return sheets>0, sheets except Exception, ex: log.exception(ex) issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, unicode(ex)) stats.add_source('Exception: %s' % ex.__class__.__name__, row) return False, 0
def extract_table(engine, table, row, resource_id, force): # For now, interpret lack of data as not-failure at this stage, on # the basis that it was already reported as failure at the # retrieve stage and will just clutter up this list. if not os.path.exists(source_path(row)): return # assert os.path.exists(source_path(row)), "No source file exists." connection = engine.connect() extracted_table = sl.get_table(connection, "extracted") # Skip over tables we have already extracted if not force and sl.find_one(engine, extracted_table, resource_id=resource_id) is not None: return fh = open(source_path(row), "rb") source_data = fh.read() assert len(source_data) > 0, "Empty file" assert html_re.search(source_data[0:1024]) is None, "Looks like HTML" assert not source_data.startswith("%PDF"), "Looks like PDF" trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet.from_fileobj(fh) elif source_data.startswith("PK"): table_set = XLSXTableSet(source_path(row)) else: cd = chardet.detect(source_data) fh.close() fh = codecs.open(source_path(row), "r", cd["encoding"]) table_set = CSVTableSet.from_fileobj(fh) for table_id, row_set in enumerate(table_set.tables): # types = type_guess(row_set.sample) # row_set.register_processor(types_processor(types)) offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) assert ( len(headers) > 1 or len(table_set.tables) > 1 ), "Only one column was detected; assuming this is not valid data." # print headers # We might have multiple table sets where one is blank or ranty text or something. Skip those. if len(headers) <= 1: continue row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = "raw_%s_table%s" % (resource_id, table_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if len(c.column.strip())]) for cell, value in cells.items(): values[cell][value] += 1 sl.add_row(connection, raw_table, cells) sl.upsert( connection, extracted_table, {"resource_id": resource_id, "max_table_id": table_id, "extraction_time": time.time() - start}, ["resource_id"], ) trans.commit() # except Exception: # traceback.print_exc() # #log.exception(ex) # assert False, traceback.format_exc() finally: connection.close() fh.close()
def extract_resource_core(engine, row, stats): connection = engine.connect() fh = open(source_path(row), 'rb') source_data = fh.read() if not len(source_data): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Empty file") stats.add_source('Empty file', row) return False, 0 if html_re.search(source_data[0:1024]) is not None: issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "HTML file detected, not a transaction report") stats.add_source('HTML file', row) return False, 0 if source_data.startswith('%PDF'): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "PDF file detected, not a transaction report") stats.add_source('PDF file', row) return False, 0 trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet(fh) elif source_data.startswith('PK'): table_set = XLSXTableSet(filename=source_path(row)) else: #fh.seek(0) from StringIO import StringIO sio = StringIO(source_data) encoding = None detected = chardet.detect(source_data[:200]) log.debug('Encoding detected as: %s', detected.get('encoding')) if detected.get( 'encoding') == 'ISO-8859-2' and '\xa3' in source_data: # Detected as Latin2 but probably isn't - that is for Eastern # European languages. Probably because the presence of a GBP # pound sign has foxed chardet. It is pretty certain that it is # a single-byte ASCII-variant, and my money is on Windows-1252 encoding = 'windows-1252' log.debug( 'Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252' ) table_set = CSVTableSet(sio, encoding=encoding) sheets = 0 for sheet_id, row_set in enumerate(table_set.tables): offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) log.debug("Headers: %r", headers) if len(headers) <= 1: continue sheets += 1 row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) # with one header row, offset=0 and we want row_number=1 so that # the first data row is row_number=2, matching the row number as # seen in Excel row_number = offset + 1 for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \ len(c.column.strip())]) row_number += 1 if is_row_blank(cells): continue for cell, value in cells.items(): values[cell][value] += 1 cells['row_number'] = row_number sl.add_row(connection, raw_table, cells) trans.commit() log.debug(stats.add_source('Extracted ok', row)) return sheets > 0, sheets except Exception, ex: log.exception(ex) issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, unicode(ex)) stats.add_source('Exception: %s' % ex.__class__.__name__, row) return False, 0
def extract_resource_core(engine, row): connection = engine.connect() fh = open(source_path(row), 'rb') source_data = fh.read() if not len(source_data): issue(engine, row['resource_id'], row['retrieve_hash'], "Empty file") return False, 0 if html_re.search(source_data[0:1024]) is not None: issue(engine, row['resource_id'], row['retrieve_hash'], "HTML file detected, not a transaction report") return False, 0 if source_data.startswith('%PDF'): issue(engine, row['resource_id'], row['retrieve_hash'], "PDF file detected, not a transaction report") return False, 0 trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet(fh) elif source_data.startswith('PK'): table_set = XLSXTableSet(source_path(row)) else: #fh.seek(0) from StringIO import StringIO sio = StringIO(source_data) #cd = chardet.detect(source_data) #fh.close() #fh = codecs.open(source_path(row), 'r', cd['encoding'] or 'utf-8') table_set = CSVTableSet(sio) sheets = 0 for sheet_id, row_set in enumerate(table_set.tables): offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) log.debug("Headers: %r", headers) if len(headers) <= 1: continue sheets += 1 row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \ len(c.column.strip())]) for cell, value in cells.items(): values[cell][value] += 1 sl.add_row(connection, raw_table, cells) trans.commit() return sheets>0, sheets except Exception, ex: log.exception(ex) issue(engine, row['resource_id'], row['retrieve_hash'], unicode(ex)) return False, 0