def _message(self): return self.template.format( field=self.sniffer.field['name'], guesses=format_to_json(self.sniffer.format_guesses), failed_rows=format_to_json(self.sniffer.failures[:10]), nb_failures=self.sniffer.nb_failures, max_nb_failures=self.sniffer.max_nb_failures, sample_size=self.sniffer.sample_size, )
def _log_parameters(self): """Record ingestion parameters to the log stream.""" fields_as_json = format_to_json(sorted(self._fields)) headers_as_json = format_to_json(sorted(self._headers)) options_as_json = format_to_json(self._body_options) nb_empty_headers = len(self._fields) - len(self._headers) logging.info('Ignoring %s empty header fields', nb_empty_headers) logging.info('%s sourced fields = %s', len(self._fields), fields_as_json) logging.info('%s data fields = %s', len(self._headers), headers_as_json) logging.info('Ingestor options = %s', options_as_json)
def _drop_bad_rows(rows): """Drop rows when they don't match headers (post-processor).""" for index, headers, row in rows: while len(row) > len(headers) and len(row[-1].strip()) == 0: row = row[:-1] if len(row) == len(headers): yield index, headers, row else: message = 'Bad row {}:\nheaders={}\nrow={}'\ .format(index, format_to_json(headers), format_to_json(row)) assert False, message
def _log_parameters(self): """Record ingestion parameters to the log stream.""" fields_as_json = format_to_json(sorted(self._fields)) headers_as_json = format_to_json(sorted(self._headers)) options_as_json = format_to_json(self._body_options) nb_empty_headers = len(self._fields) - len(self._headers) info('Ignoring %s empty header fields', nb_empty_headers) info('%s sourced fields = %s', len(self._fields), fields_as_json) info('%s data fields = %s', len(self._headers), headers_as_json) info('Ingestor options = %s', options_as_json)
def check_fields_match(resource, stream): """Check if the datapackage and the data have the same set of fields.""" data_fields = [str(field) for field in stream.headers if field] sourced_fields = [field['name'] for field in resource['schema']['fields']] nb_untitled_fields = len(stream.headers) - len(data_fields) fields_as_json = format_to_json(sorted(sourced_fields)) data_fields_as_json = format_to_json(sorted(data_fields)) info('%s fields sourced = %s', len(sourced_fields), fields_as_json) info('%s untitled fields in the data', nb_untitled_fields) info('%s fields in the data = %s', len(data_fields), data_fields_as_json) message = 'Data and source fields do not match' assert set(data_fields) == set(sourced_fields), message
def skip_rows(rows): for index, headers, row in rows: if index not in row_to_skip: yield (index, headers, row) else: row_as_json = format_to_json(dict(zip(headers, row))) warning('Skipping row %s = %s', index, row_as_json)
def fill_missing_fields(): with open(self.resource['path']) as stream: rows = json.loads(stream.read()) for row in rows: for header in self._raw_headers: if header not in row: row[header] = None with open(self.resource['path'], 'w+') as stream: stream.write(format_to_json(rows))
def fill_missing_fields(path): """Pre-fill incomplete JSON rows (to avoid fields mixing up).""" headers = get_json_headers(path) with open(path) as stream: rows = json.loads(stream.read()) for row in rows: for header in headers: if header not in row: row[header] = None with open(path, 'w+') as stream: stream.write(format_to_json(rows))
def assemble_fiscal_datapackage(): """Assemble the fiscal datapackage for the concatenated dataset.""" with open(FISCAL_METADATA_FILE) as stream: fdp = yaml.load(stream.read()) with open(FISCAL_MODEL_FILE) as stream: fdp['model'] = yaml.load(stream.read()) with open(FISCAL_SCHEMA_FILE) as stream: fdp['resources'][0]['schema'] = yaml.load(stream.read()) message = 'Fiscal datapackage: \n%s' info(message, format_to_json(fdp)) return fdp
def stream_local_file(datapackage, **parameters): """Read local files and return row iterators.""" if not parameters.get('sample_size'): parameters.update(sample_size=LOG_SAMPLE_SIZE) for resource in datapackage['resources']: path = resource['path'] _, extension = os.path.splitext(path) parameters.update(headers=1) parameters['post_parse'] = [] if 'parser_options' in resource: if resource['parser_options'].get('skip_rows'): row_numbers = resource['parser_options'].pop('skip_rows') or [] if row_numbers: parameters['post_parse'] = [get_skip_rows(row_numbers)] parameters.update(**resource.get('parser_options')) if extension == '.csv': parameters['post_parse'].append(drop_bad_rows) parameters.update(encoding=get_encoding(parameters, resource)) if extension in ('.xls', '.xlsx'): parameters['post_parse'].append(force_strings) if extension == '.json': fill_missing_fields(path) parameters['post_parse'].append(force_strings) info('Ingesting file = %s', path) info('Ingestion parameters = %s', format_to_json(parameters)) parameters.update(headers=get_headers(parameters, path)) with Stream(path, **parameters) as stream: check_fields_match(resource, stream) log_sample_table(stream) yield stream.iter(keyed=True)
def parse_currencies(row, fields=None, characters=None): """Clean up and convert currency fields to floats.""" assert fields, 'Missing `fields` parameter' assert characters, 'Missing `characters` parameter' for key in fields: if row[key] is not None: row[key] = str(row[key]) if not row[key].strip(): row[key] = None else: try: row[key] = float(row[key].replace( characters['currency'], '').replace(characters['grouping'], '').replace(characters['decimal'], '.').strip()) except ValueError as error: warning('%s in row\n%s', error, format_to_json(row)) return row