def _message(self):
     return self.template.format(
         field=self.sniffer.field['name'],
         guesses=format_to_json(self.sniffer.format_guesses),
         failed_rows=format_to_json(self.sniffer.failures[:10]),
         nb_failures=self.sniffer.nb_failures,
         max_nb_failures=self.sniffer.max_nb_failures,
         sample_size=self.sniffer.sample_size,
     )
 def _message(self):
     return self.template.format(
         field=self.sniffer.field['name'],
         guesses=format_to_json(self.sniffer.format_guesses),
         failed_rows=format_to_json(self.sniffer.failures[:10]),
         nb_failures=self.sniffer.nb_failures,
         max_nb_failures=self.sniffer.max_nb_failures,
         sample_size=self.sniffer.sample_size,
     )
    def _log_parameters(self):
        """Record ingestion parameters to the log stream."""

        fields_as_json = format_to_json(sorted(self._fields))
        headers_as_json = format_to_json(sorted(self._headers))
        options_as_json = format_to_json(self._body_options)
        nb_empty_headers = len(self._fields) - len(self._headers)

        logging.info('Ignoring %s empty header fields', nb_empty_headers)
        logging.info('%s sourced fields = %s', len(self._fields), fields_as_json)
        logging.info('%s data fields = %s', len(self._headers), headers_as_json)
        logging.info('Ingestor options = %s', options_as_json)
    def _drop_bad_rows(rows):
        """Drop rows when they don't match headers (post-processor)."""

        for index, headers, row in rows:
            while len(row) > len(headers) and len(row[-1].strip()) == 0:
                row = row[:-1]
            if len(row) == len(headers):
                yield index, headers, row
            else:
                message = 'Bad row {}:\nheaders={}\nrow={}'\
                    .format(index, format_to_json(headers), format_to_json(row))
                assert False, message
    def _log_parameters(self):
        """Record ingestion parameters to the log stream."""

        fields_as_json = format_to_json(sorted(self._fields))
        headers_as_json = format_to_json(sorted(self._headers))
        options_as_json = format_to_json(self._body_options)
        nb_empty_headers = len(self._fields) - len(self._headers)

        info('Ignoring %s empty header fields', nb_empty_headers)
        info('%s sourced fields = %s', len(self._fields), fields_as_json)
        info('%s data fields = %s', len(self._headers), headers_as_json)
        info('Ingestor options = %s', options_as_json)
    def _drop_bad_rows(rows):
        """Drop rows when they don't match headers (post-processor)."""

        for index, headers, row in rows:
            while len(row) > len(headers) and len(row[-1].strip()) == 0:
                row = row[:-1]
            if len(row) == len(headers):
                yield index, headers, row
            else:
                message = 'Bad row {}:\nheaders={}\nrow={}'\
                    .format(index, format_to_json(headers), format_to_json(row))
                assert False, message
def check_fields_match(resource, stream):
    """Check if the datapackage and the data have the same set of fields."""

    data_fields = [str(field) for field in stream.headers if field]
    sourced_fields = [field['name'] for field in resource['schema']['fields']]
    nb_untitled_fields = len(stream.headers) - len(data_fields)

    fields_as_json = format_to_json(sorted(sourced_fields))
    data_fields_as_json = format_to_json(sorted(data_fields))

    info('%s fields sourced = %s', len(sourced_fields), fields_as_json)
    info('%s untitled fields in the data', nb_untitled_fields)
    info('%s fields in the data = %s', len(data_fields), data_fields_as_json)

    message = 'Data and source fields do not match'
    assert set(data_fields) == set(sourced_fields), message
 def skip_rows(rows):
     for index, headers, row in rows:
         if index not in row_to_skip:
             yield (index, headers, row)
         else:
             row_as_json = format_to_json(dict(zip(headers, row)))
             warning('Skipping row %s = %s', index, row_as_json)
def check_fields_match(resource, stream):
    """Check if the datapackage and the data have the same set of fields."""

    data_fields = [str(field) for field in stream.headers if field]
    sourced_fields = [field['name'] for field in resource['schema']['fields']]
    nb_untitled_fields = len(stream.headers) - len(data_fields)

    fields_as_json = format_to_json(sorted(sourced_fields))
    data_fields_as_json = format_to_json(sorted(data_fields))

    info('%s fields sourced = %s', len(sourced_fields), fields_as_json)
    info('%s untitled fields in the data', nb_untitled_fields)
    info('%s fields in the data = %s', len(data_fields), data_fields_as_json)

    message = 'Data and source fields do not match'
    assert set(data_fields) == set(sourced_fields), message
 def skip_rows(rows):
     for index, headers, row in rows:
         if index not in row_to_skip:
             yield (index, headers, row)
         else:
             row_as_json = format_to_json(dict(zip(headers, row)))
             warning('Skipping row %s = %s', index, row_as_json)
        def fill_missing_fields():
            with open(self.resource['path']) as stream:
                rows = json.loads(stream.read())

            for row in rows:
                for header in self._raw_headers:
                    if header not in row:
                        row[header] = None

            with open(self.resource['path'], 'w+') as stream:
                stream.write(format_to_json(rows))
        def fill_missing_fields():
            with open(self.resource['path']) as stream:
                rows = json.loads(stream.read())

            for row in rows:
                for header in self._raw_headers:
                    if header not in row:
                        row[header] = None

            with open(self.resource['path'], 'w+') as stream:
                stream.write(format_to_json(rows))
def fill_missing_fields(path):
    """Pre-fill incomplete JSON rows (to avoid fields mixing up)."""

    headers = get_json_headers(path)

    with open(path) as stream:
        rows = json.loads(stream.read())

    for row in rows:
        for header in headers:
            if header not in row:
                row[header] = None

    with open(path, 'w+') as stream:
        stream.write(format_to_json(rows))
def fill_missing_fields(path):
    """Pre-fill incomplete JSON rows (to avoid fields mixing up)."""

    headers = get_json_headers(path)

    with open(path) as stream:
        rows = json.loads(stream.read())

    for row in rows:
        for header in headers:
            if header not in row:
                row[header] = None

    with open(path, 'w+') as stream:
        stream.write(format_to_json(rows))
Exemple #15
0
def assemble_fiscal_datapackage():
    """Assemble the fiscal datapackage for the concatenated dataset."""

    with open(FISCAL_METADATA_FILE) as stream:
        fdp = yaml.load(stream.read())

    with open(FISCAL_MODEL_FILE) as stream:
        fdp['model'] = yaml.load(stream.read())

    with open(FISCAL_SCHEMA_FILE) as stream:
        fdp['resources'][0]['schema'] = yaml.load(stream.read())

    message = 'Fiscal datapackage: \n%s'
    info(message, format_to_json(fdp))

    return fdp
def assemble_fiscal_datapackage():
    """Assemble the fiscal datapackage for the concatenated dataset."""

    with open(FISCAL_METADATA_FILE) as stream:
        fdp = yaml.load(stream.read())

    with open(FISCAL_MODEL_FILE) as stream:
        fdp['model'] = yaml.load(stream.read())

    with open(FISCAL_SCHEMA_FILE) as stream:
        fdp['resources'][0]['schema'] = yaml.load(stream.read())

    message = 'Fiscal datapackage: \n%s'
    info(message, format_to_json(fdp))

    return fdp
def stream_local_file(datapackage, **parameters):
    """Read local files and return row iterators."""

    if not parameters.get('sample_size'):
        parameters.update(sample_size=LOG_SAMPLE_SIZE)

    for resource in datapackage['resources']:
        path = resource['path']
        _, extension = os.path.splitext(path)

        parameters.update(headers=1)
        parameters['post_parse'] = []

        if 'parser_options' in resource:
            if resource['parser_options'].get('skip_rows'):
                row_numbers = resource['parser_options'].pop('skip_rows') or []
                if row_numbers:
                    parameters['post_parse'] = [get_skip_rows(row_numbers)]
            parameters.update(**resource.get('parser_options'))

        if extension == '.csv':
            parameters['post_parse'].append(drop_bad_rows)
            parameters.update(encoding=get_encoding(parameters, resource))

        if extension in ('.xls', '.xlsx'):
            parameters['post_parse'].append(force_strings)

        if extension == '.json':
            fill_missing_fields(path)
            parameters['post_parse'].append(force_strings)

        info('Ingesting file = %s', path)
        info('Ingestion parameters = %s', format_to_json(parameters))

        parameters.update(headers=get_headers(parameters, path))

        with Stream(path, **parameters) as stream:
            check_fields_match(resource, stream)
            log_sample_table(stream)
            yield stream.iter(keyed=True)
def stream_local_file(datapackage, **parameters):
    """Read local files and return row iterators."""

    if not parameters.get('sample_size'):
        parameters.update(sample_size=LOG_SAMPLE_SIZE)

    for resource in datapackage['resources']:
        path = resource['path']
        _, extension = os.path.splitext(path)

        parameters.update(headers=1)
        parameters['post_parse'] = []

        if 'parser_options' in resource:
            if resource['parser_options'].get('skip_rows'):
                row_numbers = resource['parser_options'].pop('skip_rows') or []
                if row_numbers:
                    parameters['post_parse'] = [get_skip_rows(row_numbers)]
            parameters.update(**resource.get('parser_options'))

        if extension == '.csv':
            parameters['post_parse'].append(drop_bad_rows)
            parameters.update(encoding=get_encoding(parameters, resource))

        if extension in ('.xls', '.xlsx'):
            parameters['post_parse'].append(force_strings)

        if extension == '.json':
            fill_missing_fields(path)
            parameters['post_parse'].append(force_strings)

        info('Ingesting file = %s', path)
        info('Ingestion parameters = %s', format_to_json(parameters))

        parameters.update(headers=get_headers(parameters, path))

        with Stream(path, **parameters) as stream:
            check_fields_match(resource, stream)
            log_sample_table(stream)
            yield stream.iter(keyed=True)
Exemple #19
0
def parse_currencies(row, fields=None, characters=None):
    """Clean up and convert currency fields to floats."""

    assert fields, 'Missing `fields` parameter'
    assert characters, 'Missing `characters` parameter'

    for key in fields:
        if row[key] is not None:
            row[key] = str(row[key])

            if not row[key].strip():
                row[key] = None
            else:
                try:
                    row[key] = float(row[key].replace(
                        characters['currency'],
                        '').replace(characters['grouping'],
                                    '').replace(characters['decimal'],
                                                '.').strip())
                except ValueError as error:
                    warning('%s in row\n%s', error, format_to_json(row))
    return row