Esempio n. 1
0
    def table(self):
        """Build and cache a table from query results"""
        if self._table is None:
            schema_obj = Schema(self._schema)

            table = []
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)
                    table.append(OrderedDict(zip(field_names, table_row)))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                table = [{'boolean': self.raw_data['boolean']}]

            self._table = table

        return self._table
Esempio n. 2
0
    def iter(self, bucket):

        # Get response
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        response = self.__service.tabledata().list(
            projectId=self.__project,
            datasetId=self.__dataset,
            tableId=tablename).execute()

        # Yield rows
        for fields in response['rows']:
            row = []
            values = [field['v'] for field in fields['f']]
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we fix bigquery "1.234234E9" like datetimes
                if field.type == 'date':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    fmt = '%Y-%m-%d'
                    if field.format.startswith('fmt:'):
                        fmt = field.format.replace('fmt:', '')
                    value = value.strftime(fmt)
                elif field.type == 'datetime':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    value = '%sZ' % value.isoformat()
                row.append(value)
            yield schema.cast_row(row)
    def iter(self, bucket):

        # Get response
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        response = self.__service.tabledata().list(
            projectId=self.__project,
            datasetId=self.__dataset,
            tableId=tablename).execute()

        # Yield rows
        for fields in response['rows']:
            row = []
            values = [field['v'] for field in fields['f']]
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we fix bigquery "1.234234E9" like datetimes
                if field.type == 'date':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    fmt = '%Y-%m-%d'
                    if field.format.startswith('fmt:'):
                        fmt = field.format.replace('fmt:', '')
                    value = value.strftime(fmt)
                elif field.type == 'datetime':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    value = '%sZ' % value.isoformat()
                row.append(value)
            yield schema.cast_row(row)
def test_init():
    # Valid
    assert Schema(DESCRIPTOR_MIN)
    assert Schema(DESCRIPTOR_MAX)
    assert Schema('data/schema_valid_full.json')
    assert Schema('data/schema_valid_simple.json')
    # Invalid
    with pytest.raises(exceptions.SchemaValidationError) as exception:
        Schema('data/schema_invalid_multiple_errors.json')
Esempio n. 5
0
def descriptor_and_rows_to_dataframe(descriptor, rows):

    # Prepare
    primary_key = None
    schema = Schema(descriptor)
    if len(schema.primary_key) == 1:
        primary_key = schema.primary_key[0]
    elif len(schema.primary_key) > 1:
        raise RuntimeError('Multi-column primary keys are not supported')

    # Get data/index
    data_rows = []
    index_rows = []
    jtstypes_map = {}
    for row in rows:
        values = []
        index = None
        for field, value in zip(schema.fields, row):
            try:
                value = field.cast_value(value)
            except InvalidObjectType:
                value = json.loads(value)
            if value is None and field.type in ('number', 'integer'):
                jtstypes_map[field.name] = 'number'
                value = np.NaN
            if field.name == primary_key:
                index = value
            else:
                values.append(value)
        data_rows.append(tuple(values))
        index_rows.append(index)

    # Get dtypes
    dtypes = []
    for field in schema.fields:
        if field.name != primary_key:
            field_name = field.name
            if six.PY2:
                field_name = field.name.encode('utf-8')
            dtype = jtstype_to_dtype(jtstypes_map.get(field.name, field.type))
            dtypes.append((field_name, dtype))

    # Create dataframe
    index = None
    columns = schema.headers
    array = np.array(data_rows, dtype=dtypes)
    if primary_key:
        index_field = schema.get_field(primary_key)
        index_dtype = jtstype_to_dtype(index_field.type)
        index_class = pd.Index
        if index_field.type in ['datetime', 'date']:
            index_class = pd.DatetimeIndex
        index = index_class(index_rows, name=primary_key, dtype=index_dtype)
        columns = filter(lambda column: column != primary_key, schema.headers)
    dataframe = pd.DataFrame(array, index=index, columns=columns)

    return dataframe
def sunc_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        row = schema.cast_row(row)
        values = []
        for index, field in enumerate(descriptor['fields']):
            value = row[index]
            if field['type'] == 'date':
                value = datetime.datetime.fromordinal(value.toordinal())
            values.append(value)
        result.append(values)
    return result
Esempio n. 7
0
def sunc_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        row = schema.cast_row(row)
        values = []
        for index, field in enumerate(descriptor['fields']):
            value = row[index]
            if field['type'] == 'date':
                value = datetime.datetime.fromordinal(value.toordinal())
            values.append(value)
        result.append(values)
    return result
def test_descriptor():
    # Dict
    assert Schema(DESCRIPTOR_MIN).descriptor == DESCRIPTOR_MIN
    assert Schema(DESCRIPTOR_MAX).descriptor == DESCRIPTOR_MAX
    # Path
    path = 'data/schema_valid_simple.json'
    expect = Schema(path).descriptor
    actual = json.load(io.open(path, encoding='utf-8'))
    assert expect == actual
    # Url
    url = BASE_URL % 'data/schema_valid_simple.json'
    expect = Schema(url).descriptor
    actual = requests.get(url).json()
    assert expect == actual
Esempio n. 9
0
def datapackage(source, **options):
    errors = []
    tables = []

    # Prepare datapackage
    datapackage = DataPackage(source, **options)
    for exception in datapackage.iter_errors():
        # Error message should contain datapackage source (often it's path)
        message = spec['errors']['datapackage-error']['message']
        message = message.format(
            error_message='{problem} [{source}]'.format(
                problem=str(exception).splitlines()[0],
                source=str(source)))
        errors.append({
            'code': 'datapackage-error',
            'message': message,
            'row-number': None,
            'column-number': None,
        })

    # Add tables
    if not errors:
        for resource in datapackage.resources:
            path = resource.remote_data_path or resource.local_data_path
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': Schema(resource.descriptor['schema']),
                'extra': {
                    'datapackage': str(source),
                },
            })

    return errors, tables
    def __write_rows_buffer(self, bucket, rows_buffer):

        # Process data to byte stream csv
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        bytes = io.BufferedRandom(io.BytesIO())
        writer = unicodecsv.writer(bytes, encoding='utf-8')
        for values in rows_buffer:
            row = []
            values = schema.cast_row(values)
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we convert date to datetime
                if field.type == 'date':
                    value = datetime.datetime.fromordinal(value.toordinal())
                    value = '%sZ' % value.isoformat()
                row.append(value)
            writer.writerow(row)
        bytes.seek(0)

        # Prepare job body
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        body = {
            'configuration': {
                'load': {
                    'destinationTable': {
                        'projectId': self.__project,
                        'datasetId': self.__dataset,
                        'tableId': tablename
                    },
                    'sourceFormat': 'CSV',
                }
            }
        }

        # Prepare job media body
        mimetype = 'application/octet-stream'
        media_body = MediaIoBaseUpload(bytes, mimetype=mimetype)

        # Make request to Big Query
        response = self.__service.jobs().insert(
            projectId=self.__project,
            body=body,
            media_body=media_body).execute()
        self.__wait_response(response)
Esempio n. 11
0
    def __write_rows_buffer(self, bucket, rows_buffer):

        # Process data to byte stream csv
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        bytes = io.BufferedRandom(io.BytesIO())
        writer = unicodecsv.writer(bytes, encoding='utf-8')
        for values in rows_buffer:
            row = []
            values = schema.cast_row(values)
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we convert date to datetime
                if field.type == 'date':
                    value = datetime.datetime.fromordinal(value.toordinal())
                    value = '%sZ' % value.isoformat()
                row.append(value)
            writer.writerow(row)
        bytes.seek(0)

        # Prepare job body
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        body = {
            'configuration': {
                'load': {
                    'destinationTable': {
                        'projectId': self.__project,
                        'datasetId': self.__dataset,
                        'tableId': tablename
                    },
                    'sourceFormat': 'CSV',
                }
            }
        }

        # Prepare job media body
        mimetype = 'application/octet-stream'
        media_body = MediaIoBaseUpload(bytes, mimetype=mimetype)

        # Make request to Big Query
        response = self.__service.jobs().insert(
            projectId=self.__project, body=body,
            media_body=media_body).execute()
        self.__wait_response(response)
Esempio n. 12
0
    def _iter_rows(self):
        if self._schema is not None:  # Not empty results
            schema_obj = Schema(self._schema)
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)

                    # when the column is a string value, the jsontableschema
                    # library is incorrectly mapping the several literal
                    # string values ('null', 'none', '-', etc.) to the python
                    # `None` value - a deeper fix might be to reconsider using
                    # that library, or maybe fixing this issue in that
                    # library (since it's probably not a good idea to render
                    # a number of strings un-representable) - this fixes the
                    # problem for our result sets.  Essentially, this zips
                    # over each result set and checks whether we mapped a
                    # non-null value to `None` in a string field, and if
                    # so it restores the non-null value before continuing
                    table_row = map(
                        lambda field, original, mapped: original
                        if (not mapped) and original and field.type == 'string'
                        else mapped, schema_obj.fields, values, table_row)

                    yield OrderedDict(zip(field_names, table_row))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                yield {'boolean': self.raw_data['boolean']}
Esempio n. 13
0
    def _iter_rows(self):
        if self._schema is not None:  # Not empty results
            schema_obj = Schema(self._schema)
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)
                    yield OrderedDict(zip(field_names, table_row))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                yield {'boolean': self.raw_data['boolean']}
Esempio n. 14
0
def validate_data(datapackage):
    # Start timer
    start = datetime.datetime.now()

    tables = []
    for resource in datapackage.resources:
        is_tabular = resource.descriptor.get('format', None) == 'csv' \
                or resource.descriptor.get('mediatype', None) == 'text/csv' \
                or resource.local_data_path.endswith('csv')

        if is_tabular:
            path = resource.remote_data_path or resource.local_data_path
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': Schema(resource.descriptor['schema']),
                'extra': {}
            })
    inspector = Inspector()

    reports = []
    errors = []
    for table in tables:
        report = inspector._Inspector__inspect_table(table)
        errors.extend(report['errors'])
        reports.append(report)

    # Stop timer
    stop = datetime.datetime.now()
    errors = errors[:1000]
    report = {
        'time':
        round((stop - start).total_seconds(), 3),
        'valid':
        True if len(reports) == 0 else all(report['valid']
                                           for report in reports),
        'table-count':
        len(tables),
        'error-count':
        sum(len(report['errors']) for report in reports),
        'errors':
        errors,
        'tables':
        reports,
    }
    return report
Esempio n. 15
0
    def iter(self, bucket):

        # Check existense
        if bucket not in self.buckets:
            raise RuntimeError('Bucket "%s" doesn\'t exist.' % bucket)

        # Prepare
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)

        # Yield rows
        for pk, row in self.__dataframes[bucket].iterrows():
            rdata = []
            for field in schema.fields:
                if schema.primary_key and schema.primary_key[0] == field.name:
                    rdata.append(field.cast_value(pk))
                else:
                    value = row[field.name]
                    rdata.append(field.cast_value(value))
            yield rdata
Esempio n. 16
0
def assert_conforms_to_schema(schema, doc):
    assert isinstance(doc, dict), "invalid doc: {}".format(doc)
    row = [doc[field["name"]] for field in schema["fields"]]
    try:
        Schema(schema).cast_row(row)
    except Exception as e:
        logging.exception(e)
        raise Exception(
            "row does not conform to schema\nrow='{}'\nschema='{}'".format(
                json.dumps(row), json.dumps(schema)))
    schema_model = SchemaModel(schema)
    res = {}
    for k, v in doc.items():
        try:
            res[k] = schema_model.cast(k, v)
        except Exception as e:
            logging.exception(e)
            raise Exception("doc attribute '{}' with value '{}' "
                            "does not conform to schema '{}'".format(
                                *map(json.dumps, [k, v, schema])))
    return res
Esempio n. 17
0
 def opener():
     _params = dict(headers=1)
     _params.update(
         dict(x for x in __resource.items()
              if x[0] not in {'path', 'name', 'schema',
                              'mediatype', 'skip_rows'}))
     skip_rows = __resource.get('skip_rows', 0)
     _stream = tabulator.Stream(__url, **_params,
                                post_parse=[row_skipper(skip_rows)])
     try:
         _stream.open()
         _headers = dedupe(_stream.headers)
         _schema = __resource.get('schema')
         if _schema is not None:
             _schema = Schema(_schema)
         return _schema, _headers, _stream, _stream.close
     except tabulator.exceptions.TabulatorException as e:
         logging.warning("Error while opening resource from url %s: %r",
                         _url, e)
         if not _ignore_missing:
             raise
         return {}, [], [], lambda: None
Esempio n. 18
0
def table(source, schema=None, **options):
    errors = []
    tables = []

    # Prepare schema
    if schema is not None:
        descriptor = schema
        try:
            # https://github.com/frictionlessdata/jsontableschema-py/issues/113
            from jsontableschema.helpers import load_json_source
            loaded_descriptor = load_json_source(schema)
            validate(loaded_descriptor, no_fail_fast=True)
            schema = Schema(loaded_descriptor)
        except jsontableschema.exceptions.MultipleInvalid as exception:
            for error in exception.errors:
                # Error message should contain schema source (often it's path)
                message = spec['errors']['jsontableschema-error']['message']
                message = message.format(
                    error_message='{problem} [{source}]'.format(
                        problem=str(error).splitlines()[0],
                        source=str(descriptor)))
                errors.append({
                    'code': 'jsontableschema-error',
                    'message': message,
                    'row-number': None,
                    'column-number': None,
                })

    # Add table
    if not errors:
        options.setdefault('headers', 1)
        tables.append({
            'source': str(source),
            'stream': Stream(source, **options),
            'schema': schema,
            'extra': {},
        })

    return errors, tables
Esempio n. 19
0
 def fetch_from_datapackage(self, **kwargs):
     schema = Schema(self.resource.descriptor["schema"])
     path = self.get_path("{}.csv".format(self.get_path()))
     with open(path) as f:
         csv_reader = csv.reader(f)
         next(csv_reader)  # skip header line
         for row in csv.reader(f):
             cast_row = OrderedDict()
             for i, val in enumerate(row):
                 field = schema.fields[i]
                 if field.type == "string":
                     val = val.decode("utf-8")
                 elif field.type == "datetime" and val != "":
                     val = "{}Z".format(val)
                 try:
                     val = field.cast_value(val)
                 except Exception as e:
                     raise Exception(
                         "Failed to cast value for field '{}' ({}) with value '{}': {}"
                         .format(field.name, field.type, val, e.message))
                 cast_row[field.name] = val
             yield cast_row
Esempio n. 20
0
def extra_header(errors, columns, sample, infer_fields=False):
    for column in copy(columns):
        if 'field' not in column:
            # Infer field
            if infer_fields:
                column_sample = []
                for row in sample:
                    value = None
                    if len(row) > column['number']:
                        value = row[column['number']]
                    column_sample.append(value)
                descriptor = infer([column['header']], column_sample)
                column['field'] = Schema(descriptor).fields[0]
            # Add error/remove column
            else:
                message = spec['errors']['extra-header']['message']
                message = message.format(column_number=column['number'])
                errors.append({
                    'code': 'extra-header',
                    'message': message,
                    'row-number': None,
                    'column-number': column['number'],
                })
                columns.remove(column)
def test_primary_key():
    assert Schema(DESCRIPTOR_MIN).primary_key == []
    assert Schema(DESCRIPTOR_MAX).primary_key == ['id']
def sync_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        result.append(schema.cast_row(row))
    return result
Esempio n. 23
0
def pull_datapackage(descriptor, name, backend, **backend_options):
    """Pull Data Package from storage.

    All parameters should be used as keyword arguments.

    Args:
        descriptor (str): path where to store descriptor
        name (str): name of the pulled datapackage
        backend (str): backend name like `sql` or `bigquery`
        backend_options (dict): backend options mentioned in backend docs

    """

    # Save datapackage name
    datapackage_name = name

    # Get storage
    plugin = import_module('jsontableschema.plugins.%s' % backend)
    storage = plugin.Storage(**backend_options)

    # Iterate over tables
    resources = []
    for table in storage.buckets:

        # Prepare
        schema = storage.describe(table)
        base = os.path.dirname(descriptor)
        path, name = mappers.restore_path(table)
        fullpath = os.path.join(base, path)

        # Write data
        helpers.ensure_dir(fullpath)
        with io.open(fullpath, 'wb') as file:
            model = Schema(deepcopy(schema))
            data = storage.iter(table)
            writer = csv.writer(file, encoding='utf-8')
            writer.writerow(model.headers)
            for row in data:
                writer.writerow(row)

        # Add resource
        resource = {'schema': schema, 'path': path}
        if name is not None:
            resource['name'] = name
        resources.append(resource)

    # Write descriptor
    mode = 'w'
    encoding = 'utf-8'
    if six.PY2:
        mode = 'wb'
        encoding = None
    resources = mappers.restore_resources(resources)
    helpers.ensure_dir(descriptor)
    with io.open(descriptor,
                 mode=mode,
                 encoding=encoding) as file:
        descriptor = {
            'name': datapackage_name,
            'resources': resources,
        }
        json.dump(descriptor, file, indent=4)
    return storage
def test_fields():
    expect = ['id', 'height']
    actual = [field.name for field in Schema(DESCRIPTOR_MIN).fields]
    assert expect == actual
def test_cast_row_null_values():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '', '-', 'string', 'null']
    target = ['string', None, None, 'string', None]
    assert schema.cast_row(source) == target
def test_get_field():
    schema = Schema(DESCRIPTOR_MIN)
    assert schema.get_field('id').name == 'id'
    assert schema.get_field('height').name == 'height'
    assert schema.get_field('undefined') is None
def test_cast_row_too_short():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string']
    with pytest.raises(exceptions.InvalidCastError):
        schema.cast_row(source)
def test_has_field():
    schema = Schema(DESCRIPTOR_MIN)
    assert schema.has_field('id')
    assert schema.has_field('height')
    assert not schema.has_field('undefined')
def test_cast_row():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string', 'string']
    target = ['string', Decimal(10.0), 1, 'string', 'string']
    assert schema.cast_row(source) == target
def test_cast_row_wrong_type_multiple_errors():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.MultipleInvalid) as excinfo:
        schema.cast_row(source, no_fail_fast=True)
    assert len(excinfo.value.errors) == 2
def sync_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        result.append(schema.cast_row(row))
    return result
def test_foreign_keys():
    assert Schema(DESCRIPTOR_MIN).foreign_keys == []
    assert Schema(DESCRIPTOR_MAX).foreign_keys == DESCRIPTOR_MAX['foreignKeys']
def test_headers():
    assert Schema(DESCRIPTOR_MIN).headers == ['id', 'height']
def test_save(tmpdir):
    path = str(tmpdir.join('schema.json'))
    Schema(DESCRIPTOR_MIN).save(path)
    assert DESCRIPTOR_MIN == json.load(io.open(path, encoding='utf-8'))