def table(self): """Build and cache a table from query results""" if self._table is None: schema_obj = Schema(self._schema) table = [] if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) table.append(OrderedDict(zip(field_names, table_row))) elif 'boolean' in self.raw_data: # Results of an ASK query table = [{'boolean': self.raw_data['boolean']}] self._table = table return self._table
def iter(self, bucket): # Get response descriptor = self.describe(bucket) schema = Schema(descriptor) tablename = mappers.bucket_to_tablename(self.__prefix, bucket) response = self.__service.tabledata().list( projectId=self.__project, datasetId=self.__dataset, tableId=tablename).execute() # Yield rows for fields in response['rows']: row = [] values = [field['v'] for field in fields['f']] for index, field in enumerate(schema.fields): value = values[index] # Here we fix bigquery "1.234234E9" like datetimes if field.type == 'date': value = datetime.datetime.utcfromtimestamp( int(float(value))) fmt = '%Y-%m-%d' if field.format.startswith('fmt:'): fmt = field.format.replace('fmt:', '') value = value.strftime(fmt) elif field.type == 'datetime': value = datetime.datetime.utcfromtimestamp( int(float(value))) value = '%sZ' % value.isoformat() row.append(value) yield schema.cast_row(row)
def test_init(): # Valid assert Schema(DESCRIPTOR_MIN) assert Schema(DESCRIPTOR_MAX) assert Schema('data/schema_valid_full.json') assert Schema('data/schema_valid_simple.json') # Invalid with pytest.raises(exceptions.SchemaValidationError) as exception: Schema('data/schema_invalid_multiple_errors.json')
def descriptor_and_rows_to_dataframe(descriptor, rows): # Prepare primary_key = None schema = Schema(descriptor) if len(schema.primary_key) == 1: primary_key = schema.primary_key[0] elif len(schema.primary_key) > 1: raise RuntimeError('Multi-column primary keys are not supported') # Get data/index data_rows = [] index_rows = [] jtstypes_map = {} for row in rows: values = [] index = None for field, value in zip(schema.fields, row): try: value = field.cast_value(value) except InvalidObjectType: value = json.loads(value) if value is None and field.type in ('number', 'integer'): jtstypes_map[field.name] = 'number' value = np.NaN if field.name == primary_key: index = value else: values.append(value) data_rows.append(tuple(values)) index_rows.append(index) # Get dtypes dtypes = [] for field in schema.fields: if field.name != primary_key: field_name = field.name if six.PY2: field_name = field.name.encode('utf-8') dtype = jtstype_to_dtype(jtstypes_map.get(field.name, field.type)) dtypes.append((field_name, dtype)) # Create dataframe index = None columns = schema.headers array = np.array(data_rows, dtype=dtypes) if primary_key: index_field = schema.get_field(primary_key) index_dtype = jtstype_to_dtype(index_field.type) index_class = pd.Index if index_field.type in ['datetime', 'date']: index_class = pd.DatetimeIndex index = index_class(index_rows, name=primary_key, dtype=index_dtype) columns = filter(lambda column: column != primary_key, schema.headers) dataframe = pd.DataFrame(array, index=index, columns=columns) return dataframe
def sunc_rows(descriptor, rows): result = [] schema = Schema(descriptor) for row in rows: row = schema.cast_row(row) values = [] for index, field in enumerate(descriptor['fields']): value = row[index] if field['type'] == 'date': value = datetime.datetime.fromordinal(value.toordinal()) values.append(value) result.append(values) return result
def test_descriptor(): # Dict assert Schema(DESCRIPTOR_MIN).descriptor == DESCRIPTOR_MIN assert Schema(DESCRIPTOR_MAX).descriptor == DESCRIPTOR_MAX # Path path = 'data/schema_valid_simple.json' expect = Schema(path).descriptor actual = json.load(io.open(path, encoding='utf-8')) assert expect == actual # Url url = BASE_URL % 'data/schema_valid_simple.json' expect = Schema(url).descriptor actual = requests.get(url).json() assert expect == actual
def datapackage(source, **options): errors = [] tables = [] # Prepare datapackage datapackage = DataPackage(source, **options) for exception in datapackage.iter_errors(): # Error message should contain datapackage source (often it's path) message = spec['errors']['datapackage-error']['message'] message = message.format( error_message='{problem} [{source}]'.format( problem=str(exception).splitlines()[0], source=str(source))) errors.append({ 'code': 'datapackage-error', 'message': message, 'row-number': None, 'column-number': None, }) # Add tables if not errors: for resource in datapackage.resources: path = resource.remote_data_path or resource.local_data_path tables.append({ 'source': path, 'stream': Stream(path, headers=1), 'schema': Schema(resource.descriptor['schema']), 'extra': { 'datapackage': str(source), }, }) return errors, tables
def __write_rows_buffer(self, bucket, rows_buffer): # Process data to byte stream csv descriptor = self.describe(bucket) schema = Schema(descriptor) bytes = io.BufferedRandom(io.BytesIO()) writer = unicodecsv.writer(bytes, encoding='utf-8') for values in rows_buffer: row = [] values = schema.cast_row(values) for index, field in enumerate(schema.fields): value = values[index] # Here we convert date to datetime if field.type == 'date': value = datetime.datetime.fromordinal(value.toordinal()) value = '%sZ' % value.isoformat() row.append(value) writer.writerow(row) bytes.seek(0) # Prepare job body tablename = mappers.bucket_to_tablename(self.__prefix, bucket) body = { 'configuration': { 'load': { 'destinationTable': { 'projectId': self.__project, 'datasetId': self.__dataset, 'tableId': tablename }, 'sourceFormat': 'CSV', } } } # Prepare job media body mimetype = 'application/octet-stream' media_body = MediaIoBaseUpload(bytes, mimetype=mimetype) # Make request to Big Query response = self.__service.jobs().insert( projectId=self.__project, body=body, media_body=media_body).execute() self.__wait_response(response)
def _iter_rows(self): if self._schema is not None: # Not empty results schema_obj = Schema(self._schema) if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) # when the column is a string value, the jsontableschema # library is incorrectly mapping the several literal # string values ('null', 'none', '-', etc.) to the python # `None` value - a deeper fix might be to reconsider using # that library, or maybe fixing this issue in that # library (since it's probably not a good idea to render # a number of strings un-representable) - this fixes the # problem for our result sets. Essentially, this zips # over each result set and checks whether we mapped a # non-null value to `None` in a string field, and if # so it restores the non-null value before continuing table_row = map( lambda field, original, mapped: original if (not mapped) and original and field.type == 'string' else mapped, schema_obj.fields, values, table_row) yield OrderedDict(zip(field_names, table_row)) elif 'boolean' in self.raw_data: # Results of an ASK query yield {'boolean': self.raw_data['boolean']}
def _iter_rows(self): if self._schema is not None: # Not empty results schema_obj = Schema(self._schema) if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) yield OrderedDict(zip(field_names, table_row)) elif 'boolean' in self.raw_data: # Results of an ASK query yield {'boolean': self.raw_data['boolean']}
def validate_data(datapackage): # Start timer start = datetime.datetime.now() tables = [] for resource in datapackage.resources: is_tabular = resource.descriptor.get('format', None) == 'csv' \ or resource.descriptor.get('mediatype', None) == 'text/csv' \ or resource.local_data_path.endswith('csv') if is_tabular: path = resource.remote_data_path or resource.local_data_path tables.append({ 'source': path, 'stream': Stream(path, headers=1), 'schema': Schema(resource.descriptor['schema']), 'extra': {} }) inspector = Inspector() reports = [] errors = [] for table in tables: report = inspector._Inspector__inspect_table(table) errors.extend(report['errors']) reports.append(report) # Stop timer stop = datetime.datetime.now() errors = errors[:1000] report = { 'time': round((stop - start).total_seconds(), 3), 'valid': True if len(reports) == 0 else all(report['valid'] for report in reports), 'table-count': len(tables), 'error-count': sum(len(report['errors']) for report in reports), 'errors': errors, 'tables': reports, } return report
def iter(self, bucket): # Check existense if bucket not in self.buckets: raise RuntimeError('Bucket "%s" doesn\'t exist.' % bucket) # Prepare descriptor = self.describe(bucket) schema = Schema(descriptor) # Yield rows for pk, row in self.__dataframes[bucket].iterrows(): rdata = [] for field in schema.fields: if schema.primary_key and schema.primary_key[0] == field.name: rdata.append(field.cast_value(pk)) else: value = row[field.name] rdata.append(field.cast_value(value)) yield rdata
def assert_conforms_to_schema(schema, doc): assert isinstance(doc, dict), "invalid doc: {}".format(doc) row = [doc[field["name"]] for field in schema["fields"]] try: Schema(schema).cast_row(row) except Exception as e: logging.exception(e) raise Exception( "row does not conform to schema\nrow='{}'\nschema='{}'".format( json.dumps(row), json.dumps(schema))) schema_model = SchemaModel(schema) res = {} for k, v in doc.items(): try: res[k] = schema_model.cast(k, v) except Exception as e: logging.exception(e) raise Exception("doc attribute '{}' with value '{}' " "does not conform to schema '{}'".format( *map(json.dumps, [k, v, schema]))) return res
def opener(): _params = dict(headers=1) _params.update( dict(x for x in __resource.items() if x[0] not in {'path', 'name', 'schema', 'mediatype', 'skip_rows'})) skip_rows = __resource.get('skip_rows', 0) _stream = tabulator.Stream(__url, **_params, post_parse=[row_skipper(skip_rows)]) try: _stream.open() _headers = dedupe(_stream.headers) _schema = __resource.get('schema') if _schema is not None: _schema = Schema(_schema) return _schema, _headers, _stream, _stream.close except tabulator.exceptions.TabulatorException as e: logging.warning("Error while opening resource from url %s: %r", _url, e) if not _ignore_missing: raise return {}, [], [], lambda: None
def table(source, schema=None, **options): errors = [] tables = [] # Prepare schema if schema is not None: descriptor = schema try: # https://github.com/frictionlessdata/jsontableschema-py/issues/113 from jsontableschema.helpers import load_json_source loaded_descriptor = load_json_source(schema) validate(loaded_descriptor, no_fail_fast=True) schema = Schema(loaded_descriptor) except jsontableschema.exceptions.MultipleInvalid as exception: for error in exception.errors: # Error message should contain schema source (often it's path) message = spec['errors']['jsontableschema-error']['message'] message = message.format( error_message='{problem} [{source}]'.format( problem=str(error).splitlines()[0], source=str(descriptor))) errors.append({ 'code': 'jsontableschema-error', 'message': message, 'row-number': None, 'column-number': None, }) # Add table if not errors: options.setdefault('headers', 1) tables.append({ 'source': str(source), 'stream': Stream(source, **options), 'schema': schema, 'extra': {}, }) return errors, tables
def fetch_from_datapackage(self, **kwargs): schema = Schema(self.resource.descriptor["schema"]) path = self.get_path("{}.csv".format(self.get_path())) with open(path) as f: csv_reader = csv.reader(f) next(csv_reader) # skip header line for row in csv.reader(f): cast_row = OrderedDict() for i, val in enumerate(row): field = schema.fields[i] if field.type == "string": val = val.decode("utf-8") elif field.type == "datetime" and val != "": val = "{}Z".format(val) try: val = field.cast_value(val) except Exception as e: raise Exception( "Failed to cast value for field '{}' ({}) with value '{}': {}" .format(field.name, field.type, val, e.message)) cast_row[field.name] = val yield cast_row
def extra_header(errors, columns, sample, infer_fields=False): for column in copy(columns): if 'field' not in column: # Infer field if infer_fields: column_sample = [] for row in sample: value = None if len(row) > column['number']: value = row[column['number']] column_sample.append(value) descriptor = infer([column['header']], column_sample) column['field'] = Schema(descriptor).fields[0] # Add error/remove column else: message = spec['errors']['extra-header']['message'] message = message.format(column_number=column['number']) errors.append({ 'code': 'extra-header', 'message': message, 'row-number': None, 'column-number': column['number'], }) columns.remove(column)
def test_primary_key(): assert Schema(DESCRIPTOR_MIN).primary_key == [] assert Schema(DESCRIPTOR_MAX).primary_key == ['id']
def sync_rows(descriptor, rows): result = [] schema = Schema(descriptor) for row in rows: result.append(schema.cast_row(row)) return result
def pull_datapackage(descriptor, name, backend, **backend_options): """Pull Data Package from storage. All parameters should be used as keyword arguments. Args: descriptor (str): path where to store descriptor name (str): name of the pulled datapackage backend (str): backend name like `sql` or `bigquery` backend_options (dict): backend options mentioned in backend docs """ # Save datapackage name datapackage_name = name # Get storage plugin = import_module('jsontableschema.plugins.%s' % backend) storage = plugin.Storage(**backend_options) # Iterate over tables resources = [] for table in storage.buckets: # Prepare schema = storage.describe(table) base = os.path.dirname(descriptor) path, name = mappers.restore_path(table) fullpath = os.path.join(base, path) # Write data helpers.ensure_dir(fullpath) with io.open(fullpath, 'wb') as file: model = Schema(deepcopy(schema)) data = storage.iter(table) writer = csv.writer(file, encoding='utf-8') writer.writerow(model.headers) for row in data: writer.writerow(row) # Add resource resource = {'schema': schema, 'path': path} if name is not None: resource['name'] = name resources.append(resource) # Write descriptor mode = 'w' encoding = 'utf-8' if six.PY2: mode = 'wb' encoding = None resources = mappers.restore_resources(resources) helpers.ensure_dir(descriptor) with io.open(descriptor, mode=mode, encoding=encoding) as file: descriptor = { 'name': datapackage_name, 'resources': resources, } json.dump(descriptor, file, indent=4) return storage
def test_fields(): expect = ['id', 'height'] actual = [field.name for field in Schema(DESCRIPTOR_MIN).fields] assert expect == actual
def test_cast_row_null_values(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '', '-', 'string', 'null'] target = ['string', None, None, 'string', None] assert schema.cast_row(source) == target
def test_get_field(): schema = Schema(DESCRIPTOR_MIN) assert schema.get_field('id').name == 'id' assert schema.get_field('height').name == 'height' assert schema.get_field('undefined') is None
def test_cast_row_too_short(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string'] with pytest.raises(exceptions.InvalidCastError): schema.cast_row(source)
def test_has_field(): schema = Schema(DESCRIPTOR_MIN) assert schema.has_field('id') assert schema.has_field('height') assert not schema.has_field('undefined')
def test_cast_row(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string'] target = ['string', Decimal(10.0), 1, 'string', 'string'] assert schema.cast_row(source) == target
def test_cast_row_wrong_type_multiple_errors(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.MultipleInvalid) as excinfo: schema.cast_row(source, no_fail_fast=True) assert len(excinfo.value.errors) == 2
def test_foreign_keys(): assert Schema(DESCRIPTOR_MIN).foreign_keys == [] assert Schema(DESCRIPTOR_MAX).foreign_keys == DESCRIPTOR_MAX['foreignKeys']
def test_headers(): assert Schema(DESCRIPTOR_MIN).headers == ['id', 'height']
def test_save(tmpdir): path = str(tmpdir.join('schema.json')) Schema(DESCRIPTOR_MIN).save(path) assert DESCRIPTOR_MIN == json.load(io.open(path, encoding='utf-8'))