def create(self, bucket, descriptor, force=False): """https://github.com/frictionlessdata/tableschema-pandas-py#storage """ # Make lists buckets = bucket if isinstance(bucket, six.string_types): buckets = [bucket] descriptors = descriptor if isinstance(descriptor, dict): descriptors = [descriptor] # Check buckets for existence for bucket in buckets: if bucket in self.buckets: if not force: message = 'Bucket "%s" already exists' % bucket raise tableschema.exceptions.StorageError(message) self.delete(bucket) # Define dataframes for bucket, descriptor in zip(buckets, descriptors): tableschema.validate(descriptor) self.__descriptors[bucket] = descriptor self.__dataframes[bucket] = pd.DataFrame()
def create(self, bucket, descriptor, force=False): # Make lists buckets = bucket if isinstance(bucket, six.string_types): buckets = [bucket] descriptors = descriptor if isinstance(descriptor, dict): descriptors = [descriptor] # Check buckets for existence for bucket in reversed(self.buckets): if bucket in buckets: if not force: message = 'Bucket "%s" already exists.' % bucket raise tableschema.exceptions.StorageError(message) self.delete(bucket) # Iterate over buckets/descriptors for bucket, descriptor in zip(buckets, descriptors): # Define resources tableschema.validate(descriptor) self.__descriptors[bucket] = descriptor datastore_dict = \ self.__mapper.descriptor_to_datastore_dict(descriptor, bucket) datastore_create_url = \ "{}/datastore_create".format(self.__base_endpoint) self._make_ckan_request(datastore_create_url, method='POST', json=datastore_dict) # Invalidate cache self.__bucket_cache = None
def validate(schema): """Validate that a supposed schema is in fact a Table Schema.""" try: tableschema.validate(schema) click.echo(False) except tableschema.exceptions.ValidationError as exception: click.echo(True) click.echo(exception.errors)
def test_schema_multiple_errors_no_fail_fast_true(self): filepath = os.path.join(self.data_dir, 'schema_invalid_multiple_errors.json') with io.open(filepath) as stream: schema = json.load(stream) try: tableschema.validate(schema, no_fail_fast=True) except exceptions.MultipleInvalid as exception: self.assertEquals(3, len(exception.errors))
def validate(schema): """Validate that a supposed schema is in fact a Table Schema.""" try: tableschema.validate(schema) click.echo("Schema is valid") sys.exit(0) except tableschema.exceptions.ValidationError as exception: click.echo("Schema is not valid") click.echo(exception.errors) sys.exit(1)
def check_schema(self, filename): try: tableschema.validate(self.filepath(filename)) except tableschema.exceptions.ValidationError as e: errors = "; ".join([repr(e) for e in e.errors]) message = "Schema %s is not a valid TableSchema schema. Errors: %s" % ( filename, errors, ) raise exceptions.InvalidSchemaException(self.repo, message)
def create(self, bucket, descriptor, force=False, indexes_fields=None): """Create bucket # Arguments indexes_fields (str[]): list of tuples containing field names, or list of such lists """ # Make lists buckets = bucket if isinstance(bucket, six.string_types): buckets = [bucket] descriptors = descriptor if isinstance(descriptor, dict): descriptors = [descriptor] if indexes_fields is None or len(indexes_fields) == 0: indexes_fields = [()] * len(descriptors) elif type(indexes_fields[0][0]) not in {list, tuple}: indexes_fields = [indexes_fields] # Check dimensions if not (len(buckets) == len(descriptors) == len(indexes_fields)): raise tableschema.exceptions.StorageError('Wrong argument dimensions') # Check buckets for existence for bucket in reversed(self.buckets): if bucket in buckets: if not force: message = 'Bucket "%s" already exists.' % bucket raise tableschema.exceptions.StorageError(message) self.delete(bucket) # Define buckets for bucket, descriptor, index_fields in zip(buckets, descriptors, indexes_fields): tableschema.validate(descriptor) table_name = self.__mapper.convert_bucket(bucket) autoincrement = self.__get_autoincrement_for_bucket(bucket) columns, constraints, indexes, fallbacks, table_comment = self.__mapper \ .convert_descriptor(bucket, descriptor, index_fields, autoincrement) Table(table_name, self.__metadata, *(columns + constraints + indexes), comment=table_comment) self.__descriptors[bucket] = descriptor self.__fallbacks[bucket] = fallbacks # Create tables, update metadata try: self.__metadata.create_all() except sqlalchemy.exc.ProgrammingError as exception: if 'there is no unique constraint matching given keys' in str(exception): message = 'Foreign keys can only reference primary key or unique fields\n%s' six.raise_from( tableschema.exceptions.ValidationError(message % str(exception)), None)
def resource_schema_validator(value, context): if not value: return msg = None if isinstance(value, string_types): if value.lower().startswith('http'): return value try: descriptor = json.loads(str(value)) if not isinstance(descriptor, dict): msg = u'Invalid Table Schema descriptor: {}'.format(value) raise Invalid(msg) except ValueError as e: msg = u'JSON error in Table Schema descriptor: {}'.format(e) raise Invalid(msg) elif isinstance(value, binary_type): try: # Decode UTF-8 bytes to Unicode, and convert single quotes # to double quotes to make it valid JSON decoded_value = value.decode('utf8').replace("'", '"') descriptor = json.loads(decoded_value) if not isinstance(descriptor, dict): msg = u'Invalid Table Schema descriptor: {}'.format(value) raise Invalid(msg) except ValueError as e: msg = u'JSON error in Table Schema descriptor: {}'.format(e) raise Invalid(msg) else: descriptor = value try: tableschema.validate(descriptor) except tableschema.exceptions.ValidationError as e: errors = [] for error in e.errors: errors.append(str(error)) msg = u'Invalid Table Schema: {}'.format(u', '.join(errors)) if msg: raise Invalid(msg) return json.dumps(descriptor)
def test_validate_error_message(): descriptor = { 'fields': [ { 'name': 'name', 'type': 'other' }, ], } with pytest.raises(exceptions.ValidationError) as excinfo: validate(descriptor) message = str(excinfo.value.errors[0]) assert 'Descriptor validation error' in message assert 'at "fields/0" in descriptor' in message assert 'at "properties/fields/items/anyOf" in profile' in message
def _filter_row(self, row, **kwargs): id = int(row.pop(self._id_field_name)) if self._id_field_name in row else None values = self._get_values(row) if self.db_table is None: tableschema.validate(self._table_schema) prefix, bucket = "", self.table_name index_fields = [] autoincrement = None tablename = mappers.bucket_to_tablename(prefix, bucket) columns, constraints, indexes = mappers.descriptor_to_columns_and_constraints(prefix, bucket, self._table_schema, index_fields, autoincrement) self.db_table = Table(tablename, self.db_meta, *(columns + constraints + indexes)) self.db_table.create() logging.info("Created DB table {}".format(tablename)) res = self._upsert(id, values) if res: yield res
def create(self, bucket, descriptor, force=False, indexes_fields=None): """https://github.com/frictionlessdata/tableschema-sql-py#storage """ # Make lists buckets = bucket if isinstance(bucket, six.string_types): buckets = [bucket] descriptors = descriptor if isinstance(descriptor, dict): descriptors = [descriptor] if indexes_fields is None or len(indexes_fields) == 0: indexes_fields = [()] * len(descriptors) elif type(indexes_fields[0][0]) not in {list, tuple}: indexes_fields = [indexes_fields] # Check dimensions if not (len(buckets) == len(descriptors) == len(indexes_fields)): raise tableschema.exceptions.StorageError( 'Wrong argument dimensions') # Check buckets for existence for bucket in reversed(self.buckets): if bucket in buckets: if not force: message = 'Bucket "%s" already exists.' % bucket raise tableschema.exceptions.StorageError(message) self.delete(bucket) # Define buckets for bucket, descriptor, index_fields in zip(buckets, descriptors, indexes_fields): tableschema.validate(descriptor) table_name = self.__mapper.convert_bucket(bucket) columns, constraints, indexes, fallbacks = self.__mapper.convert_descriptor( bucket, descriptor, index_fields, self.__autoincrement) Table(table_name, self.__metadata, *(columns + constraints + indexes)) self.__descriptors[bucket] = descriptor self.__fallbacks[bucket] = fallbacks # Create tables, update metadata self.__metadata.create_all()
def _filter_row(self, row, **kwargs): id = int(row.pop( self._id_field_name)) if self._id_field_name in row else None values = self._get_values(row) if self.db_table is None: tableschema.validate(self._table_schema) prefix, bucket = "", self.table_name index_fields = [] autoincrement = None tablename = mappers.bucket_to_tablename(prefix, bucket) columns, constraints, indexes = mappers.descriptor_to_columns_and_constraints( prefix, bucket, self._table_schema, index_fields, autoincrement) self.db_table = Table(tablename, self.db_meta, *(columns + constraints + indexes)) self.db_table.create() logging.info("Created DB table {}".format(tablename)) res = self._upsert(id, values) if res: yield res
def query( self, data_model, data_resource_name, restricted_fields, table_schema, request_obj, ): """Query the data resource.""" try: request_obj = request_obj.json except Exception: raise ApiError("No request body found.", 400) errors = [] _ = Schema(table_schema) accepted_fields = [] response = OrderedDict() response["results"] = [] if validate(table_schema): for field in table_schema["fields"]: if field["name"] not in restricted_fields: accepted_fields.append(field["name"]) for field in request_obj.keys(): if field not in accepted_fields: errors.append( "Unknown or restricted field '{}' found.".format( field)) if len(errors) > 0: raise ApiUnhandledError("Invalid request body.", 400, errors) else: try: session = Session() results = session.query(data_model).filter_by( **request_obj) for row in results: response["results"].append( self.build_json_from_object( row, restricted_fields)) if len(response["results"]) == 0: return {"message": "No matches found"}, 404 else: return response, 200 except Exception: raise ApiUnhandledError("Failed to create new resource.", 400) finally: session.close() else: raise SchemaValidationFailure() return {"message": "querying data resource"}, 200
def test_primary_key_is_not_a_valid_type(self): filepath = os.path.join(self.data_dir, 'schema_invalid_pk_is_wrong_type.json') with io.open(filepath) as stream: schema = json.load(stream) try: errors = [ i for i in tableschema.validate(schema, no_fail_fast=True) ] except exceptions.MultipleInvalid as error: self.assertEquals(2, len(error.errors))
def resource_schema_validator(value, context): ''' ''' if not value: return msg = None if isinstance(value, basestring): if value.lower().startswith('http'): return value try: descriptor = json.loads(str(value)) if not isinstance(descriptor, dict): msg = u'Invalid Table Schema descriptor: {}'.format(value) raise Invalid(msg) except ValueError as e: msg = u'JSON error in Table Schema descriptor: {}'.format(e) raise Invalid(msg) else: descriptor = value try: tableschema.validate(descriptor) except tableschema.exceptions.ValidationError as e: errors = [] for error in e.errors: errors.append(error.message) msg = u'Invalid Table Schema: {}'.format(u', '.join(errors)) if msg: raise Invalid(msg) return json.dumps(descriptor)
def load_data_from_local_csv(csv_file=ASSET_DATA_FILE): table = Table(csv_file, schema=SCHEMA_FILE) try: valid = validate(table.schema.descriptor) if valid: for keyed_row in table.iter(keyed=True): yield keyed_row except exceptions.ValidationError as exception: for error in exception.errors: print(error) except exceptions.CastError as exception: if not exception.errors: print(exception) for error in exception.errors: write_skipped_assets(error, [])
def update_one( self, id, data_model, data_resource_name, table_schema, restricted_fields, request_obj, mode="PATCH", ): """Update a single object from the data model based on it's primary key. Args: id (any): The primary key for the specific object. data_model (object): SQLAlchemy ORM model. data_resource_name (str): Name of the data resource. table_schema (dict): The Table Schema object to use for validation. Return: dict, int: The response object and the HTTP status code. """ try: request_obj = request_obj.json except Exception: raise ApiError("No request body found.", 400) try: primary_key = table_schema["primaryKey"] session = Session() data_obj = (session.query(data_model).filter( getattr(data_model, primary_key) == id).first()) if data_obj is None: session.close() raise ApiUnhandledError(f"Resource with id '{id}' not found.", 404) except Exception: raise ApiUnhandledError(f"Resource with id '{id}' not found.", 404) _ = Schema(table_schema) errors = [] accepted_fields = [] if validate(table_schema): for field in table_schema["fields"]: accepted_fields.append(field["name"]) for field in request_obj.keys(): if field not in accepted_fields: errors.append(f"Unknown field '{field}' found.") elif field in restricted_fields: errors.append(f"Cannot update restricted field '{field}'.") else: session.close() raise ApiError("Data schema validation error.", 400) if len(errors) > 0: session.close() raise ApiError("Invalid request body.", 400, errors) if mode == "PATCH": for key, value in request_obj.items(): setattr(data_obj, key, value) session.commit() elif mode == "PUT": for field in table_schema["fields"]: if field["required"] and field["name"] not in request_obj.keys( ): errors.append( f"Required field '{field['name']}' is missing.") if len(errors) > 0: session.close() raise ApiError("Invalid request body.", 400, errors) for key, value in request_obj.items(): setattr(data_obj, key, value) session.commit() session.close() return {"message": f"Successfully updated resource '{id}'."}, 201
def test_schema_invalid_fk_no_reference(): with pytest.raises(exceptions.ValidationError): valid = validate('data/schema_invalid_fk_no_reference.json')
def _validate_schema(schema): try: validate(schema) except exceptions.ValidationError as exception: for error in exception.errors: raise error
def insert_one(self, data_model, data_resource_name, table_schema, request_obj): """Insert a new object. Args: data_model (object): SQLAlchemy ORM model. data_resource_name (str): Name of the data resource. table_schema (dict): The Table Schema object to use for validation. request_obj (dict): HTTP request object. Return: dict, int: The response object and associated HTTP status code. """ try: request_obj = request_obj.json except Exception: raise ApiError("No request body found.", 400) _ = Schema(table_schema) errors = [] accepted_fields = [] if not validate(table_schema): raise SchemaValidationFailure() # Check for required fields for field in table_schema["fields"]: accepted_fields.append(field["name"]) if field["required"] and not field["name"] in request_obj.keys(): errors.append(f"Required field '{field['name']}' is missing.") valid_fields = [] many_query = [] for field in request_obj.keys(): if field in accepted_fields: valid_fields.append(field) else: junc_table = JuncHolder.lookup_table(field, data_resource_name) if junc_table is not None: values = request_obj[field] if not isinstance(values, list): values = [values] many_query.append([field, values, junc_table]) else: errors.append(f"Unknown field '{field}' found.") if len(errors) > 0: raise ApiError("Invalid request body.", 400, errors) try: session = Session() new_object = data_model() for field in valid_fields: value = request_obj[field] setattr(new_object, field, value) session.add(new_object) session.commit() id_value = getattr(new_object, table_schema["primaryKey"]) # process the many_query for field, values, table in many_query: self.process_many_query(session, table, id_value, field, data_resource_name, values) return { "message": "Successfully added new resource.", "id": id_value }, 201 except Exception: raise ApiUnhandledError("Failed to create new resource.", 400) finally: session.close()
def test_schema_invalid_fk_reference_array_number_mismatch(): with pytest.raises(exceptions.ValidationError): valid = validate('data/schema_invalid_fk_array_wrong_number.json')
def test_schema_invalid_pk_string(): with pytest.raises(exceptions.ValidationError): valid = validate('data/schema_invalid_pk_string.json')
def test_primary_key_is_not_a_valid_type(): with pytest.raises(exceptions.ValidationError) as excinfo: valid = validate('data/schema_invalid_pk_is_wrong_type.json') assert len(excinfo.value.errors) == 2
def test_schema_valid_full(): valid = validate('data/schema_valid_full.json') assert valid
def test_schema_invalid_wrong_type(): with pytest.raises(exceptions.ValidationError): valid = validate([])
def test_schema_invalid_fk_reference_is_a_string_fields_is_an_array(): with pytest.raises(exceptions.ValidationError): valid = validate('data/schema_invalid_fk_array_string_ref.json')
def test_schema_invalid_fk_array(): with pytest.raises(exceptions.ValidationError): valid = validate('data/schema_invalid_fk_array.json')
def test_schema_valid_fk_array(self): filepath = os.path.join(self.data_dir, 'schema_valid_fk_array.json') with io.open(filepath) as stream: schema = json.load(stream) valid = tableschema.validate(schema) self.assertTrue(valid)
def test_schema_valid_fk_array(): valid = validate('data/schema_valid_fk_array.json') assert valid
# schema validator. use this to validate the schema before using it from tableschema import validate, exceptions import sys try: # validate the schema valid = validate('ugms_inbound_table_schema_swt_v0.01.json') print('OK') sys.exit(0) except exceptions.ValidationError as exception: for error in exception.errors: print(error) sys.exit(1)
def test_schema_multiple_errors_no_fail_fast_true(): with pytest.raises(exceptions.ValidationError) as excinfo: valid = validate('data/schema_invalid_multiple_errors.json') assert len(excinfo.value.errors) == 5