def test_check_type_or_format_error(log): errors = [] cells = [ { 'number': 1, 'header': 'name1', 'value': '1', 'field': Field({ 'name': 'name', 'type': 'integer' }) }, ] type_or_format_error(errors, cells, 1) assert log(errors) == [] assert len(cells) == 1 assert cells[0]['value'] == 1
def _process_field(field: Field, rules, ret, prefix): schema_type = field['type'] if schema_type == 'array': field = copy(field) field['type'] = field['es:itemType'] return _process_field(field, rules, ret, prefix) enabled = field.get('es:index', True) subschema = {'fields': []} if enabled and schema_type == 'object': subschema = field['es:schema'] _process_schema(subschema, rules, ret, prefix + field['name'] + '.') elif schema_type == 'string': if field['name'] not in ('doc_id', ): search_field = prefix + field['name'] for suffix in rules.get( ('es:title' in field, 'es:keyword' in field), ['']): ret.append(search_field + suffix)
def test_check_extra_header_infer_with_empty_data(log): cells = [ goodtables.cells.create_cell('name1', field=Field({'name': 'name1'}), column_number=1), goodtables.cells.create_cell('name2', column_number=2), ] sample = [ ['123', ''], ['456', ''], ['789', ''], ] extra_header = ExtraHeader(infer_fields=True) errors = extra_header.check_headers(cells, sample=sample) assert log(errors) == [] assert len(cells) == 2 assert cells[1]['field'].name == 'name2' assert cells[1]['field'].type == 'string'
def test_check_type_or_format_error_problem(log): errors = [] cells = [ { 'number': 1, 'header': 'name1', 'value': 'value1', 'field': Field({ 'name': 'name', 'type': 'integer' }) }, ] type_or_format_error(errors, cells, 1) assert log(errors) == [ (1, 1, 'type-or-format-error'), ] assert len(cells) == 0
def test_check_extra_header_infer_with_empty_data(log): errors = [] cells = [ {'number': 1, 'header': 'name1', 'field': Field({'name': 'name1'})}, {'number': 2, 'header': 'name2'}, ] sample = [ ['123', ''], ['456', ''], ['789', ''], ] extra_header = ExtraHeader(infer_fields=True) extra_header.check_headers(errors, cells, sample=sample) assert log(errors) == [] assert len(cells) == 2 assert cells[1]['field'].name == 'name2' assert cells[1]['field'].type == 'string'
def test_test_value_required(): field = Field( { 'name': 'name', 'type': 'string', 'constraints': { 'required': True } }, missing_values=['', 'NA', 'N/A']) test = partial(field.test_value, constraints=['required']) assert test('test') == True assert test('null') == True assert test('none') == True assert test('nil') == True assert test('nan') == True assert test('NA') == False assert test('N/A') == False assert test('-') == True assert test('') == False assert test(None) == False
def test_test_value_constraints_false(): assert Field(DESCRIPTOR_MIN).test_value('', constraints=False) == True
def test_test_value(): assert Field(DESCRIPTOR_MAX).test_value('1') == True assert Field(DESCRIPTOR_MAX).test_value('string') == False assert Field(DESCRIPTOR_MAX).test_value('') == False
def test_cast_value_null_with_missing_values(): field = Field({'name': 'name', 'type': 'number'}, missing_values=['null']) assert field.cast_value('null') == None
def test_cast_value_constraint_error(): with pytest.raises(exceptions.CastError): Field(DESCRIPTOR_MAX).cast_value('')
def test_format(): assert Field(DESCRIPTOR_MIN).format == 'default' assert Field(DESCRIPTOR_MAX).format == 'default'
def test_type(): assert Field(DESCRIPTOR_MIN).type == 'string' assert Field(DESCRIPTOR_MAX).type == 'integer'
def test_name(): assert Field(DESCRIPTOR_MIN).name == 'id'
def test_descriptor(apply_defaults): assert Field(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN)
def caster(v): f = Field(__field) return f.cast_value(v)
def test_missing_values(): assert Field(DESCRIPTOR_MIN).missing_values == [''] assert Field(DESCRIPTOR_MIN, missing_values=['-']).missing_values == ['-']
def test_constraints(): assert Field(DESCRIPTOR_MIN).constraints == {} assert Field(DESCRIPTOR_MAX).constraints == {'required': True}
def test_required(): assert Field(DESCRIPTOR_MIN).required == False assert Field(DESCRIPTOR_MAX).required == True
def test_cast_value(): assert Field(DESCRIPTOR_MAX).cast_value('1') == 1
class SchemaField: """ Utility class for a field in a schema. Uses a tableschema.Field (https://github.com/frictionlessdata/tableschema-py/blob/master/tableschema/field.py) for help. It doesn't extend this class but compose with it, mostly for the use of the cast_value method. """ DATETIME_TYPES = ['date', 'datetime'] TRUE_VALUES = ['True', 'true', 'True', 'YES', 'yes', 'y', 'Y', 'Yes'] FALSE_VALUES = ['FALSE', 'false', 'False', 'NO', 'no', 'n', 'N', 'No'] def __init__(self, descriptor): self.descriptor = self.__curate_descriptor(descriptor) self.name = self.descriptor.get('name') # We want to throw an exception if there is no name if not self.name: raise FieldSchemaError("A field without a name: {}".format( json.dumps(descriptor))) # the tableschema field. self.tableschema_field = TableField(self.descriptor) # biosys specific self.biosys = BiosysSchema( self.descriptor.get(BiosysSchema.BIOSYS_KEY_NAME)) self.constraints = SchemaConstraints( self.descriptor.get('constraints', {})) # implement some dict like methods def __getitem__(self, item): return self.descriptor.__getitem__(item) def get(self, k, d=None): return self.descriptor.get(k, d) @property def title(self): return self.descriptor.get('title') @property def type(self): return self.descriptor.get('type') @property def column_name(self): return self.name @property def required(self): return self.constraints.required @property def aliases(self): return self.descriptor[ 'aliases'] if 'aliases' in self.descriptor else [] @property def is_datetime_types(self): return self.type in self.DATETIME_TYPES @property def is_date_type(self): return self.type == 'date' @property def is_numeric(self): return self.type in ['number', 'integer'] @property def format(self): return self.descriptor['format'] def has_alias(self, name, icase=False): for alias in self.aliases: if (alias == name) or (icase and alias.lower() == name.lower()): return True return False def has_name_or_alias(self, name, alias, icase=False): """ Test is the field has a name name or an alias alias :param name: :param alias: :param icase: :return: """ has_name = (self.name == name) or (icase and self.name.lower() == name.lower()) return has_name or self.has_alias(alias, icase=icase) def cast(self, value): """ Returns o native Python object of the expected format. Will throw an exception if the value doesn't complies with any constraints. This method delegates most of the cast to the tableschema.Field.cast_value. Except for - date and dateTime with format='any'. This because the tableschema.Field.cast_value interprets an ambiguous day/month/year date as month/day/year (american way) :param value: :return: """ # we want to strip strings if isinstance(value, six.string_types): value = value.strip() # TODO: remove that when running in Python3 if not isinstance(value, six.text_type): # the ensure only unicode value = six.u(value).strip() # date or datetime with format='any if self.is_datetime_types and self.format == 'any' and value: return cast_date_any_format( value) if self.is_date_type else cast_datetime_any_format( value) # delegates to tableschema.Field.cast_value return self.tableschema_field.cast_value(value, constraints=True) def validation_error(self, value): """ Return an error message if the value is not valid according to the schema. It relies on exception thrown by the 'cast1 method of Type method. :param value: :return: None if value is valid or an error message string """ error = None # override the integer validation. The default message is a bit cryptic if there's an error casting a string # like '1.2' into an int. if self.type == 'integer': if not is_blank_value(value): not_integer = False try: casted = self.cast(value) # there's also the case where the case where a float 1.2 is successfully casted in 1 # (ex: int(1.2) = 1) if str(casted) != str(value): not_integer = True except Exception: not_integer = True if not_integer: return 'The field "{}" must be a whole number.'.format( self.name) try: self.cast(value) except Exception as e: error = "{}".format(e) # Override the default enum exception message to include all possible values if error.find('enum array') and self.constraints.enum: values = [str(v) for v in self.constraints.enum] error = "The value must be one the following: {}".format( values) return error def __curate_descriptor(self, descriptor): """ Apply some changes to the descriptor: - Change default values for boolean (adding 'yes' and 'no') Since TableSchema V1.0 the default true values are [ "true", "True", "TRUE", "1" ] We want to be sure that 'yes' and 'no' (and variations) are included by default. The schema specifications allows to override the true and false values with 'trueValues' and 'falseValues' (see https://frictionlessdata.io/specs/table-schema/) """ if descriptor.get('type') == 'boolean': descriptor['trueValues'] = descriptor.get('trueValues', self.TRUE_VALUES) descriptor['falseValues'] = descriptor.get('falseValues', self.FALSE_VALUES) return descriptor def __str__(self): return '{}'.format(self.name)