def add_data_format_row(self, row_data): """ Extract name and value from ``row_data`` and apply it to :py:attr:`~cutplace.interface.Cid.data_format` by calling :py:meth:`~cutplace.data.DataFormat.set_property`. :param list row_data: a list with at least 2 items for name and value \ that can be passed to \ :py:meth:`cutplace.data.DataFormat.set_property()`. """ assert row_data is not None assert len(row_data) >= 2 name, value = row_data[:2] lower_name = name.lower() self._location.advance_cell() if name == '': raise errors.InterfaceError( 'name of data format property must be specified', self._location) self._location.advance_cell() if (self._data_format is None) and (lower_name != data.KEY_FORMAT): raise errors.InterfaceError( 'first data format row must set property %s instead of %s' % (_compat.text_repr(data.KEY_FORMAT), _compat.text_repr(name)), self._location) if (self._data_format is not None) and (lower_name == data.KEY_FORMAT): raise errors.InterfaceError( 'data format already is %s and must be set only once' % _compat.text_repr(self._data_format.format), self._location) lower_value = value.lower() if self._data_format is None: self._data_format = data.DataFormat(lower_value, self._location) else: self._data_format.set_property(name.lower(), value, self._location)
def __init__(self, field_name, is_allowed_to_be_empty, length, rule, data_format): super(ConstantFieldFormat, self).__init__( field_name, is_allowed_to_be_empty, length, rule, data_format, empty_value='') # Extract constant from rule tokens. tokens = _tools.tokenize_without_space(rule) toky = next(tokens) if _tools.is_eof_token(toky): # No rule means that the field must always be empty. self._constant = '' else: self._constant = _tools.token_text(toky) toky = next(tokens) if not _tools.is_eof_token(toky): raise errors.InterfaceError( _('constant rule must be a single Python token but also found: %s') % _compat.text_repr(_tools.token_text(toky))) has_empty_rule = (rule == '') if self.is_allowed_to_be_empty and not has_empty_rule: raise errors.InterfaceError( _('to describe a Constant that can be empty, use a Choice field with a single choice')) if not self.is_allowed_to_be_empty and has_empty_rule: raise errors.InterfaceError( _('field must be marked as empty to describe a constant empty value')) try: self.length.validate( _('rule of constant field %s') % _compat.text_repr(self.field_name), len(self._constant)) except errors.RangeValueError: raise errors.InterfaceError( _('length is %s but must be %d to match constant %s') % (self.length, len(self._constant), _compat.text_repr(self._constant)))
def validated_field_name(supposed_field_name, location=None): """ Same as ``supposed_field_name`` except with surrounding white space removed. :param cutplace.errors.Location location: location used in case of errors :raise cutplace.errors.InterfaceError: if ``supposed_field_name`` is \ invalid """ field_name = supposed_field_name.strip() basic_requirements_text = 'field name must be a valid Python name consisting of ASCII letters, ' \ 'underscore (_) and digits' if field_name == '': raise errors.InterfaceError(basic_requirements_text + 'but is empty', location) if keyword.iskeyword(field_name): raise errors.InterfaceError( "field name must not be a Python keyword but is: '%s'" % field_name, location) is_first_character = True for character in field_name: if is_first_character: if character not in _ASCII_LETTERS: raise errors.InterfaceError( "field name must begin with a lower-case letter but is: %s" % _compat.text_repr(field_name), location) is_first_character = False else: if character not in _ASCII_LETTERS_DIGITS_AND_UNDERSCORE: raise errors.InterfaceError( basic_requirements_text + 'but is: %s' % _compat.text_repr(field_name), location) return field_name
def _validated_character(key, value, location): r""" A single character intended as value for data format property ``key`` derived from ``value``, which can be: * a decimal or hex number (prefixed with ``'0x'``) referring to the ASCII/Unicode of the character * a string containing a single character such as ``'\t'``. * a symbolic name from :py:const:`cutplace.errors.NAME_TO_ASCII_CODE_MAP` such as ``tab``. :raises cutplace.errors.InterfaceError: on any broken ``value`` """ assert key assert value is not None name_for_errors = 'data format property %s' % _compat.text_repr(key) stripped_value = value.strip() if (len(stripped_value) == 1) and (stripped_value not in string.digits): result_code = ord(stripped_value) else: tokens = tokenize.generate_tokens(io.StringIO(value).readline) next_token = next(tokens) if _tools.is_eof_token(next_token): raise errors.InterfaceError( _("value for %s must be specified") % name_for_errors, location) next_type = next_token[0] next_value = next_token[1] if next_type == token.NAME: result_code = ranges.code_for_symbolic_token( name_for_errors, next_value, location) elif next_type == token.NUMBER: result_code = ranges.code_for_number_token( name_for_errors, next_value, location) elif next_type == token.STRING: result_code = ranges.code_for_string_token( name_for_errors, next_value, location) elif (len(next_value) == 1) and not _tools.is_eof_token(next_token): result_code = ord(next_value) else: raise errors.InterfaceError( _('value for %s must a number, a single character or a symbolic name but is: %s' ) % (name_for_errors, _compat.text_repr(value)), location) # Ensure there are no further tokens. next_token = next(tokens) if (not _tools.is_eof_token(next_token)) and (next_token[0] != tokenize.NEWLINE): raise errors.InterfaceError( _('value for %s must be a single character but is: %s') % (name_for_errors, _compat.text_repr(value)), location) # TODO: Handle 'none' properly. assert result_code is not None assert result_code >= 0 result = six.unichr(result_code) assert result is not None return result
def add_check_row(self, possibly_incomplete_items): """ Add a check as declared in ``possibly_incomplete_items``, which ideally is a list composed of 3 elements: 1. description ('customer_id_must_be_unique') 2. type (e.g. 'IsUnique' mapping to :py:class:`cutplace.checks.IsUniqueCheck`) 3. rule (e.g. 'customer_id') Missing items are interpreted as empty string (``''``), additional items are ignored. :raises cutplace.errors.InterfaceError: on broken \ ``possibly_incomplete_items`` """ assert possibly_incomplete_items is not None items = list(possibly_incomplete_items) # HACK: Ignore possible concatenated (empty) cells between description and type. while (len(items) >= 2) and (items[1].strip() == ''): del items[1] check_description, check_type, check_rule = (items + 3 * [''])[:3] self._location.advance_cell() if check_description == '': raise errors.InterfaceError('check description must be specified', self._location) self._location.advance_cell() check_class_name = check_type + "Check" if check_class_name not in self._check_name_to_class_map: list_of_available_check_types = _tools.human_readable_list( sorted(self._check_name_to_class_map.keys())) raise errors.InterfaceError( "check type is '%s' but must be one of: %s" % (check_type, list_of_available_check_types), self._location) _log.debug("create check: %s(%r, %r)", check_type, check_description, check_rule) check_class = self._create_check_class(check_type) check = check_class.__new__(check_class, check_description, check_rule, self._field_names, self._location) check.__init__(check_description, check_rule, self._field_names, self._location) self._location.set_cell(1) existing_check = self._check_name_to_check_map.get(check_description) if existing_check is not None: raise errors.InterfaceError( "check description must be used only once: %s" % _compat.text_repr(check_description), self._location, "first declaration", existing_check.location) self._check_name_to_check_map[check_description] = check self._check_names.append(check_description) assert len(self.check_names) == len(self._check_name_to_check_map)
def _validated_int_at_least_0(key, value, location): assert key assert value is not None try: result = int(value) except ValueError: raise errors.InterfaceError( _('data format property %s is %s but must be a number') % (_compat.text_repr(key), _compat.text_repr(value)), location) if result < 0: raise errors.InterfaceError( _('data format property %s is %d but must be at least 0') % (_compat.text_repr(key), result), location) return result
def __init__(self, description, rule, available_field_names, location=None): super(IsUniqueCheck, self).__init__(description, rule, available_field_names, location) self._field_names_to_check = [] self._row_key_to_location_map = None self.reset() # Extract field names to check from rule. rule_read_line = _compat.token_io_readline(rule) toky = tokenize.generate_tokens(rule_read_line) after_comma = True next_token = next(toky) unique_field_names = set() while (not _tools.is_eof_token(next_token)) and (next_token[0] != tokenize.NEWLINE): token_type = next_token[0] token_value = next_token[1] if after_comma: if token_type != tokenize.NAME: raise errors.InterfaceError( _("field name must contain only ASCII letters, numbers and underscores (_) " "but found: %r [token type=%r]") % (token_value, token_type), self.location_of_rule) try: fields.field_name_index(token_value, available_field_names, location) if token_value in unique_field_names: raise errors.InterfaceError( _("duplicate field name for unique check must be removed: %s" ) % token_value, self.location_of_rule) unique_field_names.add(token_value) except errors.InterfaceError as error: raise errors.InterfaceError(six.text_type(error)) self._field_names_to_check.append(token_value) elif not _tools.is_comma_token(next_token): raise errors.InterfaceError( _("after field name a comma (,) must follow but found: %r") % token_value, self.location_of_rule) after_comma = not after_comma next_token = next(toky) if not len(self._field_names_to_check): raise errors.InterfaceError( _("rule must contain at least one field name to check for uniqueness" ), self.location_of_rule)
def __init__(self, field_name, is_allowed_to_be_empty, length_text, rule, data_format, empty_value=None): super(DecimalFieldFormat, self).__init__( field_name, is_allowed_to_be_empty, length_text, rule, data_format, empty_value) if rule.strip() != '': raise errors.InterfaceError("decimal rule must be empty") self.decimalSeparator = data_format.decimal_separator self.thousandsSeparator = data_format.thousands_separator
def code_for_string_token(name, value, location): """ The numeric code for text representing an string with a single character in ``value``. :param str name: the name of the value as it is known to the end user :param str value: the text that represents a string with a single character :param cutplace.errors.Location location: the location of ``value`` or ``None`` """ assert name is not None assert value is not None assert len(value) >= 2 left_quote = value[0] right_quote = value[-1] assert left_quote in "\"\'", "left_quote=%r" % left_quote assert right_quote in "\"\'", "right_quote=%r" % right_quote value_without_quotes = value[1:-1] if len(value_without_quotes) != 1: value_without_quotes = value_without_quotes.encode('utf-8').decode( 'unicode_escape') if len(value_without_quotes) != 1: raise errors.InterfaceError( _('text for %s must be a single character but is: %s') % (name, _compat.text_repr(value)), location) return ord(value_without_quotes)
def __init__(self, description, rule, available_field_names, location_of_definition=None): r""" Create a check. :param str description: human readable description of the check :param str rule: the check conditions to validate :param list available_field_names: the names of the fields available for the check (typically referring \ to :py:attr:`cutplace.interface.Cid.field_names`) :param location_of_definition: location in the CID where the check was declared to be (used by error \ messages); if ``None``, use ``cutplace.errors.create_caller_location(['checks'])`` :type location_of_definition: :py:class:`~cutplace.errors.Location` or None """ assert description assert rule is not None assert available_field_names is not None if not available_field_names: raise errors.InterfaceError( _("field names must be specified before check"), location_of_definition) self._description = description self._rule = rule self._field_names = available_field_names if location_of_definition is None: self._location = errors.create_caller_location(['checks']) self._location_of_rule = self._location else: self._location = copy.copy(location_of_definition) self._location.set_cell(1) self._location_of_rule = copy.copy(location_of_definition) self._location_of_rule.set_cell(3)
def __init__(self, description, rule, available_field_names, location=None): super(DistinctCountCheck, self).__init__(description, rule, available_field_names, location) rule_read_line = _compat.token_io_readline(rule) tokens = tokenize.generate_tokens(rule_read_line) first_token = next(tokens) # Obtain and validate field to count. if first_token[0] != tokenize.NAME: raise errors.InterfaceError( _("rule must start with a field name but found: %r") % first_token[1], self.location_of_rule) self._field_name_to_count = first_token[1] fields.field_name_index(self._field_name_to_count, available_field_names, location) line_where_field_name_ends, column_where_field_name_ends = first_token[ 3] assert column_where_field_name_ends > 0 assert line_where_field_name_ends == 1 # Build and test Python expression for validation. self._expression = DistinctCountCheck._COUNT_NAME + rule[ column_where_field_name_ends:] self._distinct_value_to_count_map = None self.reset() self._eval()
def __init__(self, field_name, is_allowed_to_be_empty, length, rule, data_format): super(ChoiceFieldFormat, self).__init__(field_name, is_allowed_to_be_empty, length, rule, data_format, empty_value='') self.choices = [] # Split rule into tokens, ignoring white space. tokens = _tools.tokenize_without_space(rule) # Extract choices from rule tokens. previous_toky = None toky = next(tokens) while not _tools.is_eof_token(toky): if _tools.is_comma_token(toky): # Handle comma after comma without choice. if previous_toky: previous_toky_text = previous_toky[1] else: previous_toky_text = None raise errors.InterfaceError( "choice value must precede a comma (,) but found: %s" % _compat.text_repr(previous_toky_text)) choice = _tools.token_text(toky) if not choice: raise errors.InterfaceError( "choice field must be allowed to be empty instead of containing an empty choice" ) self.choices.append(choice) toky = next(tokens) if not _tools.is_eof_token(toky): if not _tools.is_comma_token(toky): raise errors.InterfaceError( "comma (,) must follow choice value %s but found: %s" % (_compat.text_repr(choice), _compat.text_repr( toky[1]))) # Process next choice after comma. toky = next(tokens) if _tools.is_eof_token(toky): raise errors.InterfaceError( "trailing comma (,) must be removed") if not self.is_allowed_to_be_empty and not self.choices: raise errors.InterfaceError( "choice field without any choices must be allowed to be empty")
def read(self, cid_path, rows): """ Provided no ``cid_path`` has already been specified for :py:class:`~cutplace.interface.Cid.__init__()`, process ``rows`` using :py:meth:`~cutplace.interface.Cid.add_data_format_row()`, :py:meth:`~cutplace.interface.Cid.add_field_format()` and :py:meth:`~cutplace.interface.Cid.add_check()`. Report any errors by referring to ``cid_path``. :param str cid_path: the path from which ``rows`` where obtained :param sequence rows: sequence of lists where each list either \ describes a data format, field format, check or comment for a CID. :raises cutplace.errors.InterfaceError: in case any row in ``rows`` \ cannot be processed """ assert cid_path is not None assert self.data_format is None, 'CID must be read only once' # TODO: Detect format and use proper reader. self._location = errors.Location(cid_path, has_cell=True) if self._cid_path is None: self._cid_path = cid_path for row in rows: if row: row_type = row[0].lower().strip() row_data = (row[1:] + [''] * 6)[:6] if row_type == 'd': self.add_data_format_row(row_data) elif row_type == 'f': self.add_field_format_row(row_data) elif row_type == 'c': self.add_check_row(row_data) elif row_type != '': # Raise error when value is not supported. raise errors.InterfaceError( 'CID row type is "%s" but must be empty or one of: C, D, or F' % row_type, self._location) self._location.advance_line() if self.data_format is None: raise errors.InterfaceError('data format must be specified', self._location) self.data_format.validate() if len(self.field_names) == 0: raise errors.InterfaceError('fields must be specified', self._location)
def _eval(self): """ The current result of `self._expression`. """ local_variables = { DistinctCountCheck._COUNT_NAME: self._distinct_count() } try: result = eval(self._expression, {}, local_variables) except Exception as message: raise errors.InterfaceError( _("cannot evaluate count expression %r: %s") % (self._expression, message), self.location_of_rule) if result not in (True, False): raise errors.InterfaceError( _("count expression %r must result in %r or %r, but test resulted in: %r" ) % (self._expression, True, False, result), self.location_of_rule) return result
def check_distinct(name1, name2): assert name1 is not None assert name2 is not None assert name1 < name2, 'names must be sorted for consistent error message: %r, %r' % ( name1, name2) value1 = self.__dict__['_' + name1] value2 = self.__dict__['_' + name2] if value1 == value2: raise errors.InterfaceError( _("'%s' and '%s' are both %s but must be different from each other" ) % (name1, name2, _compat.text_repr(value1)))
def __init__(self, field_name, is_allowed_to_be_empty, length_text, rule, data_format, empty_value=None): super(IntegerFieldFormat, self).__init__( field_name, is_allowed_to_be_empty, length_text, rule, data_format, empty_value) is_fixed_format = (data_format.format == data.FORMAT_FIXED) has_length = (length_text is not None) and (length_text.strip() != '') if has_length: length = self.length if is_fixed_format: # For fixed data format, use an implicit range starting from # 1 to take into account that leading and trailing blanks # might be missing from the rule parts. assert self.length.lower_limit == self.length.upper_limit length = ranges.Range('1...%d' % self.length.upper_limit) length_range = ranges.create_range_from_length(length) has_rule = (rule is not None) and (rule.strip() != '') if has_rule: rule_range = ranges.Range(rule) if has_length: if has_rule: # Both a length and a rule have been specified: check if all # non ``None`` parts of each item of the rule fit within the # range of the length. Then use the rule as valid range. for rule_item in rule_range.items: partial_rule_limits = [ partial_rule_limit for partial_rule_limit in rule_item if partial_rule_limit is not None ] for partial_rule_limit in partial_rule_limits: length_of_partial_rule_limit = _tools.length_of_int(partial_rule_limit) try: length.validate( "length of partial rule limit '%d'" % partial_rule_limit, length_of_partial_rule_limit) except errors.RangeValueError as error: message = "length must be consistent with rule: %s" % error raise errors.InterfaceError(message) self.valid_range = rule_range else: # A length but no rule has been specified: derive a valid # range from the length. self.valid_range = length_range else: if has_rule: # No length but a rule has been specified: use the rule as # valid range. self.valid_range = rule_range else: # No length and no rule has been specified: use a default # range of signed 32 bit integer. If the user wants a bigger # range, he has to specify it. Python's ``int`` scales to any # range as long as there is enough memory available to # represent it. self.valid_range = ranges.Range(ranges.DEFAULT_INTEGER_RANGE_TEXT)
def _create_class(self, name_to_class_map, class_qualifier, class_name_appendix, type_name): assert name_to_class_map assert class_qualifier assert class_name_appendix assert type_name class_name = class_qualifier.split(".")[-1] + class_name_appendix result = name_to_class_map.get(class_name) if result is None: raise errors.InterfaceError( "cannot find class for %s %s: related class is %s but must be one of: %s" % ( type_name, class_qualifier, class_name, _tools.human_readable_list(sorted(name_to_class_map.keys()))), self._location) return result
def rows(self): """ Data rows of ``source_path``. Even with ``on_error`` set to ' continue' or 'yield' certain errors still cause a stop, for example checks at the end of the file still raise a :py:exc:`cutplace.errors.CheckError` and generally broken files result in a :py:exc:`cutplace.errors.DataFormatError`. :raises cutplace.errors.DataError: on broken data """ self.accepted_rows_count = 0 self.rejected_rows_count = 0 for check in self.cid.check_map.values(): check.reset() header_row_count = self._cid.data_format.header for row_count, row in enumerate(self._raw_rows(), 1): try: is_after_header_row = (row_count > header_row_count) is_before_validate_until = (self._validate_until is None) or (row_count <= self._validate_until) if is_after_header_row: if is_before_validate_until: self.validate_row(row) self.accepted_rows_count += 1 yield row else: if self.cid.data_format.validate_header_row_against_field_names: # we don't know, which header row to validate if there are multiple ones if header_row_count > 1: raise errors.InterfaceError( _("Cannot validate the header row, when 'Header' is set to '{count}'. " "Either set 'Header' to '1' or disable header validation with " "'Validate header row against field names' set to 'False'.").format( count=header_row_count ) ) self.validate_header(row) except errors.DataError as error: if self.on_error == 'raise': raise self.rejected_rows_count += 1 if self.on_error == 'yield': yield error else: assert self.on_error == 'continue' self._location.advance_line()
def __init__(self, format_name, location=None): r""" Create a new data format. :param str format_name: the data format, which must be one of \ :py:const:`FORMAT_DELIMITED`, :py:const:`FORMAT_EXCEL`, :py:const:`FORMAT_FIXED` or :py:const:`FORMAT_ODS`. :param cutplace.errors.Location location: location where the data format was declared """ assert format_name == format_name.lower( ), 'format_name must be lower case: %r' % format_name if format_name not in (_VALID_FORMATS + ['csv']): raise errors.InterfaceError( _('format is %s but must be on of: %s') % (format_name, _VALID_FORMATS), location if location is not None else errors.create_caller_location(['data'])) # HACK: Treat ``format_name`` 'csv' as synonym for ``FORMAT_DELIMITED``. self._format = format_name if format_name != 'csv' else FORMAT_DELIMITED self._header = 0 self._validate_header_row_against_field_names = False self._is_valid = False self._allowed_characters = None self._encoding = 'cp1252' self._quoting = csv.QUOTE_MINIMAL self._strict_field_names = True if self.format == FORMAT_DELIMITED: self._escape_character = '"' self._item_delimiter = ',' self._quote_character = '"' self._skip_initial_space = False if self.format in (FORMAT_DELIMITED, FORMAT_FIXED): self._decimal_separator = '.' self._line_delimiter = ANY self._thousands_separator = '' elif self.format in (FORMAT_EXCEL, FORMAT_ODS): self._sheet = 1 if self.format in (FORMAT_DELIMITED, FORMAT_FIXED): # Valid values for property 'line delimiter', which is only available for delimited and fixed data # with no line delimiter only allowed for fixed data. self._VALID_LINE_DELIMITER_TEXTS = sorted([ line_delimiter_text for line_delimiter, line_delimiter_text in LINE_DELIMITER_TO_TEXT_MAP.items() if (line_delimiter is not None) or ( self.format == FORMAT_FIXED) ])
def _validated_choice(key, value, choices, location, ignore_case=False): """ Same as ``value`` or ``value.lower()`` in case ``ignore_case`` is set to ``True``. If the supposed result is not on of the available ``choices``, raise `errors.InterfaceError`. """ assert key assert value is not None assert choices result = value if not ignore_case else value.lower() if result not in choices: raise errors.InterfaceError( _('data format property %s is %s but must be one of: %s') % (_compat.text_repr(key), _compat.text_repr(value), _tools.human_readable_list(choices)), location) return result
def code_for_number_token(name, value, location): """ The numeric code for text representing an :py:class:`int` in ``value``. :param str name: the name of the value as it is known to the end user :param str value: the text that represents an :py:class:`int` :param cutplace.errors.Location location: the location of ``value`` or ``None`` """ assert name is not None assert value is not None try: # Note: base 0 automatically handles prefixes like 0x. result = int(value, 0) except ValueError: raise errors.InterfaceError( 'numeric value for %s must be an integer number but is: %s' % (name, _compat.text_repr(value)), location) return result
def code_for_symbolic_token(name, value, location): """ The numeric code for text representing an a symbolic name in ``value``, which has to be one of the values in :py:const:`cutplace.errors.NAME_TO_ASCII_CODE_MAP`. :param str name: the name of the value as it is known to the end user :param str value: the text that represents a symbolic name :param cutplace.errors.Location location: the location of ``value`` or ``None`` """ assert name is not None assert value is not None try: result = errors.NAME_TO_ASCII_CODE_MAP[value.lower()] except KeyError: valid_symbols = _tools.human_readable_list(sorted(errors.NAME_TO_ASCII_CODE_MAP.keys())) raise errors.InterfaceError( 'symbolic name %s for %s must be one of: %s' % (_compat.text_repr(value), name, valid_symbols), location) return result
def field_name_index(field_name_to_look_up, available_field_names, location): """ The index of ``field_name_to_look_up`` (without leading or trailing white space) in ``available_field_names``. :param cutplace.errors.Location location: location used in case of errors :raise cutplace.errors.InterfaceError: if ``field_name_to_look_up`` is \ not part of ``available_field_names`` """ assert field_name_to_look_up is not None assert field_name_to_look_up == field_name_to_look_up.strip() assert available_field_names field_name_to_look_up = field_name_to_look_up.strip() try: field_index = available_field_names.index(field_name_to_look_up) except ValueError: raise errors.InterfaceError( 'unknown field name %s must be replaced by one of: %s' % (_compat.text_repr(field_name_to_look_up), _tools.human_readable_list(available_field_names)), location) return field_index
def add_field_format_row(self, possibly_incomplete_items): """ Add field as described by `possibly_incomplete_items`, which is a list consisting of: 1) field name 2) optional: example value (can be empty) 3) optional: empty flag ("X" = field is allowed to be empty) 4) optional: length (using the syntax of :py:class:`cutplace.ranges.Range`) 5) optional: field type (e.g. 'Integer' for :py:class:`cutplace.fields.IntegerFieldFormat`) 6) optional: rule to validate field (depending on type) Any missing items are interpreted as empty string (``''``). Additional items are ignored. :raises cutplace.errors.InterfaceError: on broken \ ``possibly_incomplete_items`` """ assert possibly_incomplete_items is not None assert self._location is not None if self._data_format is None: raise errors.InterfaceError( "data format must be specified before first field", self._location) # Assert that the various lists and maps related to fields are in a consistent state. # Ideally this would be a class invariant, but this is Python, not Eiffel. field_count = len(self.field_names) assert len(self._field_formats) == field_count assert len(self._field_name_to_format_map) == field_count assert len(self._field_name_to_index_map) == field_count items = (possibly_incomplete_items + 6 * [''])[:6] # Obtain field name. field_name = fields.validated_field_name(items[0], self._location) if field_name in self._field_name_to_format_map: # TODO: Add see_also_location pointing to previous declaration. raise errors.InterfaceError( 'duplicate field name must be changed to a unique one: %s' % field_name, self._location) # Obtain example. self._location.advance_cell() field_example = items[1] # Obtain "empty" mark. self._location.advance_cell() field_is_allowed_to_be_empty_text = items[2].strip().lower() if field_is_allowed_to_be_empty_text == '': field_is_allowed_to_be_empty = False elif field_is_allowed_to_be_empty_text == self._EMPTY_INDICATOR: field_is_allowed_to_be_empty = True else: raise errors.InterfaceError( "mark for empty field must be %s or empty but is %s" % (self._EMPTY_INDICATOR, field_is_allowed_to_be_empty_text), self._location) # Obtain length. self._location.advance_cell() field_length = items[3] # Obtain field type and rule. self._location.advance_cell() field_type_item = items[4].strip() if field_type_item == '': field_type = 'Text' else: field_type = '' field_type_parts = field_type_item.split(".") try: for part in field_type_parts: if field_type: field_type += "." field_type += _tools.validated_python_name( "field type part", part) assert field_type, "empty field type must be detected by validated_python_name()" except NameError as error: raise errors.InterfaceError(six.text_type(error), self._location) field_class = self._create_field_format_class(field_type) self._location.advance_cell() field_rule = items[5].strip() _log.debug("create field: %s(%r, %r, %r)", field_class.__name__, field_name, field_type, field_rule) try: field_format = field_class.__new__(field_class, field_name, field_is_allowed_to_be_empty, field_length, field_rule) field_format.__init__(field_name, field_is_allowed_to_be_empty, field_length, field_rule, self._data_format) except errors.InterfaceError as error: error_location = error.location if error.location is not None else self._location error.prepend_message( 'cannot declare field %s' % _compat.text_repr(field_name), error_location) raise error # Validate field length. # TODO #82: Cleanup validation for declared field formats. self._location.set_cell(4) field_length = field_format.length if self._data_format.format == data.FORMAT_FIXED: if field_length.items is None: raise errors.InterfaceError( "length of field %s must be specified with fixed data format" % _compat.text_repr(field_name), self._location) if field_length.lower_limit != field_length.upper_limit: raise errors.InterfaceError( "length of field %s for fixed data format must be a specific number but is: %s" % (_compat.text_repr(field_name), field_format.length), self._location) if field_length.lower_limit < 1: raise errors.InterfaceError( "length of field %s for fixed data format must be at least 1 but is: %d" % (_compat.text_repr(field_name), field_format.length.lower_limit), self._location) elif field_length.lower_limit is not None: if field_length.lower_limit < 0: raise errors.InterfaceError( "lower limit for length of field %s must be at least 0 but is: %d" % (_compat.text_repr(field_name), field_format.length.lower_limit), self._location) elif field_length.upper_limit is not None: # Note: 0 as upper limit is valid for a field that must always be empty. if field_length.upper_limit < 0: raise errors.InterfaceError( "upper limit for length of field %s must be at least 0 but is: %d" % (_compat.text_repr(field_name), field_format.length.upper_limit), self._location) # Set and validate example in case there is one. if field_example != '': try: field_format.example = field_example except errors.FieldValueError as error: self._location.set_cell(2) raise errors.InterfaceError( "cannot validate example for field %s: %s" % (_compat.text_repr(field_name), error), self._location) self._location.set_cell(1) assert field_name assert field_type assert field_rule is not None self.add_field_format(field_format)
def set_property(self, name, value, location=None): r""" Set data format property ``name`` to ``value`` possibly translating ``value`` from a human readable representation to an internal one. :param str name: any of the ``KEY_*`` constants :param value: the value to set the property to as it would show up in a CID. \ In some cases, the value will be translated to an internal representation. \ For example ``set_property(KEY_LINE_DELIMITER, 'lf')`` results in \ :py:attr:`cutplace.data.line_delimiter` being ``'\n'``. :type value: str or None :raises cutplace.errors.InterfaceError: if ``name`` is not a valid property name for this data format :raises cutplace.errors.InterfaceError: if ``value`` is invalid for the specified property """ assert not self.is_valid, 'after validate() has been called property %r cannot be set anymore' % name assert name is not None assert name == name.lower( ), 'property name must be lower case: %r' % name assert (value is not None) or (name in (KEY_ALLOWED_CHARACTERS, KEY_LINE_DELIMITER)) name = name.replace(' ', '_') property_attribute_name = '_' + name if property_attribute_name not in self.__dict__: valid_property_names = _tools.human_readable_list( list(self.__dict__.keys())) raise errors.InterfaceError( _('data format property %s for format %s is %s but must be one of %s' ) % (_compat.text_repr(name), self.format, _compat.text_repr(value), valid_property_names), location) if name == KEY_ENCODING: try: codecs.lookup(value) except LookupError: raise errors.InterfaceError( _('value for data format property %s is %s but must be a valid encoding' ) % (_compat.text_repr(KEY_ENCODING), _compat.text_repr(self.encoding)), location) self.encoding = value elif name == KEY_HEADER: self.header = DataFormat._validated_int_at_least_0( name, value, location) elif name == KEY_VALIDATE_HEADER_ROW_AGAINST_FIELD_NAMES: self.validate_header_row_against_field_names = DataFormat._validated_bool( KEY_VALIDATE_HEADER_ROW_AGAINST_FIELD_NAMES, value, location) elif name == KEY_ALLOWED_CHARACTERS: try: self._allowed_characters = ranges.Range(value) except errors.InterfaceError as error: raise errors.InterfaceError( _('data format property %s must be a valid range: %s') % (_compat.text_repr(KEY_ALLOWED_CHARACTERS), error), location) elif name == KEY_DECIMAL_SEPARATOR: self.decimal_separator = DataFormat._validated_choice( KEY_DECIMAL_SEPARATOR, value, _VALID_DECIMAL_SEPARATORS, location) elif name == KEY_ESCAPE_CHARACTER: self.escape_character = DataFormat._validated_choice( KEY_ESCAPE_CHARACTER, value, _VALID_ESCAPE_CHARACTERS, location) elif name == KEY_ITEM_DELIMITER: item_delimiter = DataFormat._validated_character( KEY_ITEM_DELIMITER, value, location) if item_delimiter == '\x00': raise errors.InterfaceError( _("data format property %s must not be 0 (to avoid zero termindated strings in Python's C based CSV reader)" ) % _compat.text_repr(KEY_ITEM_DELIMITER), location) self.item_delimiter = item_delimiter elif name == KEY_LINE_DELIMITER: try: self.line_delimiter = _TEXT_TO_LINE_DELIMITER_MAP[ value.lower()] except KeyError: raise errors.InterfaceError( _('line delimiter %s must be changed to one of: %s') % (_compat.text_repr(value), _tools.human_readable_list( self._VALID_LINE_DELIMITER_TEXTS)), location) elif name == KEY_QUOTE_CHARACTER: self.quote_character = DataFormat._validated_choice( KEY_QUOTE_CHARACTER, value, _VALID_QUOTE_CHARACTERS, location) elif name == KEY_SHEET: self.sheet = DataFormat._validated_int_at_least_0( KEY_SHEET, value, location) elif name == KEY_SKIP_INITIAL_SPACE: self.skip_initial_space = DataFormat._validated_bool( KEY_SKIP_INITIAL_SPACE, value, location) elif name == KEY_THOUSANDS_SEPARATOR: self.thousands_separator = DataFormat._validated_choice( KEY_THOUSANDS_SEPARATOR, value, _VALID_THOUSANDS_SEPARATORS, location) elif name == KEY_QUOTING: result = DataFormat._validated_choice(KEY_QUOTING, value, _VALID_QUOTING, location, ignore_case=True) self.quoting = READABLE_TO_CSV_QUOTING_FORMAT[result] elif name == KEY_STRICT_FIELD_NAMES: self.strict_field_names = DataFormat._validated_bool( KEY_STRICT_FIELD_NAMES, value, location) else: assert False, 'name=%r' % name
def __init__(self, description, default=None, location=None): """ Setup a decimal range as specified by ``description``. :param str description: a range description of the form \ ``lower...upper`` or ``limit``, possibly consisting of multiple \ items. In case it is empty (``''``), the range specified by \ ``default`` is used; the description also specifies the \ :py:attr:`~cutplace.ranges.DecimalRange.scale` and \ :py:attr:`~cutplace.ranges.DecimalRange.precision` valid numbers \ can use. :param str default: an alternative to use in case ``description`` is ``None`` or empty; in case both ``description`` and \ ``default`` are ``None`` or empty, all values within the \ :py:const:`DEFAULT_SCALE` and :py:const:`DEFAULT_PRECISION` are \ valid. """ assert default is None or (default.strip() != ''), "default=%r" % default self._precision = DEFAULT_PRECISION self._scale = DEFAULT_SCALE # Find out if a `description` has been specified and if not, use optional `default` instead. has_description = (description is not None) and (description.strip() != '') if not has_description and default is not None: description = default has_description = True if not has_description: # Use empty ranges. self._description = None self._items = None self._lower_limit = None self._upper_limit = None else: self._description = description.replace('...', ELLIPSIS) self._items = [] tokens = _tools.tokenize_without_space(self._description) end_reached = False max_digits_after_dot = 0 max_digits_before_dot = 0 while not end_reached: lower = None upper = None ellipsis_found = False after_hyphen = False next_token = next(tokens) while not _tools.is_eof_token( next_token) and not _tools.is_comma_token(next_token): next_type = next_token[0] next_value = next_token[1] if next_type == token.NUMBER: if next_type == token.NUMBER: try: decimal_value = decimal.Decimal(next_value) _sign, digits, exponent = decimal_value.as_tuple( ) digits_after_dot = max(0, -exponent) if digits_after_dot > max_digits_after_dot: max_digits_after_dot = digits_after_dot digits_before_dot = len(digits) + exponent if digits_before_dot > max_digits_before_dot: max_digits_before_dot = digits_before_dot except decimal.DecimalException: raise errors.InterfaceError( _("number must be an decimal or integer but is: %s" ) % _compat.text_repr(next_value), location) if after_hyphen: decimal_value = decimal_value.copy_negate() after_hyphen = False if ellipsis_found: if upper is None: upper = decimal_value else: raise errors.InterfaceError( _("range must have at most lower and upper limit but found another number: %s" ) % _compat.text_repr(next_value), location) elif lower is None: lower = decimal_value else: raise errors.InterfaceError( _("number must be followed by ellipsis (...) but found: %s" ) % _compat.text_repr(next_value)) elif after_hyphen: raise errors.InterfaceError( _("hyphen (-) must be followed by number but found: %s" ) % _compat.text_repr(next_value)) elif (next_type == token.OP) and (next_value == "-"): after_hyphen = True elif next_value in (ELLIPSIS, ':'): ellipsis_found = True else: message = "range must be specified using decimal or integer numbers" \ " and ellipsis (...) but found: %s [token type: %d]" \ % (_compat.text_repr(next_value), next_type) raise errors.InterfaceError(message) next_token = next(tokens) if after_hyphen: raise errors.InterfaceError( _("hyphen (-) at end must be followed by number")) # Decide upon the result. if lower is None: if upper is None: if ellipsis_found: # Handle "...". # TODO: Handle "..." same as ""? raise errors.InterfaceError( _("ellipsis (...) must be preceded and/or succeeded by number" )) else: assert ellipsis_found # Handle "...y". range_item = (None, upper) elif ellipsis_found: # Handle "x..." and "x...y". if (upper is not None) and (lower > upper): raise errors.InterfaceError( _("lower limit %s must be less or equal than upper limit %s" ) % (_decimal_as_text(lower, self.precision), _decimal_as_text(upper, self.precision))) range_item = (lower, upper) else: # Handle "x". range_item = (lower, lower) if range_item is not None: self._precision = max_digits_after_dot self._scale = max_digits_before_dot + max_digits_after_dot for item in self._items: if self._items_overlap(item, range_item): item_text = _compat.text_repr( self._repr_item(item)) result_text = _compat.text_repr( self._repr_item(range_item)) raise errors.InterfaceError( _("overlapping parts in decimal range must be cleaned up: %s and %s" ) % (item_text, result_text), location) self._items.append(range_item) if _tools.is_eof_token(next_token): end_reached = True assert self.precision >= 0 assert self.scale >= self.precision self._lower_limit = None self._upper_limit = None is_first_item = True for lower_item, upper_item in self._items: if is_first_item: self._lower_limit = lower_item self._upper_limit = upper_item is_first_item = False if lower_item is None: self._lower_limit = None elif (self._lower_limit is not None) and (lower_item < self._lower_limit): self._lower_limit = lower_item if upper_item is None: self._upper_limit = None elif (self._upper_limit is not None) and (upper_item > self._upper_limit): self._upper_limit = upper_item
def __init__(self, description, default=None): """ Setup a range as specified by ``description``. :param str description: a range description of the form \ ``lower...upper`` or ``limit``. In case it is empty (``''``), any \ value will be accepted by \ :py:meth:`~cutplace.ranges.Range.validate()`. For example, \ ``1...40`` accepts values between 1 and 40. :param str default: an alternative to use in case ``description`` is \ ``None`` or empty. """ assert default is None or (default.strip() != ''), "default=%r" % default # Find out if a `description` has been specified and if not, use optional `default` instead. has_description = (description is not None) and (description.strip() != '') if not has_description and default is not None: description = default has_description = True if not has_description: # Use empty ranges. self._description = None self._items = None self._lower_limit = None self._upper_limit = None else: self._description = description.replace('...', ELLIPSIS) self._items = [] name_for_code = 'range' location = None # TODO: Add location where range is declared. tokens = _tools.tokenize_without_space(self._description) end_reached = False while not end_reached: lower = None upper = None ellipsis_found = False after_hyphen = False next_token = next(tokens) while not _tools.is_eof_token( next_token) and not _tools.is_comma_token(next_token): next_type = next_token[0] next_value = next_token[1] if next_type in (token.NAME, token.NUMBER, token.STRING): if next_type == token.NAME: # Symbolic names, e.g. ``tab``. value_as_int = code_for_symbolic_token( name_for_code, next_value, location) elif next_type == token.NUMBER: # Numbers, e.g. ``123``. value_as_int = code_for_number_token( name_for_code, next_value, location) if after_hyphen: value_as_int *= -1 after_hyphen = False elif next_type == token.STRING: # Python strings, e.g. ``'abc'`` or ``"""abc"""``. value_as_int = code_for_string_token( name_for_code, next_value, location) elif (len(next_value) == 1) and not _tools.is_eof_token(next_token): # Other single characters, e.g. ``,``; this is particular useful with delimiter properties. value_as_int = ord(next_value) else: raise errors.InterfaceError( _('value for %s must a number, a single character or a symbolic name but is: %s' ) % (name_for_code, _compat.text_repr(next_value)), location) if ellipsis_found: if upper is None: upper = value_as_int else: raise errors.InterfaceError( _("range must have at most lower and upper limit but found another number: %s" ) % _compat.text_repr(next_value), location) elif lower is None: lower = value_as_int else: raise errors.InterfaceError( _("number must be followed by ellipsis (...) but found: %s" ) % _compat.text_repr(next_value), location) elif after_hyphen: raise errors.InterfaceError( _("hyphen (-) must be followed by number but found: %s" ) % _compat.text_repr(next_value), location) elif (next_type == token.OP) and (next_value == "-"): after_hyphen = True elif next_value in (ELLIPSIS, ':'): ellipsis_found = True else: raise errors.InterfaceError( _("range must be specified using integer numbers, text, " "symbols and ellipsis (...) but found: %s [token type: %d]" ) % (_compat.text_repr(next_value), next_type), location) next_token = next(tokens) if after_hyphen: raise errors.InterfaceError( _("hyphen (-) at end must be followed by number"), location) # Decide upon the result. if lower is None: if upper is None: if ellipsis_found: # Handle "...". raise errors.InterfaceError( _('ellipsis (...) must be preceded and/or succeeded by number' ), location) else: # Handle "". result = None else: assert ellipsis_found # Handle "...y". result = (None, upper) elif ellipsis_found: # Handle "x..." and "x...y". if (upper is not None) and (lower > upper): raise errors.InterfaceError( _("lower range %d must be greater or equal than upper range %d" ) % (lower, upper), location) result = (lower, upper) else: # Handle "x". result = (lower, lower) if result is not None: for item in self._items: if self._items_overlap(item, result): item_text = _compat.text_repr( self._repr_item(item)) result_text = _compat.text_repr( self._repr_item(result)) raise errors.InterfaceError( _("overlapping parts in range must be cleaned up: %s and %s" ) % (item_text, result_text), location) self._items.append(result) if _tools.is_eof_token(next_token): end_reached = True self._lower_limit = None self._upper_limit = None is_first_item = True for lower_item, upper_item in self._items: if is_first_item: self._lower_limit = lower_item self._upper_limit = upper_item is_first_item = False if lower_item is None: self._lower_limit = None elif (self._lower_limit is not None) and (lower_item < self._lower_limit): self._lower_limit = lower_item if upper_item is None: self._upper_limit = None elif (self._upper_limit is not None) and (upper_item > self._upper_limit): self._upper_limit = upper_item