def __init__( self, ngram, # type: int encoding=_DEFAULT_ENCODING, # type: str weight=_DEFAULT_WEIGHT, # type: Union[int, float] positional=_DEFAULT_POSITIONAL, # type: bool missing_value=None # type: Optional[MissingValueSpec] ): # type: (...) -> None """ Make a :class:`FieldHashingProperties` object, setting it attributes to values specified in keyword arguments. """ if ngram not in range(3): msg = 'ngram is {} but is expected to be 0, 1, or 2.' raise ValueError(msg.format(ngram)) try: ''.encode(encoding) except LookupError as e: msg = '{} is not a valid Python encoding.' raise_from(ValueError(msg.format(encoding)), e) if weight < 0: msg = 'weight should be non-negative but is {}.' raise ValueError(msg.format(weight)) self.ngram = ngram self.encoding = encoding self.positional = positional self.weight = weight self.missing_value = missing_value
def validate_schema_dict(schema): # type: (Dict[str, Any]) -> None """ Validate the schema. This raises iff either the schema or the master schema are invalid. If it's successful, it returns nothing. :param schema: The schema to validate, as parsed by `json`. :raises SchemaError: When the schema is invalid. :raises MasterSchemaError: When the master schema is invalid. """ if not isinstance(schema, dict): msg = ('The top level of the schema file is a {}, whereas a dict is ' 'expected.'.format(type(schema).__name__)) raise SchemaError(msg) if 'version' in schema: version = schema['version'] else: raise SchemaError('A format version is expected in the schema.') master_schema = _get_master_schema(version) try: jsonschema.validate(schema, master_schema) except jsonschema.exceptions.ValidationError as e: raise_from(SchemaError('The schema is not valid.\n\n' + str(e)), e) except jsonschema.exceptions.SchemaError as e: msg = ('The master schema is not valid. The schema cannot be ' 'validated. Please file a bug report.') raise_from(MasterSchemaError(msg), e)
def validate_entries( fields, # type: Sequence[FieldSpec] data # type: Sequence[Sequence[str]] ): # type: (...) -> None """ Validate the `data` entries according to the specification in `fields`. :param fields: The `FieldSpec` objects forming the specification. :param data: The data to validate. :raises EntryError: When an entry is not valid according to its :class:`FieldSpec`. """ validators = [f.validate for f in fields] for i, row in enumerate(data): for entry, v in zip(row, validators): try: v(entry) except InvalidEntryError as e: msg = ('Invalid entry in row {row_index}, column ' "'{column_name}'. {original_message}").format( row_index=i, column_name=cast(FieldSpec, e.field_spec).identifier, original_message=e.args[0]) e_invalid_entry = EntryError(msg) e_invalid_entry.field_spec = e.field_spec e_invalid_entry.row_index = i raise_from(e_invalid_entry, e)
def validate(self, str_in): # type: (Text) -> None """ Validates an entry in the field. Raises `InvalidEntryError` iff the entry is invalid. An entry is invalid iff (1) the string does not represent a date in the correct format; or (2) the date it represents is invalid (such as 30 February). :param str str_in: String to validate. :raises InvalidEntryError: Iff entry is invalid. :raises ValueError: When self.format is unrecognised. """ if self.is_missing_value(str_in): return # noinspection PyCompatibility super().validate(str_in) try: datetime.strptime(str_in, self.format) except ValueError as e: msg = "Validation error for date type: {}".format(e) e_new = InvalidEntryError(msg) e_new.field_spec = self raise_from(e_new, e)
def get_master_schema(version): # type: (Hashable) -> bytes """ Loads the master schema of given version as bytes. :param version: The version of the master schema whose path we wish to retrieve. :raises SchemaError: When the schema version is unknown. This usually means that either (a) clkhash is out of date, or (b) the schema version listed is incorrect. :return: Bytes of the schema. """ try: file_name = MASTER_SCHEMA_FILE_NAMES[version] except (TypeError, KeyError) as e: msg = ('Schema version {} is not supported. ' 'Consider updating clkhash.').format(version) raise_from(SchemaError(msg), e) try: schema_bytes = pkgutil.get_data('clkhash', 'master-schemas/{}'.format(file_name)) except IOError as e: # In Python 3 we can be more specific with # FileNotFoundError, but that doesn't exist in # Python 2. msg = ('The master schema could not be found. The schema cannot be ' 'validated. Please file a bug report.') raise_from(MasterSchemaError(msg), e) if schema_bytes is None: msg = ('The master schema could not be loaded. The schema cannot be ' 'validated. Please file a bug report.') raise MasterSchemaError(msg) return schema_bytes
def __init__( self, comparator, # type: AbstractComparison strategy, # type: StrategySpec encoding=_DEFAULT_ENCODING, # type: str hash_type='blakeHash', # type: str prevent_singularity=None, # type: Optional[bool] missing_value=None # type: Optional[MissingValueSpec] ): # type: (...) -> None """ Make a :class:`FieldHashingProperties` object, setting it attributes to values specified in keyword arguments. """ if comparator is None: raise ValueError('no comparator specified') try: ''.encode(encoding) except LookupError as e: msg = '{} is not a valid Python encoding.' raise_from(ValueError(msg.format(encoding)), e) if prevent_singularity is not None and hash_type != 'doubleHash': raise ValueError("Prevent_singularity must only be specified" " with hash_type doubleHash.") if strategy is None: raise ValueError('no strategy specified') self.comparator = comparator self.encoding = encoding self.hash_type = hash_type self.prevent_singularity = prevent_singularity self.strategy = strategy self.missing_value = missing_value
def convert_schema(schema_json, output): """convert the given schema file to the latest version. """ try: schema_dict = json.load(schema_json) except ValueError as e: # In Python 3 we can be more specific # with json.decoder.JSONDecodeError, # but that doesn't exist in Python 2. msg = 'The provided schema is not a valid JSON file.' raise_from(SchemaError(msg), e) validate_schema_dict(schema_dict) new_schema_dict = convert_to_latest_version(schema_dict, validate_result=True) json.dump(new_schema_dict, output)
def _format_regular_value(self, str_in): # type: (Text) -> Text """ we overwrite default behaviour as we want to hash the numbers only, no fillers like '-', or '/' :param str str_in: date string :return: str date string with format DateSpec.OUTPUT_FORMAT """ try: dt = datetime.strptime(str_in, self.format) return strftime(dt, DateSpec.OUTPUT_FORMAT) except ValueError as e: msg = "Unable to format date value '{}'. Reason: {}".format( str_in, e) e_new = InvalidEntryError(msg) e_new.field_spec = self raise_from(e_new, e)
def plot(clk_json): try: # data was writen with: json.dump({'clks': clk_data}, output); so ... clks = json.load(clk_json)['clks'] except ValueError as e: # In Python 3 we can be more specific # with json.decoder.JSONDecodeError, # but that doesn't exist in Python 2. msg = 'The input is not a valid JSON file.' raise_from(DescribeError(msg), e) if len(clks) == 0: msg = 'No clks found' raise DescribeError(msg) popcounts = [deserialize_bitarray(clk).count() for clk in clks] plot_hist(popcounts, bincount=60, title='popcounts', xlab=True, showSummary=True)
def _format_regular_value(self, str_in): # type: (Text) -> Text """ we need to reformat integer strings, as there can be different strings for the same integer. The strategy of unification here is to first parse the integer string to an Integer type. Thus all of '+13', ' 13', '13' will be parsed to 13. We then convert the integer value to an unambiguous string (no whitespaces, leading '-' for negative numbers, no leading '+'). :param str_in: integer string :return: integer string without whitespaces, leading '-' for negative numbers, no leading '+' """ try: value = int(str_in, base=10) return str(value) except ValueError as e: msg = "Invalid integer. Read '{}'.".format(str_in) e_new = InvalidEntryError(msg) e_new.field_spec = self raise_from(e_new, e)
def from_json_file(schema_file, validate=True): # type: (TextIO, bool) -> Schema """ Load a Schema object from a json file. :param schema_file: A JSON file containing the schema. :param validate: (default True) Raise an exception if the schema does not conform to the master schema. :raises SchemaError: When the schema is invalid. :return: the Schema """ try: schema_dict = json.load(schema_file) except ValueError as e: # In Python 3 we can be more specific # with json.decoder.JSONDecodeError, # but that doesn't exist in Python 2. msg = 'The schema is not a valid JSON file.' raise_from(SchemaError(msg), e) return from_json_dict(schema_dict, validate=validate)
def from_json_dict( cls, json_dict # type: Dict[str, Any] ): # type: (...) -> StringSpec """ Make a StringSpec object from a dictionary containing its properties. :param dict json_dict: This dictionary must contain an `'encoding'` key associated with a Python-conformant encoding. It must also contain a `'hashing'` key, whose contents are passed to :class:`FieldHashingProperties`. Permitted keys also include `'pattern'`, `'case'`, `'minLength'`, and `'maxLength'`. :raises InvalidSchemaError: When a regular expression is provided but is not a valid pattern. """ # noinspection PyCompatibility result = cast( StringSpec, # Go away, Mypy. super().from_json_dict(json_dict)) format_ = json_dict['format'] if 'encoding' in format_ and result.hashing_properties: result.hashing_properties.encoding = format_['encoding'] if 'pattern' in format_: pattern = format_['pattern'] try: result.regex = re_compile_full(pattern) except (SyntaxError, re.error) as e: msg = "Invalid regular expression '{}.'".format(pattern) e_new = InvalidSchemaError(msg) e_new.json_field_spec = json_dict raise_from(e_new, e) result.regex_based = True else: result.case = format_.get('case', StringSpec._DEFAULT_CASE) result.min_length = format_.get('minLength') result.max_length = format_.get('maxLength') result.regex_based = False return result
def validate(self, str_in): # type: (Text) -> None """ Validates an entry in the field. Raises `InvalidEntryError` iff the entry is invalid. An entry is invalid iff (1) the string does not represent a base-10 integer; (2) the integer is not between `self.minimum` and `self.maximum`, if those exist; or (3) the integer is negative. :param str str_in: String to validate. :raises InvalidEntryError: When entry is invalid. """ if self.is_missing_value(str_in): return # noinspection PyCompatibility super().validate(str_in) try: value = int(str_in, base=10) except ValueError as e: msg = "Invalid integer. Read '{}'.".format(str_in) e_new = InvalidEntryError(msg) e_new.field_spec = self raise_from(e_new, e) return # to stop PyCharm thinking that value might be undefined # later if self.minimum is not None and value < self.minimum: msg = ("Expected integer value of at least {}. Read '{}'.".format( self.minimum, value)) e_new = InvalidEntryError(msg) e_new.field_spec = self raise e_new if self.maximum is not None and value > self.maximum: msg = ("Expected integer value of at most {}. Read '{}'.".format( self.maximum, value)) e_new = InvalidEntryError(msg) e_new.field_spec = self raise e_new
def __init__( self, ngram, # type: int encoding=_DEFAULT_ENCODING, # type: str positional=_DEFAULT_POSITIONAL, # type: bool hash_type='blakeHash', # type: str prevent_singularity=None, # type: Optional[bool] num_bits=None, # type: Optional[int] k=None, # type: Optional[int] missing_value=None # type: Optional[MissingValueSpec] ): # type: (...) -> None """ Make a :class:`FieldHashingProperties` object, setting it attributes to values specified in keyword arguments. """ if ngram not in range(3): msg = 'ngram is {} but is expected to be 0, 1, or 2.' raise ValueError(msg.format(ngram)) try: ''.encode(encoding) except LookupError as e: msg = '{} is not a valid Python encoding.' raise_from(ValueError(msg.format(encoding)), e) if prevent_singularity is not None and hash_type != 'doubleHash': raise ValueError("Prevent_singularity must only be specified" " with hash_type doubleHash.") if not num_bits and not k: raise ValueError('One of num_bits or k must be specified.') self.ngram = ngram self.encoding = encoding self.positional = positional self.hash_type = hash_type self.prevent_singularity = prevent_singularity self.num_bits = num_bits self.k = k self.missing_value = missing_value
def validate(self, str_in): # type: (Text) -> None """ Validates an entry in the field. Raises :class:`InvalidEntryError` iff the entry is invalid. Subclasses must override this method with their own validation. They should call the parent's `validate` method via `super`. :param str str_in: String to validate. :raises InvalidEntryError: When entry is invalid. """ if self.hashing_properties: # else its Ignore try: str_in.encode(encoding=self.hashing_properties.encoding) except UnicodeEncodeError as err: msg = ("Expected entry that can be encoded in {}. Read '{}'.". format(self.hashing_properties.encoding, str_in)) e_new = InvalidEntryError(msg) e_new.field_spec = self raise_from(e_new, err)
def validate_schema_dict(schema): # type: (Dict[str, Any]) -> None """ Validate the schema. This raises iff either the schema or the master schema are invalid. If it's successful, it returns nothing. :param schema: The schema to validate, as parsed by `json`. :raises SchemaError: When the schema is invalid. :raises MasterSchemaError: When the master schema is invalid. """ if not isinstance(schema, dict): msg = ('The top level of the schema file is a {}, whereas a dict is ' 'expected.'.format(type(schema).__name__)) raise SchemaError(msg) if 'version' in schema: version = schema['version'] else: raise SchemaError('A format version is expected in the schema.') master_schema_bytes = get_master_schema(version) try: master_schema = json.loads(master_schema_bytes.decode('utf-8')) except ValueError as e: # In Python 3 we can be more specific with # json.decoder.JSONDecodeError, but that # doesn't exist in Python 2. msg = ('The master schema is not a valid JSON file. The schema cannot ' 'be validated. Please file a bug report.') raise_from(MasterSchemaError(msg), e) try: jsonschema.validate(schema, master_schema) except jsonschema.exceptions.ValidationError as e: raise_from(SchemaError('The schema is not valid.'), e) except jsonschema.exceptions.SchemaError as e: msg = ('The master schema is not valid. The schema cannot be ' 'validated. Please file a bug report.') raise_from(MasterSchemaError(msg), e)
# type checker thinks max_length is of type None # noinspection PyTypeChecker if max_length is not None and max_length <= 0: msg = 'max_length must be positive, but is {}' raise ValueError(msg.format(max_length)) if regex_based: regex_str = cast(str, regex) try: compiled_regex = re_compile_full(regex_str) self.regex = compiled_regex except (SyntaxError, re.error) as e: msg = "invalid regular expression '{}.'".format(regex_str) e_new = InvalidEntryError(msg) e_new.field_spec = self raise_from(e_new, e) else: self.case = case self.min_length = min_length self.max_length = max_length self.regex_based = regex_based @classmethod def from_json_dict( cls, json_dict # type: Dict[str, Any] ): # type: (...) -> StringSpec """ Make a StringSpec object from a dictionary containing its properties.