Exemple #1
0
    def __init__(
            self,
            ngram,  # type: int
            encoding=_DEFAULT_ENCODING,  # type: str
            weight=_DEFAULT_WEIGHT,  # type: Union[int, float]
            positional=_DEFAULT_POSITIONAL,  # type: bool
            missing_value=None  # type: Optional[MissingValueSpec]
    ):
        # type: (...) -> None
        """ Make a :class:`FieldHashingProperties` object, setting it
            attributes to values specified in keyword arguments.
        """
        if ngram not in range(3):
            msg = 'ngram is {} but is expected to be 0, 1, or 2.'
            raise ValueError(msg.format(ngram))

        try:
            ''.encode(encoding)
        except LookupError as e:
            msg = '{} is not a valid Python encoding.'
            raise_from(ValueError(msg.format(encoding)), e)

        if weight < 0:
            msg = 'weight should be non-negative but is {}.'
            raise ValueError(msg.format(weight))

        self.ngram = ngram
        self.encoding = encoding
        self.positional = positional
        self.weight = weight
        self.missing_value = missing_value
Exemple #2
0
def validate_schema_dict(schema):
    # type: (Dict[str, Any]) -> None
    """ Validate the schema.

        This raises iff either the schema or the master schema are
        invalid. If it's successful, it returns nothing.

        :param schema: The schema to validate, as parsed by `json`.
        :raises SchemaError: When the schema is invalid.
        :raises MasterSchemaError: When the master schema is invalid.
    """
    if not isinstance(schema, dict):
        msg = ('The top level of the schema file is a {}, whereas a dict is '
               'expected.'.format(type(schema).__name__))
        raise SchemaError(msg)

    if 'version' in schema:
        version = schema['version']
    else:
        raise SchemaError('A format version is expected in the schema.')

    master_schema = _get_master_schema(version)

    try:
        jsonschema.validate(schema, master_schema)
    except jsonschema.exceptions.ValidationError as e:
        raise_from(SchemaError('The schema is not valid.\n\n' + str(e)), e)
    except jsonschema.exceptions.SchemaError as e:
        msg = ('The master schema is not valid. The schema cannot be '
               'validated. Please file a bug report.')
        raise_from(MasterSchemaError(msg), e)
Exemple #3
0
def validate_entries(
        fields,  # type: Sequence[FieldSpec]
        data  # type: Sequence[Sequence[str]]
):
    # type: (...) -> None
    """ Validate the `data` entries according to the specification in
        `fields`.

        :param fields: The `FieldSpec` objects forming the
            specification.
        :param data: The data to validate.
        :raises EntryError: When an entry is not valid according to its
            :class:`FieldSpec`.
    """
    validators = [f.validate for f in fields]

    for i, row in enumerate(data):
        for entry, v in zip(row, validators):
            try:
                v(entry)
            except InvalidEntryError as e:
                msg = ('Invalid entry in row {row_index}, column '
                       "'{column_name}'. {original_message}").format(
                           row_index=i,
                           column_name=cast(FieldSpec,
                                            e.field_spec).identifier,
                           original_message=e.args[0])
                e_invalid_entry = EntryError(msg)
                e_invalid_entry.field_spec = e.field_spec
                e_invalid_entry.row_index = i
                raise_from(e_invalid_entry, e)
    def validate(self, str_in):
        # type: (Text) -> None
        """ Validates an entry in the field.

            Raises `InvalidEntryError` iff the entry is invalid.

            An entry is invalid iff (1) the string does not represent a
            date in the correct format; or (2) the date it represents
            is invalid (such as 30 February).

            :param str str_in: String to validate.
            :raises InvalidEntryError: Iff entry is invalid.
            :raises ValueError: When self.format is unrecognised.
        """
        if self.is_missing_value(str_in):
            return
        # noinspection PyCompatibility
        super().validate(str_in)
        try:
            datetime.strptime(str_in, self.format)
        except ValueError as e:
            msg = "Validation error for date type: {}".format(e)
            e_new = InvalidEntryError(msg)
            e_new.field_spec = self
            raise_from(e_new, e)
Exemple #5
0
def get_master_schema(version):
    # type: (Hashable) -> bytes
    """ Loads the master schema of given version as bytes.

        :param version: The version of the master schema whose path we
            wish to retrieve.
        :raises SchemaError: When the schema version is unknown. This
            usually means that either (a) clkhash is out of date, or (b)
            the schema version listed is incorrect.
        :return: Bytes of the schema.
    """
    try:
        file_name = MASTER_SCHEMA_FILE_NAMES[version]
    except (TypeError, KeyError) as e:
        msg = ('Schema version {} is not supported. '
               'Consider updating clkhash.').format(version)
        raise_from(SchemaError(msg), e)

    try:
        schema_bytes = pkgutil.get_data('clkhash',
                                        'master-schemas/{}'.format(file_name))
    except IOError as e:  # In Python 3 we can be more specific with
        # FileNotFoundError, but that doesn't exist in
        # Python 2.
        msg = ('The master schema could not be found. The schema cannot be '
               'validated. Please file a bug report.')
        raise_from(MasterSchemaError(msg), e)

    if schema_bytes is None:
        msg = ('The master schema could not be loaded. The schema cannot be '
               'validated. Please file a bug report.')
        raise MasterSchemaError(msg)

    return schema_bytes
Exemple #6
0
    def __init__(
        self,
        comparator,  # type: AbstractComparison
        strategy,  # type: StrategySpec
        encoding=_DEFAULT_ENCODING,  # type: str
        hash_type='blakeHash',  # type: str
        prevent_singularity=None,  # type: Optional[bool]
        missing_value=None  # type: Optional[MissingValueSpec]
    ):
        # type: (...) -> None
        """ Make a :class:`FieldHashingProperties` object, setting it
            attributes to values specified in keyword arguments.
        """
        if comparator is None:
            raise ValueError('no comparator specified')

        try:
            ''.encode(encoding)
        except LookupError as e:
            msg = '{} is not a valid Python encoding.'
            raise_from(ValueError(msg.format(encoding)), e)

        if prevent_singularity is not None and hash_type != 'doubleHash':
            raise ValueError("Prevent_singularity must only be specified"
                             " with hash_type doubleHash.")

        if strategy is None:
            raise ValueError('no strategy specified')

        self.comparator = comparator
        self.encoding = encoding
        self.hash_type = hash_type
        self.prevent_singularity = prevent_singularity
        self.strategy = strategy
        self.missing_value = missing_value
Exemple #7
0
def convert_schema(schema_json, output):
    """convert the given schema file to the latest version.
    """
    try:
        schema_dict = json.load(schema_json)
    except ValueError as e:  # In Python 3 we can be more specific
        # with json.decoder.JSONDecodeError,
        # but that doesn't exist in Python 2.
        msg = 'The provided schema is not a valid JSON file.'
        raise_from(SchemaError(msg), e)
    validate_schema_dict(schema_dict)
    new_schema_dict = convert_to_latest_version(schema_dict,
                                                validate_result=True)
    json.dump(new_schema_dict, output)
Exemple #8
0
    def _format_regular_value(self, str_in):
        # type: (Text) -> Text
        """ we overwrite default behaviour as we want to hash the numbers only, no fillers like '-', or '/'

        :param str str_in: date string
        :return: str date string with format DateSpec.OUTPUT_FORMAT
        """
        try:
            dt = datetime.strptime(str_in, self.format)
            return strftime(dt, DateSpec.OUTPUT_FORMAT)
        except ValueError as e:
            msg = "Unable to format date value '{}'. Reason: {}".format(
                str_in, e)
            e_new = InvalidEntryError(msg)
            e_new.field_spec = self
            raise_from(e_new, e)
Exemple #9
0
def plot(clk_json):
    try:
        # data was writen with: json.dump({'clks': clk_data}, output); so ...
        clks = json.load(clk_json)['clks']
    except ValueError as e:  # In Python 3 we can be more specific
        # with json.decoder.JSONDecodeError,
        # but that doesn't exist in Python 2.
        msg = 'The input is not a valid JSON file.'
        raise_from(DescribeError(msg), e)
        
    if len(clks) == 0:
        msg = 'No clks found'
        raise DescribeError(msg)

    popcounts = [deserialize_bitarray(clk).count() for clk in clks]
    plot_hist(popcounts, bincount=60, title='popcounts', xlab=True, showSummary=True)
Exemple #10
0
    def _format_regular_value(self, str_in):
        # type: (Text) -> Text
        """ we need to reformat integer strings, as there can be different strings for the same integer. The
            strategy of unification here is to first parse the integer string to an Integer type. Thus all of
            '+13', ' 13', '13' will be parsed to 13. We then convert the integer value to an unambiguous string
            (no whitespaces, leading '-' for negative numbers, no leading '+').

            :param str_in: integer string
            :return: integer string without whitespaces, leading '-' for negative numbers, no leading '+'
        """
        try:
            value = int(str_in, base=10)
            return str(value)
        except ValueError as e:
            msg = "Invalid integer. Read '{}'.".format(str_in)
            e_new = InvalidEntryError(msg)
            e_new.field_spec = self
            raise_from(e_new, e)
Exemple #11
0
def from_json_file(schema_file, validate=True):
    # type: (TextIO, bool) -> Schema
    """ Load a Schema object from a json file.
        :param schema_file: A JSON file containing the schema.
        :param validate: (default True) Raise an exception if the
            schema does not conform to the master schema.
        :raises SchemaError: When the schema is invalid.
        :return: the Schema
    """
    try:
        schema_dict = json.load(schema_file)
    except ValueError as e:  # In Python 3 we can be more specific
        # with json.decoder.JSONDecodeError,
        # but that doesn't exist in Python 2.
        msg = 'The schema is not a valid JSON file.'
        raise_from(SchemaError(msg), e)

    return from_json_dict(schema_dict, validate=validate)
    def from_json_dict(
            cls,
            json_dict  # type: Dict[str, Any]
    ):
        # type: (...) -> StringSpec
        """ Make a StringSpec object from a dictionary containing its
            properties.

            :param dict json_dict: This dictionary must contain an
                `'encoding'` key associated with a Python-conformant
                encoding. It must also contain a `'hashing'` key, whose
                contents are passed to :class:`FieldHashingProperties`.
                Permitted keys also include `'pattern'`, `'case'`,
                `'minLength'`, and `'maxLength'`.
            :raises InvalidSchemaError: When a regular expression is
                provided but is not a valid pattern.
        """
        # noinspection PyCompatibility
        result = cast(
            StringSpec,  # Go away, Mypy.
            super().from_json_dict(json_dict))

        format_ = json_dict['format']
        if 'encoding' in format_ and result.hashing_properties:
            result.hashing_properties.encoding = format_['encoding']

        if 'pattern' in format_:
            pattern = format_['pattern']
            try:
                result.regex = re_compile_full(pattern)
            except (SyntaxError, re.error) as e:
                msg = "Invalid regular expression '{}.'".format(pattern)
                e_new = InvalidSchemaError(msg)
                e_new.json_field_spec = json_dict
                raise_from(e_new, e)
            result.regex_based = True

        else:
            result.case = format_.get('case', StringSpec._DEFAULT_CASE)
            result.min_length = format_.get('minLength')
            result.max_length = format_.get('maxLength')
            result.regex_based = False

        return result
    def validate(self, str_in):
        # type: (Text) -> None
        """ Validates an entry in the field.

            Raises `InvalidEntryError` iff the entry is invalid.

            An entry is invalid iff (1) the string does not represent a
            base-10 integer; (2) the integer is not between
            `self.minimum` and `self.maximum`, if those exist; or (3)
            the integer is negative.

            :param str str_in: String to validate.
            :raises InvalidEntryError: When entry is invalid.
        """
        if self.is_missing_value(str_in):
            return
        # noinspection PyCompatibility
        super().validate(str_in)

        try:
            value = int(str_in, base=10)
        except ValueError as e:
            msg = "Invalid integer. Read '{}'.".format(str_in)
            e_new = InvalidEntryError(msg)
            e_new.field_spec = self
            raise_from(e_new, e)
            return  # to stop PyCharm thinking that value might be undefined
            #  later

        if self.minimum is not None and value < self.minimum:
            msg = ("Expected integer value of at least {}. Read '{}'.".format(
                self.minimum, value))
            e_new = InvalidEntryError(msg)
            e_new.field_spec = self
            raise e_new

        if self.maximum is not None and value > self.maximum:
            msg = ("Expected integer value of at most {}. Read '{}'.".format(
                self.maximum, value))
            e_new = InvalidEntryError(msg)
            e_new.field_spec = self
            raise e_new
    def __init__(
        self,
        ngram,  # type: int
        encoding=_DEFAULT_ENCODING,  # type: str
        positional=_DEFAULT_POSITIONAL,  # type: bool
        hash_type='blakeHash',  # type: str
        prevent_singularity=None,  # type: Optional[bool]
        num_bits=None,  # type: Optional[int]
        k=None,  # type: Optional[int]
        missing_value=None  # type: Optional[MissingValueSpec]
    ):
        # type: (...) -> None
        """ Make a :class:`FieldHashingProperties` object, setting it
            attributes to values specified in keyword arguments.
        """
        if ngram not in range(3):
            msg = 'ngram is {} but is expected to be 0, 1, or 2.'
            raise ValueError(msg.format(ngram))

        try:
            ''.encode(encoding)
        except LookupError as e:
            msg = '{} is not a valid Python encoding.'
            raise_from(ValueError(msg.format(encoding)), e)

        if prevent_singularity is not None and hash_type != 'doubleHash':
            raise ValueError("Prevent_singularity must only be specified"
                             " with hash_type doubleHash.")

        if not num_bits and not k:
            raise ValueError('One of num_bits or k must be specified.')

        self.ngram = ngram
        self.encoding = encoding
        self.positional = positional
        self.hash_type = hash_type
        self.prevent_singularity = prevent_singularity
        self.num_bits = num_bits
        self.k = k
        self.missing_value = missing_value
    def validate(self, str_in):
        # type: (Text) -> None
        """ Validates an entry in the field.

            Raises :class:`InvalidEntryError` iff the entry is invalid.

            Subclasses must override this method with their own
            validation. They should call the parent's `validate` method
            via `super`.

            :param str str_in: String to validate.
            :raises InvalidEntryError: When entry is invalid.
        """
        if self.hashing_properties:  # else its Ignore
            try:
                str_in.encode(encoding=self.hashing_properties.encoding)
            except UnicodeEncodeError as err:
                msg = ("Expected entry that can be encoded in {}. Read '{}'.".
                       format(self.hashing_properties.encoding, str_in))
                e_new = InvalidEntryError(msg)
                e_new.field_spec = self
                raise_from(e_new, err)
Exemple #16
0
def validate_schema_dict(schema):
    # type: (Dict[str, Any]) -> None
    """ Validate the schema.

        This raises iff either the schema or the master schema are
        invalid. If it's successful, it returns nothing.

        :param schema: The schema to validate, as parsed by `json`.
        :raises SchemaError: When the schema is invalid.
        :raises MasterSchemaError: When the master schema is invalid.
    """
    if not isinstance(schema, dict):
        msg = ('The top level of the schema file is a {}, whereas a dict is '
               'expected.'.format(type(schema).__name__))
        raise SchemaError(msg)

    if 'version' in schema:
        version = schema['version']
    else:
        raise SchemaError('A format version is expected in the schema.')

    master_schema_bytes = get_master_schema(version)
    try:
        master_schema = json.loads(master_schema_bytes.decode('utf-8'))
    except ValueError as e:  # In Python 3 we can be more specific with
        # json.decoder.JSONDecodeError, but that
        # doesn't exist in Python 2.
        msg = ('The master schema is not a valid JSON file. The schema cannot '
               'be validated. Please file a bug report.')
        raise_from(MasterSchemaError(msg), e)

    try:
        jsonschema.validate(schema, master_schema)
    except jsonschema.exceptions.ValidationError as e:
        raise_from(SchemaError('The schema is not valid.'), e)
    except jsonschema.exceptions.SchemaError as e:
        msg = ('The master schema is not valid. The schema cannot be '
               'validated. Please file a bug report.')
        raise_from(MasterSchemaError(msg), e)
        # type checker thinks max_length is of type None
        # noinspection PyTypeChecker
        if max_length is not None and max_length <= 0:
            msg = 'max_length must be positive, but is {}'
            raise ValueError(msg.format(max_length))

        if regex_based:
            regex_str = cast(str, regex)
            try:
                compiled_regex = re_compile_full(regex_str)
                self.regex = compiled_regex
            except (SyntaxError, re.error) as e:
                msg = "invalid regular expression '{}.'".format(regex_str)
                e_new = InvalidEntryError(msg)
                e_new.field_spec = self
                raise_from(e_new, e)
        else:
            self.case = case
            self.min_length = min_length
            self.max_length = max_length

        self.regex_based = regex_based

    @classmethod
    def from_json_dict(
            cls,
            json_dict  # type: Dict[str, Any]
    ):
        # type: (...) -> StringSpec
        """ Make a StringSpec object from a dictionary containing its
            properties.