Ejemplo n.º 1
0
    def from_line(cls, line, line_number=None):
        """Reads a single line in the MAF file header.

        If a formatting error is encountered, returns (error, None), otherwise
        returns (None, record).  Formatting errors include:
        - the line does not start with the correct symbol (i.e. #)
        - the line is missing a space separator for the key and value
        - the line has an empty key
        - the line has an empty value
        """
        error = None
        record = None
        if not line.startswith(MafHeader.HeaderLineStartSymbol):
            error = MafValidationError(
                MafValidationErrorType.HEADER_LINE_MISSING_START_SYMBOL,
                "Header line did not start with a '#'",
                line_number=line_number)
        else:
            tokens = line[1:].split(" ", 1)
            if len(tokens) != 2:
                error = MafValidationError(
                    MafValidationErrorType.HEADER_LINE_MISSING_SEPARATOR,
                    "Header line did not have a key and value separated by a "
                    "space",
                    line_number=line_number)
            else:
                key, value = tokens
                value = value.rstrip()
                if not key:
                    error = MafValidationError(
                        MafValidationErrorType.HEADER_LINE_EMPTY_KEY,
                        "Header line had an empty key",
                        line_number=line_number)
                elif not value:
                    error = MafValidationError(
                        MafValidationErrorType.HEADER_LINE_EMPTY_VALUE,
                        "Header line had an empty value",
                        line_number=line_number)
                elif key == MafHeader.VersionKey:
                    record = MafHeaderVersionRecord(value=value)
                elif key == MafHeader.AnnotationSpecKey:
                    record = MafHeaderAnnotationSpecRecord(value=value)
                elif key == MafHeader.SortOrderKey:
                    try:
                        record = MafHeaderSortOrderRecord(value=value)
                    except:
                        error = MafValidationError(
                            MafValidationErrorType.
                            HEADER_UNSUPPORTED_SORT_ORDER,
                            "Sort order '%s' was not recognized" % value,
                            line_number=line_number)
                elif key == MafHeader.ContigKey:
                    record = MafHeaderContigRecord(value=value)
                else:
                    record = MafHeaderRecord(key=key, value=value)
        return record, error
Ejemplo n.º 2
0
 def test_process_validation_errors_strict(self):
     logger = Logger.get_logger("test_process_validation_errors_strict")
     with self.assertRaises(MafFormatException) as context:
         MafValidationError.process_validation_errors(
             validation_errors=TestMafValidationError.__errors,
             validation_stringency=ValidationStringency.Strict,
             logger=logger,
         )
     self.assertIn("Error with line number", str(context.exception))
     self.assertTrue(context.exception.tpe,
                     MafValidationErrorType.HEADER_LINE_EMPTY_KEY)
Ejemplo n.º 3
0
    def validate(self, reset_errors=True, scheme=None, line_number=None):
        """
        This method should not be overridden by sub-classes.

        Checks to see if the value is one of the nullable values.  If not,
        calls ``__validate__``.  If no message was returned, calls ``validate``
        on the super-class.
        :return: a list of validation errors, if any.
        """
        if reset_errors:
            self.validation_errors = list()
        nullable_values = self.__nullable_values__()
        if nullable_values is not None and self.value in nullable_values:
            msg = None
        else:
            msg = self.__validate__()
        if msg is not None:
            error = MafValidationError(
                MafValidationErrorType.RECORD_COLUMN_WRONG_FORMAT,
                "%s in column with name '%s'" % (msg, self.key),
                line_number=line_number)
            self.validation_errors.append(error)
        return super(MafCustomColumnRecord, self).validate(
            reset_errors=False,  # we reset above!
            scheme=scheme,
            line_number=line_number)
Ejemplo n.º 4
0
    def __update_scheme__(self, scheme=None, column_names=None):
        def add_error(error):
            self.validation_errors.append(error)

        self.__scheme = self.__header.scheme()

        # Set the scheme if given, but check that they match, otherwise,
        # add an error
        if scheme is not None:
            if self.__scheme is not None \
                    and scheme.version() != self.__scheme.version():
                add_error(MafValidationError(
                    MafValidationErrorType.HEADER_MISMATCH_SCHEME,
                    "Version in the header '%s' did not match the expected "
                    "version '%s'" %
                    (self.__scheme.version(), scheme.version())
                ))
            self.__scheme = scheme

        # If there are column names, and either there is no scheme or the scheme
        # is the "no restrictions anything goes" scheme, then use the "no
        # restrictions" scheme with the given column names.
        if column_names is not None and \
                (self.__scheme is None or isinstance(self.__scheme,
                                                     NoRestrictionsScheme)):
            if self.validation_stringency is not ValidationStringency.Silent:
                self.__logger.warn(
                    "No matching scheme was found in the header, defaulting "
                    "to the least restrictive scheme.")
            self.__scheme = NoRestrictionsScheme(column_names=column_names)
Ejemplo n.º 5
0
    def validate(self, reset_errors=True, scheme=None, line_number=None):
        """
        Validates that the value is of the correct type and an acceptable
        value.
        :return: a list of validation errors found, if any.
        """

        if reset_errors:
            self.validation_errors = list()

        if scheme:

            def add_errors(error):
                """Adds an error"""
                self.validation_errors.append(error)

            scheme_column_index = scheme.column_index(name=self.key)
            scheme_column_class = scheme.column_class(name=self.key)

            if scheme_column_index is None:
                add_errors(
                    MafValidationError(
                        MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES,
                        "No column '%s' present in the scheme '%s'" %
                        (self.key, scheme.version()),
                        line_number=line_number))
            elif self.column_index is not None and scheme_column_index != \
                    self.column_index:
                add_errors(
                    MafValidationError(
                        MafValidationErrorType.RECORD_COLUMN_OUT_OF_ORDER,
                        "Column with name '%s' was found in the %dth column"
                        ", but expected the %dth column with scheme "
                        "'%s''" % (self.key, self.column_index,
                                   scheme_column_index, scheme.version()),
                        line_number=line_number))
            elif not isinstance(self, scheme_column_class):
                add_errors(
                    MafValidationError(
                        MafValidationErrorType.RECORD_COLUMN_WRONG_FORMAT,
                        "Column with name '%s' is in the wrong format. "
                        "Found '%s' expected '%s'" %
                        (self.key, str(
                            self.__class__), str(scheme_column_class)),
                        line_number=line_number))

        return self.validation_errors
Ejemplo n.º 6
0
    def test_process_validation_errors_lenient(self):
        err_stream = tempfile.NamedTemporaryFile(delete=False, mode="w")
        err_file_name = err_stream.name
        logger = Logger.get_logger(err_file_name, stream=err_stream)

        MafValidationError.process_validation_errors(
            validation_errors=TestMafValidationError.__errors,
            validation_stringency=ValidationStringency.Lenient,
            logger=logger,
        )
        err_stream.close()

        reader = open(err_file_name, "r")
        actual_lines = reader.readlines()
        expected_lines = [
            MafValidationError.ignore_message(error)
            for error in TestMafValidationError.__errors
        ]
        reader.close()

        self.assertTrue(len(actual_lines) == len(expected_lines))
        for actual_line, expected_line in zip(actual_lines, expected_lines):
            self.assertIn(expected_line, actual_line)
Ejemplo n.º 7
0
    def from_lines(
        cls,
        lines: List[str],
        validation_stringency: ValidationStringency = None,
        logger: logging.Logger = Logger.RootLogger,
    ) -> 'MafHeader':
        """
        :param lines: a sequence of lines
        :param validation_stringency: optionally the validation stringency to
        use, otherwise use the default (Silent)
        :param logger the logger to which to write errors
        :return: a MafHeader
        """

        header = cls(validation_stringency=validation_stringency)

        def add_error(error: MafValidationError) -> None:
            header.validation_errors.append(error)

        for line_number, line in enumerate(lines):
            line_number = line_number + 1  # 1-based
            record, error = MafHeaderRecord.from_line(line, line_number)
            if error:
                assert record is None
                add_error(error)
            else:
                assert record is not None
                if record.key in header:
                    add_error(
                        MafValidationError(
                            MafValidationErrorType.HEADER_DUPLICATE_KEYS,
                            "Multiple header lines with key '%s' found" % record.key,
                            line_number=line_number,
                        )
                    )
                else:
                    header[record.key] = record

        if header.contigs():
            if header.sort_order() and issubclass(
                header.sort_order().__class__, Coordinate
            ):
                sokey = header[MafHeader.SortOrderKey].value.name()
                header[MafHeader.SortOrderKey] = MafHeaderSortOrderRecord(
                    value=sokey, contigs=header.contigs()
                )

        header.validate(logger=logger, reset_errors=False)

        return header
Ejemplo n.º 8
0
class TestMafValidationError(unittest.TestCase):

    __errors = (
        MafValidationError(
            tpe=MafValidationErrorType.HEADER_LINE_EMPTY_KEY,
            message="Error with line number",
        ),
        MafValidationError(
            tpe=MafValidationErrorType.HEADER_LINE_MISSING_START_SYMBOL,
            message="Error without line number",
            line_number=42,
        ),
    )

    def test_str(self):
        actual_strings = [
            str(error) for error in TestMafValidationError.__errors
        ]
        expected_strings = [
            "Error with line number",
            "On line number 42: Error without line number",
        ]
        self.assertTrue(len(actual_strings) == len(expected_strings))
        for actual, expect in zip(actual_strings, expected_strings):
            self.assertIn(expect, actual)

    def test_process_validation_errors_strict(self):
        logger = Logger.get_logger("test_process_validation_errors_strict")
        with self.assertRaises(MafFormatException) as context:
            MafValidationError.process_validation_errors(
                validation_errors=TestMafValidationError.__errors,
                validation_stringency=ValidationStringency.Strict,
                logger=logger,
            )
        self.assertIn("Error with line number", str(context.exception))
        self.assertTrue(context.exception.tpe,
                        MafValidationErrorType.HEADER_LINE_EMPTY_KEY)

    def test_process_validation_errors_lenient(self):
        err_stream = tempfile.NamedTemporaryFile(delete=False, mode="w")
        err_file_name = err_stream.name
        logger = Logger.get_logger(err_file_name, stream=err_stream)

        MafValidationError.process_validation_errors(
            validation_errors=TestMafValidationError.__errors,
            validation_stringency=ValidationStringency.Lenient,
            logger=logger,
        )
        err_stream.close()

        reader = open(err_file_name, "r")
        actual_lines = reader.readlines()
        expected_lines = [
            MafValidationError.ignore_message(error)
            for error in TestMafValidationError.__errors
        ]
        reader.close()

        self.assertTrue(len(actual_lines) == len(expected_lines))
        for actual_line, expected_line in zip(actual_lines, expected_lines):
            self.assertIn(expected_line, actual_line)
Ejemplo n.º 9
0
    def __init__(self, lines,
                 closeable=None,
                 validation_stringency=None,
                 scheme=None):
        """ Initializes a MAF reader and reads in the header and column
        definitions.

        If no scheme is provided, the scheme will be determined from the
        version and annotation pragmas in the header, and matched against the
        known set of schemes.  If the scheme is not recognized, then the
        column names will determine a custom scheme and no assumption is made
        about the values of each column.

        :param lines: the lines (iterable) from the MAF file.
        :param closeable: any closeable object (has a ``close()`` method) that
        will be closed when ``close()`` is called.
        :param validation_stringency: the validation stringency.
        :param scheme: a scheme that should be used to override the scheme in
        the header.
        """
        self.__iter = iter(lines)
        self.__closeable = closeable
        self.validation_stringency = \
            ValidationStringency.Silent if (validation_stringency is None) \
                else validation_stringency
        self.__logger = Logger.get_logger(self.__class__.__name__)
        self.validation_errors = list()

        self.__next_line = None
        self.__line_number = 0

        def add_error(error):
            self.validation_errors.append(error)

        # read in the header lines
        header_lines = list()
        while True:
            self.__next_line__()
            if self.__next_line is not None \
                    and self.__next_line.startswith(MafHeader.HeaderLineStartSymbol):
                header_lines.append(self.__next_line)
            else:
                break
        self.__header = \
            MafHeader.from_lines(
                lines=header_lines,
                validation_stringency=self.validation_stringency)

        for error in self.__header.validation_errors:
            add_error(error)

        # get the column names
        if self.__next_line is not None:
            column_names = self.__next_line.split(MafRecord.ColumnSeparator)
            self.__next_line__()
        else:
            column_names = None

        # update the scheme
        self.__update_scheme__(scheme=scheme, column_names=column_names)

        # validate the column names against the scheme
        if column_names is not None:
            # match the column names against the scheme
            scheme_column_names = self.__scheme.column_names()
            if len(column_names) != len(scheme_column_names):
                add_error(MafValidationError(
                    MafValidationErrorType.SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES,
                    "Found '%d' columns but expected '%d'" %
                    (len(column_names), len(scheme_column_names)),
                    line_number=self.__line_number - 1
                ))
            else:
                for i, (column_name, scheme_column_name) in \
                        enumerate(zip(column_names, scheme_column_names)):
                    if column_name != scheme_column_name:
                        add_error(MafValidationError(
                            MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES,
                            "Found column with name '%s' but expected '%s' for "
                            "the '%d'th column" %
                            (column_name, scheme_column_name, i + 1),
                            line_number=self.__line_number - 1
                        ))
        else:
            add_error(MafValidationError(
                MafValidationErrorType.HEADER_MISSING_COLUMN_NAMES,
                "Found no column names",
                line_number=self.__line_number+1
            ))

        # process validation errors so far
        MafValidationError.process_validation_errors(
            validation_errors=self.validation_errors,
            validation_stringency=self.validation_stringency,
            name=self.__class__.__name__,
            logger=self.__logger
        )
Ejemplo n.º 10
0
    def validate(
        self,
        validation_stringency: ValidationStringency = None,
        logger: logging.Logger = Logger.RootLogger,
        reset_errors: bool = True,
    ) -> List[MafValidationError]:
        """Validates the header and returns a list of errors.
        Ensures that:
        - there is a version line in the header
        - the version is supported
        - the annotation specification is not in the header if the scheme is
          basic
        - the annotation specification is in the header if the scheme is basic
        - the annotation specification, when present, is supported
        """

        if reset_errors:
            self.validation_errors = list()

        def add_error(error: MafValidationError) -> None:
            self.validation_errors.append(error)

        # get the scheme!
        scheme = self.scheme()

        if not validation_stringency:
            validation_stringency = self.validation_stringency

        # ensure there's a version record
        if MafHeader.VersionKey not in self:
            add_error(
                MafValidationError(
                    MafValidationErrorType.HEADER_MISSING_VERSION,
                    "No version line found in the header",
                )
            )
        else:
            # ensure that the version is a supported version
            version = self[MafHeader.VersionKey].value
            if version not in MafHeader.SupportedVersions:
                add_error(
                    MafValidationError(
                        MafValidationErrorType.HEADER_UNSUPPORTED_VERSION,
                        "The version '%s' is not supported" % version,
                    )
                )

        # Check the annotation spec
        # 1. basic annotation specs should not be in the header
        # 2. non-basic annotation specs should be present (in the header) and
        # have a known value
        if scheme is not None and scheme.is_basic():
            if MafHeader.AnnotationSpecKey in self:
                add_error(
                    MafValidationError(
                        MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
                        "Unexpected annotation.spec line found in the header",
                    )
                )
        else:
            if MafHeader.AnnotationSpecKey not in self:
                add_error(
                    MafValidationError(
                        MafValidationErrorType.HEADER_MISSING_ANNOTATION_SPEC,
                        "No annotation.spec line found in the header",
                    )
                )
            else:
                # ensure that the annotation spec is a supported annotation spec
                annotation = self[MafHeader.AnnotationSpecKey].value
                if annotation not in MafHeader.SupportedAnnotationSpecs:
                    add_error(
                        MafValidationError(
                            MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC,
                            "The annotation.spec '%s' is not supported" % annotation,
                        )
                    )

        # process validation errors
        MafValidationError.process_validation_errors(
            validation_errors=self.validation_errors,
            validation_stringency=validation_stringency,
            logger=logger,
        )

        return self.validation_errors
Ejemplo n.º 11
0
    def from_line(
        cls,
        line: str,
        column_names: Optional[List[str]] = None,
        scheme: Optional['MafScheme'] = None,
        line_number: Optional[int] = None,
        validation_stringency: ValidationStringency = ValidationStringency.
        Strict,
        logger: logging.Logger = Logger.RootLogger,
    ) -> 'MafRecord':
        """
        Parses a record from a single tab-delimited line.
        :param column_names: the expected names of the columns, in order,
        otherwise will use the scheme.
        :param line: the line to parse.
        :param scheme: an optional MafScheme
        :param line_number: the optional line number.
        :param validation_stringency: the optional validation stringency for
        the record
        :param logger the logger to which to write errors
        :return:
        """
        record = cls(line_number=line_number,
                     validation_stringency=validation_stringency)

        if column_names is None:
            if scheme is None:
                raise ValueError("Either column_names or scheme must be given")
            column_names = scheme.column_names()

        def add_errors(error: MafValidationError) -> None:
            record.validation_errors.append(error)

        column_values = line.rstrip("\r\n").split(cls.ColumnSeparator)

        if len(column_names) != len(column_values):
            add_errors(
                MafValidationError(
                    MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS,
                    f"Found '{len(column_values)}' columns but expected '{len(column_names)}'",
                    line_number=line_number,
                ))
            record.validate(logger=logger, reset_errors=False)

            return record

        for column_index, (column_name, column_value) in enumerate(
                zip(column_names, column_values)):
            column = None

            scheme_column_class = (scheme.column_class(
                name=column_name) if scheme else None)

            # A validation error will be found later if we don't find the
            # column name
            if scheme_column_class is None:
                column = MafColumnRecord(key=column_name,
                                         value=column_value,
                                         column_index=column_index)
            else:
                try:
                    scheme_column_class = scheme.column_class(
                        name=column_name)  # type: ignore
                    column = scheme_column_class.build(  # type: ignore
                        name=column_name,
                        value=column_value,
                        column_index=column_index,
                    )
                except Exception as error:
                    add_errors(
                        MafValidationError(
                            MafValidationErrorType.RECORD_INVALID_COLUMN_VALUE,
                            f"Could not build column '{column_index+1}' with name '{column_name}' scheme '{scheme.version()}': {error}",  # type: ignore
                            line_number=line_number,
                        ))

            if column is not None:
                column_validation_errors = column.validate(
                    scheme=scheme, line_number=line_number)
                record.validation_errors.extend(
                    column_validation_errors)  # type: ignore
                if len(column_validation_errors) == 0:
                    record[column_name] = column

        # process validation errors
        record.validate(logger=logger, reset_errors=False)

        return record
Ejemplo n.º 12
0
    def validate(
        self,
        validation_stringency: Optional[ValidationStringency] = None,
        logger: logging.Logger = Logger.RootLogger,
        reset_errors: bool = True,
        scheme: Optional['MafScheme'] = None,
    ) -> List[MafValidationError]:
        """
        Collects a list of validation errors.
        :return: the list of validation errors, if any.
        """
        if reset_errors:
            self.validation_errors = list()

        found_none_column = False

        if not validation_stringency:
            validation_stringency = self.validation_stringency

        def add_errors(error: MafValidationError) -> None:
            self.validation_errors.append(error)

        # Validate the # of columns against the given scheme

        if scheme and len(scheme) != len(self):
            add_errors(
                MafValidationError(
                    MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS,
                    f"Found '{len(self)}' columns but expected '{len(scheme)}'",
                ))

        # find any columns that have None in the list or dictionary
        for i, column in enumerate(self.__columns_list):
            if not column:
                # NB: I am not sure if this that useful of an error to report
                # when the column could not be built successfully?
                found_none_column = True
                add_errors(
                    MafValidationError(
                        MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE,
                        f"Column '{i+1}' had no value",
                        line_number=self.__line_number,
                    ))
            else:
                # add any validation errors from the column itself.
                self.validation_errors.extend(
                    column.validate(reset_errors=reset_errors,
                                    scheme=scheme)  # type: ignore
                )

        # if we did not find any None columns, then do a bunch of internal
        #  self-consistency checking.
        if not found_none_column:
            # double-check the dictionary for columns with None values.
            for name in self.__columns_dict:
                assert self.__columns_dict[name] is not None
            # validate we have the same # of columns in the list as in the dict
            assert len(self.__columns_dict) == len(self.__columns_list)
            # validate we have the same columns in the list as in the dict
            assert (sorted(self.__columns_dict.values(),
                           key=lambda r: r.column_index)  # type: ignore
                    == self.__columns_list)
            # ensure that all records' column_index match the index in the list
            for (column_index, column) in enumerate(self.__columns_list):
                assert column_index == column.column_index  # type: ignore

        # TODO: validate cross-column constraints (ex. Mutation_Status)
        # TODO: validate that chromosome/start/end are defined

        # process validation errors
        MafValidationError.process_validation_errors(
            validation_errors=self.validation_errors,
            validation_stringency=validation_stringency,
            logger=logger,
        )

        return self.validation_errors
Ejemplo n.º 13
0
    def from_line(cls,
                  line,
                  column_names=None,
                  scheme=None,
                  line_number=None,
                  validation_stringency=None,
                  logger=Logger.RootLogger):
        """
        Parses a record from a single tab-delimited line.
        :param column_names: the expected names of the columns, in order,
        otherwise will use the scheme.
        :param line: the line to parse.
        :param scheme: an optional MafScheme
        :param line_number: the optional line number.
        :param validation_stringency: the optional validation stringency for
        the record
        :param logger the logger to which to write errors
        :return:
        """
        record = MafRecord(line_number=line_number,
                           validation_stringency=validation_stringency)

        if column_names is None:
            if scheme is None:
                raise ValueError("Either column_names or scheme must be given")
            column_names = scheme.column_names()

        def add_errors(error):
            record.validation_errors.append(error)

        column_values = line.rstrip("\r\n").split(MafRecord.ColumnSeparator)

        if len(column_names) != len(column_values):
            add_errors(
                MafValidationError(
                    MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS,
                    "Found '%d' columns but expected '%d'" %
                    (len(column_values), len(column_names)),
                    line_number=line_number))
        else:
            for column_index, column_name_and_value in \
                    enumerate(zip(column_names, column_values)):
                column_name = column_name_and_value[0]
                column_value = column_name_and_value[1]
                column = None

                scheme_column_class = \
                    scheme.column_class(name=column_name) if scheme else None

                # A validation error will be found later if we don't find the
                # column name
                if scheme_column_class is None:
                    column = MafColumnRecord(key=column_name,
                                             value=column_value,
                                             column_index=column_index)
                else:
                    try:
                        scheme_column_class = \
                            scheme.column_class(name=column_name)
                        column = scheme_column_class.build(
                            name=column_name,
                            value=column_value,
                            column_index=column_index)
                    except Exception as error:
                        add_errors(
                            MafValidationError(
                                MafValidationErrorType.
                                RECORD_INVALID_COLUMN_VALUE,
                                "Could not build column '%d' with name '%s' "
                                "with the scheme '%s': %s" %
                                (column_index + 1, column_name,
                                 scheme.version(), str(error)),
                                line_number=line_number,
                            ))

                if column is not None:
                    column_validation_errors = \
                        column.validate(scheme=scheme, line_number=line_number)
                    record.validation_errors.extend(column_validation_errors)
                    if len(column_validation_errors) == 0:
                        record[column_name] = column

        # process validation errors
        record.validate(logger=logger, reset_errors=False)

        return record