Esempio n. 1
0
    def validate_patient(patient: Patient, organism=ORGANISM_HOMO_SAPIENS):

        # checks format consistency first
        ModelValidator.validate(patient)

        try:
            # checks that patient id is not empty considering white spaces

            patient_id = patient.identifier.strip() if patient.identifier else patient.identifier
            assert patient_id is not None and patient_id != "", "A patient identifier is missing"
            assert patient.identifier == patient.identifier.strip(), \
                "Patient identifier contains white spaces at start or end: {}".format(patient.identifier)

            # checks MHC I
            if patient.mhc1:
                for m in patient.mhc1:
                    ModelValidator._validate_mhc1(m, organism=organism)
            # checks MHC II
            if patient.mhc2:
                for m in patient.mhc2:
                    ModelValidator._validate_mhc2(m, organism=organism)

        except AssertionError as e:
            logger.error(patient.to_json(indent=3))
            raise NeofoxDataValidationException(e)
Esempio n. 2
0
    def build_mhc1_alleles(alleles: List[str],
                           mhc_database: MhcDatabase) -> List[Mhc1]:
        isoforms = []
        try:
            mhc_parser = MhcParser.get_mhc_parser(mhc_database)
            # NOTE: during the pandas parsing of empty columns empty lists become a list with one empty string
            parsed_alleles = list(
                map(mhc_parser.parse_mhc_allele,
                    filter(lambda a: a != "", alleles)))
            for a in parsed_alleles:
                ModelValidator.validate_mhc1_gene(a)

            # do we need to validate genes anymore? add test creating MhcAllele with bad gene and see what happens
            for mhc1_gene in mhc_database.mhc1_genes:
                gene_alleles = list(
                    filter(lambda a: a.gene == mhc1_gene.name, parsed_alleles))
                zygosity = MhcFactory._get_zygosity_from_alleles(gene_alleles)
                if zygosity == Zygosity.HOMOZYGOUS:
                    gene_alleles = [
                        gene_alleles[0]
                    ]  # we don't want repeated instances of the same allele
                isoforms.append(
                    Mhc1(name=mhc1_gene,
                         zygosity=zygosity,
                         alleles=gene_alleles))
        except AssertionError as e:
            raise NeofoxDataValidationException(e)
        return list(filter(lambda i: i.zygosity != Zygosity.LOSS, isoforms))
Esempio n. 3
0
    def parse_mhc_allele(self,
                         allele: str,
                         pattern=H2_ALLELE_PATTERN) -> MhcAllele:
        match = H2_NETMHCPAN_ALLELE_PATTERN.match(allele)
        if match:
            # this ensures that netmhcpan output is normalized
            allele = "H2{gene}{protein}".format(gene=match.group(1),
                                                protein=match.group(2))
        match = H2_ALLELE_PATTERN.match(allele)
        if match is None:
            raise NeofoxDataValidationException(
                "Allele does not match H2 allele pattern {}".
                format(allele) if allele != "" else
                "Please check the format of provided alleles. An empty allele is provided"
            )

        gene = match.group(1)
        protein = match.group(2)

        # controls for existence in the HLA database and warns the user
        mhc_allele = MhcAllele(gene=gene, protein=protein)
        if not self.mhc_database.exists(mhc_allele):
            logger.warning(
                "Allele {} does not exist in the H2 database".format(allele))

        # builds a normalized representation of the allele
        name = "{gene}{protein}".format(gene=gene, protein=protein)

        # full name is the same as name in this case as the pattern does not allow variability
        mhc_allele.name = name
        mhc_allele.full_name = name
        return mhc_allele
Esempio n. 4
0
    def _validate_input_data(self):

        patient_identifiers_from_neoantigens = set(
            [n.patient_identifier for n in self.neoantigens])
        patient_identifiers_from_patients = set(
            [p.identifier for p in self.patients.values()])

        # checks that no neoantigen is referring to an empty patient
        if ("" in patient_identifiers_from_neoantigens
                or None in patient_identifiers_from_neoantigens):
            raise NeofoxDataValidationException(
                "There are neoantigens missing a reference to a patient")

        # checks that there is no neoantigen referring to a non existing patient
        missing_patient_identifiers = patient_identifiers_from_neoantigens.difference(
            patient_identifiers_from_patients)
        if len(missing_patient_identifiers) > 0:
            raise NeofoxDataValidationException(
                "There are neoantigens referring to missing patients: {}".
                format(missing_patient_identifiers))
Esempio n. 5
0
    def parse_mhc_allele(self, allele: str) -> MhcAllele:
        match = HLA_ALLELE_PATTERN_WITHOUT_SEPARATOR.match(allele)
        if match is not None:
            # allele without separator, controls for ambiguities
            gene = match.group(1)
            group = match.group(2)
            protein = match.group(3)
            default_allele_exists = self.mhc_database.exists(
                MhcAllele(gene=gene, group=group, protein=protein))
            if not default_allele_exists:
                # if default allele does not exist, tries alternative
                protein = group[-1:] + protein
                group = group[0:-1]
        else:
            # infers gene, group and protein from the name
            match = HLA_ALLELE_PATTERN.match(allele)
            if match is None:
                raise NeofoxDataValidationException(
                    "Allele does not match HLA allele pattern {}".
                    format(allele) if allele != "" else
                    "Please check the format of provided alleles. An empty allele is provided"
                )
            gene = match.group(1)
            group = match.group(2)
            protein = match.group(3)

        # controls for existence in the HLA database and warns the user
        mhc_allele = MhcAllele(gene=gene, group=group, protein=protein)
        if not self.mhc_database.exists(mhc_allele):
            logger.warning(
                "Allele {} does not exist in the HLA database".format(allele))

        # builds a normalized representation of the allele
        name = "HLA-{gene}*{serotype}:{protein}".format(gene=gene,
                                                        serotype=group,
                                                        protein=protein)
        # ensures that full name stores the complete allele as provided but normalizes
        # its representation
        full_name = name
        six_digits_id = match.group(4)
        if six_digits_id is not None and six_digits_id != "":
            full_name = full_name + ":{}".format(six_digits_id)
            eight_digits_id = match.group(5)
            if eight_digits_id is not None and eight_digits_id != "":
                full_name = full_name + ":{}".format(eight_digits_id)
                expression_change = match.group(6)
                if expression_change is not None and expression_change != "":
                    full_name = full_name + expression_change
        mhc_allele.name = name
        mhc_allele.full_name = full_name
        return mhc_allele
Esempio n. 6
0
    def validate_mhc2_isoform_representation(isoform: Mhc2Isoform, organism: str):
        try:
            if organism == ORGANISM_HOMO_SAPIENS:
                match_molecule = HLA_MOLECULE_PATTERN.match(isoform.name)
                match_single_allele = HLA_DR_MOLECULE_PATTERN.match(isoform.name)
                assert match_molecule or match_single_allele, "MHC II isoform not following molecule pattern"
                ModelValidator.validate_mhc_allele_representation(isoform.beta_chain, organism)
                if match_molecule:
                    # the DR molecule does not have alpha chain
                    ModelValidator.validate_mhc_allele_representation(isoform.alpha_chain, organism)
            elif organism == ORGANISM_MUS_MUSCULUS:
                match = H2_MOLECULE_PATTERN.match(isoform.name)
                if match:
                    ModelValidator.validate_mhc_allele_representation(isoform.alpha_chain, organism)
                    #ModelValidator.validate_mhc_allele_representation(isoform.beta_chain, organism)
                else:
                    raise NeofoxDataValidationException(
                        "Transformed MHC II molecule name does not match H2 isoform pattern {}".format(isoform.name))
            else:
                raise NeofoxDataValidationException("Not supported organism {}".format(organism))

        except AssertionError as e:
            logger.error(isoform.to_json(indent=3))
            raise NeofoxDataValidationException(e)
Esempio n. 7
0
    def validate_mhc_allele_representation(allele: MhcAllele, organism: str):
        try:
            allele_pattern = ALLELE_PATTERN_BY_ORGANISM.get(organism)
            valid_genes = [g.name for g in MHC_I_GENES_BY_ORGANISM.get(organism) + MHC_II_GENES_BY_ORGANISM.get(organism)]

            assert allele_pattern.match(allele.name) is not None, \
                "Allele name does not match expected pattern: {}".format(allele.name)
            assert allele.gene in valid_genes, "MHC gene {} not from classic MHC for organism {}".format(
                allele.gene, organism)
            assert isinstance(allele.protein, str), \
                "The field protein in MHC allele model has the value {} and wrong type but must be a character " \
                "instead of {}".format(allele.protein, type(allele.protein))
            if organism == ORGANISM_HOMO_SAPIENS:
                assert isinstance(allele.group, str), \
                    "The field group in MHC allele model has the value {} and wrong type but must be a character " \
                    "instead of {}".format(allele.group, type(allele.group))
            elif organism == ORGANISM_MUS_MUSCULUS:
                assert allele.group is None or allele.group == "", \
                    "Provided group for H2 allele"
            else:
                raise NeofoxDataValidationException("Not supported organism {}".format(organism))
        except AssertionError as e:
            logger.error(allele.to_json(indent=3))
            raise NeofoxDataValidationException(e)
Esempio n. 8
0
    def build_mhc2_alleles(alleles: List[str],
                           mhc_database: MhcDatabase) -> List[Mhc2]:
        mhc2s = []
        try:
            mhc_parser = MhcParser.get_mhc_parser(mhc_database)
            # NOTE: during the pandas parsing of empty columns empty lists become a list with one empty string
            parsed_alleles = list(
                map(mhc_parser.parse_mhc_allele,
                    filter(lambda a: a != "", alleles)))
            for a in parsed_alleles:
                ModelValidator.validate_mhc2_gene(a)

            # do we need to validate genes anymore? add test creating MhcAllele with bad gene and see what happens
            for mhc2_isoform_name in mhc_database.mhc2_molecules:
                mhc2_isoform_genes = GENES_BY_MOLECULE.get(mhc2_isoform_name)
                isoform_alleles = list(
                    filter(
                        lambda a: a.gene in
                        [g.name for g in mhc2_isoform_genes], parsed_alleles))
                genes = []
                for gene_name in mhc2_isoform_genes:
                    gene_alleles = list(
                        filter(lambda a: a.gene == gene_name.name,
                               isoform_alleles))
                    zygosity = MhcFactory._get_zygosity_from_alleles(
                        gene_alleles)
                    if zygosity == Zygosity.HOMOZYGOUS:
                        gene_alleles = [
                            gene_alleles[0]
                        ]  # we don't want repeated instances of the same allele
                    genes.append(
                        Mhc2Gene(name=gene_name,
                                 zygosity=zygosity,
                                 alleles=gene_alleles))
                isoforms = MhcFactory._get_mhc2_isoforms(
                    mhc2_isoform_name, genes)
                mhc2s.append(
                    Mhc2(name=mhc2_isoform_name,
                         genes=genes,
                         isoforms=isoforms))
        except AssertionError as e:
            raise NeofoxDataValidationException(e)
        return list(
            filter(
                lambda m: all(
                    map(lambda g: g.zygosity != Zygosity.LOSS, m.genes)),
                mhc2s))
Esempio n. 9
0
    def validate_neoantigen(neoantigen: Neoantigen):

        # checks format consistency first
        ModelValidator.validate(neoantigen)

        try:
            assert neoantigen.patient_identifier is not None and len(neoantigen.patient_identifier) > 0, \
                "A patient identifier is missing. Please provide patientIdentifier in the input file"

            # checks mutation
            ModelValidator._validate_mutation(neoantigen.mutation)

            # check the expression values
            ModelValidator._validate_expression_values(neoantigen)
        except AssertionError as e:
            logger.error(neoantigen.to_json(indent=3))
            raise NeofoxDataValidationException(e)
Esempio n. 10
0
 def validate(model: betterproto.Message):
     # TODO: make this method capture appropriately validation issues when dealing with int and float
     try:
         model.__bytes__()
     except Exception as e:
         raise NeofoxDataValidationException(e)