Ejemplo n.º 1
0
class PeptideMassValidatorTestCase(unittest.TestCase):
    """
    Tests peptide mass validator with modified peptides.
    """

    PRECURSOR = mass_to_int(1751.868942379)
    """Precursor
    """

    PRECURSOR_RANGE = PrecursorRange(PRECURSOR, 5, 5)
    """Precursor tolerance
    """

    PEPTIDE_SEQUENCES = [
        ("VYMGWJKGVYTTYR", "VYM[v:any:15994915000]GWJKGVYTTYR"),
        ("AVMQCVTVQSKPYNK", "AVMQC[s:any:57021464000]VTVQSKPYNK"),
        ("MTEYPDVJWGTRIR", "M[v:any:15994915000]TEYPDVJWGTRIR"),
        ("EPEHJDVJMPRMAAK", "EPEHJDVJMPRM[v:any:15994915000]AAK"),
        ("WYMAZLVWJIZER", "WYM[v:any:15994915000]AZLVWJIZER"),
        ("CGCJVHJJMFFJAR",
         "C[s:any:57021464000]GC[s:any:57021464000]JVHJJM[v:any:15994915000]FFJAR"
         ), ("CVPPPQSATDJQNVAR", "C[s:any:57021464000]VPPPQSATDJQNVAR"),
        ("LJPYRVPFTPMCDK",
         "LJPYRVPFTPM[v:any:15994915000]C[s:any:57021464000]DK"),
        ("VFFTWESJTVHCVK", "VFFTWESJTVHC[s:any:57021464000]VK"),
        ("JSYNCDIEJRASRR", "JSYNC[s:any:57021464000]DIEJRASRR"),
        ("QYECRRVOWYR", "QYEC[s:any:57021464000]RRVOWYR"),
        ("AYYWJNRGFYJMR", "AYYWJNRGFYJMR"),
        ("JQATYMTSGGTSPPITK", "JQATYMTSGGTSPPITK"),
        ("ODGANVQZRTBPMAJ", "ODGANVQZRTBPMAJ"),
        ("QJMVFGKQQCQLEK",
         "QJM[v:any:15994915000]VFGKQQC[s:any:57021464000]QLEK"),
        ("PAPVHCDYPPYPVJK", "PAPVHC[s:any:57021464000]DYPPYPVJK"),
        ("VBLVVTBMDHVHMVK", "VBLVVTBMDHVHM[v:any:15994915000]VK"),
        ("MIHCPAFYRIMAVK",
         "MIHC[s:any:57021464000]PAFYRIM[v:any:15994915000]AVK"),
        ("QMYFARHJHDGLHK", "QMYFARHJHDGLHK"),
        ("QJDJRMBTTFDBKR", "QJDJRMBTTFDBKR"),
        ("QRTEFCREIGEVTK", "QRTEFC[s:any:57021464000]REIGEVTK"),
        ("HKDDRTVQLFAMYL", "HKDDRTVQLFAM[v:any:15994915000]YL"),
        ("VAEPMFDRVMRMVR", "VAEPMFDRVM[v:any:15994915000]RMVR"),
        ("WTEQAPSJYJMGGRK", "WTEQAPSJYJM[v:any:15994915000]GGRK"),
        ("ATVFPQLKEQJVTUK", "ATVFPQLKEQJVTUK"),
        ("FQSVVBFVEBIHYR", "FQSVVBFVEBIHYR"),
        ("ZVVCCJVJSABIYGR",
         "ZVVC[s:any:57021464000]C[s:any:57021464000]JVJSABIYGR"),
        ("EGRSAAAETCVVFSLR", "EGRSAAAETC[s:any:57021464000]VVFSLR"),
        ("HQQMDAVTKSPGTQPK", "HQQMDAVTKSPGTQPK"),
        ("VDFAFVQRPKCEEK", "VDFAFVQRPKC[s:any:57021464000]EEK"),
        ("DQMCPFRJCKJIR",
         "DQM[v:any:15994915000]C[s:any:57021464000]PFRJC[s:any:57021464000]KJIR"
         ),
        ("MCMHTJRIVEFKR",
         "M[v:any:15994915000]C[s:any:57021464000]M[v:any:15994915000]HTJRIVEFKR"
         ), ("JJSPGDYTPHVTHGMK", "JJSPGDYTPHVTHGMK"),
        ("FVKMYGRFJCYPR",
         "FVKM[v:any:15994915000]YGRFJC[s:any:57021464000]YPR"),
        ("CIMPYRGRTIQWR",
         "C[s:any:57021464000]IM[v:any:15994915000]PYRGRTIQWR"),
        ("FYVDVHMFTJQQPK", "FYVDVHMFTJQQPK"),
        ("ODBRVBFMYAAIGK", "ODBRVBFM[v:any:15994915000]YAAIGK"),
        ("FFVMCRPNDRVVGR", "FFVMC[s:any:57021464000]RPNDRVVGR"),
        ("UPMVTQPAGPPIIPKR", "UPMVTQPAGPPIIPKR"),
        ("FTBDTIVVTNZFPQK", "FTBDTIVVTNZFPQK"),
        ("JMQWKCFVPJVCR",
         "JM[v:any:15994915000]QWKC[s:any:57021464000]FVPJVC[s:any:57021464000]R"
         ), ("YKQFTFFMGJAEVR", "YKQFTFFM[v:any:15994915000]GJAEVR"),
        ("HTVVJTFJJSKUVSTG", "HTVVJTFJJSKUVSTG"),
        ("WAKQJSNRCTFWR", "WAKQJSNRC[s:any:57021464000]TFWR"),
        ("JSJGWZBJCWAYJK", "JSJGWZBJC[s:any:57021464000]WAYJK"),
        ("YYSNIHNQAIVRQF", "YYSNIHNQAIVRQF"),
        ("REVJRDIMFPMGEK",
         "REVJRDIM[v:any:15994915000]FPM[v:any:15994915000]GEK"),
        ("MJNCMTWAGKQKLR",
         "MJNC[s:any:57021464000]M[v:any:15994915000]TWAGKQKLR"),
        ("FGILPJSVRWQUTGK", "FGILPJSVRWQUTGK"),
        ("YYPGKPEPMKRENK", "YYPGKPEPM[v:any:15994915000]KRENK"),
        ("JTMVDENNWAJKYR", "JTMVDENNWAJKYR"),
        ("QQYMICAJAPMVRR",
         "QQYMIC[s:any:57021464000]AJAPM[v:any:15994915000]VRR"),
        ("CIGFEQKIKBZQMK", "C[s:any:57021464000]IGFEQKIKBZQMK"),
        ("FQAIBSPMVKTMBVR", "FQAIBSPMVKTM[v:any:15994915000]BVR"),
        ("LIDFIZAZLAVTIUR", "LIDFIZAZLAVTIUR"),
        ("YAMKPASAMJKMJGPAG", "YAMKPASAM[v:any:15994915000]JKMJGPAG"),
        ("PQNGPZQPJZTCKQK", "PQNGPZQPJZTC[s:any:57021464000]KQK"),
        ("MCIQMQJKYPPRR",
         "M[v:any:15994915000]C[s:any:57021464000]IQM[v:any:15994915000]QJKYPPRR"
         ), ("CWNHPAKJVWWQK", "C[s:any:57021464000]WNHPAKJVWWQK"),
        ("WZGQTAVGZAQQOGR", "WZGQTAVGZAQQOGR"),
        ("MQCDJGHSORKQR", "MQC[s:any:57021464000]DJGHSORKQR"),
        ("TSFFHFVINNKDQR", "TSFFHFVINNKDQR"),
        ("WLQQJAGTEQPYYR", "WLQQJAGTEQPYYR"),
        ("TCMTDDRIRPVJJY", "TC[s:any:57021464000]MTDDRIRPVJJY"),
        ("MKGBMIPSJAZVYQR", "MKGBM[v:any:15994915000]IPSJAZVYQR"),
        ("KGAHJQQQDADAAPFR", "KGAHJQQQDADAAPFR"),
        ("EVWVGYTDGRJVCAK", "EVWVGYTDGRJVC[s:any:57021464000]AK"),
        ("OAYIJRPUFRSNK", "OAYIJRPUFRSNK"),
        ("BCDAVMAVBAJPIVHK", "BC[s:any:57021464000]DAVMAVBAJPIVHK"),
        ("QJPMCKFPYEPAKK",
         "QJPM[v:any:15994915000]C[s:any:57021464000]KFPYEPAKK"),
        ("UHRFRQVFJFPVR", "UHRFRQVFJFPVR"),
        ("QMPJOAASZWBPQK", "QM[v:any:15994915000]PJOAASZWBPQK"),
        ("CMGJEJLDVKKMDGK",
         "C[s:any:57021464000]M[v:any:15994915000]GJEJLDVKKMDGK"),
        ("VDCDJTQJJEQKYK", "VDC[s:any:57021464000]DJTQJJEQKYK"),
        ("NVVNFDVPVMVJMEF", "NVVNFDVPVMVJMEF"),
        ("MJVQVCFIJNDTQR",
         "M[v:any:15994915000]JVQVC[s:any:57021464000]FIJNDTQR"),
        ("DVJACKGQSRTQGJGY", "DVJAC[s:any:57021464000]KGQSRTQGJGY"),
        ("JQCPJQCRYVKCK",
         "JQC[s:any:57021464000]PJQC[s:any:57021464000]RYVKC[s:any:57021464000]K"
         ), ("MNPASVJEMTJFJMR", "MNPASVJEMTJFJMR"),
        ("IPDTQRYKMAJCEK", "IPDTQRYKMAJC[s:any:57021464000]EK"),
        ("HAOSJPJVUGKMPAK", "HAOSJPJVUGKM[v:any:15994915000]PAK"),
        ("VSTPVRJFMVACGCR",
         "VSTPVRJFMVAC[s:any:57021464000]GC[s:any:57021464000]R"),
        ("CVGGAKALDYHYJSAK", "C[s:any:57021464000]VGGAKALDYHYJSAK"),
        ("RVVEPFAYCJEDVR", "RVVEPFAYC[s:any:57021464000]JEDVR"),
        ("CJMPMMVPMKVQKK",
         "C[s:any:57021464000]JMPM[v:any:15994915000]M[v:any:15994915000]VPMKVQKK"
         ), ("ZQRYJFVYMTFZK", "ZQRYJFVYMTFZK"),
        ("JGTWPASJHDSLYHR", "JGTWPASJHDSLYHR"),
        ("HTBDTEZJCROIK", "HTBDTEZJC[s:any:57021464000]ROIK"),
        ("ALWJEYSRCJEANK", "ALWJEYSRC[s:any:57021464000]JEANK"),
        ("WGDSCDIGAJJPPVPR", "WGDSC[s:any:57021464000]DIGAJJPPVPR"),
        ("MJREDFJIEIWCK", "MJREDFJIEIWC[s:any:57021464000]K"),
        ("VGHQMAMGPPJVDQJK",
         "VGHQM[v:any:15994915000]AM[v:any:15994915000]GPPJVDQJK"),
        ("WGAYRRJYWYYR", "WGAYRRJYWYYR"),
        ("KGQRVYZMNBQTJR", "KGQRVYZM[v:any:15994915000]NBQTJR"),
        ("SWMQEKSPVFWAIK", "SWM[v:any:15994915000]QEKSPVFWAIK"),
        ("FKQAGTVMYMYJJR",
         "FKQAGTVM[v:any:15994915000]YM[v:any:15994915000]YJJR"),
        ("BJPVSFPQBHGTWVR", "BJPVSFPQBHGTWVR"),
        ("QDGLJJPFWNMYQK", "QDGLJJPFWNMYQK"),
        ("DCITAMHPAKPMPKR", "DC[s:any:57021464000]ITAMHPAKPMPKR"),
        ("BMJZMJVDYJPRMK", "BMJZMJVDYJPRMK")
    ]
    """Plain and PTM annotated peptide seqeunces
    """

    MODIFICATION_COLLECTION: ClassVar[
        ModificationCollection] = ModificationCollection.read_from_csv_file(
            pathlib.Path("./test_files/modifications.csv"))
    """Modification Collection
    """

    NUMBER_OF_VARIABLE_MODIFICATIONS: ClassVar[int] = 3
    """Number of variable modifications
    """
    def test_validation(self):
        """
        Checks if mass validation works.
        """
        peptide_mass_validator = PeptideMassValidator(
            self.__class__.MODIFICATION_COLLECTION,
            self.__class__.NUMBER_OF_VARIABLE_MODIFICATIONS,
            self.__class__.PRECURSOR_RANGE)
        for plain_sequence, annotated_sequence in self.__class__.PEPTIDE_SEQUENCES:
            peptide = Peptide(plain_sequence,
                              Trypsin.count_missed_cleavages(plain_sequence))
            self.assertTrue(
                peptide_mass_validator.validate(peptide, True),
                f"expected: {annotated_sequence}; is: {peptide.sequence_with_modification_markers}"
            )
Ejemplo n.º 2
0
    def test_with_real_data(self):
        VARIABLE_MODIFICATION_LIMIT = 2
        # Mass of MFPVTJEDTEGNVJTVSPPCYGFJQJR
        PRECURSOR = 3025492916648
        PRECURSOR_TOLERANCE = 20
        # With the given mass, tolerance and modifications 3 peptides shoud be found

        modifications_file_path = pathlib.Path(
            './test_files/modifications.csv')

        work_dir = pathlib.Path(f"./tmp/{self.id()}")
        test_files_path = pathlib.Path('./test_files')
        protein_data_test_file_path = test_files_path.joinpath('proteins.txt')

        self.prepare_workdir(work_dir, test_files_path,
                             protein_data_test_file_path)

        maintenance = DatabaseMaintenance(os.getenv("TEST_MACPEPDB_URL"),
                                          work_dir, 4, 5, 'Trypsin', 2, 5, 40)

        maintenance.start()

        modification_collection = ModificationCollection.read_from_csv_file(
            modifications_file_path)
        peptide_mass_validator = PeptideMassValidator(
            modification_collection, VARIABLE_MODIFICATION_LIMIT,
            PrecursorRange(PRECURSOR, PRECURSOR_TOLERANCE,
                           PRECURSOR_TOLERANCE))

        validated_matching_peptide_sequences = set()

        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                # Run through all peptides (in batches of 1000 peptides) and check which matchs the precursor and modification requirements
                database_cursor.execute(
                    f"SELECT sequence, number_of_missed_cleavages FROM {Peptide.TABLE_NAME};"
                )
                while True:
                    rows = database_cursor.fetchmany(1000)
                    if not len(rows):
                        break
                    for row in rows:
                        peptide = Peptide(row[0], row[1])
                        if peptide_mass_validator.validate(peptide):
                            validated_matching_peptide_sequences.add(
                                peptide.sequence)

        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                modification_combination_list = ModificationCombinationList(
                    modification_collection, PRECURSOR, PRECURSOR_TOLERANCE,
                    PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_LIMIT)
                where_conditions = modification_combination_list.to_where_condition(
                )
                peptides = Peptide.select(database_cursor,
                                          where_conditions,
                                          fetchall=True)

                queried_matching_peptide_sequences = set()
                for peptide in peptides:
                    queried_matching_peptide_sequences.add(peptide.sequence)

                # Check length of both manually validated set and the queried set
                self.assertEqual(len(validated_matching_peptide_sequences),
                                 len(queried_matching_peptide_sequences))

                # Cross check if peptide from one set is in the other set
                for sequence in queried_matching_peptide_sequences:
                    self.assertIn(sequence,
                                  validated_matching_peptide_sequences)

                for sequence in validated_matching_peptide_sequences:
                    self.assertIn(sequence, queried_matching_peptide_sequences)