Ejemplo n.º 1
0
    def setUp(self):
        self.prots = data_sets.Proteins(proteins=(data_sets.Protein(
            accession='P03995',
            gene='Gfap',
            description='Glial fibrillary acidic protein',
            full_sequence=(
                'MERRRITSARRSYASETVVRGLGPSRQLGTMPRFSLSRMTPPLPARVDFSLAG'
                'ALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEP'
                'TKLADVYQAELRELRLRLDQLTANSARLEVERDNFAQDLGTLRQKLQDETNLR'
                'LEAENNLAAYRQEADEATLARVDLERKVESLEEEIQFLRKIYEEEVRELREQL'
                'AQQQVHVEMDVAKPDLTAALREIRTQYEAVATSNMQETEEWYRSKFADLTDAA'
                'SRNAELLRQAKHEANDYRRQLQALTCDLESLRGTNESLERQMREQEERHARES'
                'ASYQEALARLEEEGQSLKEEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEE'
                'NRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKDSKQE'
                'HKDVVM'),
        ), ), )
        self.seq = data_sets.extract_sequence(self.prots, 'QEADEATLAR')

        self.mods = data_sets.Modifications(mods=[
            data_sets.Modification(
                rel_pos=0,
                mod_type='TMT6plex',
                nterm=True,
                sequence=self.seq,
            ),
        ], )

        self.seq.modifications = self.mods
        self.channels = OrderedDict([
            ('low1', '126'),
            ('low2', '127'),
            ('low3', '128'),
            ('med', '129'),
            ('high', '130'),
            ('norm', '131'),
        ])
        self.groups = OrderedDict([
            ('base', ['low1', 'low2', 'low3']),
            ('stim', ['med', 'high']),
        ])

        insert = {
            'Proteins': self.prots,
            'Sequence': self.seq,
            'Modifications': self.mods,
            '126': 1e4,
            '127': 1e4,
            '128': np.nan,
            '129': 4e4,
            '130': 4e4,
            '131': 1e4,
        }

        self.data = data_sets.DataSet(
            channels=self.channels,
            groups=self.groups,
        )

        self.data.add_peptide(insert)
Ejemplo n.º 2
0
 def setUp(self):
     self.sequence = data_sets.Sequence(
         pep_seq="GEPNVsyICSR",
         protein_matches=(data_sets.ProteinMatch(
             protein=data_sets.Protein(
                 accession="Q9WV60",
                 gene="Gsk3b",
                 description="Glycogen synthase kinase-3 beta",
                 full_sequence=(
                     "MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD"
                     "RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL"
                     "QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS"
                     "RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL"
                     "KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC"
                     "VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP"
                     "QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL"
                     "RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA"
                     "NATAASDTNAGDRGQTNNAASASASNST"),
             ),
             rel_pos=209,
             exact=True,
         ), ),
     )
     self.sequence.modifications = data_sets.Modifications(
         (
             # S215-p
             data_sets.Modification(
                 rel_pos=5,
                 mod_type="Phospho",
                 sequence=self.sequence,
             ),
             # Y216-p
             data_sets.Modification(
                 rel_pos=6,
                 mod_type="Phospho",
                 sequence=self.sequence,
             ),
         ), )
     self.sequences = list(motif.generate_n_mers(self.sequence))
     self.foreground = self.sequences
     self.background = self.sequences
Ejemplo n.º 3
0
def _reassign_mods(mods, psp_val, probability_cutoff=75):
    reassigned = False
    ambiguous = False

    # phophoRS example format: 'T(4): 99.6; S(6): 0.4; S(10): 0.0'
    # Error messages include: 'Too many isoforms'
    if psp_val is None:
        psp_val = ''

    psp_val = [
        RE_PSP.match(i.strip())
        for i in psp_val.split(';')
    ]
    psp_val = [
        i.groups()
        for i in psp_val
        if i
    ]
    psp_val = [
        (i[0], int(i[1]), float(i[2]))
        for i in psp_val
    ]

    o_mods = [i for i in mods if not _is_pmod(i)]
    p_mods = [i for i in mods if _is_pmod(i)]
    psp_val_f = [i for i in psp_val if i[2] > probability_cutoff]

    if len(p_mods) != len(psp_val_f):
        LOGGER.debug(
            'Not enough info to assign phophosite: {}'.format(psp_val)
        )
        ambiguous = True
    elif set(i.rel_pos + 1 for i in p_mods) != set(i[1] for i in psp_val_f):
        p_mods = [
            data_sets.Modification(
                rel_pos=i[1] - 1,
                mod_type='Phospho',
                nterm=False,
                cterm=False,
                sequence=p_mods[0].sequence,
            )
            for i in psp_val_f
        ]
        reassigned = True

        mods = data_sets.Modifications(
            mods=_sort_mods(o_mods + p_mods),
        )

        for mod in mods.mods:
            mod.sequence.modifications = mods

    return mods, reassigned, ambiguous
Ejemplo n.º 4
0
 def setUp(self):
     self.sequence = data_sets.Sequence(
         pep_seq='GEPNVsyICSR',
         protein_matches=(data_sets.ProteinMatch(
             protein=data_sets.Protein(
                 accession='Q9WV60',
                 gene='Gsk3b',
                 description='Glycogen synthase kinase-3 beta',
                 full_sequence=(
                     'MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD'
                     'RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL'
                     'QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS'
                     'RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL'
                     'KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC'
                     'VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP'
                     'QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL'
                     'RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA'
                     'NATAASDTNAGDRGQTNNAASASASNST'),
             ),
             rel_pos=209,
             exact=True,
         ), ),
     )
     self.sequence.modifications = data_sets.Modifications(
         (
             # S215-p
             data_sets.Modification(
                 rel_pos=5,
                 mod_type='Phospho',
                 sequence=self.sequence,
             ),
             # Y216-p
             data_sets.Modification(
                 rel_pos=6,
                 mod_type='Phospho',
                 sequence=self.sequence,
             ),
         ), )
Ejemplo n.º 5
0
def _get_modifications(df, cursor, pd_version):
    mod_dict = defaultdict(list)

    if pd_version[:2] in [(1, 4)]:
        aa_mods = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            AminoAcidModifications.Abbreviation,
            PeptidesAminoAcidModifications.Position

            FROM Peptides

            JOIN PeptidesAminoAcidModifications
            ON Peptides.PeptideID=PeptidesAminoAcidModifications.PeptideID

            JOIN AminoAcidModifications
            ON PeptidesAminoAcidModifications.AminoAcidModificationID=
            AminoAcidModifications.AminoAcidModificationID
            ''',
        )

        for peptide_id, name, pos in aa_mods:
            if peptide_id not in df.index:
                continue

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=False,
                cterm=False,
            )

            mod_dict[peptide_id].append(mod)

        term_mods = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            Peptides.Sequence,
            AminoAcidModifications.Abbreviation,
            AminoAcidModifications.PositionType

            FROM Peptides

            JOIN PeptidesTerminalModifications
            ON Peptides.PeptideID=PeptidesTerminalModifications.PeptideID

            JOIN AminoAcidModifications
            ON PeptidesTerminalModifications.TerminalModificationID=
            AminoAcidModifications.AminoAcidModificationID
            ''',
        )

        # PositionType rules taken from:
        #
        # https://github.com/compomics/thermo-msf-parser/blob/
        # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/
        # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/
        # Parser.java#L1022
        for peptide_id, pep_seq, name, pos_type in term_mods:
            if peptide_id not in df.index:
                continue

            nterm = pos_type == 1
            pos = 0 if nterm else len(pep_seq)

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=nterm,
                cterm=not nterm,
            )
            mod_dict[peptide_id].append(mod)
    elif pd_version[:2] in [(2, 2)]:
        aa_mods = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            FoundModifications.Abbreviation,
            TargetPsmsFoundModifications.Position

            FROM TargetPsms

            JOIN TargetPsmsFoundModifications
            ON
            TargetPsmsFoundModifications.TargetPsmsPeptideID=TargetPsms.PeptideID

            JOIN FoundModifications
            ON
            TargetPsmsFoundModifications.FoundModificationsModificationID=
            FoundModifications.ModificationID

            WHERE
            FoundModifications.PositionType NOT IN (1, 2)
            ''',
        )

        for peptide_id, name, pos in aa_mods:
            if peptide_id not in df.index:
                continue

            pos -= 1

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=False,
                cterm=False,
            )

            mod_dict[peptide_id].append(mod)

        term_mods = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetPsms.Sequence,
            FoundModifications.Abbreviation,
            FoundModifications.PositionType

            FROM TargetPsms

            JOIN TargetPsmsFoundModifications
            ON
            TargetPsmsFoundModifications.TargetPsmsPeptideID=TargetPsms.PeptideID

            JOIN FoundModifications
            ON
            TargetPsmsFoundModifications.FoundModificationsModificationID=
            FoundModifications.ModificationID

            WHERE
            FoundModifications.PositionType IN (1, 2)
            ''',
        )

        # PositionType rules taken from:
        #
        # https://github.com/compomics/thermo-msf-parser/blob/
        # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/
        # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/
        # Parser.java#L1022
        for peptide_id, pep_seq, name, pos_type in term_mods:
            if peptide_id not in df.index:
                continue

            nterm = pos_type == 1
            pos = 0 if nterm else len(pep_seq)

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=nterm,
                cterm=not nterm,
            )
            mod_dict[peptide_id].append(mod)
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    mod_dict = {
        key: _sort_mods(val)
        for key, val in mod_dict.items()
    }

    def _get_mods(row):
        peptide_id = row.name

        mods = data_sets.Modifications(
            mods=mod_dict.get(peptide_id, tuple()),
        )

        for mod in mods.mods:
            assert mod.sequence is None
            mod.sequence = row['Sequence']

        row['Sequence'].modifications = mods

        return mods

    df['Modifications'] = df.apply(_get_mods, axis=1)

    return df
Ejemplo n.º 6
0
def _get_modifications(df, cursor):
    aa_mods = cursor.execute(
        """
        SELECT
        Peptides.PeptideID,
        AminoAcidModifications.Abbreviation,
        PeptidesAminoAcidModifications.Position
        FROM Peptides
        JOIN PeptidesAminoAcidModifications
        ON Peptides.PeptideID=PeptidesAminoAcidModifications.PeptideID
        JOIN AminoAcidModifications
        ON PeptidesAminoAcidModifications.AminoAcidModificationID=
        AminoAcidModifications.AminoAcidModificationID
        """,
    )

    mod_dict = defaultdict(list)

    for peptide_id, name, pos in aa_mods:
        if peptide_id not in df.index:
            continue

        mod = data_sets.Modification(
            rel_pos=pos,
            mod_type=name,
            nterm=False,
            cterm=False,
        )

        mod_dict[peptide_id].append(mod)

    term_mods = cursor.execute(
        """
        SELECT
        Peptides.PeptideID,
        Peptides.Sequence,
        AminoAcidModifications.Abbreviation,
        AminoAcidModifications.PositionType
        FROM Peptides
        JOIN PeptidesTerminalModifications
        ON Peptides.PeptideID=PeptidesTerminalModifications.PeptideID
        JOIN AminoAcidModifications
        ON PeptidesTerminalModifications.TerminalModificationID=
        AminoAcidModifications.AminoAcidModificationID
        """,
    )

    # PositionType rules taken from:
    #
    # https://github.com/compomics/thermo-msf-parser/blob/
    # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/
    # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/
    # Parser.java#L1022
    for peptide_id, pep_seq, name, pos_type in term_mods:
        if peptide_id not in df.index:
            continue

        nterm = pos_type == 1
        pos = 0 if nterm else len(pep_seq)

        mod = data_sets.Modification(
            rel_pos=pos,
            mod_type=name,
            nterm=nterm,
            cterm=not nterm,
        )
        mod_dict[peptide_id].append(mod)

    mod_dict = {
        key: _sort_mods(val)
        for key, val in mod_dict.items()
    }

    def _get_mods(row):
        peptide_id = row.name

        mods = data_sets.Modifications(
            mods=mod_dict.get(peptide_id, tuple()),
        )

        for mod in mods.mods:
            assert mod.sequence is None
            mod.sequence = row["Sequence"]

        row["Sequence"].modifications = mods

        return mods

    df["Modifications"] = df.apply(_get_mods, axis=1)

    return df