def setUp(self): self.prots = data_sets.Proteins(proteins=(data_sets.Protein( accession='P03995', gene='Gfap', description='Glial fibrillary acidic protein', full_sequence=( 'MERRRITSARRSYASETVVRGLGPSRQLGTMPRFSLSRMTPPLPARVDFSLAG' 'ALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEP' 'TKLADVYQAELRELRLRLDQLTANSARLEVERDNFAQDLGTLRQKLQDETNLR' 'LEAENNLAAYRQEADEATLARVDLERKVESLEEEIQFLRKIYEEEVRELREQL' 'AQQQVHVEMDVAKPDLTAALREIRTQYEAVATSNMQETEEWYRSKFADLTDAA' 'SRNAELLRQAKHEANDYRRQLQALTCDLESLRGTNESLERQMREQEERHARES' 'ASYQEALARLEEEGQSLKEEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEE' 'NRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKDSKQE' 'HKDVVM'), ), ), ) self.seq = data_sets.extract_sequence(self.prots, 'QEADEATLAR') self.mods = data_sets.Modifications(mods=[ data_sets.Modification( rel_pos=0, mod_type='TMT6plex', nterm=True, sequence=self.seq, ), ], ) self.seq.modifications = self.mods self.channels = OrderedDict([ ('low1', '126'), ('low2', '127'), ('low3', '128'), ('med', '129'), ('high', '130'), ('norm', '131'), ]) self.groups = OrderedDict([ ('base', ['low1', 'low2', 'low3']), ('stim', ['med', 'high']), ]) insert = { 'Proteins': self.prots, 'Sequence': self.seq, 'Modifications': self.mods, '126': 1e4, '127': 1e4, '128': np.nan, '129': 4e4, '130': 4e4, '131': 1e4, } self.data = data_sets.DataSet( channels=self.channels, groups=self.groups, ) self.data.add_peptide(insert)
def setUp(self): self.sequence = data_sets.Sequence( pep_seq="GEPNVsyICSR", protein_matches=(data_sets.ProteinMatch( protein=data_sets.Protein( accession="Q9WV60", gene="Gsk3b", description="Glycogen synthase kinase-3 beta", full_sequence=( "MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD" "RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL" "QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS" "RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL" "KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC" "VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP" "QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL" "RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA" "NATAASDTNAGDRGQTNNAASASASNST"), ), rel_pos=209, exact=True, ), ), ) self.sequence.modifications = data_sets.Modifications( ( # S215-p data_sets.Modification( rel_pos=5, mod_type="Phospho", sequence=self.sequence, ), # Y216-p data_sets.Modification( rel_pos=6, mod_type="Phospho", sequence=self.sequence, ), ), ) self.sequences = list(motif.generate_n_mers(self.sequence)) self.foreground = self.sequences self.background = self.sequences
def _reassign_mods(mods, psp_val, probability_cutoff=75): reassigned = False ambiguous = False # phophoRS example format: 'T(4): 99.6; S(6): 0.4; S(10): 0.0' # Error messages include: 'Too many isoforms' if psp_val is None: psp_val = '' psp_val = [ RE_PSP.match(i.strip()) for i in psp_val.split(';') ] psp_val = [ i.groups() for i in psp_val if i ] psp_val = [ (i[0], int(i[1]), float(i[2])) for i in psp_val ] o_mods = [i for i in mods if not _is_pmod(i)] p_mods = [i for i in mods if _is_pmod(i)] psp_val_f = [i for i in psp_val if i[2] > probability_cutoff] if len(p_mods) != len(psp_val_f): LOGGER.debug( 'Not enough info to assign phophosite: {}'.format(psp_val) ) ambiguous = True elif set(i.rel_pos + 1 for i in p_mods) != set(i[1] for i in psp_val_f): p_mods = [ data_sets.Modification( rel_pos=i[1] - 1, mod_type='Phospho', nterm=False, cterm=False, sequence=p_mods[0].sequence, ) for i in psp_val_f ] reassigned = True mods = data_sets.Modifications( mods=_sort_mods(o_mods + p_mods), ) for mod in mods.mods: mod.sequence.modifications = mods return mods, reassigned, ambiguous
def setUp(self): self.sequence = data_sets.Sequence( pep_seq='GEPNVsyICSR', protein_matches=(data_sets.ProteinMatch( protein=data_sets.Protein( accession='Q9WV60', gene='Gsk3b', description='Glycogen synthase kinase-3 beta', full_sequence=( 'MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD' 'RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL' 'QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS' 'RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL' 'KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC' 'VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP' 'QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL' 'RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA' 'NATAASDTNAGDRGQTNNAASASASNST'), ), rel_pos=209, exact=True, ), ), ) self.sequence.modifications = data_sets.Modifications( ( # S215-p data_sets.Modification( rel_pos=5, mod_type='Phospho', sequence=self.sequence, ), # Y216-p data_sets.Modification( rel_pos=6, mod_type='Phospho', sequence=self.sequence, ), ), )
def _get_modifications(df, cursor, pd_version): mod_dict = defaultdict(list) if pd_version[:2] in [(1, 4)]: aa_mods = cursor.execute( ''' SELECT Peptides.PeptideID, AminoAcidModifications.Abbreviation, PeptidesAminoAcidModifications.Position FROM Peptides JOIN PeptidesAminoAcidModifications ON Peptides.PeptideID=PeptidesAminoAcidModifications.PeptideID JOIN AminoAcidModifications ON PeptidesAminoAcidModifications.AminoAcidModificationID= AminoAcidModifications.AminoAcidModificationID ''', ) for peptide_id, name, pos in aa_mods: if peptide_id not in df.index: continue mod = data_sets.Modification( rel_pos=pos, mod_type=name, nterm=False, cterm=False, ) mod_dict[peptide_id].append(mod) term_mods = cursor.execute( ''' SELECT Peptides.PeptideID, Peptides.Sequence, AminoAcidModifications.Abbreviation, AminoAcidModifications.PositionType FROM Peptides JOIN PeptidesTerminalModifications ON Peptides.PeptideID=PeptidesTerminalModifications.PeptideID JOIN AminoAcidModifications ON PeptidesTerminalModifications.TerminalModificationID= AminoAcidModifications.AminoAcidModificationID ''', ) # PositionType rules taken from: # # https://github.com/compomics/thermo-msf-parser/blob/ # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/ # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/ # Parser.java#L1022 for peptide_id, pep_seq, name, pos_type in term_mods: if peptide_id not in df.index: continue nterm = pos_type == 1 pos = 0 if nterm else len(pep_seq) mod = data_sets.Modification( rel_pos=pos, mod_type=name, nterm=nterm, cterm=not nterm, ) mod_dict[peptide_id].append(mod) elif pd_version[:2] in [(2, 2)]: aa_mods = cursor.execute( ''' SELECT TargetPsms.PeptideID, FoundModifications.Abbreviation, TargetPsmsFoundModifications.Position FROM TargetPsms JOIN TargetPsmsFoundModifications ON TargetPsmsFoundModifications.TargetPsmsPeptideID=TargetPsms.PeptideID JOIN FoundModifications ON TargetPsmsFoundModifications.FoundModificationsModificationID= FoundModifications.ModificationID WHERE FoundModifications.PositionType NOT IN (1, 2) ''', ) for peptide_id, name, pos in aa_mods: if peptide_id not in df.index: continue pos -= 1 mod = data_sets.Modification( rel_pos=pos, mod_type=name, nterm=False, cterm=False, ) mod_dict[peptide_id].append(mod) term_mods = cursor.execute( ''' SELECT TargetPsms.PeptideID, TargetPsms.Sequence, FoundModifications.Abbreviation, FoundModifications.PositionType FROM TargetPsms JOIN TargetPsmsFoundModifications ON TargetPsmsFoundModifications.TargetPsmsPeptideID=TargetPsms.PeptideID JOIN FoundModifications ON TargetPsmsFoundModifications.FoundModificationsModificationID= FoundModifications.ModificationID WHERE FoundModifications.PositionType IN (1, 2) ''', ) # PositionType rules taken from: # # https://github.com/compomics/thermo-msf-parser/blob/ # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/ # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/ # Parser.java#L1022 for peptide_id, pep_seq, name, pos_type in term_mods: if peptide_id not in df.index: continue nterm = pos_type == 1 pos = 0 if nterm else len(pep_seq) mod = data_sets.Modification( rel_pos=pos, mod_type=name, nterm=nterm, cterm=not nterm, ) mod_dict[peptide_id].append(mod) else: raise Exception( 'Unsupported Proteome Discoverer Version: {}'.format(pd_version) ) mod_dict = { key: _sort_mods(val) for key, val in mod_dict.items() } def _get_mods(row): peptide_id = row.name mods = data_sets.Modifications( mods=mod_dict.get(peptide_id, tuple()), ) for mod in mods.mods: assert mod.sequence is None mod.sequence = row['Sequence'] row['Sequence'].modifications = mods return mods df['Modifications'] = df.apply(_get_mods, axis=1) return df
def _get_modifications(df, cursor): aa_mods = cursor.execute( """ SELECT Peptides.PeptideID, AminoAcidModifications.Abbreviation, PeptidesAminoAcidModifications.Position FROM Peptides JOIN PeptidesAminoAcidModifications ON Peptides.PeptideID=PeptidesAminoAcidModifications.PeptideID JOIN AminoAcidModifications ON PeptidesAminoAcidModifications.AminoAcidModificationID= AminoAcidModifications.AminoAcidModificationID """, ) mod_dict = defaultdict(list) for peptide_id, name, pos in aa_mods: if peptide_id not in df.index: continue mod = data_sets.Modification( rel_pos=pos, mod_type=name, nterm=False, cterm=False, ) mod_dict[peptide_id].append(mod) term_mods = cursor.execute( """ SELECT Peptides.PeptideID, Peptides.Sequence, AminoAcidModifications.Abbreviation, AminoAcidModifications.PositionType FROM Peptides JOIN PeptidesTerminalModifications ON Peptides.PeptideID=PeptidesTerminalModifications.PeptideID JOIN AminoAcidModifications ON PeptidesTerminalModifications.TerminalModificationID= AminoAcidModifications.AminoAcidModificationID """, ) # PositionType rules taken from: # # https://github.com/compomics/thermo-msf-parser/blob/ # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/ # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/ # Parser.java#L1022 for peptide_id, pep_seq, name, pos_type in term_mods: if peptide_id not in df.index: continue nterm = pos_type == 1 pos = 0 if nterm else len(pep_seq) mod = data_sets.Modification( rel_pos=pos, mod_type=name, nterm=nterm, cterm=not nterm, ) mod_dict[peptide_id].append(mod) mod_dict = { key: _sort_mods(val) for key, val in mod_dict.items() } def _get_mods(row): peptide_id = row.name mods = data_sets.Modifications( mods=mod_dict.get(peptide_id, tuple()), ) for mod in mods.mods: assert mod.sequence is None mod.sequence = row["Sequence"] row["Sequence"].modifications = mods return mods df["Modifications"] = df.apply(_get_mods, axis=1) return df