def setUp(self): self.prots = data_sets.Proteins(proteins=(data_sets.Protein( accession='P03995', gene='Gfap', description='Glial fibrillary acidic protein', full_sequence=( 'MERRRITSARRSYASETVVRGLGPSRQLGTMPRFSLSRMTPPLPARVDFSLAG' 'ALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEP' 'TKLADVYQAELRELRLRLDQLTANSARLEVERDNFAQDLGTLRQKLQDETNLR' 'LEAENNLAAYRQEADEATLARVDLERKVESLEEEIQFLRKIYEEEVRELREQL' 'AQQQVHVEMDVAKPDLTAALREIRTQYEAVATSNMQETEEWYRSKFADLTDAA' 'SRNAELLRQAKHEANDYRRQLQALTCDLESLRGTNESLERQMREQEERHARES' 'ASYQEALARLEEEGQSLKEEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEE' 'NRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKDSKQE' 'HKDVVM'), ), ), ) self.seq = data_sets.extract_sequence(self.prots, 'QEADEATLAR') self.mods = data_sets.Modifications(mods=[ data_sets.Modification( rel_pos=0, mod_type='TMT6plex', nterm=True, sequence=self.seq, ), ], ) self.seq.modifications = self.mods self.channels = OrderedDict([ ('low1', '126'), ('low2', '127'), ('low3', '128'), ('med', '129'), ('high', '130'), ('norm', '131'), ]) self.groups = OrderedDict([ ('base', ['low1', 'low2', 'low3']), ('stim', ['med', 'high']), ]) insert = { 'Proteins': self.prots, 'Sequence': self.seq, 'Modifications': self.mods, '126': 1e4, '127': 1e4, '128': np.nan, '129': 4e4, '130': 4e4, '131': 1e4, } self.data = data_sets.DataSet( channels=self.channels, groups=self.groups, ) self.data.add_peptide(insert)
def setUp(self): self.sequence = data_sets.Sequence( pep_seq="GEPNVsyICSR", protein_matches=(data_sets.ProteinMatch( protein=data_sets.Protein( accession="Q9WV60", gene="Gsk3b", description="Glycogen synthase kinase-3 beta", full_sequence=( "MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD" "RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL" "QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS" "RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL" "KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC" "VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP" "QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL" "RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA" "NATAASDTNAGDRGQTNNAASASASNST"), ), rel_pos=209, exact=True, ), ), ) self.sequence.modifications = data_sets.Modifications( ( # S215-p data_sets.Modification( rel_pos=5, mod_type="Phospho", sequence=self.sequence, ), # Y216-p data_sets.Modification( rel_pos=6, mod_type="Phospho", sequence=self.sequence, ), ), ) self.sequences = list(motif.generate_n_mers(self.sequence)) self.foreground = self.sequences self.background = self.sequences
def setUp(self): self.sequence = data_sets.Sequence( pep_seq='GEPNVsyICSR', protein_matches=(data_sets.ProteinMatch( protein=data_sets.Protein( accession='Q9WV60', gene='Gsk3b', description='Glycogen synthase kinase-3 beta', full_sequence=( 'MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQGPD' 'RPQEVSYTDTKVIGNGSFGVVYQAKLCDSGELVAIKKVLQDKRFKNREL' 'QIMRKLDHCNIVRLRYFFYSSGEKKDEVYLNLVLDYVPETVYRVARHYS' 'RAKQTLPVIYVKLYMYQLFRSLAYIHSFGICHRDIKPQNLLLDPDTAVL' 'KLCDFGSAKQLVRGEPNVSYICSRYYRAPELIFGATDYTSSIDVWSAGC' 'VLAELLLGQPIFPGDSGVDQLVEIIKVLGTPTREQIREMNPNYTEFKFP' 'QIKAHPWTKVFRPRTPPEAIALCSRLLEYTPTARLTPLEACAHSFFDEL' 'RDPNVKLPNGRDTPALFNFTTQELSSNPPLATILIPPHARIQAAASPPA' 'NATAASDTNAGDRGQTNNAASASASNST'), ), rel_pos=209, exact=True, ), ), ) self.sequence.modifications = data_sets.Modifications( ( # S215-p data_sets.Modification( rel_pos=5, mod_type='Phospho', sequence=self.sequence, ), # Y216-p data_sets.Modification( rel_pos=6, mod_type='Phospho', sequence=self.sequence, ), ), )
def _get_proteins(df, cursor, pd_version): if pd_version[:2] in [(1, 4)]: prots = cursor.execute( ''' SELECT Peptides.PeptideID, ProteinAnnotations.Description, Proteins.Sequence FROM Peptides JOIN PeptidesProteins ON Peptides.PeptideID=PeptidesProteins.PeptideID JOIN ProteinAnnotations ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID JOIN Proteins ON Proteins.ProteinID=PeptidesProteins.ProteinID ''', ) elif pd_version[:2] in [(2, 2)]: prots = cursor.execute( ''' SELECT TargetPsms.PeptideID, TargetProteins.FastaTitleLines, TargetProteins.Sequence FROM TargetPsms JOIN TargetProteinsTargetPsms ON TargetPsms.PeptideID= TargetProteinsTargetPsms.TargetPsmsPeptideID JOIN TargetProteins ON TargetProteins.UniqueSequenceID= TargetProteinsTargetPsms.TargetProteinsUniqueSequenceID ''', ) else: raise Exception( 'Unsupported Proteome Discoverer Version: {}'.format(pd_version) ) accessions = defaultdict(list) genes = defaultdict(list) descriptions = defaultdict(list) sequences = defaultdict(list) for peptide_id, prot_string, seq in prots: for fasta_line in prot_string.split('\n'): try: accessions[peptide_id].append( pypuniprot.RE_DISCOVERER_ACCESSION.match(fasta_line).group(2) ) except: print(fasta_line) raise gene = RE_GENE.match(prot_string) if gene: gene = gene.group(1) else: gene = RE_GENE_BACKUP.match(prot_string).group(2) genes[peptide_id].append( gene ) descriptions[peptide_id].append( RE_DESCRIPTION.match(prot_string).group(2) ) sequences[peptide_id].append(seq) df['Protein Descriptions'] = df.index.map( lambda peptide_id: '; '.join(descriptions[peptide_id]) ) df['Protein Group Accessions'] = df.index.map( lambda peptide_id: '; '.join(accessions[peptide_id]) ) df['Proteins'] = df.index.map( lambda peptide_id: data_sets.Proteins( proteins=tuple( data_sets.Protein( accession=accession, gene=gene, full_sequence=seq, description=desc, ) for accession, gene, seq, desc in zip( accessions[peptide_id], genes[peptide_id], sequences[peptide_id], descriptions[peptide_id], ) ) ) ) return df
def _get_proteins(df, cursor, pd_version): if pd_version[:2] in [(1, 4)] or pd_version[:2] in [(2, 1)]: prots = cursor.execute( ''' SELECT Peptides.PeptideID, ProteinAnnotations.Description, Proteins.Sequence FROM Peptides JOIN PeptidesProteins ON Peptides.PeptideID=PeptidesProteins.PeptideID JOIN ProteinAnnotations ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID JOIN Proteins ON Proteins.ProteinID=PeptidesProteins.ProteinID ''', ) elif pd_version[:2] in [(2, 2)]: prots = cursor.execute( ''' SELECT TargetPsms.PeptideID, TargetProteins.FastaTitleLines, TargetProteins.Sequence FROM TargetPsms JOIN TargetProteinsTargetPsms ON TargetPsms.PeptideID= TargetProteinsTargetPsms.TargetPsmsPeptideID JOIN TargetProteins ON TargetProteins.UniqueSequenceID= TargetProteinsTargetPsms.TargetProteinsUniqueSequenceID ''', ) else: raise Exception( 'Unsupported Proteome Discoverer Version: {}'.format(pd_version)) accessions = defaultdict(list) genes = defaultdict(list) descriptions = defaultdict(list) sequences = defaultdict(list) for peptide_id, prot_string, seq in prots: for fasta_line in prot_string.split('\n'): accessions[peptide_id].append(fasta_line.replace(">", "")) gene = fasta_line.replace(">", "") genes[peptide_id].append(gene) descriptions[peptide_id].append("") sequences[peptide_id].append(seq) df['Protein Descriptions'] = df.index.map( lambda peptide_id: '; '.join(descriptions[peptide_id])) df['Protein Group Accessions'] = df.index.map( lambda peptide_id: '; '.join(accessions[peptide_id])) df['Proteins'] = df.index.map( lambda peptide_id: data_sets.Proteins(proteins=tuple( data_sets.Protein( accession=accession, gene=gene, full_sequence=seq, description=desc, ) for accession, gene, seq, desc in zip( accessions[peptide_id], genes[peptide_id], sequences[peptide_id], descriptions[peptide_id], )))) return df
def _get_proteins(df, cursor): prots = cursor.execute( """ SELECT Peptides.PeptideID, ProteinAnnotations.Description, Proteins.Sequence FROM Peptides JOIN PeptidesProteins ON Peptides.PeptideID=PeptidesProteins.PeptideID JOIN ProteinAnnotations ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID JOIN Proteins ON Proteins.ProteinID=PeptidesProteins.ProteinID """, ) accessions = defaultdict(list) genes = defaultdict(list) descriptions = defaultdict(list) sequences = defaultdict(list) for peptide_id, prot_string, seq in prots: accessions[peptide_id].append( pypuniprot.RE_DISCOVERER_ACCESSION.match(prot_string).group(1) ) gene = RE_GENE.match(prot_string) if not gene: gene = RE_GENE_BACKUP.match(prot_string) genes[peptide_id].append( gene.group(1) ) descriptions[peptide_id].append( RE_DESCRIPTION.match(prot_string).group(1) ) sequences[peptide_id].append(seq) df["Protein Descriptions"] = df.index.map( lambda peptide_id: "; ".join(descriptions[peptide_id]) ) df["Protein Group Accessions"] = df.index.map( lambda peptide_id: "; ".join(accessions[peptide_id]) ) df["Proteins"] = df.index.map( lambda peptide_id: data_sets.Proteins( proteins=tuple( data_sets.Protein( accession=accession, gene=gene, full_sequence=seq, description=desc, ) for accession, gene, seq, desc in zip( accessions[peptide_id], genes[peptide_id], sequences[peptide_id], descriptions[peptide_id], ) ) ) ) return df