def prot_peptides(prot_seq, enzyme, mc, minlen, maxlen, is_decoy, dont_use_seen_peptides=False): dont_use_fast_valid = parser.fast_valid(prot_seq) peptides = parser.cleave(prot_seq, enzyme, mc) for pep in peptides: plen = len(pep) if minlen <= plen <= maxlen: forms = [] if dont_use_fast_valid or pep in seen_target or pep in seen_decoy or parser.fast_valid( pep): if plen <= maxlen: forms.append(pep) for f in forms: if dont_use_seen_peptides: yield f else: if f not in seen_target and f not in seen_decoy: if is_decoy: seen_decoy.add(f) else: seen_target.add(f) yield f
def test_fast_valid(self): for j in range(50): L = random.randint(1, 10) peptide = ''.join([random.choice(self.labels) for _ in range(L)]) self.assertTrue(parser.fast_valid(peptide, labels=self.labels)) self.assertTrue(parser.valid(peptide, labels=self.labels)) self.assertTrue(parser.valid(peptide)) for aa in set(peptide): bad = peptide.replace(aa, 'Z') self.assertFalse(parser.fast_valid(bad, labels=self.labels)) self.assertFalse(parser.valid(bad, labels=self.labels))
def fraction_of_by(peptide_seq, precursor_mz, precursor_charge, mz, intensity): if not parser.fast_valid(peptide_seq): print("Invalid peptide sequence encountered", file=sys.stderr) return 0.0 spec = sus.MsmsSpectrum(peptide_seq, precursor_mz=precursor_mz, precursor_charge=precursor_charge, mz=mz, intensity=intensity, peptide=peptide_seq) fragment_tol_mass = 50 fragment_tol_mode = 'ppm' spectrum = (spectrum.set_mz_range( min_mz=100, max_mz=1400).remove_precursor_peak( fragment_tol_mass, fragment_tol_mode).annotate_peptide_fragments(fragment_tol_mass, fragment_tol_mode, ion_types='by')) current, by_current = 0., 0. for ix in range(len(spectrum.intensity)): current += spectrum.intensity[ix] if spectrum.annotation[ix] != None: by_current += spectrum.intensity[ix] if current > 0.: return by_current / current else: return 0.0
def prot_peptides(prot_seq, enzyme, mc, minlen, maxlen, is_decoy, dont_use_seen_peptides=False, snp=False, desc=False, position=False, semitryptic=False): dont_use_fast_valid = parser.fast_valid(prot_seq) methionine_check = prot_seq[0] == 'M' if snp == 2: if desc: try: tmp = desc.split(' ')[0].split('|') pos = int(tmp[1]) - 1 aach = tmp[2] except: desc = False # peptides = cparser._cleave(prot_seq, enzyme, mc) # for pep, startposition in peptides: # plen = len(pep) for pep, startposition, plen in get_peptides(prot_seq, enzyme, mc, minlen, maxlen, semitryptic): loopcnt = 0 if pep not in seen_target and pep not in seen_decoy and (dont_use_fast_valid or parser.fast_valid(pep)): loopcnt = 1 if methionine_check and startposition == 0: if minlen <= plen - 2: loopcnt = 3 elif minlen <= plen - 1: loopcnt = 2 while loopcnt: f = pep[loopcnt-1:] if dont_use_seen_peptides: if snp == 1: for ff, seq_new in custom_snp(f, startposition): if not seq_new: yield ff if not position else (ff, startposition) else: yield ff if not position else (ff, startposition) else: yield f if not position else (f, startposition) else: if f not in seen_target and f not in seen_decoy: if is_decoy: seen_decoy.add(f) else: seen_target.add(f) if snp == 1: for ff, seq_new in custom_snp(f, startposition): if not seq_new: yield ff if not position else (ff, startposition) if seq_new not in seen_decoy and seq_new not in seen_target: yield ff if not position else (ff, startposition) elif snp == 2: if desc and startposition <= pos <= startposition + plen: if len(aach) == 3 and aach[0] in parser.std_amino_acids and aach[2] in parser.std_amino_acids: pos_diff = pos - startposition f = f[:pos_diff] + 'snp%sto%sat%ssnp' % (aach.split('>')[0], aach.split('>')[-1], pos) + f[pos_diff+1:] yield f if not position else (f, startposition) else: yield f if not position else (f, startposition) else: yield f if not position else (f, startposition) loopcnt -= 1
def fraction_of_by_seq(peptide_seq, precursor_mz, precursor_charge, mz, intensity): if not parser.fast_valid(peptide_seq): print("Invalid peptide sequence encountered", file=sys.stderr) return 0.0 spec = sus.MsmsSpectrum(peptide_seq, precursor_mz=precursor_mz, precursor_charge=precursor_charge, mz=mz, intensity=intensity, peptide=peptide_seq) return fraction_of_by(spec)
def test_valid(self): for j in range(50): L = random.randint(1, 10) peptide = ''.join([random.choice(self.labels) for _ in range(L)]) modseqs = parser.isoforms(peptide, variable_mods=self.potential, fixed_mods=self.constant, labels=self.labels) self.assertFalse(parser.valid('H-' + peptide, labels=self.labels)) for s in modseqs: self.assertTrue(parser.valid(s, labels=self.extlabels)) for aa in set(peptide): bad = s.replace(aa, 'Z') self.assertFalse(parser.fast_valid(bad, labels=self.labels)) self.assertFalse(parser.valid(bad, labels=self.labels))
def rt_filtering(results, settings): settings = settings.copy() if settings.has_option('misc', 'legend'): legend = settings.get('misc', 'legend') else: legend = None RTexp, seqs = zip(*[(utils.get_RT(res['spectrum']), res['candidates'][0][1]) for res in results]) if legend is not None: stdl = set(parser.std_labels) newseqs = [] for s in seqs: if parser.fast_valid(s): newseqs.append(list(s)) else: seq = [] c, n = False, False for c in s: if c in stdl: seq.append(c) else: mod, res, term = legend[c] if res == '-': if term == '[': seq.append(mod + '-') n = True else: seq.append('-' + mod) c = True else: seq.append(mod + res) if not n: seq.append(parser.std_nterm) if not c: seq.append(parser.std_cterm) newseqs.append(seq) seqs = newseqs RTexp = [float(x) for x in RTexp] if np.allclose(RTexp, 0): logger.warning('RT is missing. Skipping RT optimization.') return settings RC_def = achrom.RCs_gilar_rp xdict = {} for key, val in RC_def['aa'].items(): xdict[key] = [val, None] RC_dict = utils.get_RCs_vary_lcp(seqs, RTexp) RC_dict_new = dict() for key, val in RC_dict['aa'].items(): xdict.setdefault(key, [val, None])[1] = val a, b, _, _ = aux.linear_regression( [x[0] for x in xdict.values() if x[1] != None], [x[1] for x in xdict.values() if x[1] != None]) for key, x in xdict.items(): if x[1] == None: x[1] = x[0] * a + b RC_dict_new[key] = x[1] if legend is not None: for k, v in legend.items(): if len(k) == 1: continue if k[-1] in '[]': if k[-2] == '-': kk = ('-' + k[1:-1]) if k[-1] == ']' else (k[:-1]) else: kk = k[:-1] elif len(k) > 1: kk = k logger.debug('%s -> %s', k, kk) if kk in RC_dict_new: RC_dict_new[v] = RC_dict_new[kk] else: if kk[-1].isupper(): kkk = kk[-1] elif kk[-1] == '-': kkk = parser.std_nterm elif kk[0] == '-': kkk = parser.std_cterm RC_dict_new[v] = RC_dict_new.get(kkk, 0) logger.info('No RC for %s, using %s or 0: %s', kk, kkk, RC_dict_new[v]) RC_dict['aa'] = RC_dict_new logger.debug('RC dict: %s', RC_dict) rtexp = np.array([np.mean(x) for x in RTexp]) rttheor = np.array( [calculate_RT(pep, RC_dict, raise_no_mod=False) for pep in seqs]) deltaRT = rtexp - rttheor logger.debug('Linear regression: %s', aux.linear_regression(rtexp, rttheor)) best_RT_l = scoreatpercentile(deltaRT, 0.05) best_RT_r = scoreatpercentile(deltaRT, 99.95) def condition(spectrum, cand, _, stored_value=False): if not stored_value: stored_value = calculate_RT(cand, RC_dict) rtd = spectrum['RT'] - stored_value return best_RT_l <= rtd <= best_RT_r, stored_value settings.set('scoring', 'condition', condition) return settings