def filter_unique_peptides(peptides, score, ns): """ Filters unique peptides from multiple Percolator output XML files. Takes a dir with a set of XMLs, a score to filter on and a namespace. Outputs an ElementTree. """ scores = {'q': 'q_value', 'pep': 'pep', 'p': 'p_value', 'svm': 'svm_score'} highest = {} for el in peptides: featscore = float(el.xpath('xmlns:%s' % scores[score], namespaces=ns)[0].text) seq = reader.get_peptide_seq(el, ns) if seq not in highest: highest[seq] = { 'pep_el': formatting.stringify_strip_namespace_declaration( el, ns), 'score': featscore} if score == 'svm': # greater than score is accepted if featscore > highest[seq]['score']: highest[seq] = { 'pep_el': formatting.stringify_strip_namespace_declaration(el, ns), 'score': featscore} else: # lower than score is accepted if featscore < highest[seq]['score']: highest[seq] = { 'pep_el': formatting.stringify_strip_namespace_declaration(el, ns), 'score': featscore} formatting.clear_el(el) for pep in list(highest.values()): yield pep['pep_el']
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns, deamidation, minpeplen, enforce_tryp): whole_proteins = { str(prot.seq).replace('L', 'I'): prot.id for prot in fasta.parse_fasta(protein_fasta) } whole_proteins = {v: k for k, v in whole_proteins.items()} for element in elements: seq_matches_protein = False element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation) element_prots = { seq: [(protid, pos) for protid, pos in lookup.get_protein_from_pep(seq[:minpeplen])] for seq in element_seqs } for pepseq, proteins in element_prots.items(): for prot_id, pos in proteins: protseq = whole_proteins[prot_id] if pepseq in protseq: if enforce_tryp and ( pos == 0 or not set([pepseq[-1], protseq[pos - 1] ]).difference(['K', 'R'])): # pepseq is tryptic on both ends, or # pepseq is an N-term peptide), # matches to protein seq so remove seq_matches_protein = True break elif not enforce_tryp: seq_matches_protein = True break if seq_matches_protein: formatting.clear_el(element) else: yield formatting.string_and_clear(element, ns)
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns, deamidation, minpeplen, enforce_tryp): whole_proteins = {str(prot.seq).replace('L', 'I'): prot.id for prot in fasta.parse_fasta(protein_fasta)} whole_proteins = {v: k for k, v in whole_proteins.items()} for element in elements: seq_matches_protein = False element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation) element_prots = {seq: [(protid, pos) for protid, pos in lookup.get_protein_from_pep(seq[:minpeplen])] for seq in element_seqs} for pepseq, proteins in element_prots.items(): for prot_id, pos in proteins: protseq = whole_proteins[prot_id] if pepseq in protseq: if enforce_tryp and (pos == 0 or not set( [pepseq[-1], protseq[pos - 1]]).difference(['K', 'R'])): # pepseq is tryptic on both ends, or # pepseq is an N-term peptide), # matches to protein seq so remove seq_matches_protein = True break elif not enforce_tryp: seq_matches_protein = True break if seq_matches_protein: formatting.clear_el(element) else: yield formatting.string_and_clear(element, ns)
def filter_peptide_length(features, elementtype, ns, minlen=0, maxlen=None): minlen = int(minlen) if maxlen is None: maxlen = float('inf') else: maxlen = int(maxlen) for feat in features: seq = get_either_seq(elementtype, feat, ns) seq = strip_modifications(seq) if len(seq) >= minlen and len(seq) <= maxlen: yield formatting.string_and_clear(feat, ns) else: formatting.clear_el(feat)
def protein_header_split_generator(elements, headers, ns): """Loop through proteins of each PSM/peptide. If a protein does not match any of headers, discard PSM/peptide immediately""" for el in elements: header_not_matching = False for protein in el.findall('{%s}protein_id' % ns['xmlns']): if not any((re.search(h, protein.text) for h in headers)): header_not_matching = True break if header_not_matching: formatting.clear_el(el) else: yield formatting.string_and_clear(el, ns)
def generate_xmltags(fn, returntag, ignore_tags, ns=None): """ Base generator for percolator xml psm, peptide, protein output, as well as for mzML, mzIdentML. ignore_tags are the ones that are cleared when met by parser. """ xmlns = create_namespace(ns) ns_ignore = ['{0}{1}'.format(xmlns, x) for x in ignore_tags] for ac, el in etree.iterparse(fn): if el.tag == '{0}{1}'.format(xmlns, returntag): yield el elif el.tag in ns_ignore: formatting.clear_el(el)
def filter_known_searchspace(elements, seqtype, lookup, ns, ntermwildcards, deamidation): """Yields peptides from generator as long as their sequence is not found in known search space dict. Useful for excluding peptides that are found in e.g. ENSEMBL or similar""" for element in elements: seq_is_known = False for seq in get_seqs_from_element(element, seqtype, ns, deamidation): if lookup.check_seq_exists(seq, ntermwildcards): seq_is_known = True break if seq_is_known: formatting.clear_el(element) else: yield formatting.string_and_clear(element, ns)
def mzmlfn_ms2_spectra_generator(mzmlfiles): for fn, spec, ns in mzmlfn_spectra_generator(mzmlfiles): specparams = get_all_cvparams(spec, ns) mslvl = fetch_cvparam_value_by_name(specparams, 'ms level') if mslvl != '2': continue scannr = get_spec_scan_nr(spec) rt = fetch_cvparams_values_from_subel(spec, 'scan', ['scan start time'], ns) iit = fetch_cvparams_values_from_subel(spec, 'scan', ['ion injection time'], ns) mz, charge = fetch_cvparams_values_from_subel(spec, 'selectedIon', ['selected ion m/z', 'charge state'], ns) yield fn, {'scan': scannr, 'rt': rt[0], 'iit': iit[0], 'mz': mz, 'charge': charge} formatting.clear_el(spec)
def protein_header_split_generator(elements, ns, can_headers, headers): """Loop through proteins of each PSM/peptide. If a protein does not match any of headers, discard PSM/peptide immediately""" for el in elements: header_matching = False can = False for protein in el.findall('{%s}protein_id' % ns['xmlns']): if any(re.search(h, protein.text) for h in can_headers): can = True break #as soon as a canonical match was found break """for classes other than known, check if there is at least one protein matching the specified header and those with matches to the canonical proteins will not be used""" if any(re.search(h, protein.text) for h in headers): header_matching = True if (header_matching and not can) or ((headers == can_headers) and can): yield formatting.string_and_clear(el, ns) else: formatting.clear_el(el)
def filter_unique_peptides(peptides, score, ns): """ Filters unique peptides from multiple Percolator output XML files. Takes a dir with a set of XMLs, a score to filter on and a namespace. Outputs an ElementTree. """ scores = {'q': 'q_value', 'pep': 'pep', 'p': 'p_value', 'svm': 'svm_score'} highest = {} for el in peptides: featscore = float( el.xpath('xmlns:%s' % scores[score], namespaces=ns)[0].text) seq = reader.get_peptide_seq(el, ns) if seq not in highest: highest[seq] = { 'pep_el': formatting.stringify_strip_namespace_declaration(el, ns), 'score': featscore } if score == 'svm': # greater than score is accepted if featscore > highest[seq]['score']: highest[seq] = { 'pep_el': formatting.stringify_strip_namespace_declaration(el, ns), 'score': featscore } else: # lower than score is accepted if featscore < highest[seq]['score']: highest[seq] = { 'pep_el': formatting.stringify_strip_namespace_declaration(el, ns), 'score': featscore } formatting.clear_el(el) for pep in list(highest.values()): yield pep['pep_el']
def mzmlfn_ms2_spectra_generator(mzmlfiles): for fn, spec, ns in mzmlfn_spectra_generator(mzmlfiles): specparams = get_all_cvparams(spec, ns) mslvl = fetch_cvparam_value_by_name(specparams, 'ms level') if mslvl != '2': continue specscanid = spec.attrib['id'] rt, iit, ionmob = fetch_cvparams_values_from_subel( spec, 'scan', [ 'scan start time', 'ion injection time', 'inverse reduced ion mobility' ], ns) mz, charge = fetch_cvparams_values_from_subel( spec, 'selectedIon', ['selected ion m/z', 'charge state'], ns) yield fn, { 'specscanid': specscanid, 'ionmob': ionmob, 'rt': rt, 'iit': iit, 'mz': mz, 'charge': charge } formatting.clear_el(spec)
def target_decoy_generator(element_generator, decoy, ns): for el in element_generator: if el.attrib['{%s}decoy' % ns['xmlns']] == decoy: yield formatting.string_and_clear(el, ns) else: formatting.clear_el(el)
def get_score(elements, ns, scoretype='svm_score'): for el in elements: score = el.xpath('xmlns:{0}'.format(scoretype), namespaces=ns)[0].text formatting.clear_el(el) yield score