def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns, deamidation, minpeplen, enforce_tryp): whole_proteins = { str(prot.seq).replace('L', 'I'): prot.id for prot in fasta.parse_fasta(protein_fasta) } whole_proteins = {v: k for k, v in whole_proteins.items()} for element in elements: seq_matches_protein = False element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation) element_prots = { seq: [(protid, pos) for protid, pos in lookup.get_protein_from_pep(seq[:minpeplen])] for seq in element_seqs } for pepseq, proteins in element_prots.items(): for prot_id, pos in proteins: protseq = whole_proteins[prot_id] if pepseq in protseq: if enforce_tryp and ( pos == 0 or not set([pepseq[-1], protseq[pos - 1] ]).difference(['K', 'R'])): # pepseq is tryptic on both ends, or # pepseq is an N-term peptide), # matches to protein seq so remove seq_matches_protein = True break elif not enforce_tryp: seq_matches_protein = True break if seq_matches_protein: formatting.clear_el(element) else: yield formatting.string_and_clear(element, ns)
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns, deamidation, minpeplen, enforce_tryp): whole_proteins = {str(prot.seq).replace('L', 'I'): prot.id for prot in fasta.parse_fasta(protein_fasta)} whole_proteins = {v: k for k, v in whole_proteins.items()} for element in elements: seq_matches_protein = False element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation) element_prots = {seq: [(protid, pos) for protid, pos in lookup.get_protein_from_pep(seq[:minpeplen])] for seq in element_seqs} for pepseq, proteins in element_prots.items(): for prot_id, pos in proteins: protseq = whole_proteins[prot_id] if pepseq in protseq: if enforce_tryp and (pos == 0 or not set( [pepseq[-1], protseq[pos - 1]]).difference(['K', 'R'])): # pepseq is tryptic on both ends, or # pepseq is an N-term peptide), # matches to protein seq so remove seq_matches_protein = True break elif not enforce_tryp: seq_matches_protein = True break if seq_matches_protein: formatting.clear_el(element) else: yield formatting.string_and_clear(element, ns)
def reassign_elements(elements, stats, ns): for el in elements: score = round(float(el.xpath('xmlns:svm_score', namespaces=ns)[0].text), 5) oldq = el.xpath('xmlns:q_value', namespaces=ns)[0] oldpep = el.xpath('xmlns:pep', namespaces=ns)[0] newq, newpep, warning = lookup_statistic(score, stats) if warning is not None: sys.stdout.write(warning) oldq.text, oldpep.text = newq, newpep yield formatting.string_and_clear(el, ns)
def filter_peptide_length(features, elementtype, ns, minlen=0, maxlen=None): minlen = int(minlen) if maxlen is None: maxlen = float('inf') else: maxlen = int(maxlen) for feat in features: seq = get_either_seq(elementtype, feat, ns) seq = strip_modifications(seq) if len(seq) >= minlen and len(seq) <= maxlen: yield formatting.string_and_clear(feat, ns) else: formatting.clear_el(feat)
def protein_header_split_generator(elements, headers, ns): """Loop through proteins of each PSM/peptide. If a protein does not match any of headers, discard PSM/peptide immediately""" for el in elements: header_not_matching = False for protein in el.findall('{%s}protein_id' % ns['xmlns']): if not any((re.search(h, protein.text) for h in headers)): header_not_matching = True break if header_not_matching: formatting.clear_el(el) else: yield formatting.string_and_clear(el, ns)
def merge_peptides(fns, ns): """Loops peptides from multiple files, fetches PSMs from sequence:PSM map, outputs correctly PSM mapped peptides""" peptides_to_map = reader.generate_peptides_multiple_fractions(fns, ns) psmmap = create_merge_psm_map(peptides_to_map, ns) peptides = reader.generate_peptides_multiple_fractions(fns, ns) for peptide in peptides: seq = reader.get_peptide_seq(peptide, ns) psm_ids = reader.get_psm_ids_from_peptide(peptide, ns) # remove current psm ids, repopulate with stored ones psm_ids.clear() for new_psm_id in psmmap[seq]: etree.SubElement(psm_ids, 'psm_id').text = new_psm_id yield formatting.string_and_clear(peptide, ns)
def filter_known_searchspace(elements, seqtype, lookup, ns, ntermwildcards, deamidation): """Yields peptides from generator as long as their sequence is not found in known search space dict. Useful for excluding peptides that are found in e.g. ENSEMBL or similar""" for element in elements: seq_is_known = False for seq in get_seqs_from_element(element, seqtype, ns, deamidation): if lookup.check_seq_exists(seq, ntermwildcards): seq_is_known = True break if seq_is_known: formatting.clear_el(element) else: yield formatting.string_and_clear(element, ns)
def protein_header_split_generator(elements, ns, can_headers, headers): """Loop through proteins of each PSM/peptide. If a protein does not match any of headers, discard PSM/peptide immediately""" for el in elements: header_matching = False can = False for protein in el.findall('{%s}protein_id' % ns['xmlns']): if any(re.search(h, protein.text) for h in can_headers): can = True break #as soon as a canonical match was found break """for classes other than known, check if there is at least one protein matching the specified header and those with matches to the canonical proteins will not be used""" if any(re.search(h, protein.text) for h in headers): header_matching = True if (header_matching and not can) or ((headers == can_headers) and can): yield formatting.string_and_clear(el, ns) else: formatting.clear_el(el)
def generate_tags_multiple_files_strings(input_files, ns, tag, ignore_tags): """ Creates stringified xml output of elements with certain tag. """ for el in generate_tags_multiple_files(input_files, tag, ignore_tags, ns): yield formatting.string_and_clear(el, ns)
def target_decoy_generator(element_generator, decoy, ns): for el in element_generator: if el.attrib['{%s}decoy' % ns['xmlns']] == decoy: yield formatting.string_and_clear(el, ns) else: formatting.clear_el(el)