Ejemplo n.º 1
0
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns,
                          deamidation, minpeplen, enforce_tryp):
    whole_proteins = {
        str(prot.seq).replace('L', 'I'): prot.id
        for prot in fasta.parse_fasta(protein_fasta)
    }
    whole_proteins = {v: k for k, v in whole_proteins.items()}
    for element in elements:
        seq_matches_protein = False
        element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation)
        element_prots = {
            seq:
            [(protid, pos)
             for protid, pos in lookup.get_protein_from_pep(seq[:minpeplen])]
            for seq in element_seqs
        }
        for pepseq, proteins in element_prots.items():
            for prot_id, pos in proteins:
                protseq = whole_proteins[prot_id]
                if pepseq in protseq:
                    if enforce_tryp and (
                            pos == 0 or not set([pepseq[-1], protseq[pos - 1]
                                                 ]).difference(['K', 'R'])):
                        # pepseq is tryptic on both ends, or
                        # pepseq is an N-term peptide),
                        # matches to protein seq so remove
                        seq_matches_protein = True
                        break
                    elif not enforce_tryp:
                        seq_matches_protein = True
                        break
        if seq_matches_protein:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Ejemplo n.º 2
0
def filter_whole_proteins(elements, protein_fasta, lookup, seqtype, ns,
                          deamidation, minpeplen, enforce_tryp):
    whole_proteins = {str(prot.seq).replace('L', 'I'): prot.id for prot in
                      fasta.parse_fasta(protein_fasta)}
    whole_proteins = {v: k for k, v in whole_proteins.items()}
    for element in elements:
        seq_matches_protein = False
        element_seqs = get_seqs_from_element(element, seqtype, ns, deamidation)
        element_prots = {seq: [(protid, pos) for protid, pos in
                               lookup.get_protein_from_pep(seq[:minpeplen])]
                         for seq in element_seqs}
        for pepseq, proteins in element_prots.items():
            for prot_id, pos in proteins:
                protseq = whole_proteins[prot_id]
                if pepseq in protseq:
                    if enforce_tryp and (pos == 0 or not set(
                            [pepseq[-1],
                             protseq[pos - 1]]).difference(['K', 'R'])):
                        # pepseq is tryptic on both ends, or
                        # pepseq is an N-term peptide),
                        # matches to protein seq so remove
                        seq_matches_protein = True
                        break
                    elif not enforce_tryp:
                        seq_matches_protein = True
                        break
        if seq_matches_protein:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Ejemplo n.º 3
0
def reassign_elements(elements, stats, ns):
    for el in elements:
        score = round(float(el.xpath('xmlns:svm_score',
                                     namespaces=ns)[0].text), 5)
        oldq = el.xpath('xmlns:q_value', namespaces=ns)[0]
        oldpep = el.xpath('xmlns:pep', namespaces=ns)[0]
        newq, newpep, warning = lookup_statistic(score, stats)
        if warning is not None:
            sys.stdout.write(warning)
        oldq.text, oldpep.text = newq, newpep
        yield formatting.string_and_clear(el, ns)
Ejemplo n.º 4
0
def filter_peptide_length(features, elementtype, ns, minlen=0, maxlen=None):
    minlen = int(minlen)
    if maxlen is None:
        maxlen = float('inf')
    else:
        maxlen = int(maxlen)
    for feat in features:
        seq = get_either_seq(elementtype, feat, ns)
        seq = strip_modifications(seq)
        if len(seq) >= minlen and len(seq) <= maxlen:
            yield formatting.string_and_clear(feat, ns)
        else:
            formatting.clear_el(feat)
Ejemplo n.º 5
0
def filter_peptide_length(features, elementtype, ns, minlen=0, maxlen=None):
    minlen = int(minlen)
    if maxlen is None:
        maxlen = float('inf')
    else:
        maxlen = int(maxlen)
    for feat in features:
        seq = get_either_seq(elementtype, feat, ns)
        seq = strip_modifications(seq)
        if len(seq) >= minlen and len(seq) <= maxlen:
            yield formatting.string_and_clear(feat, ns)
        else:
            formatting.clear_el(feat)
Ejemplo n.º 6
0
def protein_header_split_generator(elements, headers, ns):
    """Loop through proteins of each PSM/peptide. If a protein does not
    match any of headers, discard PSM/peptide immediately"""
    for el in elements:
        header_not_matching = False
        for protein in el.findall('{%s}protein_id' % ns['xmlns']):
            if not any((re.search(h, protein.text) for h in headers)):
                header_not_matching = True
                break
        if header_not_matching:
            formatting.clear_el(el)
        else:
            yield formatting.string_and_clear(el, ns)
Ejemplo n.º 7
0
def protein_header_split_generator(elements, headers, ns):
    """Loop through proteins of each PSM/peptide. If a protein does not
    match any of headers, discard PSM/peptide immediately"""
    for el in elements:
        header_not_matching = False
        for protein in el.findall('{%s}protein_id' % ns['xmlns']):
            if not any((re.search(h, protein.text) for h in headers)):
                header_not_matching = True
                break
        if header_not_matching:
            formatting.clear_el(el)
        else:
            yield formatting.string_and_clear(el, ns)
Ejemplo n.º 8
0
def merge_peptides(fns, ns):
    """Loops peptides from multiple files, fetches PSMs from
    sequence:PSM map, outputs correctly PSM mapped peptides"""
    peptides_to_map = reader.generate_peptides_multiple_fractions(fns, ns)
    psmmap = create_merge_psm_map(peptides_to_map, ns)
    peptides = reader.generate_peptides_multiple_fractions(fns, ns)
    for peptide in peptides:
        seq = reader.get_peptide_seq(peptide, ns)
        psm_ids = reader.get_psm_ids_from_peptide(peptide, ns)
        # remove current psm ids, repopulate with stored ones
        psm_ids.clear()
        for new_psm_id in psmmap[seq]:
            etree.SubElement(psm_ids, 'psm_id').text = new_psm_id
        yield formatting.string_and_clear(peptide, ns)
Ejemplo n.º 9
0
def merge_peptides(fns, ns):
    """Loops peptides from multiple files, fetches PSMs from
    sequence:PSM map, outputs correctly PSM mapped peptides"""
    peptides_to_map = reader.generate_peptides_multiple_fractions(fns, ns)
    psmmap = create_merge_psm_map(peptides_to_map, ns)
    peptides = reader.generate_peptides_multiple_fractions(fns, ns)
    for peptide in peptides:
        seq = reader.get_peptide_seq(peptide, ns)
        psm_ids = reader.get_psm_ids_from_peptide(peptide, ns)
        # remove current psm ids, repopulate with stored ones
        psm_ids.clear()
        for new_psm_id in psmmap[seq]:
            etree.SubElement(psm_ids, 'psm_id').text = new_psm_id
        yield formatting.string_and_clear(peptide, ns)
Ejemplo n.º 10
0
def filter_known_searchspace(elements, seqtype, lookup, ns, ntermwildcards,
                             deamidation):
    """Yields peptides from generator as long as their sequence is not found in
    known search space dict. Useful for excluding peptides that are found in
    e.g. ENSEMBL or similar"""
    for element in elements:
        seq_is_known = False
        for seq in get_seqs_from_element(element, seqtype, ns, deamidation):
            if lookup.check_seq_exists(seq, ntermwildcards):
                seq_is_known = True
                break
        if seq_is_known:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Ejemplo n.º 11
0
def filter_known_searchspace(elements, seqtype, lookup, ns, ntermwildcards,
                             deamidation):
    """Yields peptides from generator as long as their sequence is not found in
    known search space dict. Useful for excluding peptides that are found in
    e.g. ENSEMBL or similar"""
    for element in elements:
        seq_is_known = False
        for seq in get_seqs_from_element(element, seqtype, ns, deamidation):
            if lookup.check_seq_exists(seq, ntermwildcards):
                seq_is_known = True
                break
        if seq_is_known:
            formatting.clear_el(element)
        else:
            yield formatting.string_and_clear(element, ns)
Ejemplo n.º 12
0
def protein_header_split_generator(elements, ns, can_headers, headers):
    """Loop through proteins of each PSM/peptide. If a protein does not
    match any of headers, discard PSM/peptide immediately"""
    for el in elements:
        header_matching = False
        can = False
        for protein in el.findall('{%s}protein_id' % ns['xmlns']):
            if any(re.search(h, protein.text) for h in can_headers):
                can = True
                break  #as soon as a canonical match was found break
            """for classes other than known,
               check if there is at least one protein matching the specified header
               and those with matches to the canonical proteins will not be used"""
            if any(re.search(h, protein.text) for h in headers):
                header_matching = True
        if (header_matching and not can) or ((headers == can_headers) and can):
            yield formatting.string_and_clear(el, ns)
        else:
            formatting.clear_el(el)
Ejemplo n.º 13
0
def generate_tags_multiple_files_strings(input_files, ns, tag, ignore_tags):
    """
    Creates stringified xml output of elements with certain tag.
    """
    for el in generate_tags_multiple_files(input_files, tag, ignore_tags, ns):
        yield formatting.string_and_clear(el, ns)
Ejemplo n.º 14
0
def target_decoy_generator(element_generator, decoy, ns):
    for el in element_generator:
        if el.attrib['{%s}decoy' % ns['xmlns']] == decoy:
            yield formatting.string_and_clear(el, ns)
        else:
            formatting.clear_el(el)
Ejemplo n.º 15
0
def generate_tags_multiple_files_strings(input_files, ns, tag, ignore_tags):
    """
    Creates stringified xml output of elements with certain tag.
    """
    for el in generate_tags_multiple_files(input_files, tag, ignore_tags, ns):
        yield formatting.string_and_clear(el, ns)
Ejemplo n.º 16
0
def target_decoy_generator(element_generator, decoy, ns):
    for el in element_generator:
        if el.attrib['{%s}decoy' % ns['xmlns']] == decoy:
            yield formatting.string_and_clear(el, ns)
        else:
            formatting.clear_el(el)