def parse_scan(scan_elem, nsmap): scan = parse.parse_attrib(scan_elem) scan['matches'] = [] tag = lambda tag_id: parse.fixtag('', tag_id, nsmap) for search_elem in scan_elem.findall(parse.fixtag('', "search_result", nsmap)): search_hit_elem = search_elem[0] match = parse.parse_attrib(search_hit_elem) match['modified_sequence'] = match['peptide'] match['other_seqids'] = [] for alt_protein in search_hit_elem.findall(parse.fixtag('', 'alternative_protein', nsmap)): match['other_seqids'].append(alt_protein.attrib['protein']) match['modifications'] = [] for modified_elem in search_hit_elem.findall(parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) match['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) attr['i'] = attr['position'] - 1 del attr['position'] match['modifications'].append(attr) for score_elem in search_hit_elem.findall(tag('search_score')): match.update(parse.parse_name_value(score_elem)) for analysis_elem in search_hit_elem.find(parse.fixtag('', 'analysis_result', nsmap)): if analysis_elem.tag == parse.fixtag('', 'peptideprophet_result', nsmap): match.update(parse.parse_attrib(analysis_elem)) for param_elem in analysis_elem[0]: match.update(parse.parse_name_value(param_elem)) scan['matches'].append(match) return scan
def parse_protein_group(elem, nsmap): group = parse.parse_attrib(elem) group['proteins'] = [] for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)): protein = parse.parse_attrib(protein_elem) protein['group_number'] = group['group_number'] annotation_elem = protein_elem.find(parse.fixtag('', 'annotation', nsmap)) if annotation_elem is not None: protein['description'] = annotation_elem.attrib['protein_description'] protein['other_seqids'] = [] for alt_protein in protein_elem.findall(parse.fixtag('', 'indistinguishable_protein', nsmap)): protein['other_seqids'].append(alt_protein.attrib['protein_name']) protein['other_seqids'] = protein['other_seqids'] protein['protein_name'] = protein['protein_name'] protein['peptides'] = [] n_unique_peptide = 0 for peptide_elem in protein_elem.findall(parse.fixtag('', 'peptide', nsmap)): peptide = parse.parse_attrib(peptide_elem) protein['peptides'].append(peptide) peptide['modifications'] = [] peptide['modified_sequence'] = peptide['peptide_sequence'] for modified_elem in peptide_elem.findall(parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) peptide['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) peptide['modifications'].append(attr) group['proteins'].append(protein) return group
def parse_scan(group, nsmap): scan = {} fastas = {} scan.update(parse.parse_attrib(group)) scan['matches'] = [] for protein in group.findall('protein'): match = {} words = protein.attrib['label'].split() seqid = words[0] description = ' '.join(words[1:]) peptide_elem = protein.find('peptide') match['seqid'] = seqid if seqid not in fastas: sequence = strip_whitespace(peptide_elem.text) fastas[seqid] = { 'description': description, 'sequence': sequence, } domain_elem = peptide_elem.find('domain') match.update(parse.parse_attrib(domain_elem)) scan['matches'].append(match) elem = group.find('group[@label="fragment ion mass spectrum"]') note = elem.find('note') if note: scan['Description'] = elem.find('note').text.strip() else: scan['Description'] = '' masses_elem = elem.find( 'GAML:trace/GAML:Xdata/GAML:values', namespaces=nsmap) scan['masses'] = masses_elem.text.strip() intensities_elem = elem.find( 'GAML:trace/GAML:Ydata/GAML:values', namespaces=nsmap) scan['intensities'] = intensities_elem.text.strip() charge_elem = elem.find( 'GAML:trace/GAML:attribute[@type="charge"]', namespaces=nsmap) scan['charge'] = charge_elem.text.strip() mass_elem = elem.find( 'GAML:trace/GAML:attribute[@type="M+H"]', namespaces=nsmap) scan['mass'] = mass_elem.text.strip() return scan, fastas
def parse_scan(group, nsmap): scan = {} fastas = {} scan.update(parse.parse_attrib(group)) scan['matches'] = [] for protein in group.findall('protein'): match = {} words = protein.attrib['label'].split() seqid = words[0] description = ' '.join(words[1:]) peptide_elem = protein.find('peptide') match['seqid'] = seqid if seqid not in fastas: sequence = strip_whitespace(peptide_elem.text) fastas[seqid] = { 'description': description, 'sequence': sequence, } domain_elem = peptide_elem.find('domain') match.update(parse.parse_attrib(domain_elem)) scan['matches'].append(match) elem = group.find('group[@label="fragment ion mass spectrum"]') note = elem.find('note') if note: scan['Description'] = elem.find('note').text.strip() else: scan['Description'] = '' masses_elem = elem.find('GAML:trace/GAML:Xdata/GAML:values', namespaces=nsmap) scan['masses'] = masses_elem.text.strip() intensities_elem = elem.find('GAML:trace/GAML:Ydata/GAML:values', namespaces=nsmap) scan['intensities'] = intensities_elem.text.strip() charge_elem = elem.find('GAML:trace/GAML:attribute[@type="charge"]', namespaces=nsmap) scan['charge'] = charge_elem.text.strip() mass_elem = elem.find('GAML:trace/GAML:attribute[@type="M+H"]', namespaces=nsmap) scan['mass'] = mass_elem.text.strip() return scan, fastas
def parse_scan(top_elem, nsmap): scan = {} scan.update(parse.parse_attrib(top_elem)) scan['matches'] = [] for protein in top_elem.findall('protein'): words = protein.attrib['label'].split() seqid = words[0] description = ' '.join(words[1:]) peptide_elem = protein.find('peptide') sequence = strip_whitespace(peptide_elem.text) match = { 'seqid': seqid, 'sequence': sequence, 'description': description, 'modifications': [] } domain_elem = peptide_elem.find('domain') for mod_elem in domain_elem.findall('aa'): match['modifications'].append(parse.parse_attrib(mod_elem)) match.update(parse.parse_attrib(domain_elem)) scan['matches'].append(match) elem = top_elem.find('group[@label="fragment ion mass spectrum"]') note = elem.find('note') if note is not None: scan['Description'] = elem.find('note').text.strip() else: scan['Description'] = '' masses_elem = elem.find('GAML:trace/GAML:Xdata/GAML:values', namespaces=nsmap) scan['masses'] = masses_elem.text.strip() intensities_elem = elem.find('GAML:trace/GAML:Ydata/GAML:values', namespaces=nsmap) scan['intensities'] = intensities_elem.text.strip() charge_elem = elem.find('GAML:trace/GAML:attribute[@type="charge"]', namespaces=nsmap) scan['charge'] = charge_elem.text.strip() mass_elem = elem.find('GAML:trace/GAML:attribute[@type="M+H"]', namespaces=nsmap) scan['mass'] = mass_elem.text.strip() return scan
def parse_protein_group(elem, nsmap): group = parse.parse_attrib(elem) group['proteins'] = [] for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)): protein = parse.parse_attrib(protein_elem) protein['group_number'] = group['group_number'] for parameter_elem in protein_elem.findall( parse.fixtag('', 'parameter', nsmap)): key = parameter_elem.attrib['name'] val = parameter_elem.attrib['value'] protein[key] = val annotation_elem = protein_elem.find( parse.fixtag('', 'annotation', nsmap)) if annotation_elem is not None: protein['description'] = annotation_elem.attrib[ 'protein_description'] protein['other_seqids'] = [] for alt_protein in protein_elem.findall( parse.fixtag('', 'indistinguishable_protein', nsmap)): protein['other_seqids'].append(alt_protein.attrib['protein_name']) protein['other_seqids'] = protein['other_seqids'] protein['protein_name'] = protein['protein_name'] protein['peptides'] = [] n_unique_peptide = 0 for peptide_elem in protein_elem.findall( parse.fixtag('', 'peptide', nsmap)): peptide = parse.parse_attrib(peptide_elem) protein['peptides'].append(peptide) peptide['modifications'] = [] peptide['modified_sequence'] = peptide['peptide_sequence'] for modified_elem in peptide_elem.findall( parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) peptide['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall( parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) peptide['modifications'].append(attr) group['proteins'].append(protein) return group
def parse_protein_probabilities(elem, nsmap): probs = [] for data_point in elem.findall(parse.fixtag('', 'protein_summary_data_filter', nsmap)): attrib = parse.parse_attrib(data_point) probs.append({ 'error': attrib['false_positive_error_rate'], 'prob': attrib['min_probability'], }) probs.sort(key=lambda d:d['error']) return probs
def parse_protein_probabilities(elem, nsmap): probs = [] for data_point in elem.findall( parse.fixtag('', 'protein_summary_data_filter', nsmap)): attrib = parse.parse_attrib(data_point) probs.append({ 'error': attrib['false_positive_error_rate'], 'prob': attrib['min_probability'], }) probs.sort(key=lambda d: d['error']) return probs
def parse_protein_probabilities(self, elem): self.distribution = [] for data_point in self.findall(elem, 'protein_summary_data_filter'): attrib = parse.parse_attrib(data_point) self.distribution.append({ 'error': attrib['false_positive_error_rate'], 'prob': attrib['min_probability'], }) self.distribution.sort(key=lambda d: d['error'])
def parse_scan(scan_elem, nsmap): scan = parse.parse_attrib(scan_elem) scan['matches'] = [] tag = lambda tag_id: parse.fixtag('', tag_id, nsmap) for search_elem in scan_elem.findall( parse.fixtag('', "search_result", nsmap)): search_hit_elem = search_elem[0] match = parse.parse_attrib(search_hit_elem) match['modified_sequence'] = match['peptide'] match['other_seqids'] = [] for alt_protein in search_hit_elem.findall( parse.fixtag('', 'alternative_protein', nsmap)): match['other_seqids'].append(alt_protein.attrib['protein']) match['modifications'] = [] for modified_elem in search_hit_elem.findall( parse.fixtag('', 'modification_info', nsmap)): attr = parse.parse_attrib(modified_elem) match['modified_sequence'] = attr['modified_peptide'] for modification_elem in modified_elem.findall( parse.fixtag('', 'mod_aminoacid_mass', nsmap)): attr = parse.parse_attrib(modification_elem) attr['i'] = attr['position'] - 1 del attr['position'] match['modifications'].append(attr) for score_elem in search_hit_elem.findall(tag('search_score')): match.update(parse.parse_name_value(score_elem)) for analysis_elem in search_hit_elem.find( parse.fixtag('', 'analysis_result', nsmap)): if analysis_elem.tag == parse.fixtag('', 'peptideprophet_result', nsmap): match.update(parse.parse_attrib(analysis_elem)) for param_elem in analysis_elem[0]: match.update(parse.parse_name_value(param_elem)) scan['matches'].append(match) return scan
def parse_scan(self, scan_elem): scan = parse.parse_attrib(scan_elem) scan['matches'] = [] for search_elem in self.findall(scan_elem, "search_result"): search_hit_elem = search_elem[0] pepxml_match = parse.parse_attrib(search_hit_elem) pepxml_match['modified_sequence'] = pepxml_match['peptide'] pepxml_match['other_seqids'] = [] for alt_protein in self.findall(search_hit_elem, 'alternative_protein'): pepxml_match['other_seqids'].append( alt_protein.attrib['protein']) pepxml_match['modifications'] = [] for modified_elem in self.findall(search_hit_elem, 'modification_info'): attr = parse.parse_attrib(modified_elem) pepxml_match['modified_sequence'] = attr['modified_peptide'] for modification_elem in self.findall(modified_elem, 'mod_aminoacid_mass'): attr = parse.parse_attrib(modification_elem) attr['i'] = attr['position'] - 1 del attr['position'] pepxml_match['modifications'].append(attr) for score_elem in self.findall(search_hit_elem, 'search_score'): pepxml_match.update(parse.parse_name_value(score_elem)) for analysis_elem in self.find(search_hit_elem, 'analysis_result'): if analysis_elem.tag == self.search_tag( 'peptideprophet_result'): pepxml_match.update(parse.parse_attrib(analysis_elem)) for param_elem in analysis_elem[0]: pepxml_match.update(parse.parse_name_value(param_elem)) scan['matches'].append(pepxml_match) return scan
def parse_peptide_probabilities(elem, nsmap): # try with error_point error_points = elem.findall(parse.fixtag('', 'error_point', nsmap)) if len(error_points) == 0: charge = 0 for charge_elem in elem.findall(parse.fixtag('', 'roc_error_data', nsmap)): if charge_elem.attrib['charge'] == 'all': error_points = charge_elem.findall(parse.fixtag('', 'error_point', nsmap)) break probs = [] for elem in error_points: attrib = parse.parse_attrib(elem) probs.append({ 'error': attrib['error'], 'prob': attrib['min_prob'], }) probs.sort(key=lambda d:d['error']) return probs
def parse_peptide_probabilities(elem, nsmap): # try with error_point error_points = elem.findall(parse.fixtag('', 'error_point', nsmap)) if len(error_points) == 0: charge = 0 for charge_elem in elem.findall( parse.fixtag('', 'roc_error_data', nsmap)): if charge_elem.attrib['charge'] == 'all': error_points = charge_elem.findall( parse.fixtag('', 'error_point', nsmap)) break probs = [] for elem in error_points: attrib = parse.parse_attrib(elem) probs.append({ 'error': attrib['error'], 'prob': attrib['min_prob'], }) probs.sort(key=lambda d: d['error']) return probs
def parse_peptide_probabilities(self, elem): # try with error_point error_points = self.findall(elem, 'error_point') if len(error_points) == 0: charge = 0 for charge_elem in self.findall(elem, 'roc_error_data'): if charge_elem.attrib['charge'] == 'all': error_points = self.findall(charge_elem, 'error_point') break self.distribution = [] for elem in error_points: attrib = parse.parse_attrib(elem) self.distribution.append({ 'error': attrib['error'], 'prob': attrib['min_prob'], }) self.distribution.sort(key=lambda d: d['error']) peptide_probability = error_to_probability(self.distribution, 0.01) logger.info('Peptide probability cutoff for 0.01 fpe: %f' % peptide_probability)