Esempio n. 1
0
def parse_scan(scan_elem, nsmap):
  scan = parse.parse_attrib(scan_elem)
  scan['matches'] = []
  tag = lambda tag_id: parse.fixtag('', tag_id, nsmap)
  for search_elem in scan_elem.findall(parse.fixtag('', "search_result", nsmap)):
    search_hit_elem = search_elem[0] 
    match = parse.parse_attrib(search_hit_elem)
    match['modified_sequence'] = match['peptide']

    match['other_seqids'] = []
    for alt_protein in search_hit_elem.findall(parse.fixtag('', 'alternative_protein', nsmap)):
      match['other_seqids'].append(alt_protein.attrib['protein'])

    match['modifications'] = []
    for modified_elem in search_hit_elem.findall(parse.fixtag('', 'modification_info', nsmap)):
      attr = parse.parse_attrib(modified_elem)
      match['modified_sequence'] = attr['modified_peptide']
      for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
        attr = parse.parse_attrib(modification_elem)
        attr['i'] = attr['position'] - 1
        del attr['position']
        match['modifications'].append(attr)

    for score_elem in search_hit_elem.findall(tag('search_score')):
      match.update(parse.parse_name_value(score_elem))

    for analysis_elem in search_hit_elem.find(parse.fixtag('', 'analysis_result', nsmap)):
      if analysis_elem.tag == parse.fixtag('', 'peptideprophet_result', nsmap):
        match.update(parse.parse_attrib(analysis_elem))
        for param_elem in analysis_elem[0]:
          match.update(parse.parse_name_value(param_elem))

    scan['matches'].append(match)
  return scan
Esempio n. 2
0
def parse_protein_group(elem, nsmap):
  group = parse.parse_attrib(elem)
  group['proteins'] = []
  for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)):
    protein = parse.parse_attrib(protein_elem)
    protein['group_number'] = group['group_number']

    annotation_elem = protein_elem.find(parse.fixtag('', 'annotation', nsmap))
    if annotation_elem is not None:
      protein['description'] = annotation_elem.attrib['protein_description']

    protein['other_seqids'] = []
    for alt_protein in protein_elem.findall(parse.fixtag('', 'indistinguishable_protein', nsmap)):
      protein['other_seqids'].append(alt_protein.attrib['protein_name'])

    protein['other_seqids'] = protein['other_seqids']
    protein['protein_name'] = protein['protein_name']

    protein['peptides'] = []
    n_unique_peptide = 0
    for peptide_elem in protein_elem.findall(parse.fixtag('', 'peptide', nsmap)):
      peptide = parse.parse_attrib(peptide_elem)
      protein['peptides'].append(peptide)
      peptide['modifications'] = []
      peptide['modified_sequence'] = peptide['peptide_sequence']
      for modified_elem in peptide_elem.findall(parse.fixtag('', 'modification_info', nsmap)):
        attr = parse.parse_attrib(modified_elem)
        peptide['modified_sequence'] = attr['modified_peptide']
        for modification_elem in modified_elem.findall(parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
          attr = parse.parse_attrib(modification_elem)
          peptide['modifications'].append(attr)

    group['proteins'].append(protein)
  return group
Esempio n. 3
0
def parse_scan(group, nsmap):
  scan = {}
  fastas = {}

  scan.update(parse.parse_attrib(group))

  scan['matches'] = []
  for protein in group.findall('protein'):
    match = {}

    words = protein.attrib['label'].split()
    seqid = words[0]
    description = ' '.join(words[1:])
    peptide_elem = protein.find('peptide')

    match['seqid'] = seqid

    if seqid not in fastas:
      sequence = strip_whitespace(peptide_elem.text)
      fastas[seqid] = {
        'description': description,
        'sequence': sequence,
      }

    domain_elem = peptide_elem.find('domain')
    match.update(parse.parse_attrib(domain_elem))
    scan['matches'].append(match)

  elem = group.find('group[@label="fragment ion mass spectrum"]')

  note = elem.find('note')
  if note:
    scan['Description'] = elem.find('note').text.strip()
  else:
    scan['Description'] = ''

  masses_elem = elem.find(
      'GAML:trace/GAML:Xdata/GAML:values', namespaces=nsmap)
  scan['masses'] = masses_elem.text.strip()

  intensities_elem = elem.find(
      'GAML:trace/GAML:Ydata/GAML:values', namespaces=nsmap)
  scan['intensities'] = intensities_elem.text.strip()

  charge_elem = elem.find(
      'GAML:trace/GAML:attribute[@type="charge"]', namespaces=nsmap)
  scan['charge'] = charge_elem.text.strip()

  mass_elem = elem.find(
      'GAML:trace/GAML:attribute[@type="M+H"]', namespaces=nsmap)
  scan['mass'] = mass_elem.text.strip()

  return scan, fastas
Esempio n. 4
0
def parse_scan(group, nsmap):
    scan = {}
    fastas = {}

    scan.update(parse.parse_attrib(group))

    scan['matches'] = []
    for protein in group.findall('protein'):
        match = {}

        words = protein.attrib['label'].split()
        seqid = words[0]
        description = ' '.join(words[1:])
        peptide_elem = protein.find('peptide')

        match['seqid'] = seqid

        if seqid not in fastas:
            sequence = strip_whitespace(peptide_elem.text)
            fastas[seqid] = {
                'description': description,
                'sequence': sequence,
            }

        domain_elem = peptide_elem.find('domain')
        match.update(parse.parse_attrib(domain_elem))
        scan['matches'].append(match)

    elem = group.find('group[@label="fragment ion mass spectrum"]')

    note = elem.find('note')
    if note:
        scan['Description'] = elem.find('note').text.strip()
    else:
        scan['Description'] = ''

    masses_elem = elem.find('GAML:trace/GAML:Xdata/GAML:values',
                            namespaces=nsmap)
    scan['masses'] = masses_elem.text.strip()

    intensities_elem = elem.find('GAML:trace/GAML:Ydata/GAML:values',
                                 namespaces=nsmap)
    scan['intensities'] = intensities_elem.text.strip()

    charge_elem = elem.find('GAML:trace/GAML:attribute[@type="charge"]',
                            namespaces=nsmap)
    scan['charge'] = charge_elem.text.strip()

    mass_elem = elem.find('GAML:trace/GAML:attribute[@type="M+H"]',
                          namespaces=nsmap)
    scan['mass'] = mass_elem.text.strip()

    return scan, fastas
Esempio n. 5
0
def parse_scan(top_elem, nsmap):
    scan = {}
    scan.update(parse.parse_attrib(top_elem))

    scan['matches'] = []
    for protein in top_elem.findall('protein'):
        words = protein.attrib['label'].split()
        seqid = words[0]
        description = ' '.join(words[1:])
        peptide_elem = protein.find('peptide')
        sequence = strip_whitespace(peptide_elem.text)
        match = {
            'seqid': seqid,
            'sequence': sequence,
            'description': description,
            'modifications': []
        }
        domain_elem = peptide_elem.find('domain')
        for mod_elem in domain_elem.findall('aa'):
            match['modifications'].append(parse.parse_attrib(mod_elem))
        match.update(parse.parse_attrib(domain_elem))
        scan['matches'].append(match)

    elem = top_elem.find('group[@label="fragment ion mass spectrum"]')

    note = elem.find('note')
    if note is not None:
        scan['Description'] = elem.find('note').text.strip()
    else:
        scan['Description'] = ''

    masses_elem = elem.find('GAML:trace/GAML:Xdata/GAML:values',
                            namespaces=nsmap)
    scan['masses'] = masses_elem.text.strip()

    intensities_elem = elem.find('GAML:trace/GAML:Ydata/GAML:values',
                                 namespaces=nsmap)
    scan['intensities'] = intensities_elem.text.strip()

    charge_elem = elem.find('GAML:trace/GAML:attribute[@type="charge"]',
                            namespaces=nsmap)
    scan['charge'] = charge_elem.text.strip()

    mass_elem = elem.find('GAML:trace/GAML:attribute[@type="M+H"]',
                          namespaces=nsmap)
    scan['mass'] = mass_elem.text.strip()

    return scan
Esempio n. 6
0
def parse_protein_group(elem, nsmap):
    group = parse.parse_attrib(elem)
    group['proteins'] = []
    for protein_elem in elem.findall(parse.fixtag('', 'protein', nsmap)):
        protein = parse.parse_attrib(protein_elem)
        protein['group_number'] = group['group_number']

        for parameter_elem in protein_elem.findall(
                parse.fixtag('', 'parameter', nsmap)):
            key = parameter_elem.attrib['name']
            val = parameter_elem.attrib['value']
            protein[key] = val

        annotation_elem = protein_elem.find(
            parse.fixtag('', 'annotation', nsmap))
        if annotation_elem is not None:
            protein['description'] = annotation_elem.attrib[
                'protein_description']

        protein['other_seqids'] = []
        for alt_protein in protein_elem.findall(
                parse.fixtag('', 'indistinguishable_protein', nsmap)):
            protein['other_seqids'].append(alt_protein.attrib['protein_name'])

        protein['other_seqids'] = protein['other_seqids']
        protein['protein_name'] = protein['protein_name']

        protein['peptides'] = []
        n_unique_peptide = 0
        for peptide_elem in protein_elem.findall(
                parse.fixtag('', 'peptide', nsmap)):
            peptide = parse.parse_attrib(peptide_elem)
            protein['peptides'].append(peptide)
            peptide['modifications'] = []
            peptide['modified_sequence'] = peptide['peptide_sequence']
            for modified_elem in peptide_elem.findall(
                    parse.fixtag('', 'modification_info', nsmap)):
                attr = parse.parse_attrib(modified_elem)
                peptide['modified_sequence'] = attr['modified_peptide']
                for modification_elem in modified_elem.findall(
                        parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
                    attr = parse.parse_attrib(modification_elem)
                    peptide['modifications'].append(attr)

        group['proteins'].append(protein)
    return group
Esempio n. 7
0
def parse_protein_probabilities(elem, nsmap):
  probs = []
  for data_point in elem.findall(parse.fixtag('', 'protein_summary_data_filter', nsmap)):
    attrib = parse.parse_attrib(data_point)
    probs.append({
      'error': attrib['false_positive_error_rate'],
      'prob': attrib['min_probability'],
    })
  probs.sort(key=lambda d:d['error'])
  return probs
Esempio n. 8
0
def parse_protein_probabilities(elem, nsmap):
    probs = []
    for data_point in elem.findall(
            parse.fixtag('', 'protein_summary_data_filter', nsmap)):
        attrib = parse.parse_attrib(data_point)
        probs.append({
            'error': attrib['false_positive_error_rate'],
            'prob': attrib['min_probability'],
        })
    probs.sort(key=lambda d: d['error'])
    return probs
Esempio n. 9
0
 def parse_protein_probabilities(self, elem):
     self.distribution = []
     for data_point in self.findall(elem, 'protein_summary_data_filter'):
         attrib = parse.parse_attrib(data_point)
         self.distribution.append({
             'error':
             attrib['false_positive_error_rate'],
             'prob':
             attrib['min_probability'],
         })
     self.distribution.sort(key=lambda d: d['error'])
Esempio n. 10
0
def parse_scan(scan_elem, nsmap):
    scan = parse.parse_attrib(scan_elem)
    scan['matches'] = []
    tag = lambda tag_id: parse.fixtag('', tag_id, nsmap)
    for search_elem in scan_elem.findall(
            parse.fixtag('', "search_result", nsmap)):
        search_hit_elem = search_elem[0]
        match = parse.parse_attrib(search_hit_elem)
        match['modified_sequence'] = match['peptide']

        match['other_seqids'] = []
        for alt_protein in search_hit_elem.findall(
                parse.fixtag('', 'alternative_protein', nsmap)):
            match['other_seqids'].append(alt_protein.attrib['protein'])

        match['modifications'] = []
        for modified_elem in search_hit_elem.findall(
                parse.fixtag('', 'modification_info', nsmap)):
            attr = parse.parse_attrib(modified_elem)
            match['modified_sequence'] = attr['modified_peptide']
            for modification_elem in modified_elem.findall(
                    parse.fixtag('', 'mod_aminoacid_mass', nsmap)):
                attr = parse.parse_attrib(modification_elem)
                attr['i'] = attr['position'] - 1
                del attr['position']
                match['modifications'].append(attr)

        for score_elem in search_hit_elem.findall(tag('search_score')):
            match.update(parse.parse_name_value(score_elem))

        for analysis_elem in search_hit_elem.find(
                parse.fixtag('', 'analysis_result', nsmap)):
            if analysis_elem.tag == parse.fixtag('', 'peptideprophet_result',
                                                 nsmap):
                match.update(parse.parse_attrib(analysis_elem))
                for param_elem in analysis_elem[0]:
                    match.update(parse.parse_name_value(param_elem))

        scan['matches'].append(match)
    return scan
Esempio n. 11
0
    def parse_scan(self, scan_elem):
        scan = parse.parse_attrib(scan_elem)
        scan['matches'] = []
        for search_elem in self.findall(scan_elem, "search_result"):
            search_hit_elem = search_elem[0]
            pepxml_match = parse.parse_attrib(search_hit_elem)
            pepxml_match['modified_sequence'] = pepxml_match['peptide']

            pepxml_match['other_seqids'] = []
            for alt_protein in self.findall(search_hit_elem,
                                            'alternative_protein'):
                pepxml_match['other_seqids'].append(
                    alt_protein.attrib['protein'])

            pepxml_match['modifications'] = []
            for modified_elem in self.findall(search_hit_elem,
                                              'modification_info'):
                attr = parse.parse_attrib(modified_elem)
                pepxml_match['modified_sequence'] = attr['modified_peptide']
                for modification_elem in self.findall(modified_elem,
                                                      'mod_aminoacid_mass'):
                    attr = parse.parse_attrib(modification_elem)
                    attr['i'] = attr['position'] - 1
                    del attr['position']
                    pepxml_match['modifications'].append(attr)

            for score_elem in self.findall(search_hit_elem, 'search_score'):
                pepxml_match.update(parse.parse_name_value(score_elem))

            for analysis_elem in self.find(search_hit_elem, 'analysis_result'):
                if analysis_elem.tag == self.search_tag(
                        'peptideprophet_result'):
                    pepxml_match.update(parse.parse_attrib(analysis_elem))
                    for param_elem in analysis_elem[0]:
                        pepxml_match.update(parse.parse_name_value(param_elem))

            scan['matches'].append(pepxml_match)

        return scan
Esempio n. 12
0
def parse_peptide_probabilities(elem, nsmap):
  # try with error_point
  error_points = elem.findall(parse.fixtag('', 'error_point', nsmap))
  if len(error_points) == 0:
    charge = 0
    for charge_elem in elem.findall(parse.fixtag('', 'roc_error_data', nsmap)):
      if charge_elem.attrib['charge'] == 'all':
        error_points = charge_elem.findall(parse.fixtag('', 'error_point', nsmap))
        break
  probs = []
  for elem in error_points:
      attrib = parse.parse_attrib(elem)
      probs.append({
        'error': attrib['error'],
        'prob': attrib['min_prob'],
      })
  probs.sort(key=lambda d:d['error'])
  return probs
Esempio n. 13
0
def parse_peptide_probabilities(elem, nsmap):
    # try with error_point
    error_points = elem.findall(parse.fixtag('', 'error_point', nsmap))
    if len(error_points) == 0:
        charge = 0
        for charge_elem in elem.findall(
                parse.fixtag('', 'roc_error_data', nsmap)):
            if charge_elem.attrib['charge'] == 'all':
                error_points = charge_elem.findall(
                    parse.fixtag('', 'error_point', nsmap))
                break
    probs = []
    for elem in error_points:
        attrib = parse.parse_attrib(elem)
        probs.append({
            'error': attrib['error'],
            'prob': attrib['min_prob'],
        })
    probs.sort(key=lambda d: d['error'])
    return probs
Esempio n. 14
0
 def parse_peptide_probabilities(self, elem):
     # try with error_point
     error_points = self.findall(elem, 'error_point')
     if len(error_points) == 0:
         charge = 0
         for charge_elem in self.findall(elem, 'roc_error_data'):
             if charge_elem.attrib['charge'] == 'all':
                 error_points = self.findall(charge_elem, 'error_point')
                 break
     self.distribution = []
     for elem in error_points:
         attrib = parse.parse_attrib(elem)
         self.distribution.append({
             'error': attrib['error'],
             'prob': attrib['min_prob'],
         })
     self.distribution.sort(key=lambda d: d['error'])
     peptide_probability = error_to_probability(self.distribution, 0.01)
     logger.info('Peptide probability cutoff for 0.01 fpe: %f' %
                 peptide_probability)