Beispiel #1
0
def get_proteins_and_sources(protxml,
                             pepxmls,
                             peptide_error=0.01,
                             protein_error=0.01):
    """
  Returns a proteins dictionary and list of source names.
  """
    logger.info('Loading protxml ' + protxml)
    proteins, protein_probs = generate_proteins_from_protxml(protxml)

    source_names = []
    for pepxml in pepxmls:
        logger.info('Loading pepxml ' + pepxml)
        load_pepxml(proteins,
                    pepxml,
                    error_cutoff=peptide_error,
                    source_names=source_names)

    count_tpp_indep_spectra(proteins)

    probability = error_to_probability(protein_probs, protein_error)
    filter_proteins(proteins, probability)

    if logger.root.level <= logging.DEBUG:
        dump = protxml.replace('prot.xml', 'proteins.dump')
        logger.debug('Dumping protein data structure to ' + dump)
        parse.save_data_dict(proteins, dump)

    return proteins, source_names
Beispiel #2
0
def get_proteins_and_sources(
    protxml, pepxmls, peptide_error=0.01, protein_error=0.01):
  """
  Returns a proteins dictionary and list of source names.
  """
  logger.info('Loading protxml ' + protxml)
  proteins, protein_probs = generate_proteins_from_protxml(protxml)

  source_names = []
  for pepxml in pepxmls:
    logger.info('Loading pepxml ' + pepxml)
    load_pepxml(proteins, pepxml, error_cutoff=peptide_error, source_names=source_names)
    
  count_tpp_indep_spectra(proteins)

  probability = error_to_probability(protein_probs, protein_error)
  filter_proteins(proteins, probability)

  if logger.root.level <= logging.DEBUG:
    dump = protxml.replace('prot.xml', 'proteins.dump')
    logger.debug('Dumping protein data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  return proteins, source_names
Beispiel #3
0
def get_proteins_and_sources(protxml,
                             pepxmls,
                             peptide_error=0.01,
                             protein_error=0.01,
                             good_expect=1E-8,
                             cutoff_expect=1E-2):
    """
  Returns a proteins dictionary and list of source names.
  """

    logger.info('Loading protxml ' + protxml)
    proteins, protein_probs = make_proteins_from_protxml(protxml)

    source_names = []
    for pepxml in pepxmls:
        logger.info('Loading pepxml ' + pepxml)
        load_pepxml_into_proteins(proteins,
                                  pepxml,
                                  error_cutoff=peptide_error,
                                  source_names=source_names,
                                  good_expect=good_expect,
                                  cutoff_expect=cutoff_expect)

    count_independent_spectra(proteins)

    prob_cutoff = error_to_probability(protein_probs, protein_error)
    for seqid in proteins.keys():
        if proteins[seqid]['attr']['probability'] < prob_cutoff:
            del proteins[seqid]

    if logger.root.level <= logging.DEBUG:
        dump = protxml.replace('prot.xml', 'proteins.dump')
        logger.debug('Dumping protein data structure to ' + dump)
        parse.save_data_dict(proteins, dump)

    return proteins, source_names
Beispiel #4
0
  """
  scans = { int(scan['id']):scan for scan in xtandem_scans }
  for seqid in proteins.keys():
    source = proteins[seqid]['sources'][i_source]
    scan_ids = []
    for peptide in source['peptides']:
      scan_id = peptide['attr']['scan_id']
      sequence = peptide['sequence']
      modifications = peptide['attr']['modifications']
      if scan_id not in scans:
        logger.warning("Couldn't find xtadnem entry for scan {} in pepxml".format(scan_id))
        continue
      scan = scans[scan_id]
      x_vals = map(float, scan['masses'].split())
      y_vals = map(float, scan['intensities'].split())
      ions = [(x, y) for x, y in zip(x_vals, y_vals)]
      ions.sort(key=lambda i:-i[1])
      peptide['spectrum'] = ions[:n_peak]



if __name__ == "__main__":
  scans, fastas = read('../example/xtandem/Seq23282_E1O1.tandem')
  parse.save_data_dict(scans, '../example/xtandem/scans.dump')
  parse.save_data_dict(fastas, '../example/xtandem/fastas.dump')





Beispiel #5
0
        ions = [(x, y) for x, y in zip(x_vals, y_vals)]
        ions.sort(key=lambda i:-i[1])
        peptide['spectrum'] = ions[:n_peak]

  proteins_module.load_fastas_into_proteins(proteins, fastas)
  for seqid in proteins.keys():
    protein = proteins[seqid]
    if 'sequence' not in protein:
      logger.debug("Protein {} not found in x!tandem".format(seqid))
      del proteins[seqid]
      continue
    n_peptide = sum([len(source['peptides']) for source in protein['sources']])
    if n_peptide == 0:
      del proteins[seqid]
      logger.debug("No peptide-spectra matches found in {}".format(seqid))
      continue

  # proteins_module.calculate_peptide_positions(proteins)



if __name__ == "__main__":
  scans, fastas = read('../example/xtandem/Seq23282_E1O1.tandem')
  parse.save_data_dict(scans, '../example/xtandem/scans.dump')
  parse.save_data_dict(fastas, '../example/xtandem/fastas.dump')





Beispiel #6
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
    is_debug = logger.root.level <= logging.DEBUG
    dump_dir = os.path.dirname(protein_groups_fname)

    if modifications_fname is not None:
        modification_table = read_modification_dict(modifications_fname)
    else:
        modification_table = {}

    proteins = {}
    dict_dump_writer = DictListWriter(
        is_debug, os.path.join(dump_dir, 'protein_groups.dump'))
    for i_group, protein_group in enumerate(
            read_tsv_iter(protein_groups_fname)):
        descriptions = protein_group['protein description'].split(' / ')
        coverage_str = str(protein_group['protein sequence coverage (%)'])
        if ';' in coverage_str:
            coverage = float(get_first(coverage_str, ';'))
        else:
            coverage = float(get_first(coverage_str, '/'))
        seqs = protein_group['protein sequence'].split('/')
        seqids = [desc.split()[0] for desc in descriptions]
        for seqid in seqids:
            if seqid in proteins:
                logger.warning(
                    "Different protein groups claim same first seqid", seqid)
        protein = {
            'description': descriptions[0],
            'sequence': seqs[0],
            'other_sequences': seqs[1:],
            'attr': {
                'coverage':
                parse.round_decimal(coverage, 4),
                'morpheus-score':
                parse.round_decimal(protein_group['summed morpheus score'], 4),
                'i_group':
                i_group,
                'other_seqids':
                seqids[1:],
                'seqid':
                seqids[0],
            },
            'sources': [{
                'peptides': []
            }]
        }
        proteins[seqids[0]] = protein
        dict_dump_writer.dump_dict(protein_group)
    dict_dump_writer.close()

    protein_by_seqid = {}
    for seqid in proteins:
        protein = proteins[seqid]
        protein_by_seqid[seqid] = protein
        for alt_seqid in protein['attr']['other_seqids']:
            protein_by_seqid[alt_seqid] = protein

    dict_dump_writer = DictListWriter(is_debug,
                                      os.path.join(dump_dir, 'peptides.dump'))
    n_peptide = 0
    n_peptide_matched = 0
    for src_peptide in read_tsv_iter(psm_fname):
        dict_dump_writer.dump_dict(src_peptide)
        descriptions = src_peptide['protein description'].split(' / ')
        peptide_seqids = [d.split()[0] for d in descriptions]
        protein = None
        for peptide_seqid in peptide_seqids:
            if peptide_seqid in protein_by_seqid:
                protein = protein_by_seqid[peptide_seqid]
                break
        n_peptide += 1
        if protein is None:
            continue
        n_peptide_matched += 1
        sequence = protein['sequence']
        extracted_peptide_sequence, modifications = parse_peptide(
            src_peptide['peptide sequence'], modification_table)
        peptide_sequence = src_peptide['base peptide sequence']
        if extracted_peptide_sequence != peptide_sequence:
            logger.warning("Peptide sequences don't match: " +
                           src_peptide['peptide sequence'] + " " +
                           extracted_peptide_sequence + " " + peptide_sequence)
        i = sequence.find(peptide_sequence)
        if i < 0:
            logger.warning(peptide_sequence + ' not found in ' +
                           protein['attr']['seqid'])
            continue
        q_value = float(src_peptide['q-value (%)'])
        if 'scan number' in src_peptide:
            scan_id = src_peptide['scan number']
        elif 'spectrum number' in src_peptide:
            scan_id = src_peptide['spectrum number']
        else:
            scan_id = ''
        if 'retention time (min)' in src_peptide:
            time = parse.round_decimal(src_peptide['retention time (min)'], 4)
        elif 'retention time (minutes)' in src_peptide:
            time = parse.round_decimal(src_peptide['retention time (minutes)'],
                                       4)
        else:
            time = ''

        peptide = {
            'sequence': peptide_sequence,
            'attr': {
                'scan_id':
                scan_id,
                'retention_time':
                time,
                'morpheus_score':
                parse.round_decimal(src_peptide['morpheus score'], 4),
                'mass':
                parse.round_decimal(src_peptide['precursor mass (da)'], 4),
                'mass_diff':
                parse.round_decimal(src_peptide['precursor mass error (da)'],
                                    4),
                'm/z':
                parse.round_decimal(src_peptide['precursor m/z'], 4),
                'source':
                parse.basename(src_peptide['filename']),
                'q_value':
                q_value,
            },
            'intensity': 1.0 - q_value / 100.0,
            'i': i,
        }
        if modifications:
            for modification in modifications:
                modification['mass'] = parse.round_decimal(
                    modification['mass'], 4)
            peptide['attr']['modifications'] = modifications

        protein['sources'][0]['peptides'].append(peptide)

    dict_dump_writer.close()

    dump = os.path.join(dump_dir, 'proteins.dump')
    if logger.root.level <= logging.DEBUG:
        logger.debug('Dumping proteins data structure to ' + dump)
        parse.save_data_dict(proteins, dump)

    logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(
        n_peptide_matched, n_peptide))

    return proteins
Beispiel #7
0
def get_proteins_and_sources(
    protxml, pepxml, 
    n_peptide_cutoff=1, 
    is_skip_no_unique=True,
    errors = [0.01]):
  """
  Basic structure proteins in YAML formt.
    "sample_seqid": 
      sequence: "AAAAAAAAAA"
      description: "sample protein"
      attr:
        param: value
      sources:
        -
          peptides
            -
              sequence: "AAA"
              i: 0
              j: 3
              attr:
                is_unique: True
                param: value
  """
  max_error = max(errors)
  protein_groups, protein_probs = read_protxml(protxml)
  proteins = make_proteins_from_protxml(protein_groups)

  dump_dir = os.path.dirname(protxml)
  if logger.root.level <= logging.DEBUG:
    dump = os.path.join(dump_dir, 'protxml.dump')
    logger.debug('Dumping protxml data structure to ' + dump)
    parse.save_data_dict(protein_groups, dump)
    dump = os.path.join(dump_dir, 'proterror.dump')
    logger.debug('Dumping protein error distribution to ' + dump)
    parse.save_data_dict(protein_probs, dump)

  scans_by_sources, peptide_probs = read_pepxml(pepxml)

  if logger.root.level <= logging.DEBUG:
    dump = os.path.join(dump_dir, 'pepxml.dump')
    logger.debug('Dumping pepxml data structure to ' + dump)
    parse.save_data_dict(scans_by_sources, dump)
    dump = os.path.join(dump_dir, 'peperror.dump')
    logger.debug('Dumping peptide error distribution to ' + dump)
    parse.save_data_dict(peptide_probs, dump)

  source_names = [scans['filename'] for scans in scans_by_sources]
  load_pepxml(proteins, scans_by_sources)
  probability = error_to_probability(peptide_probs, max_error)
  filter_peptides(proteins, probability)
  probabilities = [error_to_probability(peptide_probs, e) for e in errors]
  make_mask(proteins, probabilities)
  probability = error_to_probability(protein_probs, max_error)
  filter_proteins(proteins, probability)
  parse_proteins.determine_unique_peptides(proteins)
  parse_proteins.count_peptides(proteins, n_peptide_cutoff, is_skip_no_unique)

  if logger.root.level <= logging.DEBUG:
    dump = os.path.join(dump_dir, 'proteins.dump')
    logger.debug('Dumping protein data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  return proteins, source_names
Beispiel #8
0
    for source in protein['sources']:
      for peptide in source['peptides']:
        if 'identity' in peptide['attr']:
          match_id = "%.2f%.2f%.2f%s" % \
              (peptide['attr']['score'],
               peptide['attr']['identity'],
               peptide['attr']['homology'],
               peptide['sequence'])
          peptide_by_match_id[match_id] = peptide
  scans, mascot_proteins = read_mascot_dat(mascot_dat)
  n_match = 0
  for scan in scans.values():
    for match in scan['matches']:
      match_id = "%.2f%.2f%.2f%s" % \
          (match['score'],
           scan['identity'],
           scan['homology'],
           match['sequence'])
      if match_id in peptide_by_match_id:
        n_match += 1
        peptide = peptide_by_match_id[match_id]
        peptide['spectrum'] = split_mascot_ion_str(scan['Ions1'])
  logger.info('%s: matched %d pepXML to %d mascot PSM' % \
      (mascot_dat, len(peptide_by_match_id), n_match))


if __name__ == '__main__':
  scans, proteins = read_mascot_dat('../example/mascot/F022045.dat')
  save_data_dict(scans, '../example/mascot/scans.dump')
  save_data_dict(proteins,'../example/mascot/proteins.dump')
                    peptide_intensities1.extend(intensities[experiment1])
                elif experiment2 in intensities:
                    ratio = 0.0
                    intensity = -2 * max_ratio
                    std = 0
                    peptide_intensities2.extend(intensities[experiment2])
                else:
                    # neither experiment1 or experiment2 found in experiment column
                    ratio = None
                for peptide in peptides:
                    peptide['attr']['ratio'] = ratio
                    peptide['intensity'] = intensity
                    peptide['attr']['ratio_var'] = std
        sum2 = numpy.sum(peptide_intensities2)
        sum1 = numpy.sum(peptide_intensities1)
        if sum2 > 0.0:
            group_ratio = sum1 / sum2
        else:
            group_ratio = float('inf')
        protein['attr']['ratio'] = group_ratio


if __name__ == '__main__':
    peptides, scans, protein_groups, evidence = read(
        '../example/maxquant/silac')
    parse.save_data_dict(peptides, '../example/maxquant/peptides.dump')
    parse.save_data_dict(scans, '../example/maxquant/scans.dump')
    parse.save_data_dict(protein_groups,
                         '../example/maxquant/protein_groups.dump')
    parse.save_data_dict(evidence, '../example/maxquant/evidence.dump')
Beispiel #10
0
            section = name[1:-1]
            if section == "summary":
                process_line = \
                    lambda l: process_summary(l, scans)
            if section == "peptides":
                process_line = \
                    lambda l: process_matches(l, scans, max_peptide_rank)
            if section == "proteins":
                process_line = \
                    lambda l: process_proteins(l, proteins)
            if "query" in section:
                scan_id = int(section[5:])
                process_line = \
                    lambda l: process_query(l, scan_id, scans)
            continue
        if process_line:
            process_line(l[:-1])
    return scans, proteins


def split_mascot_ion_str(s):
    "Parses an Ion entry string into a dictionary"
    pairs = [piece.split(':') for piece in s.split(',')]
    return [[float(x), float(y)] for x, y in pairs]


if __name__ == '__main__':
    scans, proteins = read_mascot_dat('../example/mascot/F022045.dat')
    save_data_dict(scans, '../example/mascot/scans.dump')
    save_data_dict(proteins, '../example/mascot/proteins.dump')
Beispiel #11
0
def get_proteins_and_sources(
      protein_groups_fname, 
      psm_fname, 
      modifications_fname=None,
      q_good=0.0, 
      q_cutoff=10):

  is_debug = logger.root.level <= logging.DEBUG

  dump_dir = os.path.dirname(protein_groups_fname)

  modification_table = {}
  if modifications_fname:
    modification_table = read_modification_dict(modifications_fname)
  
  proteins = {}
  dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump'))
  for i_group, protein_group in enumerate(parse.read_tsv(protein_groups_fname)):
    protein = make_protein(i_group, protein_group)
    proteins[protein['attr']['seqid']] = protein
    dict_dump_writer.dump_dict(protein_group)
  dict_dump_writer.close()

  protein_by_seqid = {}
  for seqid in proteins:
    protein = proteins[seqid]
    protein_by_seqid[seqid] = protein
    for alt_seqid in protein['attr']['other_seqids']:
      protein_by_seqid[alt_seqid] = protein

  dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump'))

  n_match = 0
  n_match_assigned = 0
  i_source_from_source = {}
  sources = []
  for psm in parse.read_tsv(psm_fname):
    dict_dump_writer.dump_dict(psm)

    match = make_match(psm, modification_table)
    match['intensity']  = parse_proteins.calc_intensity(
       match['attr']['q_value'], q_good, q_cutoff)
    if match['attr']['q_value'] > q_cutoff:
      continue
    peptide_sequence = match['sequence']

    n_match += 1

    protein = None
    descriptions = psm['protein description'].split(' / ')
    peptide_seqids = [d.split()[0] for d in descriptions]
    for peptide_seqid in peptide_seqids:
      if peptide_seqid in protein_by_seqid:
        test_protein = protein_by_seqid[peptide_seqid]
        sequence = protein_by_seqid[peptide_seqid]['sequence']
        if peptide_sequence in sequence:
          protein = test_protein
          break
    else:
      logger.debug("Couldn't find protein for %s" % (peptide_sequence))
      continue
    match['i'] = sequence.find(peptide_sequence)

    n_match_assigned += 1
    i_source = get_i_source(proteins, sources, psm['filename'])
    protein['sources'][i_source]['matches'].append(match)

  dict_dump_writer.close()

  dump = os.path.join(dump_dir, 'proteins.dump')
  if logger.root.level <= logging.DEBUG:
    logger.debug('Dumping proteins data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_match_assigned, n_match))

  return proteins, sources
Beispiel #12
0
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False,):
  peptide_list, scan_list, protein_group_list, evidence_list = \
        read(in_dir)

  peptides = { int(p['id']):p for p in peptide_list }
  scans = { int(s['id']):s for s in scan_list }
  protein_groups = { int(p['id']):p for p in protein_group_list }
  evidence_dict = { int(e['id']):e for e in evidence_list }

  parse.save_data_dict(peptides, in_dir + '/peptides.dump')
  parse.save_data_dict(scans, in_dir + '/scans.dump')
  parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump')
  parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump')

  sources_set = set(e['raw file'] for e in evidence_dict.values())
  sources = [str(s) for s in sorted(sources_set)]
  i_sources = {source:k for k, source in enumerate(sources)}

  proteins = {}
  protein_by_group_id = {}
  for group_id, protein_group in protein_groups.items():
    protein = {
      'description': '',
      'attr': { 
        'group_id': group_id,
        'other_seqids': [],
      },
      'sources': [{ 'peptides': [] } for k in range(len(i_sources))],
    }
    transfer_attrs(protein_group, protein['attr'], protein_parse_list)

    seqids = parse.splitter(protein_group['protein ids'])
    proteins[seqids[0]] = protein
    protein['attr']['seqid'] = seqids[0]
    protein['attr']['other_seqids'] = seqids[1:]
    protein_by_group_id[group_id] = protein

  print("Matching sequences and scan in proteins")
  i_scan = 0
  n_scan = len(scans)
  for scan_id, scan in scans.items():
    i_scan += 1
    if i_scan % 5000 == 0:
      print("{}/{} scans processed".format(i_scan, n_scan))
    evidence_id = int(scan['evidence id'])
    evidence = evidence_dict[evidence_id]

    peptide_id = int(scan['peptide id'])
    peptide = peptides[peptide_id]
    for group_id in parse.splitter(str(scan['protein group ids'])):
      new_peptide = {
        'sequence': scan['sequence'],
        'spectrum': get_labeled_spectrum(scan),
        'attr' : {
          'modifications': [],
          'mq_scan_id': scan_id,
          'is_unique': peptide['unique (groups)'] == 'yes',
        }
      }
      transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
      transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
      transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
      change_key(new_peptide['attr'], 'scan number', 'scan_id')
      change_key(new_peptide['attr'], 'retention time', 'retention_time')
      
      protein = protein_by_group_id[int(group_id)]
      i_source = i_sources[evidence['raw file']]
      protein['sources'][i_source]['peptides'].append(new_peptide)

  parse_proteins.count_peptides(proteins)
  
  return proteins, sources
Beispiel #13
0
          peptide_intensities1.extend(intensities[experiment1])
        elif experiment2 in intensities:
          ratio = 0.0
          intensity = -2*max_ratio
          std = 0
          peptide_intensities2.extend(intensities[experiment2])
        else:
          # neither experiment1 or experiment2 found in experiment column
          ratio = None
        for peptide in peptides:
          peptide['attr']['ratio'] = ratio
          peptide['intensity'] = intensity
          peptide['attr']['ratio_var'] = std
    sum2 = numpy.sum(peptide_intensities2)
    sum1 = numpy.sum(peptide_intensities1)
    if sum2 > 0.0:
      group_ratio = sum1/sum2
    else:
      group_ratio = float('inf')
    protein['attr']['ratio'] = group_ratio


if __name__ == '__main__':
  peptides, scans, protein_groups, evidence = read('../example/maxquant/silac')
  parse.save_data_dict(peptides, '../example/maxquant/peptides.dump')
  parse.save_data_dict(scans, '../example/maxquant/scans.dump')
  parse.save_data_dict(protein_groups, '../example/maxquant/protein_groups.dump')
  parse.save_data_dict(evidence, '../example/maxquant/evidence.dump')


Beispiel #14
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
    dump_dir = os.path.dirname(protein_groups_fname)
    if modifications_fname is not None:
        modification_table = read_modification_dict(modifications_fname)
    else:
        modification_table = {}
    peptides = parse.read_tsv(psm_fname)
    protein_groups = parse.read_tsv(protein_groups_fname)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'peptides.dump')
        logger.debug('Dumping peptides data structure to ' + dump)
        parse.save_data_dict(peptides, dump)
        dump = os.path.join(dump_dir, 'protein_groups.dump')
        logger.debug('Dumping protein_groups data structure to ' + dump)
        parse.save_data_dict(protein_groups, dump)

    proteins = {}
    for i_group, protein_group in enumerate(protein_groups):
        descriptions = protein_group['protein description'].split(' / ')
        seqids = [desc.split()[0] for desc in descriptions]
        for seqid in seqids:
            if seqid in proteins:
                logger.warning(
                    "Different protein groups claim same first seqid", seqid)
        protein = {
            'description': descriptions[0],
            'sequence': protein_group['protein sequence'],
            'attr': {
                'coverage':
                protein_group['protein sequence coverage (%)'],
                'morpheus-score':
                parse.round_decimal(protein_group['summed morpheus score'], 4),
                'i_group':
                i_group,
                'other_seqids':
                seqids[1:],
                'seqid':
                seqids[0],
            },
            'sources': [{
                'peptides': []
            }]
        }
        proteins[seqids[0]] = protein

    protein_by_seqid = {}
    for seqid in proteins:
        protein = proteins[seqid]
        protein_by_seqid[seqid] = protein
        for alt_seqid in protein['attr']['other_seqids']:
            protein_by_seqid[alt_seqid] = protein
    unmatched_peptides = []
    n_peptide_matched = 0
    for src_peptide in peptides:
        descriptions = src_peptide['protein description'].split(' / ')
        peptide_seqids = [d.split()[0] for d in descriptions]
        protein = None
        for peptide_seqid in peptide_seqids:
            if peptide_seqid in protein_by_seqid:
                protein = protein_by_seqid[peptide_seqid]
                break
        if protein is None:
            unmatched_peptides.append(src_peptide)
            continue
        n_peptide_matched += 1
        sequence = protein['sequence']
        peptide_sequence, modifications = parse_peptide(
            src_peptide['peptide sequence'], modification_table)
        peptide_sequence = src_peptide['base peptide sequence']
        i = sequence.index(peptide_sequence)
        peptide = {
            'sequence': peptide_sequence,
            'attr': {
                'scan_id':
                src_peptide['scan number'],
                'retention_time':
                parse.round_decimal(src_peptide['retention time (min)'], 4),
                'morpheus_score':
                parse.round_decimal(src_peptide['morpheus score'], 4),
                'mass':
                parse.round_decimal(src_peptide['precursor mass (da)'], 4),
                'mass_diff':
                parse.round_decimal(src_peptide['precursor mass error (da)'],
                                    4),
                'm/z':
                parse.round_decimal(src_peptide['precursor m/z'], 4),
                'source':
                parse.basename(src_peptide['filename']),
            },
            'intensity': src_peptide['morpheus score'] / len(peptide_sequence),
            'i': i,
        }
        if modifications:
            for modification in modifications:
                modification['mass'] = parse.round_decimal(
                    modification['mass'], 4)
            peptide['attr']['modifications'] = modifications

        protein['sources'][0]['peptides'].append(peptide)

    dump = os.path.join(dump_dir, 'proteins.dump')
    logger.debug('Dumping proteins data structure to ' + dump)
    if logger.root.level <= logging.DEBUG:
        parse.save_data_dict(proteins, dump)

    logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(
        n_peptide_matched, len(unmatched_peptides)))

    return proteins
Beispiel #15
0
def get_proteins_and_sources(
    in_dir,
    is_leu_ile_isomeric=False,
):
    peptide_list, scan_list, protein_group_list, evidence_list = \
          read(in_dir)

    peptides = {int(p['id']): p for p in peptide_list}
    scans = {int(s['id']): s for s in scan_list}
    protein_groups = {int(p['id']): p for p in protein_group_list}
    evidence_dict = {int(e['id']): e for e in evidence_list}

    parse.save_data_dict(peptides, in_dir + '/peptides.dump')
    parse.save_data_dict(scans, in_dir + '/scans.dump')
    parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump')
    parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump')

    sources_set = set(e['raw file'] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    proteins = {}
    protein_by_group_id = {}
    for group_id, protein_group in protein_groups.items():
        protein = {
            'description': '',
            'attr': {
                'group_id': group_id,
                'other_seqids': [],
            },
            'sources': [{
                'peptides': []
            } for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein['attr'], protein_parse_list)

        seqids = parse.splitter(protein_group['protein ids'])
        proteins[seqids[0]] = protein
        protein['attr']['seqid'] = seqids[0]
        protein['attr']['other_seqids'] = seqids[1:]
        protein_by_group_id[group_id] = protein

    print("Matching sequences and scan in proteins")
    i_scan = 0
    n_scan = len(scans)
    for scan_id, scan in scans.items():
        i_scan += 1
        if i_scan % 5000 == 0:
            print("{}/{} scans processed".format(i_scan, n_scan))
        evidence_id = int(scan['evidence id'])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan['peptide id'])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan['protein group ids'])):
            new_peptide = {
                'sequence': scan['sequence'],
                'spectrum': get_labeled_spectrum(scan),
                'attr': {
                    'modifications': [],
                    'mq_scan_id': scan_id,
                    'is_unique': peptide['unique (groups)'] == 'yes',
                }
            }
            transfer_attrs(scan, new_peptide['attr'], scan_parse_list)
            transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list)
            transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list)
            change_key(new_peptide['attr'], 'scan number', 'scan_id')
            change_key(new_peptide['attr'], 'retention time', 'retention_time')

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence['raw file']]
            protein['sources'][i_source]['peptides'].append(new_peptide)

    parse_proteins.count_peptides(proteins)

    return proteins, sources
Beispiel #16
0
def get_proteins_and_sources(protxml,
                             pepxml,
                             n_peptide_cutoff=1,
                             is_skip_no_unique=True,
                             errors=[0.01]):
    """
  Basic structure proteins in YAML formt.
    "sample_seqid": 
      sequence: "AAAAAAAAAA"
      description: "sample protein"
      attr:
        param: value
      sources:
        -
          peptides
            -
              sequence: "AAA"
              i: 0
              j: 3
              attr:
                is_unique: True
                param: value
  """
    max_error = max(errors)
    protein_groups, protein_probs = read_protxml(protxml)
    proteins = make_proteins_from_protxml(protein_groups)

    dump_dir = os.path.dirname(protxml)
    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'protxml.dump')
        logger.debug('Dumping protxml data structure to ' + dump)
        parse.save_data_dict(protein_groups, dump)
        dump = os.path.join(dump_dir, 'proterror.dump')
        logger.debug('Dumping protein error distribution to ' + dump)
        parse.save_data_dict(protein_probs, dump)

    scans_by_sources, peptide_probs = read_pepxml(pepxml)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'pepxml.dump')
        logger.debug('Dumping pepxml data structure to ' + dump)
        parse.save_data_dict(scans_by_sources, dump)
        dump = os.path.join(dump_dir, 'peperror.dump')
        logger.debug('Dumping peptide error distribution to ' + dump)
        parse.save_data_dict(peptide_probs, dump)

    source_names = [scans['filename'] for scans in scans_by_sources]
    load_pepxml(proteins, scans_by_sources)
    probability = error_to_probability(peptide_probs, max_error)
    filter_peptides(proteins, probability)
    probabilities = [error_to_probability(peptide_probs, e) for e in errors]
    make_mask(proteins, probabilities)
    probability = error_to_probability(protein_probs, max_error)
    filter_proteins(proteins, probability)
    parse_proteins.determine_unique_peptides(proteins)
    parse_proteins.count_peptides(proteins, n_peptide_cutoff,
                                  is_skip_no_unique)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'proteins.dump')
        logger.debug('Dumping protein data structure to ' + dump)
        parse.save_data_dict(proteins, dump)

    return proteins, source_names
Beispiel #17
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
  is_debug = logger.root.level <= logging.DEBUG
  dump_dir = os.path.dirname(protein_groups_fname)

  if modifications_fname is not None:
    modification_table = read_modification_dict(modifications_fname)
  else:
    modification_table = {}
  
  proteins = {}
  dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump'))
  for i_group, protein_group in enumerate(read_tsv_iter(protein_groups_fname)):
    descriptions = protein_group['protein description'].split(' / ')
    coverage_str = str(protein_group['protein sequence coverage (%)'])
    if ';' in coverage_str:
      coverage =  float(get_first(coverage_str, ';'))
    else:
      coverage =  float(get_first(coverage_str, '/'))
    seqs = protein_group['protein sequence'].split('/')
    seqids = [desc.split()[0] for desc in descriptions]
    for seqid in seqids:
      if seqid in proteins:
        logger.warning("Different protein groups claim same first seqid", seqid)
    protein = {
      'description': descriptions[0],
      'sequence': seqs[0],
      'other_sequences': seqs[1:],
      'attr': {
        'coverage': parse.round_decimal(coverage, 4),
        'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4),
        'i_group': i_group,
        'other_seqids': seqids[1:],
        'seqid': seqids[0],
      },
      'sources': [{ 'peptides':[] }]
    }
    proteins[seqids[0]] = protein
    dict_dump_writer.dump_dict(protein_group)
  dict_dump_writer.close()

  protein_by_seqid = {}
  for seqid in proteins:
    protein = proteins[seqid]
    protein_by_seqid[seqid] = protein
    for alt_seqid in protein['attr']['other_seqids']:
      protein_by_seqid[alt_seqid] = protein

  dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump'))
  n_peptide = 0
  n_peptide_matched = 0
  for src_peptide in read_tsv_iter(psm_fname):
    dict_dump_writer.dump_dict(src_peptide)
    descriptions = src_peptide['protein description'].split(' / ')
    peptide_seqids = [d.split()[0] for d in descriptions]
    protein = None
    for peptide_seqid in peptide_seqids:
      if peptide_seqid in protein_by_seqid:
        protein = protein_by_seqid[peptide_seqid]
        break
    n_peptide += 1
    if protein is None:
      continue
    n_peptide_matched += 1
    sequence = protein['sequence']
    extracted_peptide_sequence, modifications = parse_peptide(
        src_peptide['peptide sequence'],
        modification_table)
    peptide_sequence = src_peptide['base peptide sequence']
    if extracted_peptide_sequence != peptide_sequence:
      logger.warning("Peptide sequences don't match: " + src_peptide['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence)
    i = sequence.find(peptide_sequence)
    if i < 0:
      logger.warning(peptide_sequence + ' not found in ' + protein['attr']['seqid'])
      continue
    q_value = float(src_peptide['q-value (%)'])
    if 'scan number' in src_peptide:
      scan_id = src_peptide['scan number']
    elif 'spectrum number' in src_peptide:
      scan_id = src_peptide['spectrum number']
    else:
      scan_id = ''
    if 'retention time (min)' in src_peptide:
      time = parse.round_decimal(src_peptide['retention time (min)'], 4)
    elif 'retention time (minutes)' in src_peptide:
      time = parse.round_decimal(src_peptide['retention time (minutes)'], 4)
    else:
      time = ''

    peptide = {
      'sequence': peptide_sequence,
      'attr': {
        'scan_id': scan_id, 
        'retention_time': time,
        'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4),
        'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4),
        'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4),
        'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4),
        'source': parse.basename(src_peptide['filename']),
        'q_value': q_value,
      },
      'intensity': 1.0 - q_value/100.0,
      'i': i,
    }
    if modifications:
      for modification in modifications:
        modification['mass'] = parse.round_decimal(modification['mass'], 4)
      peptide['attr']['modifications'] = modifications

    protein['sources'][0]['peptides'].append(peptide)

  dict_dump_writer.close()

  dump = os.path.join(dump_dir, 'proteins.dump')
  if logger.root.level <= logging.DEBUG:
    logger.debug('Dumping proteins data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_peptide_matched, n_peptide))

  return proteins