Esempio n. 1
0
def read_modification_dict(modifications_tsv):
    result = {}
    for entry in parse.read_tsv(modifications_tsv):
        # pprint(entry)
        key = entry['description']
        if 'residue' in entry:
            aa = entry['residue']
        elif 'amino acid residue' in entry:
            aa = entry['amino acid residue']
        delta_mass = entry['monoisotopic mass shift (da)']
        if aa == 'n/a':
            continue
        mass = peptidemass.aa_monoisotopic_mass[aa] + float(delta_mass)
        result[key] = mass
    return result
Esempio n. 2
0
def read_modification_dict(modifications_tsv):
  result = {}
  for entry in parse.read_tsv(modifications_tsv):
    key = entry['description']
    mass = float(entry['monoisotopic mass shift (da)'])
    if 'residue' in entry:
      aa = entry['residue']
    elif 'amino acid residue' in entry:
      aa = entry['amino acid residue']
    else:
      aa = 'n/a'
    if aa != 'n/a':
      mass += peptidemass.aa_monoisotopic_mass[aa]
    result[key] = mass
  return result
Esempio n. 3
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
    dump_dir = os.path.dirname(protein_groups_fname)
    if modifications_fname is not None:
        modification_table = read_modification_dict(modifications_fname)
    else:
        modification_table = {}
    peptides = parse.read_tsv(psm_fname)
    protein_groups = parse.read_tsv(protein_groups_fname)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'peptides.dump')
        logger.debug('Dumping peptides data structure to ' + dump)
        parse.save_data_dict(peptides, dump)
        dump = os.path.join(dump_dir, 'protein_groups.dump')
        logger.debug('Dumping protein_groups data structure to ' + dump)
        parse.save_data_dict(protein_groups, dump)

    proteins = {}
    for i_group, protein_group in enumerate(protein_groups):
        descriptions = protein_group['protein description'].split(' / ')
        seqids = [desc.split()[0] for desc in descriptions]
        for seqid in seqids:
            if seqid in proteins:
                logger.warning(
                    "Different protein groups claim same first seqid", seqid)
        protein = {
            'description': descriptions[0],
            'sequence': protein_group['protein sequence'],
            'attr': {
                'coverage':
                protein_group['protein sequence coverage (%)'],
                'morpheus-score':
                parse.round_decimal(protein_group['summed morpheus score'], 4),
                'i_group':
                i_group,
                'other_seqids':
                seqids[1:],
                'seqid':
                seqids[0],
            },
            'sources': [{
                'peptides': []
            }]
        }
        proteins[seqids[0]] = protein

    protein_by_seqid = {}
    for seqid in proteins:
        protein = proteins[seqid]
        protein_by_seqid[seqid] = protein
        for alt_seqid in protein['attr']['other_seqids']:
            protein_by_seqid[alt_seqid] = protein
    unmatched_peptides = []
    n_peptide_matched = 0
    for src_peptide in peptides:
        descriptions = src_peptide['protein description'].split(' / ')
        peptide_seqids = [d.split()[0] for d in descriptions]
        protein = None
        for peptide_seqid in peptide_seqids:
            if peptide_seqid in protein_by_seqid:
                protein = protein_by_seqid[peptide_seqid]
                break
        if protein is None:
            unmatched_peptides.append(src_peptide)
            continue
        n_peptide_matched += 1
        sequence = protein['sequence']
        peptide_sequence, modifications = parse_peptide(
            src_peptide['peptide sequence'], modification_table)
        peptide_sequence = src_peptide['base peptide sequence']
        i = sequence.index(peptide_sequence)
        peptide = {
            'sequence': peptide_sequence,
            'attr': {
                'scan_id':
                src_peptide['scan number'],
                'retention_time':
                parse.round_decimal(src_peptide['retention time (min)'], 4),
                'morpheus_score':
                parse.round_decimal(src_peptide['morpheus score'], 4),
                'mass':
                parse.round_decimal(src_peptide['precursor mass (da)'], 4),
                'mass_diff':
                parse.round_decimal(src_peptide['precursor mass error (da)'],
                                    4),
                'm/z':
                parse.round_decimal(src_peptide['precursor m/z'], 4),
                'source':
                parse.basename(src_peptide['filename']),
            },
            'intensity': src_peptide['morpheus score'] / len(peptide_sequence),
            'i': i,
        }
        if modifications:
            for modification in modifications:
                modification['mass'] = parse.round_decimal(
                    modification['mass'], 4)
            peptide['attr']['modifications'] = modifications

        protein['sources'][0]['peptides'].append(peptide)

    dump = os.path.join(dump_dir, 'proteins.dump')
    logger.debug('Dumping proteins data structure to ' + dump)
    if logger.root.level <= logging.DEBUG:
        parse.save_data_dict(proteins, dump)

    logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(
        n_peptide_matched, len(unmatched_peptides)))

    return proteins
Esempio n. 4
0
def get_proteins_and_sources(in_dir, great_expect=1E-8, cutoff_expect=1E-2):

    evidence_fname = os.path.join(in_dir, 'evidence.txt')
    logger.info('Loading evidence file: ' + evidence_fname)
    evidence_iter = parse.read_tsv(evidence_fname)
    evidence_dict = {int(e['id']): e for e in evidence_iter}

    sources_set = set(e['raw file'] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt')
    logger.info('Loading protein groups: ' + protein_group_fname)
    proteins = {}
    protein_by_group_id = {}
    for protein_group in parse.read_tsv(protein_group_fname):
        group_id = protein_group['id']
        protein = {
            'description': '',
            'attr': {
                'group_id': group_id,
                'other_seqids': [],
            },
            'sources': [{
                'matches': []
            } for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein['attr'], protein_parse_list)

        seqids = parse.splitter(protein_group['protein ids'])
        proteins[seqids[0]] = protein
        protein['attr']['seqid'] = seqids[0]
        protein['attr']['other_seqids'] = seqids[1:]
        protein_by_group_id[group_id] = protein

    peptides_fname = os.path.join(in_dir, 'peptides.txt')
    logger.info('Loading peptides file: ' + peptides_fname)
    peptides_iter = parse.read_tsv(peptides_fname)
    peptides = {int(p['id']): p for p in peptides_iter}

    scans_fname = os.path.join(in_dir, 'msms.txt')
    logger.info('Loading scans and matching: ' + scans_fname)

    i_scan = 0
    for scan in parse.read_tsv(scans_fname):
        scan_id = int(scan['id'])
        i_scan += 1
        if i_scan % 5000 == 0:
            logger.info("{} scans processed".format(i_scan))
        evidence_id = int(scan['evidence id'])
        evidence = evidence_dict[evidence_id]
        mod_seq = evidence['modified sequence']
        mod_peptide_id = evidence['mod. peptide id']

        peptide_id = int(scan['peptide id'])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan['protein group ids'])):
            match = {
                'sequence': scan['sequence'],
                'spectrum': get_labeled_spectrum(scan),
                'modifications': get_modifications(scan),
                'attr': {
                    'modified_sequence': mod_seq,
                    'mq_scan_id': scan_id,
                    'evidence_id': evidence_id,
                    'is_unique': peptide['unique (groups)'] == 'yes',
                }
            }

            if scan['pep'] > cutoff_expect:
                continue

            match['intensity'] = parse_proteins.calc_minus_log_intensity(
                scan['pep'], great_expect, cutoff_expect)

            transfer_attrs(scan, match['attr'], scan_parse_list)
            transfer_attrs(evidence, match['attr'], evidence_parse_list)
            transfer_attrs(peptide, match['attr'], peptide_parse_list)
            change_key(match['attr'], 'scan number', 'scan_id')
            change_key(match['attr'], 'retention time', 'retention_time')

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence['raw file']]
            protein['sources'][i_source]['matches'].append(match)

    parse_proteins.count_matches(proteins)
    parse_proteins.delete_empty_proteins(proteins)

    return proteins, sources
Esempio n. 5
0
def read(in_dir):
    peptides = parse.read_tsv(os.path.join(in_dir, 'peptides.txt'))
    scans = parse.read_tsv(os.path.join(in_dir, 'msms.txt'))
    protein_groups = parse.read_tsv(os.path.join(in_dir, 'proteinGroups.txt'))
    evidence = parse.read_tsv(os.path.join(in_dir, 'evidence.txt'))
    return peptides, scans, protein_groups, evidence
Esempio n. 6
0
def get_proteins_and_sources(
    in_dir,
    great_expect=1E-8, 
    cutoff_expect=1E-2):

  evidence_fname = os.path.join(in_dir, 'evidence.txt')
  logger.info('Loading evidence file: ' + evidence_fname)
  evidence_iter = parse.read_tsv(evidence_fname)
  evidence_dict = { int(e['id']):e for e in evidence_iter }

  sources_set = set(e['raw file'] for e in evidence_dict.values())
  sources = [str(s) for s in sorted(sources_set)]
  i_sources = {source:k for k, source in enumerate(sources)}

  protein_group_fname = os.path.join(in_dir, 'proteinGroups.txt')
  logger.info('Loading protein groups: ' + protein_group_fname)
  proteins = {}
  protein_by_group_id = {}
  for protein_group in parse.read_tsv(protein_group_fname):
    group_id = protein_group['id']
    protein = {
      'description': '',
      'attr': { 
        'group_id': group_id,
        'other_seqids': [],
      },
      'sources': [{ 'matches': [] } for k in range(len(i_sources))],
    }
    transfer_attrs(protein_group, protein['attr'], protein_parse_list)

    seqids = parse.splitter(protein_group['protein ids'])
    proteins[seqids[0]] = protein
    protein['attr']['seqid'] = seqids[0]
    protein['attr']['other_seqids'] = seqids[1:]
    protein_by_group_id[group_id] = protein

  peptides_fname = os.path.join(in_dir, 'peptides.txt')
  logger.info('Loading peptides file: ' + peptides_fname)
  peptides_iter = parse.read_tsv(peptides_fname)
  peptides = { int(p['id']):p for p in peptides_iter }

  scans_fname = os.path.join(in_dir, 'msms.txt')
  logger.info('Loading scans and matching: ' + scans_fname)

  i_scan = 0
  for scan in parse.read_tsv(scans_fname):
    scan_id = int(scan['id'])
    i_scan += 1
    if i_scan % 5000 == 0:
      logger.info("{} scans processed".format(i_scan))
    evidence_id = int(scan['evidence id'])
    evidence = evidence_dict[evidence_id]
    mod_seq = evidence['modified sequence']
    mod_peptide_id = evidence['mod. peptide id']

    peptide_id = int(scan['peptide id'])
    peptide = peptides[peptide_id]
    for group_id in parse.splitter(str(scan['protein group ids'])):
      match = {
        'sequence': scan['sequence'],
        'spectrum': get_labeled_spectrum(scan),
        'modifications': get_modifications(scan),
        'attr' : {
          'modified_sequence': mod_seq,
          'mq_scan_id': scan_id,
          'evidence_id': evidence_id,
          'is_unique': peptide['unique (groups)'] == 'yes',
        }
      }

      if scan['pep'] > cutoff_expect:
        continue
        
      match['intensity'] = parse_proteins.calc_minus_log_intensity(
        scan['pep'], great_expect, cutoff_expect)

      transfer_attrs(scan, match['attr'], scan_parse_list)
      transfer_attrs(evidence, match['attr'], evidence_parse_list)
      transfer_attrs(peptide, match['attr'], peptide_parse_list)
      change_key(match['attr'], 'scan number', 'scan_id')
      change_key(match['attr'], 'retention time', 'retention_time')
      
      protein = protein_by_group_id[int(group_id)]
      i_source = i_sources[evidence['raw file']]
      protein['sources'][i_source]['matches'].append(match)

  parse_proteins.count_matches(proteins)
  parse_proteins.delete_empty_proteins(proteins)
  
  return proteins, sources
Esempio n. 7
0
def get_proteins_and_sources(
      protein_groups_fname, 
      psm_fname, 
      modifications_fname=None,
      q_good=0.0, 
      q_cutoff=10):

  is_debug = logger.root.level <= logging.DEBUG

  dump_dir = os.path.dirname(protein_groups_fname)

  modification_table = {}
  if modifications_fname:
    modification_table = read_modification_dict(modifications_fname)
  
  proteins = {}
  dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump'))
  for i_group, protein_group in enumerate(parse.read_tsv(protein_groups_fname)):
    protein = make_protein(i_group, protein_group)
    proteins[protein['attr']['seqid']] = protein
    dict_dump_writer.dump_dict(protein_group)
  dict_dump_writer.close()

  protein_by_seqid = {}
  for seqid in proteins:
    protein = proteins[seqid]
    protein_by_seqid[seqid] = protein
    for alt_seqid in protein['attr']['other_seqids']:
      protein_by_seqid[alt_seqid] = protein

  dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump'))

  n_match = 0
  n_match_assigned = 0
  i_source_from_source = {}
  sources = []
  for psm in parse.read_tsv(psm_fname):
    dict_dump_writer.dump_dict(psm)

    match = make_match(psm, modification_table)
    match['intensity']  = parse_proteins.calc_intensity(
       match['attr']['q_value'], q_good, q_cutoff)
    if match['attr']['q_value'] > q_cutoff:
      continue
    peptide_sequence = match['sequence']

    n_match += 1

    protein = None
    descriptions = psm['protein description'].split(' / ')
    peptide_seqids = [d.split()[0] for d in descriptions]
    for peptide_seqid in peptide_seqids:
      if peptide_seqid in protein_by_seqid:
        test_protein = protein_by_seqid[peptide_seqid]
        sequence = protein_by_seqid[peptide_seqid]['sequence']
        if peptide_sequence in sequence:
          protein = test_protein
          break
    else:
      logger.debug("Couldn't find protein for %s" % (peptide_sequence))
      continue
    match['i'] = sequence.find(peptide_sequence)

    n_match_assigned += 1
    i_source = get_i_source(proteins, sources, psm['filename'])
    protein['sources'][i_source]['matches'].append(match)

  dict_dump_writer.close()

  dump = os.path.join(dump_dir, 'proteins.dump')
  if logger.root.level <= logging.DEBUG:
    logger.debug('Dumping proteins data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_match_assigned, n_match))

  return proteins, sources
Esempio n. 8
0
def read(in_dir):
  peptides = parse.read_tsv(os.path.join(in_dir, 'peptides.txt'))
  scans = parse.read_tsv(os.path.join(in_dir, 'msms.txt'))
  protein_groups = parse.read_tsv(os.path.join(in_dir, 'proteinGroups.txt'))
  evidence = parse.read_tsv(os.path.join(in_dir, 'evidence.txt'))
  return peptides, scans, protein_groups, evidence
Esempio n. 9
0
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False):

    evidence_fname = os.path.join(in_dir, "evidence.txt")
    logger.info("Loading evidence file: " + evidence_fname)
    evidence_iter = parse.read_tsv(evidence_fname)
    evidence_dict = {int(e["id"]): e for e in evidence_iter}

    sources_set = set(e["raw file"] for e in evidence_dict.values())
    sources = [str(s) for s in sorted(sources_set)]
    i_sources = {source: k for k, source in enumerate(sources)}

    protein_group_fname = os.path.join(in_dir, "proteinGroups.txt")
    logger.info("Loading protein groups: " + protein_group_fname)
    proteins = {}
    protein_by_group_id = {}
    for protein_group in parse.read_tsv(protein_group_fname):
        group_id = protein_group["id"]
        protein = {
            "description": "",
            "attr": {"group_id": group_id, "other_seqids": []},
            "sources": [{"peptides": []} for k in range(len(i_sources))],
        }
        transfer_attrs(protein_group, protein["attr"], protein_parse_list)

        seqids = parse.splitter(protein_group["protein ids"])
        proteins[seqids[0]] = protein
        protein["attr"]["seqid"] = seqids[0]
        protein["attr"]["other_seqids"] = seqids[1:]
        protein_by_group_id[group_id] = protein

    peptides_fname = os.path.join(in_dir, "peptides.txt")
    logger.info("Loading peptides file: " + peptides_fname)
    peptides_iter = parse.read_tsv(peptides_fname)
    peptides = {int(p["id"]): p for p in peptides_iter}

    scans_fname = os.path.join(in_dir, "msms.txt")
    logger.info("Loading scans and matching: " + scans_fname)
    i_scan = 0
    for scan in parse.read_tsv(scans_fname):
        scan_id = int(scan["id"])
        i_scan += 1
        if i_scan % 5000 == 0:
            logger.info("{} scans processed".format(i_scan))
        evidence_id = int(scan["evidence id"])
        evidence = evidence_dict[evidence_id]

        peptide_id = int(scan["peptide id"])
        peptide = peptides[peptide_id]
        for group_id in parse.splitter(str(scan["protein group ids"])):
            new_peptide = {
                "sequence": scan["sequence"],
                "spectrum": get_labeled_spectrum(scan),
                "attr": {"modifications": [], "mq_scan_id": scan_id, "is_unique": peptide["unique (groups)"] == "yes"},
            }
            transfer_attrs(scan, new_peptide["attr"], scan_parse_list)
            transfer_attrs(evidence, new_peptide["attr"], evidence_parse_list)
            transfer_attrs(peptide, new_peptide["attr"], peptide_parse_list)
            change_key(new_peptide["attr"], "scan number", "scan_id")
            change_key(new_peptide["attr"], "retention time", "retention_time")

            protein = protein_by_group_id[int(group_id)]
            i_source = i_sources[evidence["raw file"]]
            protein["sources"][i_source]["peptides"].append(new_peptide)

    parse_proteins.count_peptides(proteins)

    return proteins, sources