Exemple #1
0
def make_peptide(pepxml_match, pepxml_scan, source):
    peptide = {
        'sequence': pepxml_match['peptide'],
        'modified_sequence': pepxml_match['modified_sequence'],
        'intensity': pepxml_match['probability'],
        'mask': pepxml_match['fpe'],
        'attr': {
            'pepxml_id': pepxml_scan['index'],
            'scan_id': pepxml_scan['start_scan'],
            'charge': pepxml_scan['assumed_charge'],
            'expect': pepxml_match['expect'],
            'modifications': pepxml_match['modifications'],
            'probability': pepxml_match['probability'],
            'missed_cleavages': pepxml_match['num_missed_cleavages'],
            'mass': pepxml_scan['precursor_neutral_mass'],
            'mass_diff': pepxml_match['massdiff'],
            'source': parse.basename(source),
        }
    }

    def grab_opt(peptide_key, scan_key, source_dict):
        if scan_key in source_dict:
            peptide['attr'][peptide_key] = source_dict[scan_key]

    grab_opt('retention_time', 'retention_time_sec', pepxml_scan)
    grab_opt('score', 'ionscore', pepxml_match)
    grab_opt('homology', 'homologyscore', pepxml_match)
    grab_opt('identity', 'identityscore', pepxml_match)
    peptide['attr']['matched_ions'] = str(pepxml_match['num_matched_ions'])
    peptide['attr']['matched_ions'] += '/'
    peptide['attr']['matched_ions'] += str(pepxml_match['tot_num_ions'])
    return peptide
Exemple #2
0
def make_peptide(pepxml_match, pepxml_scan, source):
  peptide = {
    'sequence': pepxml_match['peptide'],
    'modified_sequence': pepxml_match['modified_sequence'],
    'intensity': pepxml_match['probability'],
    'mask': pepxml_match['fpe'],
    'attr': {
      'pepxml_id': pepxml_scan['index'],
      'scan_id': pepxml_scan['start_scan'],
      'charge': pepxml_scan['assumed_charge'],
      'expect': pepxml_match['expect'],
      'modifications': pepxml_match['modifications'],
      'probability': pepxml_match['probability'],
      'missed_cleavages': pepxml_match['num_missed_cleavages'],
      'mass': pepxml_scan['precursor_neutral_mass'],
      'mass_diff': pepxml_match['massdiff'],
      'source': parse.basename(source),
    }
  }
  def grab_opt(peptide_key, scan_key, source_dict):
    if scan_key in source_dict:
      peptide['attr'][peptide_key] = source_dict[scan_key]
  grab_opt('retention_time', 'retention_time_sec', pepxml_scan)
  grab_opt('score', 'ionscore', pepxml_match)
  grab_opt('homology', 'homologyscore', pepxml_match)
  grab_opt('identity', 'identityscore', pepxml_match)
  peptide['attr']['matched_ions'] = str(pepxml_match['num_matched_ions'])
  peptide['attr']['matched_ions'] += '/'
  peptide['attr']['matched_ions'] += str(pepxml_match['tot_num_ions'])
  return peptide
Exemple #3
0
def make_match(psm, modification_table):
    extracted_peptide_sequence, modifications = parse_peptide(
        psm['peptide sequence'], modification_table)
    peptide_sequence = psm['base peptide sequence']
    if extracted_peptide_sequence != peptide_sequence:
      logger.debug("Peptide sequences don't match: " + psm['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence)

    q_value = float(psm['q-value (%)'])

    if 'scan number' in psm:
      scan_id = psm['scan number']
    elif 'spectrum number' in psm:
      scan_id = psm['spectrum number']
    else:
      scan_id = ''
    if 'retention time (min)' in psm:
      time = parse.round_decimal(psm['retention time (min)'], 4)
    elif 'retention time (minutes)' in psm:
      time = parse.round_decimal(psm['retention time (minutes)'], 4)
    else:
      time = ''

    match = {
      'sequence': peptide_sequence,
      'attr': {
        'scan_id': scan_id, 
        'retention_time': time,
        'morpheus_score': parse.round_decimal(psm['morpheus score'], 4),
        'charge': int(psm['precursor charge']),
        'mass': parse.round_decimal(psm['precursor mass (da)'], 4),
        'mass_diff': parse.round_decimal(psm['precursor mass error (da)'], 4),
        'm/z': parse.round_decimal(psm['precursor m/z'], 4),
        'source': parse.basename(psm['filename']),
        'missed_cleavages': int(psm['missed cleavages']),
        'q_value': q_value,
      },
      'modifications': [],
      'intensity': 1.0,
      'i': -1,
    }
    if modifications:
      for modification in modifications:
        modification['mass'] = parse.round_decimal(modification['mass'], 4)
      match['modifications'] = modifications
      modified_sequence = psm['peptide sequence'].split('.')[1]
      match['attr']['modified_sequence'] = modified_sequence

    return match
Exemple #4
0
def make_peptide(pepxml_peptide, pepxml_scan, source):
  peptide = {
    'sequence': pepxml_peptide['peptide'],
    'attr': {
      'pepxml_id': pepxml_scan['index'],
      'scan_id': pepxml_scan['start_scan'],
      'expect': pepxml_peptide['expect'],
      'retention_time': pepxml_scan['retention_time_sec'],
      'modifications': pepxml_peptide['modifications'],
      'source': parse.basename(source),
    }
  }
  peptide['attr']['matched_ions'] = str(pepxml_peptide['num_matched_ions'])
  peptide['attr']['matched_ions'] += '/'
  peptide['attr']['matched_ions'] += str(pepxml_peptide['tot_num_ions'])
  peptide['attr']['probability'] = pepxml_peptide['probability']
  peptide['attr']['missed_cleavages'] = pepxml_peptide['num_missed_cleavages']
  peptide['attr']['mass'] = pepxml_scan['precursor_neutral_mass']
  peptide['attr']['mass_diff'] = pepxml_peptide['massdiff']
  peptide['intensity'] = pepxml_peptide['probability']
  return peptide
Exemple #5
0
def make_peptide(pepxml_peptide, pepxml_scan, source):
    peptide = {
        'sequence': pepxml_peptide['peptide'],
        'attr': {
            'pepxml_id': pepxml_scan['index'],
            'scan_id': pepxml_scan['start_scan'],
            'expect': pepxml_peptide['expect'],
            'retention_time': pepxml_scan['retention_time_sec'],
            'modifications': pepxml_peptide['modifications'],
            'source': parse.basename(source),
        }
    }
    peptide['attr']['matched_ions'] = str(pepxml_peptide['num_matched_ions'])
    peptide['attr']['matched_ions'] += '/'
    peptide['attr']['matched_ions'] += str(pepxml_peptide['tot_num_ions'])
    peptide['attr']['probability'] = pepxml_peptide['probability']
    peptide['attr']['missed_cleavages'] = pepxml_peptide[
        'num_missed_cleavages']
    peptide['attr']['mass'] = pepxml_scan['precursor_neutral_mass']
    peptide['attr']['mass_diff'] = pepxml_peptide['massdiff']
    peptide['intensity'] = pepxml_peptide['probability']
    return peptide
Exemple #6
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
    dump_dir = os.path.dirname(protein_groups_fname)
    if modifications_fname is not None:
        modification_table = read_modification_dict(modifications_fname)
    else:
        modification_table = {}
    peptides = parse.read_tsv(psm_fname)
    protein_groups = parse.read_tsv(protein_groups_fname)

    if logger.root.level <= logging.DEBUG:
        dump = os.path.join(dump_dir, 'peptides.dump')
        logger.debug('Dumping peptides data structure to ' + dump)
        parse.save_data_dict(peptides, dump)
        dump = os.path.join(dump_dir, 'protein_groups.dump')
        logger.debug('Dumping protein_groups data structure to ' + dump)
        parse.save_data_dict(protein_groups, dump)

    proteins = {}
    for i_group, protein_group in enumerate(protein_groups):
        descriptions = protein_group['protein description'].split(' / ')
        seqids = [desc.split()[0] for desc in descriptions]
        for seqid in seqids:
            if seqid in proteins:
                logger.warning(
                    "Different protein groups claim same first seqid", seqid)
        protein = {
            'description': descriptions[0],
            'sequence': protein_group['protein sequence'],
            'attr': {
                'coverage':
                protein_group['protein sequence coverage (%)'],
                'morpheus-score':
                parse.round_decimal(protein_group['summed morpheus score'], 4),
                'i_group':
                i_group,
                'other_seqids':
                seqids[1:],
                'seqid':
                seqids[0],
            },
            'sources': [{
                'peptides': []
            }]
        }
        proteins[seqids[0]] = protein

    protein_by_seqid = {}
    for seqid in proteins:
        protein = proteins[seqid]
        protein_by_seqid[seqid] = protein
        for alt_seqid in protein['attr']['other_seqids']:
            protein_by_seqid[alt_seqid] = protein
    unmatched_peptides = []
    n_peptide_matched = 0
    for src_peptide in peptides:
        descriptions = src_peptide['protein description'].split(' / ')
        peptide_seqids = [d.split()[0] for d in descriptions]
        protein = None
        for peptide_seqid in peptide_seqids:
            if peptide_seqid in protein_by_seqid:
                protein = protein_by_seqid[peptide_seqid]
                break
        if protein is None:
            unmatched_peptides.append(src_peptide)
            continue
        n_peptide_matched += 1
        sequence = protein['sequence']
        peptide_sequence, modifications = parse_peptide(
            src_peptide['peptide sequence'], modification_table)
        peptide_sequence = src_peptide['base peptide sequence']
        i = sequence.index(peptide_sequence)
        peptide = {
            'sequence': peptide_sequence,
            'attr': {
                'scan_id':
                src_peptide['scan number'],
                'retention_time':
                parse.round_decimal(src_peptide['retention time (min)'], 4),
                'morpheus_score':
                parse.round_decimal(src_peptide['morpheus score'], 4),
                'mass':
                parse.round_decimal(src_peptide['precursor mass (da)'], 4),
                'mass_diff':
                parse.round_decimal(src_peptide['precursor mass error (da)'],
                                    4),
                'm/z':
                parse.round_decimal(src_peptide['precursor m/z'], 4),
                'source':
                parse.basename(src_peptide['filename']),
            },
            'intensity': src_peptide['morpheus score'] / len(peptide_sequence),
            'i': i,
        }
        if modifications:
            for modification in modifications:
                modification['mass'] = parse.round_decimal(
                    modification['mass'], 4)
            peptide['attr']['modifications'] = modifications

        protein['sources'][0]['peptides'].append(peptide)

    dump = os.path.join(dump_dir, 'proteins.dump')
    logger.debug('Dumping proteins data structure to ' + dump)
    if logger.root.level <= logging.DEBUG:
        parse.save_data_dict(proteins, dump)

    logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(
        n_peptide_matched, len(unmatched_peptides)))

    return proteins
Exemple #7
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
    is_debug = logger.root.level <= logging.DEBUG
    dump_dir = os.path.dirname(protein_groups_fname)

    if modifications_fname is not None:
        modification_table = read_modification_dict(modifications_fname)
    else:
        modification_table = {}

    proteins = {}
    dict_dump_writer = DictListWriter(
        is_debug, os.path.join(dump_dir, 'protein_groups.dump'))
    for i_group, protein_group in enumerate(
            read_tsv_iter(protein_groups_fname)):
        descriptions = protein_group['protein description'].split(' / ')
        coverage_str = str(protein_group['protein sequence coverage (%)'])
        if ';' in coverage_str:
            coverage = float(get_first(coverage_str, ';'))
        else:
            coverage = float(get_first(coverage_str, '/'))
        seqs = protein_group['protein sequence'].split('/')
        seqids = [desc.split()[0] for desc in descriptions]
        for seqid in seqids:
            if seqid in proteins:
                logger.warning(
                    "Different protein groups claim same first seqid", seqid)
        protein = {
            'description': descriptions[0],
            'sequence': seqs[0],
            'other_sequences': seqs[1:],
            'attr': {
                'coverage':
                parse.round_decimal(coverage, 4),
                'morpheus-score':
                parse.round_decimal(protein_group['summed morpheus score'], 4),
                'i_group':
                i_group,
                'other_seqids':
                seqids[1:],
                'seqid':
                seqids[0],
            },
            'sources': [{
                'peptides': []
            }]
        }
        proteins[seqids[0]] = protein
        dict_dump_writer.dump_dict(protein_group)
    dict_dump_writer.close()

    protein_by_seqid = {}
    for seqid in proteins:
        protein = proteins[seqid]
        protein_by_seqid[seqid] = protein
        for alt_seqid in protein['attr']['other_seqids']:
            protein_by_seqid[alt_seqid] = protein

    dict_dump_writer = DictListWriter(is_debug,
                                      os.path.join(dump_dir, 'peptides.dump'))
    n_peptide = 0
    n_peptide_matched = 0
    for src_peptide in read_tsv_iter(psm_fname):
        dict_dump_writer.dump_dict(src_peptide)
        descriptions = src_peptide['protein description'].split(' / ')
        peptide_seqids = [d.split()[0] for d in descriptions]
        protein = None
        for peptide_seqid in peptide_seqids:
            if peptide_seqid in protein_by_seqid:
                protein = protein_by_seqid[peptide_seqid]
                break
        n_peptide += 1
        if protein is None:
            continue
        n_peptide_matched += 1
        sequence = protein['sequence']
        extracted_peptide_sequence, modifications = parse_peptide(
            src_peptide['peptide sequence'], modification_table)
        peptide_sequence = src_peptide['base peptide sequence']
        if extracted_peptide_sequence != peptide_sequence:
            logger.warning("Peptide sequences don't match: " +
                           src_peptide['peptide sequence'] + " " +
                           extracted_peptide_sequence + " " + peptide_sequence)
        i = sequence.find(peptide_sequence)
        if i < 0:
            logger.warning(peptide_sequence + ' not found in ' +
                           protein['attr']['seqid'])
            continue
        q_value = float(src_peptide['q-value (%)'])
        if 'scan number' in src_peptide:
            scan_id = src_peptide['scan number']
        elif 'spectrum number' in src_peptide:
            scan_id = src_peptide['spectrum number']
        else:
            scan_id = ''
        if 'retention time (min)' in src_peptide:
            time = parse.round_decimal(src_peptide['retention time (min)'], 4)
        elif 'retention time (minutes)' in src_peptide:
            time = parse.round_decimal(src_peptide['retention time (minutes)'],
                                       4)
        else:
            time = ''

        peptide = {
            'sequence': peptide_sequence,
            'attr': {
                'scan_id':
                scan_id,
                'retention_time':
                time,
                'morpheus_score':
                parse.round_decimal(src_peptide['morpheus score'], 4),
                'mass':
                parse.round_decimal(src_peptide['precursor mass (da)'], 4),
                'mass_diff':
                parse.round_decimal(src_peptide['precursor mass error (da)'],
                                    4),
                'm/z':
                parse.round_decimal(src_peptide['precursor m/z'], 4),
                'source':
                parse.basename(src_peptide['filename']),
                'q_value':
                q_value,
            },
            'intensity': 1.0 - q_value / 100.0,
            'i': i,
        }
        if modifications:
            for modification in modifications:
                modification['mass'] = parse.round_decimal(
                    modification['mass'], 4)
            peptide['attr']['modifications'] = modifications

        protein['sources'][0]['peptides'].append(peptide)

    dict_dump_writer.close()

    dump = os.path.join(dump_dir, 'proteins.dump')
    if logger.root.level <= logging.DEBUG:
        logger.debug('Dumping proteins data structure to ' + dump)
        parse.save_data_dict(proteins, dump)

    logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(
        n_peptide_matched, n_peptide))

    return proteins
Exemple #8
0
def get_proteins(xtandem_fname,
                 n_peak=50,
                 good_expect=1E-8,
                 cutoff_expect=1E-2):
    proteins = {}
    i_source = 0
    print_scan = True
    for scan in read_xtandem(xtandem_fname):
        scan_id = scan['id']
        x_vals = map(float, scan['masses'].split())
        y_vals = map(float, scan['intensities'].split())
        ions = [(x, y) for x, y in zip(x_vals, y_vals)]
        ions.sort(key=lambda i: -i[1])

        for xtandem_match in scan['matches']:

            expect = xtandem_match['expect']
            if cutoff_expect < expect:
                continue

            intensity = proteins_module.calc_minus_log_intensity(
                expect, good_expect, cutoff_expect)

            seqid = xtandem_match['seqid']

            if seqid not in proteins:
                protein = proteins_module.new_protein(seqid)
                protein.update({
                    'sequence': xtandem_match['sequence'],
                    'description': xtandem_match['description'],
                })
                proteins[seqid] = protein

            protein = proteins[seqid]
            source = protein['sources'][i_source]

            match = {
                'sequence': xtandem_match['seq'],
                'intensity': intensity,
                'modifications': [],
                'spectrum': ions[:n_peak],
                'attr': {
                    'scan_id': scan['id'],
                    'charge': scan['charge'],
                    'expect': expect,
                    'missed_cleavages': xtandem_match['missed_cleavages'],
                    'mass': scan['mass'],
                    'source': parse.basename(xtandem_fname),
                }
            }
            if xtandem_match['modifications']:
                for mod in xtandem_match['modifications']:
                    i_mod_in_full_seq = int(mod['at']) - 1
                    full_seq = xtandem_match['sequence']
                    i_pep_seq = int(xtandem_match['start']) - 1
                    aa = mod['type']
                    if aa in peptidemass.aa_monoisotopic_mass:
                        mass = peptidemass.aa_monoisotopic_mass[aa]
                    else:
                        mass = 0.0
                    match['modifications'].append({
                        'i':
                        i_mod_in_full_seq - i_pep_seq,
                        'mass':
                        mod['modified'] + mass,
                    })

            source['matches'].append(match)

    proteins_module.calculate_peptide_positions(proteins)

    return proteins
Exemple #9
0
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None):
  is_debug = logger.root.level <= logging.DEBUG
  dump_dir = os.path.dirname(protein_groups_fname)

  if modifications_fname is not None:
    modification_table = read_modification_dict(modifications_fname)
  else:
    modification_table = {}
  
  proteins = {}
  dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump'))
  for i_group, protein_group in enumerate(read_tsv_iter(protein_groups_fname)):
    descriptions = protein_group['protein description'].split(' / ')
    coverage_str = str(protein_group['protein sequence coverage (%)'])
    if ';' in coverage_str:
      coverage =  float(get_first(coverage_str, ';'))
    else:
      coverage =  float(get_first(coverage_str, '/'))
    seqs = protein_group['protein sequence'].split('/')
    seqids = [desc.split()[0] for desc in descriptions]
    for seqid in seqids:
      if seqid in proteins:
        logger.warning("Different protein groups claim same first seqid", seqid)
    protein = {
      'description': descriptions[0],
      'sequence': seqs[0],
      'other_sequences': seqs[1:],
      'attr': {
        'coverage': parse.round_decimal(coverage, 4),
        'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4),
        'i_group': i_group,
        'other_seqids': seqids[1:],
        'seqid': seqids[0],
      },
      'sources': [{ 'peptides':[] }]
    }
    proteins[seqids[0]] = protein
    dict_dump_writer.dump_dict(protein_group)
  dict_dump_writer.close()

  protein_by_seqid = {}
  for seqid in proteins:
    protein = proteins[seqid]
    protein_by_seqid[seqid] = protein
    for alt_seqid in protein['attr']['other_seqids']:
      protein_by_seqid[alt_seqid] = protein

  dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump'))
  n_peptide = 0
  n_peptide_matched = 0
  for src_peptide in read_tsv_iter(psm_fname):
    dict_dump_writer.dump_dict(src_peptide)
    descriptions = src_peptide['protein description'].split(' / ')
    peptide_seqids = [d.split()[0] for d in descriptions]
    protein = None
    for peptide_seqid in peptide_seqids:
      if peptide_seqid in protein_by_seqid:
        protein = protein_by_seqid[peptide_seqid]
        break
    n_peptide += 1
    if protein is None:
      continue
    n_peptide_matched += 1
    sequence = protein['sequence']
    extracted_peptide_sequence, modifications = parse_peptide(
        src_peptide['peptide sequence'],
        modification_table)
    peptide_sequence = src_peptide['base peptide sequence']
    if extracted_peptide_sequence != peptide_sequence:
      logger.warning("Peptide sequences don't match: " + src_peptide['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence)
    i = sequence.find(peptide_sequence)
    if i < 0:
      logger.warning(peptide_sequence + ' not found in ' + protein['attr']['seqid'])
      continue
    q_value = float(src_peptide['q-value (%)'])
    if 'scan number' in src_peptide:
      scan_id = src_peptide['scan number']
    elif 'spectrum number' in src_peptide:
      scan_id = src_peptide['spectrum number']
    else:
      scan_id = ''
    if 'retention time (min)' in src_peptide:
      time = parse.round_decimal(src_peptide['retention time (min)'], 4)
    elif 'retention time (minutes)' in src_peptide:
      time = parse.round_decimal(src_peptide['retention time (minutes)'], 4)
    else:
      time = ''

    peptide = {
      'sequence': peptide_sequence,
      'attr': {
        'scan_id': scan_id, 
        'retention_time': time,
        'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4),
        'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4),
        'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4),
        'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4),
        'source': parse.basename(src_peptide['filename']),
        'q_value': q_value,
      },
      'intensity': 1.0 - q_value/100.0,
      'i': i,
    }
    if modifications:
      for modification in modifications:
        modification['mass'] = parse.round_decimal(modification['mass'], 4)
      peptide['attr']['modifications'] = modifications

    protein['sources'][0]['peptides'].append(peptide)

  dict_dump_writer.close()

  dump = os.path.join(dump_dir, 'proteins.dump')
  if logger.root.level <= logging.DEBUG:
    logger.debug('Dumping proteins data structure to ' + dump)
    parse.save_data_dict(proteins, dump)

  logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_peptide_matched, n_peptide))

  return proteins