def get_proteins_and_sources(protxml, pepxmls, peptide_error=0.01, protein_error=0.01): """ Returns a proteins dictionary and list of source names. """ logger.info('Loading protxml ' + protxml) proteins, protein_probs = generate_proteins_from_protxml(protxml) source_names = [] for pepxml in pepxmls: logger.info('Loading pepxml ' + pepxml) load_pepxml(proteins, pepxml, error_cutoff=peptide_error, source_names=source_names) count_tpp_indep_spectra(proteins) probability = error_to_probability(protein_probs, protein_error) filter_proteins(proteins, probability) if logger.root.level <= logging.DEBUG: dump = protxml.replace('prot.xml', 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
def get_proteins_and_sources( protxml, pepxmls, peptide_error=0.01, protein_error=0.01): """ Returns a proteins dictionary and list of source names. """ logger.info('Loading protxml ' + protxml) proteins, protein_probs = generate_proteins_from_protxml(protxml) source_names = [] for pepxml in pepxmls: logger.info('Loading pepxml ' + pepxml) load_pepxml(proteins, pepxml, error_cutoff=peptide_error, source_names=source_names) count_tpp_indep_spectra(proteins) probability = error_to_probability(protein_probs, protein_error) filter_proteins(proteins, probability) if logger.root.level <= logging.DEBUG: dump = protxml.replace('prot.xml', 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
def get_proteins_and_sources(protxml, pepxmls, peptide_error=0.01, protein_error=0.01, good_expect=1E-8, cutoff_expect=1E-2): """ Returns a proteins dictionary and list of source names. """ logger.info('Loading protxml ' + protxml) proteins, protein_probs = make_proteins_from_protxml(protxml) source_names = [] for pepxml in pepxmls: logger.info('Loading pepxml ' + pepxml) load_pepxml_into_proteins(proteins, pepxml, error_cutoff=peptide_error, source_names=source_names, good_expect=good_expect, cutoff_expect=cutoff_expect) count_independent_spectra(proteins) prob_cutoff = error_to_probability(protein_probs, protein_error) for seqid in proteins.keys(): if proteins[seqid]['attr']['probability'] < prob_cutoff: del proteins[seqid] if logger.root.level <= logging.DEBUG: dump = protxml.replace('prot.xml', 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
""" scans = { int(scan['id']):scan for scan in xtandem_scans } for seqid in proteins.keys(): source = proteins[seqid]['sources'][i_source] scan_ids = [] for peptide in source['peptides']: scan_id = peptide['attr']['scan_id'] sequence = peptide['sequence'] modifications = peptide['attr']['modifications'] if scan_id not in scans: logger.warning("Couldn't find xtadnem entry for scan {} in pepxml".format(scan_id)) continue scan = scans[scan_id] x_vals = map(float, scan['masses'].split()) y_vals = map(float, scan['intensities'].split()) ions = [(x, y) for x, y in zip(x_vals, y_vals)] ions.sort(key=lambda i:-i[1]) peptide['spectrum'] = ions[:n_peak] if __name__ == "__main__": scans, fastas = read('../example/xtandem/Seq23282_E1O1.tandem') parse.save_data_dict(scans, '../example/xtandem/scans.dump') parse.save_data_dict(fastas, '../example/xtandem/fastas.dump')
ions = [(x, y) for x, y in zip(x_vals, y_vals)] ions.sort(key=lambda i:-i[1]) peptide['spectrum'] = ions[:n_peak] proteins_module.load_fastas_into_proteins(proteins, fastas) for seqid in proteins.keys(): protein = proteins[seqid] if 'sequence' not in protein: logger.debug("Protein {} not found in x!tandem".format(seqid)) del proteins[seqid] continue n_peptide = sum([len(source['peptides']) for source in protein['sources']]) if n_peptide == 0: del proteins[seqid] logger.debug("No peptide-spectra matches found in {}".format(seqid)) continue # proteins_module.calculate_peptide_positions(proteins) if __name__ == "__main__": scans, fastas = read('../example/xtandem/Seq23282_E1O1.tandem') parse.save_data_dict(scans, '../example/xtandem/scans.dump') parse.save_data_dict(fastas, '../example/xtandem/fastas.dump')
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): is_debug = logger.root.level <= logging.DEBUG dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} proteins = {} dict_dump_writer = DictListWriter( is_debug, os.path.join(dump_dir, 'protein_groups.dump')) for i_group, protein_group in enumerate( read_tsv_iter(protein_groups_fname)): descriptions = protein_group['protein description'].split(' / ') coverage_str = str(protein_group['protein sequence coverage (%)']) if ';' in coverage_str: coverage = float(get_first(coverage_str, ';')) else: coverage = float(get_first(coverage_str, '/')) seqs = protein_group['protein sequence'].split('/') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning( "Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': seqs[0], 'other_sequences': seqs[1:], 'attr': { 'coverage': parse.round_decimal(coverage, 4), 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides': [] }] } proteins[seqids[0]] = protein dict_dump_writer.dump_dict(protein_group) dict_dump_writer.close() protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump')) n_peptide = 0 n_peptide_matched = 0 for src_peptide in read_tsv_iter(psm_fname): dict_dump_writer.dump_dict(src_peptide) descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break n_peptide += 1 if protein is None: continue n_peptide_matched += 1 sequence = protein['sequence'] extracted_peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] if extracted_peptide_sequence != peptide_sequence: logger.warning("Peptide sequences don't match: " + src_peptide['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence) i = sequence.find(peptide_sequence) if i < 0: logger.warning(peptide_sequence + ' not found in ' + protein['attr']['seqid']) continue q_value = float(src_peptide['q-value (%)']) if 'scan number' in src_peptide: scan_id = src_peptide['scan number'] elif 'spectrum number' in src_peptide: scan_id = src_peptide['spectrum number'] else: scan_id = '' if 'retention time (min)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (min)'], 4) elif 'retention time (minutes)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (minutes)'], 4) else: time = '' peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': scan_id, 'retention_time': time, 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), 'q_value': q_value, }, 'intensity': 1.0 - q_value / 100.0, 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal( modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dict_dump_writer.close() dump = os.path.join(dump_dir, 'proteins.dump') if logger.root.level <= logging.DEBUG: logger.debug('Dumping proteins data structure to ' + dump) parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format( n_peptide_matched, n_peptide)) return proteins
def get_proteins_and_sources( protxml, pepxml, n_peptide_cutoff=1, is_skip_no_unique=True, errors = [0.01]): """ Basic structure proteins in YAML formt. "sample_seqid": sequence: "AAAAAAAAAA" description: "sample protein" attr: param: value sources: - peptides - sequence: "AAA" i: 0 j: 3 attr: is_unique: True param: value """ max_error = max(errors) protein_groups, protein_probs = read_protxml(protxml) proteins = make_proteins_from_protxml(protein_groups) dump_dir = os.path.dirname(protxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'protxml.dump') logger.debug('Dumping protxml data structure to ' + dump) parse.save_data_dict(protein_groups, dump) dump = os.path.join(dump_dir, 'proterror.dump') logger.debug('Dumping protein error distribution to ' + dump) parse.save_data_dict(protein_probs, dump) scans_by_sources, peptide_probs = read_pepxml(pepxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'pepxml.dump') logger.debug('Dumping pepxml data structure to ' + dump) parse.save_data_dict(scans_by_sources, dump) dump = os.path.join(dump_dir, 'peperror.dump') logger.debug('Dumping peptide error distribution to ' + dump) parse.save_data_dict(peptide_probs, dump) source_names = [scans['filename'] for scans in scans_by_sources] load_pepxml(proteins, scans_by_sources) probability = error_to_probability(peptide_probs, max_error) filter_peptides(proteins, probability) probabilities = [error_to_probability(peptide_probs, e) for e in errors] make_mask(proteins, probabilities) probability = error_to_probability(protein_probs, max_error) filter_proteins(proteins, probability) parse_proteins.determine_unique_peptides(proteins) parse_proteins.count_peptides(proteins, n_peptide_cutoff, is_skip_no_unique) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
for source in protein['sources']: for peptide in source['peptides']: if 'identity' in peptide['attr']: match_id = "%.2f%.2f%.2f%s" % \ (peptide['attr']['score'], peptide['attr']['identity'], peptide['attr']['homology'], peptide['sequence']) peptide_by_match_id[match_id] = peptide scans, mascot_proteins = read_mascot_dat(mascot_dat) n_match = 0 for scan in scans.values(): for match in scan['matches']: match_id = "%.2f%.2f%.2f%s" % \ (match['score'], scan['identity'], scan['homology'], match['sequence']) if match_id in peptide_by_match_id: n_match += 1 peptide = peptide_by_match_id[match_id] peptide['spectrum'] = split_mascot_ion_str(scan['Ions1']) logger.info('%s: matched %d pepXML to %d mascot PSM' % \ (mascot_dat, len(peptide_by_match_id), n_match)) if __name__ == '__main__': scans, proteins = read_mascot_dat('../example/mascot/F022045.dat') save_data_dict(scans, '../example/mascot/scans.dump') save_data_dict(proteins,'../example/mascot/proteins.dump')
peptide_intensities1.extend(intensities[experiment1]) elif experiment2 in intensities: ratio = 0.0 intensity = -2 * max_ratio std = 0 peptide_intensities2.extend(intensities[experiment2]) else: # neither experiment1 or experiment2 found in experiment column ratio = None for peptide in peptides: peptide['attr']['ratio'] = ratio peptide['intensity'] = intensity peptide['attr']['ratio_var'] = std sum2 = numpy.sum(peptide_intensities2) sum1 = numpy.sum(peptide_intensities1) if sum2 > 0.0: group_ratio = sum1 / sum2 else: group_ratio = float('inf') protein['attr']['ratio'] = group_ratio if __name__ == '__main__': peptides, scans, protein_groups, evidence = read( '../example/maxquant/silac') parse.save_data_dict(peptides, '../example/maxquant/peptides.dump') parse.save_data_dict(scans, '../example/maxquant/scans.dump') parse.save_data_dict(protein_groups, '../example/maxquant/protein_groups.dump') parse.save_data_dict(evidence, '../example/maxquant/evidence.dump')
section = name[1:-1] if section == "summary": process_line = \ lambda l: process_summary(l, scans) if section == "peptides": process_line = \ lambda l: process_matches(l, scans, max_peptide_rank) if section == "proteins": process_line = \ lambda l: process_proteins(l, proteins) if "query" in section: scan_id = int(section[5:]) process_line = \ lambda l: process_query(l, scan_id, scans) continue if process_line: process_line(l[:-1]) return scans, proteins def split_mascot_ion_str(s): "Parses an Ion entry string into a dictionary" pairs = [piece.split(':') for piece in s.split(',')] return [[float(x), float(y)] for x, y in pairs] if __name__ == '__main__': scans, proteins = read_mascot_dat('../example/mascot/F022045.dat') save_data_dict(scans, '../example/mascot/scans.dump') save_data_dict(proteins, '../example/mascot/proteins.dump')
def get_proteins_and_sources( protein_groups_fname, psm_fname, modifications_fname=None, q_good=0.0, q_cutoff=10): is_debug = logger.root.level <= logging.DEBUG dump_dir = os.path.dirname(protein_groups_fname) modification_table = {} if modifications_fname: modification_table = read_modification_dict(modifications_fname) proteins = {} dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump')) for i_group, protein_group in enumerate(parse.read_tsv(protein_groups_fname)): protein = make_protein(i_group, protein_group) proteins[protein['attr']['seqid']] = protein dict_dump_writer.dump_dict(protein_group) dict_dump_writer.close() protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein dict_dump_writer = parse.DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump')) n_match = 0 n_match_assigned = 0 i_source_from_source = {} sources = [] for psm in parse.read_tsv(psm_fname): dict_dump_writer.dump_dict(psm) match = make_match(psm, modification_table) match['intensity'] = parse_proteins.calc_intensity( match['attr']['q_value'], q_good, q_cutoff) if match['attr']['q_value'] > q_cutoff: continue peptide_sequence = match['sequence'] n_match += 1 protein = None descriptions = psm['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: test_protein = protein_by_seqid[peptide_seqid] sequence = protein_by_seqid[peptide_seqid]['sequence'] if peptide_sequence in sequence: protein = test_protein break else: logger.debug("Couldn't find protein for %s" % (peptide_sequence)) continue match['i'] = sequence.find(peptide_sequence) n_match_assigned += 1 i_source = get_i_source(proteins, sources, psm['filename']) protein['sources'][i_source]['matches'].append(match) dict_dump_writer.close() dump = os.path.join(dump_dir, 'proteins.dump') if logger.root.level <= logging.DEBUG: logger.debug('Dumping proteins data structure to ' + dump) parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_match_assigned, n_match)) return proteins, sources
def get_proteins_and_sources(in_dir, is_leu_ile_isomeric=False,): peptide_list, scan_list, protein_group_list, evidence_list = \ read(in_dir) peptides = { int(p['id']):p for p in peptide_list } scans = { int(s['id']):s for s in scan_list } protein_groups = { int(p['id']):p for p in protein_group_list } evidence_dict = { int(e['id']):e for e in evidence_list } parse.save_data_dict(peptides, in_dir + '/peptides.dump') parse.save_data_dict(scans, in_dir + '/scans.dump') parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump') parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump') sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source:k for k, source in enumerate(sources)} proteins = {} protein_by_group_id = {} for group_id, protein_group in protein_groups.items(): protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'peptides': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein print("Matching sequences and scan in proteins") i_scan = 0 n_scan = len(scans) for scan_id, scan in scans.items(): i_scan += 1 if i_scan % 5000 == 0: print("{}/{} scans processed".format(i_scan, n_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): new_peptide = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'attr' : { 'modifications': [], 'mq_scan_id': scan_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } transfer_attrs(scan, new_peptide['attr'], scan_parse_list) transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list) transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list) change_key(new_peptide['attr'], 'scan number', 'scan_id') change_key(new_peptide['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['peptides'].append(new_peptide) parse_proteins.count_peptides(proteins) return proteins, sources
peptide_intensities1.extend(intensities[experiment1]) elif experiment2 in intensities: ratio = 0.0 intensity = -2*max_ratio std = 0 peptide_intensities2.extend(intensities[experiment2]) else: # neither experiment1 or experiment2 found in experiment column ratio = None for peptide in peptides: peptide['attr']['ratio'] = ratio peptide['intensity'] = intensity peptide['attr']['ratio_var'] = std sum2 = numpy.sum(peptide_intensities2) sum1 = numpy.sum(peptide_intensities1) if sum2 > 0.0: group_ratio = sum1/sum2 else: group_ratio = float('inf') protein['attr']['ratio'] = group_ratio if __name__ == '__main__': peptides, scans, protein_groups, evidence = read('../example/maxquant/silac') parse.save_data_dict(peptides, '../example/maxquant/peptides.dump') parse.save_data_dict(scans, '../example/maxquant/scans.dump') parse.save_data_dict(protein_groups, '../example/maxquant/protein_groups.dump') parse.save_data_dict(evidence, '../example/maxquant/evidence.dump')
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} peptides = parse.read_tsv(psm_fname) protein_groups = parse.read_tsv(protein_groups_fname) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'peptides.dump') logger.debug('Dumping peptides data structure to ' + dump) parse.save_data_dict(peptides, dump) dump = os.path.join(dump_dir, 'protein_groups.dump') logger.debug('Dumping protein_groups data structure to ' + dump) parse.save_data_dict(protein_groups, dump) proteins = {} for i_group, protein_group in enumerate(protein_groups): descriptions = protein_group['protein description'].split(' / ') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning( "Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': protein_group['protein sequence'], 'attr': { 'coverage': protein_group['protein sequence coverage (%)'], 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides': [] }] } proteins[seqids[0]] = protein protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein unmatched_peptides = [] n_peptide_matched = 0 for src_peptide in peptides: descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break if protein is None: unmatched_peptides.append(src_peptide) continue n_peptide_matched += 1 sequence = protein['sequence'] peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] i = sequence.index(peptide_sequence) peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': src_peptide['scan number'], 'retention_time': parse.round_decimal(src_peptide['retention time (min)'], 4), 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), }, 'intensity': src_peptide['morpheus score'] / len(peptide_sequence), 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal( modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping proteins data structure to ' + dump) if logger.root.level <= logging.DEBUG: parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format( n_peptide_matched, len(unmatched_peptides))) return proteins
def get_proteins_and_sources( in_dir, is_leu_ile_isomeric=False, ): peptide_list, scan_list, protein_group_list, evidence_list = \ read(in_dir) peptides = {int(p['id']): p for p in peptide_list} scans = {int(s['id']): s for s in scan_list} protein_groups = {int(p['id']): p for p in protein_group_list} evidence_dict = {int(e['id']): e for e in evidence_list} parse.save_data_dict(peptides, in_dir + '/peptides.dump') parse.save_data_dict(scans, in_dir + '/scans.dump') parse.save_data_dict(protein_groups, in_dir + '/protein_groups.dump') parse.save_data_dict(evidence_dict, in_dir + '/evidence.dump') sources_set = set(e['raw file'] for e in evidence_dict.values()) sources = [str(s) for s in sorted(sources_set)] i_sources = {source: k for k, source in enumerate(sources)} proteins = {} protein_by_group_id = {} for group_id, protein_group in protein_groups.items(): protein = { 'description': '', 'attr': { 'group_id': group_id, 'other_seqids': [], }, 'sources': [{ 'peptides': [] } for k in range(len(i_sources))], } transfer_attrs(protein_group, protein['attr'], protein_parse_list) seqids = parse.splitter(protein_group['protein ids']) proteins[seqids[0]] = protein protein['attr']['seqid'] = seqids[0] protein['attr']['other_seqids'] = seqids[1:] protein_by_group_id[group_id] = protein print("Matching sequences and scan in proteins") i_scan = 0 n_scan = len(scans) for scan_id, scan in scans.items(): i_scan += 1 if i_scan % 5000 == 0: print("{}/{} scans processed".format(i_scan, n_scan)) evidence_id = int(scan['evidence id']) evidence = evidence_dict[evidence_id] peptide_id = int(scan['peptide id']) peptide = peptides[peptide_id] for group_id in parse.splitter(str(scan['protein group ids'])): new_peptide = { 'sequence': scan['sequence'], 'spectrum': get_labeled_spectrum(scan), 'attr': { 'modifications': [], 'mq_scan_id': scan_id, 'is_unique': peptide['unique (groups)'] == 'yes', } } transfer_attrs(scan, new_peptide['attr'], scan_parse_list) transfer_attrs(evidence, new_peptide['attr'], evidence_parse_list) transfer_attrs(peptide, new_peptide['attr'], peptide_parse_list) change_key(new_peptide['attr'], 'scan number', 'scan_id') change_key(new_peptide['attr'], 'retention time', 'retention_time') protein = protein_by_group_id[int(group_id)] i_source = i_sources[evidence['raw file']] protein['sources'][i_source]['peptides'].append(new_peptide) parse_proteins.count_peptides(proteins) return proteins, sources
def get_proteins_and_sources(protxml, pepxml, n_peptide_cutoff=1, is_skip_no_unique=True, errors=[0.01]): """ Basic structure proteins in YAML formt. "sample_seqid": sequence: "AAAAAAAAAA" description: "sample protein" attr: param: value sources: - peptides - sequence: "AAA" i: 0 j: 3 attr: is_unique: True param: value """ max_error = max(errors) protein_groups, protein_probs = read_protxml(protxml) proteins = make_proteins_from_protxml(protein_groups) dump_dir = os.path.dirname(protxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'protxml.dump') logger.debug('Dumping protxml data structure to ' + dump) parse.save_data_dict(protein_groups, dump) dump = os.path.join(dump_dir, 'proterror.dump') logger.debug('Dumping protein error distribution to ' + dump) parse.save_data_dict(protein_probs, dump) scans_by_sources, peptide_probs = read_pepxml(pepxml) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'pepxml.dump') logger.debug('Dumping pepxml data structure to ' + dump) parse.save_data_dict(scans_by_sources, dump) dump = os.path.join(dump_dir, 'peperror.dump') logger.debug('Dumping peptide error distribution to ' + dump) parse.save_data_dict(peptide_probs, dump) source_names = [scans['filename'] for scans in scans_by_sources] load_pepxml(proteins, scans_by_sources) probability = error_to_probability(peptide_probs, max_error) filter_peptides(proteins, probability) probabilities = [error_to_probability(peptide_probs, e) for e in errors] make_mask(proteins, probabilities) probability = error_to_probability(protein_probs, max_error) filter_proteins(proteins, probability) parse_proteins.determine_unique_peptides(proteins) parse_proteins.count_peptides(proteins, n_peptide_cutoff, is_skip_no_unique) if logger.root.level <= logging.DEBUG: dump = os.path.join(dump_dir, 'proteins.dump') logger.debug('Dumping protein data structure to ' + dump) parse.save_data_dict(proteins, dump) return proteins, source_names
def get_proteins(protein_groups_fname, psm_fname, modifications_fname=None): is_debug = logger.root.level <= logging.DEBUG dump_dir = os.path.dirname(protein_groups_fname) if modifications_fname is not None: modification_table = read_modification_dict(modifications_fname) else: modification_table = {} proteins = {} dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'protein_groups.dump')) for i_group, protein_group in enumerate(read_tsv_iter(protein_groups_fname)): descriptions = protein_group['protein description'].split(' / ') coverage_str = str(protein_group['protein sequence coverage (%)']) if ';' in coverage_str: coverage = float(get_first(coverage_str, ';')) else: coverage = float(get_first(coverage_str, '/')) seqs = protein_group['protein sequence'].split('/') seqids = [desc.split()[0] for desc in descriptions] for seqid in seqids: if seqid in proteins: logger.warning("Different protein groups claim same first seqid", seqid) protein = { 'description': descriptions[0], 'sequence': seqs[0], 'other_sequences': seqs[1:], 'attr': { 'coverage': parse.round_decimal(coverage, 4), 'morpheus-score': parse.round_decimal(protein_group['summed morpheus score'], 4), 'i_group': i_group, 'other_seqids': seqids[1:], 'seqid': seqids[0], }, 'sources': [{ 'peptides':[] }] } proteins[seqids[0]] = protein dict_dump_writer.dump_dict(protein_group) dict_dump_writer.close() protein_by_seqid = {} for seqid in proteins: protein = proteins[seqid] protein_by_seqid[seqid] = protein for alt_seqid in protein['attr']['other_seqids']: protein_by_seqid[alt_seqid] = protein dict_dump_writer = DictListWriter(is_debug, os.path.join(dump_dir, 'peptides.dump')) n_peptide = 0 n_peptide_matched = 0 for src_peptide in read_tsv_iter(psm_fname): dict_dump_writer.dump_dict(src_peptide) descriptions = src_peptide['protein description'].split(' / ') peptide_seqids = [d.split()[0] for d in descriptions] protein = None for peptide_seqid in peptide_seqids: if peptide_seqid in protein_by_seqid: protein = protein_by_seqid[peptide_seqid] break n_peptide += 1 if protein is None: continue n_peptide_matched += 1 sequence = protein['sequence'] extracted_peptide_sequence, modifications = parse_peptide( src_peptide['peptide sequence'], modification_table) peptide_sequence = src_peptide['base peptide sequence'] if extracted_peptide_sequence != peptide_sequence: logger.warning("Peptide sequences don't match: " + src_peptide['peptide sequence'] + " " + extracted_peptide_sequence + " " + peptide_sequence) i = sequence.find(peptide_sequence) if i < 0: logger.warning(peptide_sequence + ' not found in ' + protein['attr']['seqid']) continue q_value = float(src_peptide['q-value (%)']) if 'scan number' in src_peptide: scan_id = src_peptide['scan number'] elif 'spectrum number' in src_peptide: scan_id = src_peptide['spectrum number'] else: scan_id = '' if 'retention time (min)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (min)'], 4) elif 'retention time (minutes)' in src_peptide: time = parse.round_decimal(src_peptide['retention time (minutes)'], 4) else: time = '' peptide = { 'sequence': peptide_sequence, 'attr': { 'scan_id': scan_id, 'retention_time': time, 'morpheus_score': parse.round_decimal(src_peptide['morpheus score'], 4), 'mass': parse.round_decimal(src_peptide['precursor mass (da)'], 4), 'mass_diff': parse.round_decimal(src_peptide['precursor mass error (da)'], 4), 'm/z': parse.round_decimal(src_peptide['precursor m/z'], 4), 'source': parse.basename(src_peptide['filename']), 'q_value': q_value, }, 'intensity': 1.0 - q_value/100.0, 'i': i, } if modifications: for modification in modifications: modification['mass'] = parse.round_decimal(modification['mass'], 4) peptide['attr']['modifications'] = modifications protein['sources'][0]['peptides'].append(peptide) dict_dump_writer.close() dump = os.path.join(dump_dir, 'proteins.dump') if logger.root.level <= logging.DEBUG: logger.debug('Dumping proteins data structure to ' + dump) parse.save_data_dict(proteins, dump) logger.info("Assigned {}/{} of PSMs.tsv to protein_groups.tsv".format(n_peptide_matched, n_peptide)) return proteins