def parse_morpheus_psm(fname): if 'hela' in fname: celltype = 'Hela' elif 'ecoli' in fname: celltype = 'Ecoli' values = [] key = 'Precursor Mass Error (ppm)' for entry in datafile.read_csv(fname): if entry['Target?'].lower() != "true": continue if float(entry['Q-Value (%)']) > 1: continue value = float(entry[key]) if abs(value) > 20: continue values.append(value) if len(values) == 0: return {} else: average_param = celltype + ' ' + key upper_param = celltype + ' ' + key + ' Upper' avg, std = datafile.get_avg_std(values) result = { average_param: avg, upper_param: avg + std } return result
def read_peptides_from_csv(fname): """ Read peptide sequences, sorts by shortest first, and removes repeats. """ peptides = [] for entry in datafile.read_csv(fname): if 'Sequence' not in entry: continue peptide = { 'sequence': entry['Sequence'], 'overlaps': [], 'subsets': [], 'supersets': [], 'groups': [], 'attr': {} } for key in entry: if not key == 'Sequence': peptide['attr'][key] = entry[key] peptides.append(peptide) peptides.sort() peptides.sort(key=lambda p: len(p['sequence'])) for i, peptide in enumerate(peptides): peptide['i_peptide'] = i return peptides
def add_modifications(modifications_tsv, extra_modifications_tsv): mods = list(datafile.read_csv(modifications_tsv)) default_mods = [mod['Description'] for mod in mods] logger.debug("Adding %s to %s" % (extra_modifications_tsv, modifications_tsv)) new_mod_lines = [] with open(extra_modifications_tsv, 'Ur') as f: for line in f.readlines()[1:]: mod = line.split('\t')[0] if mod not in default_mods: new_mod_lines.append(line) with open(modifications_tsv, 'a') as f: for line in new_mod_lines: f.write(line)
def read_freqs_csv(freqs_csv): freqs = {} for entry in datafile.read_csv(freqs_csv): aa = entry["Amino Acid"] if aa == "Total": continue for key, val in entry.items(): if key.startswith("P"): i = int(key[1:]) if i not in freqs: freqs[i] = {} freqs[i][aa] = float(val)/100. return freqs
def parse_morpheus_summary(fname): result = {} if 'hela' in fname: celltype = 'Hela' elif 'ecoli' in fname: celltype = 'Ecoli' for entry in datafile.read_csv(fname): for key, val in entry.items(): if not key: result[key] = None else: result[celltype + ' ' + key] = datafile.parse_string(val) break return result
def read_peptides_from_csv(fname): peptides = {} for entry in datafile.read_csv(fname): if int(entry['Uni']) == 0: continue n = entry['Uni'] seqids = entry['protein'].split(',') sequence = entry['Sequence'].split('.')[1] if sequence.startswith('n'): sequence = sequence[1:] if sequence in peptides: existing_seqids = set(peptides[sequence]['seqids']) if existing_seqids != seqids: peptides[sequence]['seqids'] = existing_seqids.union(seqids) else: peptides[sequence] = { 'sequence': sequence, 'seqids': seqids, } return peptides
def get_peptide_by_seq(fnames): times = [] peptide_by_seq = {} for fname in fnames: date = datafile.get_date_from_fname(fname) logger.debug("Reading peptides in %s" % date) seqs = [] for entry in datafile.read_csv(fname): seq = entry['Peptide Sequence'] if seq not in peptide_by_seq: peptide_by_seq[seq] = { 'sequence': seq, 'base_sequence': entry['Base Peptide Sequence'], 'intensity_fractions': [], 'ion_fractions': [], 'n_log': 0 } peptide = peptide_by_seq[seq] peptide['intensity_fractions'].append( float(entry['Fraction of Intensity Matching'])) peptide['ion_fractions'].append( float(entry['Ratio of Matching Products'])) seqs.append(seq) for seq in set(seqs): peptide_by_seq[seq]['n_log'] += 1 for peptide in peptide_by_seq.values(): ion_avg, ion_stdv = datafile.get_avg_std(peptide['ion_fractions']) intensity_avg, intensity_stdv = datafile.get_avg_std( peptide['intensity_fractions']) peptide.update({ 'ion_avg': ion_avg, 'ion_stdv': ion_stdv, 'intensity_avg': intensity_avg, 'intensity_stdv': intensity_stdv, 'n_psm': len(peptide['intensity_fractions']), }) return peptide_by_seq
def read_peptides_from_csv(fname): """ Read peptide sequences, sorts by shortest first, and removes repeats. """ peptides = [] for entry in datafile.read_csv(fname): sequence = entry['Sequence'] if not sequence: continue peptides.append({ 'sequence': sequence, 'modifications': entry['Modifications'], 'protein': entry['Protein ID'], 'overlaps': [], 'groups': [], }) peptides.sort() peptides.sort(key=lambda p: len(p['sequence'])) for i, peptide in enumerate(peptides): peptide['i_peptide'] = i return peptides
('DR', 'green', 2, list(range(1, 5)))] source_sets = [] for cell_type, color, i_source, repeats in exp_sets: for i_repeat in repeats: if skip == '_skip_dr4': if cell_type == 'DR' and i_repeat == 4: continue source_sets.append( ('%s%d' % (cell_type, i_repeat), color, i_source)) print source_sets protein = {} for exp, color, i_source in source_sets: fname = 'Data_PeptideOverlay/%s_motif.csv' % exp for entry in datafile.read_csv(fname): seqid = uniprot.parse_fasta_header(entry['Accessions'])[0] if seqid not in protein: protein[seqid] = default_protein() protein[seqid]['description'] = entry['Names'] source = protein[seqid]['sources'][i_source] source['color'] = color source['peptides'].append(entry['sequence']) seqids, fasta = uniprot.read_fasta('../db/uniprot_sprot.fasta') for seqid in protein: sequence = fasta[seqid]['sequence'] protein[seqid]['sequence'] = sequence protein[seqid]['length'] = len(sequence) for source in protein[seqid]['sources']:
def print_modifications(modifications_tsv): logger.debug('modifications.tsv: ' + os.path.relpath(modifications_tsv)) mods = [g['Description'] for g in datafile.read_csv(modifications_tsv)] logger.debug('modifications: %s' % mods)