def parse_morpheus_psm(fname):
    if 'hela' in fname:
        celltype = 'Hela'
    elif 'ecoli' in fname:
        celltype = 'Ecoli'
    values = []
    key = 'Precursor Mass Error (ppm)'
    for entry in datafile.read_csv(fname):
        if entry['Target?'].lower() != "true":
            continue
        if float(entry['Q-Value (%)']) > 1:
            continue
        value = float(entry[key])
        if abs(value) > 20:
            continue
        values.append(value)
    if len(values) == 0:
        return {}
    else:
        average_param = celltype + ' ' + key
        upper_param = celltype + ' ' + key + ' Upper'
        avg, std = datafile.get_avg_std(values)
        result = {
            average_param: avg,
            upper_param: avg + std
        }
        return result
Exemple #2
0
def read_peptides_from_csv(fname):
    """
    Read peptide sequences, sorts by shortest first, and
    removes repeats.
    """
    peptides = []
    for entry in datafile.read_csv(fname):
        if 'Sequence' not in entry:
            continue
        peptide = {
            'sequence': entry['Sequence'],
            'overlaps': [],
            'subsets': [],
            'supersets': [],
            'groups': [],
            'attr': {}
        }
        for key in entry:
            if not key == 'Sequence':
                peptide['attr'][key] = entry[key]
        peptides.append(peptide)
    peptides.sort()
    peptides.sort(key=lambda p: len(p['sequence']))
    for i, peptide in enumerate(peptides):
        peptide['i_peptide'] = i
    return peptides
Exemple #3
0
def add_modifications(modifications_tsv, extra_modifications_tsv):
    mods = list(datafile.read_csv(modifications_tsv))
    default_mods = [mod['Description'] for mod in mods]
    logger.debug("Adding %s to %s" % (extra_modifications_tsv, modifications_tsv))
    new_mod_lines = []
    with open(extra_modifications_tsv, 'Ur') as f:
        for line in f.readlines()[1:]:
            mod = line.split('\t')[0]
            if mod not in default_mods:
                new_mod_lines.append(line)
    with open(modifications_tsv, 'a') as f:
        for line in new_mod_lines:
            f.write(line)
Exemple #4
0
def read_freqs_csv(freqs_csv):
    freqs = {}
    for entry in datafile.read_csv(freqs_csv):
        aa = entry["Amino Acid"]
        if aa == "Total":
            continue
        for key, val in entry.items():
            if key.startswith("P"):
                i = int(key[1:])
                if i not in freqs:
                    freqs[i] = {}
                freqs[i][aa] = float(val)/100.
    return freqs
def parse_morpheus_summary(fname):
    result = {}
    if 'hela' in fname:
        celltype = 'Hela'
    elif 'ecoli' in fname:
        celltype = 'Ecoli'
    for entry in datafile.read_csv(fname):
        for key, val in entry.items():
            if not key:
                result[key] = None
            else:
                result[celltype + ' ' + key] = datafile.parse_string(val)
        break
    return result
Exemple #6
0
def read_peptides_from_csv(fname):
  peptides = {}
  for entry in datafile.read_csv(fname):
    if int(entry['Uni']) == 0:
      continue
    n = entry['Uni']
    seqids = entry['protein'].split(',')
    sequence = entry['Sequence'].split('.')[1]
    if sequence.startswith('n'):
      sequence = sequence[1:]
    if sequence in peptides:
      existing_seqids = set(peptides[sequence]['seqids'])
      if existing_seqids != seqids:
        peptides[sequence]['seqids'] = existing_seqids.union(seqids)
    else:
      peptides[sequence] = { 
        'sequence': sequence,
        'seqids': seqids,
      }
  return peptides
def read_peptides_from_csv(fname):
    peptides = {}
    for entry in datafile.read_csv(fname):
        if int(entry['Uni']) == 0:
            continue
        n = entry['Uni']
        seqids = entry['protein'].split(',')
        sequence = entry['Sequence'].split('.')[1]
        if sequence.startswith('n'):
            sequence = sequence[1:]
        if sequence in peptides:
            existing_seqids = set(peptides[sequence]['seqids'])
            if existing_seqids != seqids:
                peptides[sequence]['seqids'] = existing_seqids.union(seqids)
        else:
            peptides[sequence] = {
                'sequence': sequence,
                'seqids': seqids,
            }
    return peptides
Exemple #8
0
def get_peptide_by_seq(fnames):
    times = []
    peptide_by_seq = {}
    for fname in fnames:
        date = datafile.get_date_from_fname(fname)

        logger.debug("Reading peptides in %s" % date)
        seqs = []
        for entry in datafile.read_csv(fname):
            seq = entry['Peptide Sequence']
            if seq not in peptide_by_seq:
                peptide_by_seq[seq] = {
                    'sequence': seq,
                    'base_sequence': entry['Base Peptide Sequence'],
                    'intensity_fractions': [],
                    'ion_fractions': [],
                    'n_log': 0
                }
            peptide = peptide_by_seq[seq]
            peptide['intensity_fractions'].append(
                float(entry['Fraction of Intensity Matching']))
            peptide['ion_fractions'].append(
                float(entry['Ratio of Matching Products']))
            seqs.append(seq)

        for seq in set(seqs):
            peptide_by_seq[seq]['n_log'] += 1

    for peptide in peptide_by_seq.values():
        ion_avg, ion_stdv = datafile.get_avg_std(peptide['ion_fractions'])
        intensity_avg, intensity_stdv = datafile.get_avg_std(
            peptide['intensity_fractions'])
        peptide.update({
            'ion_avg': ion_avg,
            'ion_stdv': ion_stdv,
            'intensity_avg': intensity_avg,
            'intensity_stdv': intensity_stdv,
            'n_psm': len(peptide['intensity_fractions']),
        })

    return peptide_by_seq
Exemple #9
0
def read_peptides_from_csv(fname):
    """
    Read peptide sequences, sorts by shortest first, and
    removes repeats.
    """
    peptides = []
    for entry in datafile.read_csv(fname):
        sequence = entry['Sequence']
        if not sequence:
            continue
        peptides.append({
            'sequence': sequence,
            'modifications': entry['Modifications'],
            'protein': entry['Protein ID'],
            'overlaps': [],
            'groups': [],
        })
    peptides.sort()
    peptides.sort(key=lambda p: len(p['sequence']))
    for i, peptide in enumerate(peptides):
        peptide['i_peptide'] = i
    return peptides
Exemple #10
0
def read_peptides_from_csv(fname):
    """
    Read peptide sequences, sorts by shortest first, and
    removes repeats.
    """
    peptides = []
    for entry in datafile.read_csv(fname):
        sequence = entry['Sequence']
        if not sequence:
            continue
        peptides.append({
            'sequence': sequence,
            'modifications': entry['Modifications'],
            'protein': entry['Protein ID'],
            'overlaps': [],
            'groups': [],
        })
    peptides.sort()
    peptides.sort(key=lambda p: len(p['sequence']))
    for i, peptide in enumerate(peptides):
        peptide['i_peptide'] = i
    return peptides
                ('DR', 'green', 2, list(range(1, 5)))]

    source_sets = []
    for cell_type, color, i_source, repeats in exp_sets:
        for i_repeat in repeats:
            if skip == '_skip_dr4':
                if cell_type == 'DR' and i_repeat == 4:
                    continue
            source_sets.append(
                ('%s%d' % (cell_type, i_repeat), color, i_source))
    print source_sets

    protein = {}
    for exp, color, i_source in source_sets:
        fname = 'Data_PeptideOverlay/%s_motif.csv' % exp
        for entry in datafile.read_csv(fname):
            seqid = uniprot.parse_fasta_header(entry['Accessions'])[0]
            if seqid not in protein:
                protein[seqid] = default_protein()
                protein[seqid]['description'] = entry['Names']
            source = protein[seqid]['sources'][i_source]
            source['color'] = color
            source['peptides'].append(entry['sequence'])

    seqids, fasta = uniprot.read_fasta('../db/uniprot_sprot.fasta')

    for seqid in protein:
        sequence = fasta[seqid]['sequence']
        protein[seqid]['sequence'] = sequence
        protein[seqid]['length'] = len(sequence)
        for source in protein[seqid]['sources']:
Exemple #12
0
def print_modifications(modifications_tsv):
    logger.debug('modifications.tsv: ' + os.path.relpath(modifications_tsv))
    mods = [g['Description'] for g in datafile.read_csv(modifications_tsv)]
    logger.debug('modifications: %s' % mods)