def fragments(peptide, types=('b', 'y'), max_charge=1):
    '''
    Function that returns theoretical fragments of peptide.
    Modeled from : https://pyteomics.readthedocs.io/en/latest/examples/example_msms.html

    :param peptide: (str) peptide sequence
    :param types: (tuple) types of fragments desired
    :param max_charge: (int) maximum charge state of fragment ions
    '''
    d = {}
    for ion_type in types:
        d[ion_type] = []
        for i in range(1, len(peptide)):
            for charge in range(1, max_charge + 1):
                if ion_type[0] in 'abc':
                    if i == 0:
                        continue
                    m = mass.fast_mass(peptide[:i],
                                       ion_type=ion_type,
                                       charge=charge)
                else:
                    m = mass.fast_mass(peptide[i:],
                                       ion_type=ion_type,
                                       charge=charge)
                d[ion_type].append(m)
    return d
Ejemplo n.º 2
0
def getIonMasses(peptide, types=('b', 'y'), maxcharge=2):
    """
    The function generates all possible m/z for fragments of types
    `types` and of charges from 1 to `maxharge`.
    """
    ions = {
        "b1": [],
        "b2": [],
        "bn1": [],
        "bn2": [],
        "bo1": [],
        "bo2": [],
        "y1": [],
        "y2": [],
        "yn1": []
    }
    losses = ['', 'n', 'o']
    for ion_type in types:
        for charge in range(1, maxcharge + 1):
            for lossT in losses:
                key = ion_type + lossT + str(charge)
                loss = lossConvert(lossT, charge)
                for i in range(1, len(peptide) - 1):
                    if ion_type[0] in 'abc':
                        ions[key].append(
                            massC.fast_mass(peptide[:i],
                                            ion_type=ion_type,
                                            charge=charge))
                    else:
                        ions[key].append(
                            massC.fast_mass(peptide[i:],
                                            ion_type=ion_type,
                                            charge=charge))
    return ions
Ejemplo n.º 3
0
 def _fragments(self, peptide, types=("b", "y"), maxcharge=1):
     for i in range(1, len(peptide) - 1):
         for ion_type in types:
             for charge in range(1, maxcharge + 1):
                 if ion_type[0] in "abc":
                     yield mass.fast_mass(
                         peptide[:i], ion_type=ion_type, charge=charge
                     )
                 else:
                     yield mass.fast_mass(
                         peptide[i:], ion_type=ion_type, charge=charge
                     )
def create_theoretical_peak_map(peptide, ion_type_list, charge_set=[1]):
    amino_acid_list = get_peptide_modification_list_inspect_format(peptide)
    #print(amino_acid_list)

    only_letters_list = [letter for letter in peptide if letter.isalpha()]

    only_mods_mass_add_list = []
    for amino_acid in amino_acid_list:
        mod_mass_to_add = 0.0
        mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*',
                                           re.sub("[A-Z]", "", amino_acid))
        for mod_tokenized in mod_strings_tokenized:
            mod_mass_to_add += float(mod_tokenized)
        only_mods_mass_add_list.append(mod_mass_to_add)

    ion_to_mass_mapping = {}
    #print(peptide)
    #print(only_mods_mass_add_list)
    for charge in charge_set:
        for ion_type in ion_type_list:
            #print(ion_type)
            iso_topic_added_mass = 0.0
            real_ion_type = ion_type
            if ion_type[-4:] == "-iso":
                iso_topic_added_mass = 1.007276 / float(charge)
                real_ion_type = ion_type[:-4]

            for i in range(len(amino_acid_list)):
                peak_mass = 0.0
                if real_ion_type[0] in "abc":
                    peak_annotation = ion_type + ":" + str(i + 1) + ":" + str(
                        charge)
                    peak_mass = mass.fast_mass(
                        "".join(only_letters_list[:i + 1]),
                        ion_type=real_ion_type,
                        charge=charge) + sum(
                            only_mods_mass_add_list[:i + 1]) / (
                                float(charge)) + iso_topic_added_mass
                    #print(ion_type, i, charge, peak_mass, real_ion_type)
                else:
                    peak_annotation = ion_type + ":" + str(
                        len(amino_acid_list) - i) + ":" + str(charge)
                    peak_mass = mass.fast_mass(
                        "".join(only_letters_list[i:]),
                        ion_type=real_ion_type,
                        charge=charge) + sum(only_mods_mass_add_list[i:]) / (
                            float(charge)) + iso_topic_added_mass
                    #print(ion_type, i, charge, peak_mass)
                ion_to_mass_mapping[peak_annotation] = peak_mass

    return ion_to_mass_mapping
Ejemplo n.º 5
0
def fragments(peptide, types=('b', 'y'), maxcharge=1):
    """
    The function generates all possible m/z for fragments of types 
    `types` and of charges from 1 to `maxharge`.
    """
    for i in xrange(1, len(peptide) - 1):
        for ion_type in types:
            for charge in xrange(1, maxcharge + 1):
                if ion_type[0] in 'abc':
                    yield mass.fast_mass(peptide[:i],
                                         ion_type=ion_type,
                                         charge=charge)
                else:
                    yield mass.fast_mass(peptide[i:],
                                         ion_type=ion_type,
                                         charge=charge)
Ejemplo n.º 6
0
    def calc_precursor_mz(self, peptide, modifications, charge):
        """
        Calculate precursor mass and mz for given peptide and modification list,
        taking the modifications into account.

        Note: This method does not use the build-in Pyteomics modification handling, as
        that would require a known atomic composition of the modification.

        Parameters
        ----------
        peptide: str
            stripped peptide sequence

        modifications: str
            MS2PIP-style formatted modifications list (e.g. `0|Acetyl|2|Oxidation`)

        charge: int
            precursor charge

        Returns
        -------
        prec_mass, prec_mz: tuple(float, float)
        """

        charge = int(charge)
        unmodified_mass = mass.fast_mass(peptide)
        mods_massses = sum(
            [self.mass_shifts[mod] for mod in modifications.split("|")[1::2]])
        prec_mass = unmodified_mass + mods_massses
        prec_mz = (prec_mass + charge * PROTON_MASS) / charge
        return prec_mass, prec_mz
def mgf_library_upload_quant(fileName, scanDict, digDict, aaDict, maxPeaks):

    # mgf file is read in using the pyteomics mgf module
    libMGF = mgf.read(fileName)

    # return value is initialized
    lib = defaultdict(list)

    keyList = sorted(list(scanDict.keys()))
    # each spectrum in the mgf file
    for spec in libMGF:

        seq = spec['params']['seq']
        precMz = spec['params']['pepmass'][0]

        key = (round(precMz, 2), seq)
        if key not in scanDict: continue

        # Decimal values are replaced with numeric placeholders to be included in the analysis.
        sequence = re.sub(r'\+\d+\.\d+', lambda m: digDict.get(m.group()), seq)

        # peaks of the library file are intialized
        mz = list(spec['m/z array'])
        intensity = [x for x in list(spec['intensity array'])]
        z = spec['params']['charge'][0]

        # The y-ion mz value for each fragment of the peptide is calculated. If it is in the library, it and it's intensity are stored in a list
        # NOTE: y-ions are singled out because they should have at least one lysine or arginine, so will have a heavy counterpart that can show up. B-ions don't have that guarantee.
        fragList = []
        for x in range(1, len(sequence) - 1):
            fragseq = sequence[x:]
            lightfragmz = mass.fast_mass(
                sequence=sequence[x:], ion_type='y', charge=1,
                aa_mass=aaDict)  # Do I need to use different possible charges?
            i = smf.approx_list(lightfragmz, mz)
            if i == -1: continue
            fragList.append((intensity[i], lightfragmz, fragseq))

        # y-ion peaks are sorted by intensity, and lower-intensity peaks are filtered out.
        fragList.sort(reverse=True)
        if maxPeaks != 0 and len(fragList) >= maxPeaks:
            fragList = fragList[:maxPeaks]

        # heavy counterpart mz is calculated. Light and heavy pairs are additionally tagged by their intensity rank and included in the final output.
        peaks = []
        for i in range(len(fragList)):
            fragMz = fragList[i][1]
            fragInt = fragList[i][0]
            peaks.append((fragMz, fragInt, (0, i, seq)))
            peaks.append((smf.calculate_heavy_mz(fragList[i][2], fragMz,
                                                 1), fragInt, (1, i, seq)))

        peaks.sort(key=lambda x: x[0])

        lib[scanDict[key]] += peaks
    return lib
Ejemplo n.º 8
0
def getIons(sequence, charge):
    '''
    This function return theoretical mass to charge of b and y ions
    from MS2.
    Based on this, comparing with observed data to list down
    potential ions to predict peptide sequence
    '''
    outcome = []
    bions = function.bIon(sequence)
    yions = function.yIon(sequence)
    for i in bions:
        outcome.append(i)
        for j in range(1, charge):
            outcome.append(float(mass.fast_mass(i, ion_type='b', charge=j)))
    for i in yions:
        outcome.append(i)
        for j in range(1, charge):
            outcome.append(float(mass.fast_mass(i, ion_type='y', charge=j)))
    return outcome
Ejemplo n.º 9
0
 def generateAminoAcidDeltaList(self, path_dir, length, starter_mass=0.0):
     path = path_dir + "exclusionListDelta" + "_" + str(length) + "_" + str(
         starter_mass) + ".csv"
     with open(path, "a") as csvfile:
         writr = csv.writer(csvfile, lineterminator=os.linesep)
         writr.writerow(("mz", "comment", "position"))
         for i in generatePeptides(length):
             for j in i:
                 mass_pep = mass.fast_mass(j, charge=0, ion_type='b')
                 writr.writerow((mass_pep, "", "absolute"))
Ejemplo n.º 10
0
def fastmass(pep, ion_type, charge, mod=None, cam=True):
    base = mass.fast_mass(pep, ion_type=ion_type, charge=charge)

    if cam:
        base += 57.021 * pep.count('C') / charge

    if not mod is None:
        base += 15.995 * np.sum(mod == 1) / charge

        base += -np.sum(mod[mod < 0])
    return base
Ejemplo n.º 11
0
def bIon_db(sequence, charge):
    '''
    Creat database for mass of b ions to compare with observed data
    '''
    b = function.bIon(sequence)
    b_db = dict()
    for j in range(1, charge):
        for i in b:
            b_db[float(mass.fast_mass(
                i, ion_type='b', charge=j))] = i  # for j in range(1,charge)]
    return b_db
Ejemplo n.º 12
0
def yIon_db(sequence, charge):
    '''
    Creat database for mass of y ions to compare with observed data
    '''
    y = function.yIon(sequence)
    y_db = dict()
    for j in range(1, charge):
        for i in y:
            y_db[float(mass.fast_mass(
                i, ion_type='y', charge=j))] = i  # for j in range(1,charge)]
    return y_db
Ejemplo n.º 13
0
def create_theoretical_peak_map(peptide, ion_type_list, charge_set=[1]):
    amino_acid_list = get_peptide_modification_list_inspect_format(peptide)
    #print(amino_acid_list)

    only_letters_list = [letter for letter in peptide if letter.isalpha()]

    only_mods_mass_add_list = []
    for amino_acid in amino_acid_list:
        mod_mass_to_add = 0.0
        mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*', re.sub("[A-Z]", "", amino_acid))
        for mod_tokenized in mod_strings_tokenized:
            mod_mass_to_add += float(mod_tokenized)
        only_mods_mass_add_list.append(mod_mass_to_add)

    ion_to_mass_mapping = {}
    #print(peptide)
    #print(only_mods_mass_add_list)
    for charge in charge_set:
        for ion_type in ion_type_list:
            #print(ion_type)
            iso_topic_added_mass = 0.0
            real_ion_type = ion_type
            if ion_type[-4:] == "-iso":
                iso_topic_added_mass = 1.007276 / float(charge)
                real_ion_type = ion_type[:-4]

            for i in range(len(amino_acid_list)):
                peak_mass = 0.0
                if real_ion_type[0] in "abc":
                    peak_annotation = ion_type + ":" + str(i+1) + ":" + str(charge)
                    peak_mass = mass.fast_mass("".join(only_letters_list[:i+1]), ion_type=real_ion_type, charge=charge) + sum(only_mods_mass_add_list[:i+1])/(float(charge)) + iso_topic_added_mass
                    #print(ion_type, i, charge, peak_mass, real_ion_type)
                else:
                    peak_annotation = ion_type + ":" + str(len(amino_acid_list) - i) + ":" + str(charge)
                    peak_mass = mass.fast_mass("".join(only_letters_list[i:]), ion_type=real_ion_type, charge=charge) + sum(only_mods_mass_add_list[i:])/(float(charge)) + iso_topic_added_mass
                    #print(ion_type, i, charge, peak_mass)
                ion_to_mass_mapping[peak_annotation] = peak_mass

    return ion_to_mass_mapping
Ejemplo n.º 14
0
def return_frag_mzs(peptide, z):
    mzValues = []
    digPat = r'\+\d+\.\d+'
    digs = re.findall(digPat, peptide)
    pepFrags = re.split(digPat, peptide)
    modValues = {}
    seq = ''
    while len(digs) != 0:
        dig = digs.pop(0)
        frag = pepFrags.pop(0)
        seq += frag
        modValues[len(seq)] = float(dig[1:]) / z
    seq += pepFrags[0]
    for i in range(1, len(seq) - 1):
        mz = mass.fast_mass(sequence=seq[i:], ion_type='y', charge=z)
        mz += sum([modValues[x] for x in modValues if x > i])
        mzValues.append(mz)

    for i in range(len(seq) - 1, 1, -1):
        mz = mass.fast_mass(sequence=seq[:i], ion_type='b', charge=z)
        mz += sum([modValues[x] for x in modValues if x <= i])
        mzValues.append(mz)
    return mzValues
Ejemplo n.º 15
0
def fragments(peptide, types, max_charge):
    '''The function generates all possible m/z for fragments of types
    and of charges from 1 to maxcharge.'''
    frags = []

    for i in xrange(1, len(peptide) - 1):
        for ion_type in types:
            for charge in xrange(1, max_charge + 1):
                sub_pep = peptide[:i] if ion_type[0] in 'abc' else peptide[i:]
                frags.append((mass.fast_mass(sub_pep, ion_type=ion_type,
                                             charge=charge),
                              ion_type,
                              charge))

    return frags
Ejemplo n.º 16
0
def create_theoretical_peak_map(peptide, ion_type_list, charge_set=[1]):
    amino_acid_list = get_peptide_modification_list_inspect_format(peptide)

    only_letters_list = [letter for letter in peptide if letter.isalpha()]

    only_mods_mass_add_list = []
    for amino_acid in amino_acid_list:
        mod_mass_to_add = 0.0
        mod_strings_tokenized = re.findall('[+-][1-9]*',
                                           re.sub("[A-Z]", "", amino_acid))
        for mod_tokenized in mod_strings_tokenized:
            mod_mass_to_add += float(mod_tokenized)
        only_mods_mass_add_list.append(mod_mass_to_add)

    ion_to_mass_mapping = {}
    for charge in charge_set:
        for ion_type in ion_type_list:
            for i in range(len(amino_acid_list)):
                peak_mass = 0.0
                if ion_type in "abc":
                    peak_annotation = ion_type + ":" + str(i + 1) + ":" + str(
                        charge)
                    peak_mass = mass.fast_mass(
                        "".join(only_letters_list[:i + 1]),
                        ion_type=ion_type,
                        charge=charge) + sum(only_mods_mass_add_list[:i + 1])
                else:
                    peak_annotation = ion_type + ":" + str(
                        len(amino_acid_list) - i) + ":" + str(charge)
                    peak_mass = mass.fast_mass("".join(only_letters_list[i:]),
                                               ion_type=ion_type,
                                               charge=charge) + sum(
                                                   only_mods_mass_add_list[i:])
                ion_to_mass_mapping[peak_annotation] = peak_mass

    return ion_to_mass_mapping
Ejemplo n.º 17
0
def calculate_theoretical_peptide_mass(peptide_sequence, charge):
    amino_acid_list = get_peptide_modification_list_inspect_format(peptide_sequence)
    only_letters_list = [letter for letter in peptide_sequence if letter.isalpha()]

    only_mods_mass_add_list = []

    for amino_acid in amino_acid_list:
        mod_mass_to_add = 0.0
        mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*', re.sub("[A-Z]", "", amino_acid))
        for mod_tokenized in mod_strings_tokenized:
            mod_mass_to_add += float(mod_tokenized)
        only_mods_mass_add_list.append(mod_mass_to_add)

    total_peptide_mass = (mass.fast_mass("".join(only_letters_list), charge=charge) + sum(only_mods_mass_add_list)/(float(charge)))

    return total_peptide_mass
Ejemplo n.º 18
0
def get_mass(peptide, mass_dic={}, fixed={"C": 57.021464}):
    """
    Compute mass of a peptide either from the sequence or from a dictionary
    look-ip
    """

    if peptide in mass_dic:
        pep_mass = mass_dic[peptide]

    else:
        #add modification masses
        add = 0
        for fixed_mod in fixed:
            add = peptide.count(fixed_mod) * fixed[fixed_mod]

        #compute pepmass
        pep_mass = mass.fast_mass(peptide) + add
        mass_dic[peptide] = pep_mass
    return (pep_mass)
Ejemplo n.º 19
0
    def __init__(self, sequence, settings, pcharge=0, evalue=0, note='unknown', mass_exp=0, modifications=[], modification_list={}, custom_aa_mass=None, sumI=0, mc=None, infile='unknown', frag_mt=None, tags=None):
        self.sequence = sequence
        self.modified_sequence = sequence
        self.modification_out_str = ''
        self.modification_list = modification_list
        self.pcharge = int(pcharge)
        self.aa_mass = custom_aa_mass
        self.pmass = float(mass.fast_mass(sequence=self.sequence, charge=0)) - 18.0105646837 + settings.getfloat('modifications', 'protein nterm cleavage') + settings.getfloat('modifications', 'protein cterm cleavage')
        for modif in modifications:
            self.pmass += modif['mass']
            if modif['position'] not in [0, len(self.sequence) + 1]:
                aminoacid = self.sequence[modif['position'] - 1]
                self.pmass -= mass.std_aa_mass[aminoacid]
            else:
                if modif['position'] == 0:
                    self.pmass -= settings.getfloat('modifications', 'protein nterm cleavage')
                else:
                    self.pmass -= settings.getfloat('modifications', 'protein cterm cleavage')

        self.mz = (mass_exp + pcharge * 1.007276) / pcharge
        self.modified_peptide(modifications)
        # self.RT_exp = RT_exp
        # self.RT_predicted = False
        self.evalue = float(evalue)
        #self.parentproteins = []
        self.massdiff = float(mass_exp) - float(self.pmass)
        self.num_missed_cleavages = dict()
        self.mc = mc
        self.note = note
        self.note2 = ''
        self.note3 = ''
        self.protscore2 = 1
        self.peptscore = 1
        self.peptscore2 = 1
        self.spectrum_mz = None
        self.fragment_mt = frag_mt
        self.sumI = sumI# / self.pcharge
        self.it = 1.0
        self.infile = infile
        self.fragments = defaultdict(dict)
        self.valid_sequence = dict()
        self.tags = tags if len(tags) else None
Ejemplo n.º 20
0
def calculate_theoretical_peptide_mass(peptide_sequence, charge):
    amino_acid_list = get_peptide_modification_list_inspect_format(
        peptide_sequence)
    only_letters_list = [
        letter for letter in peptide_sequence if letter.isalpha()
    ]

    only_mods_mass_add_list = []

    for amino_acid in amino_acid_list:
        mod_mass_to_add = 0.0
        mod_strings_tokenized = re.findall('[+-][0-9]*.[0-9]*',
                                           re.sub("[A-Z]", "", amino_acid))
        for mod_tokenized in mod_strings_tokenized:
            mod_mass_to_add += float(mod_tokenized)
        only_mods_mass_add_list.append(mod_mass_to_add)

    total_peptide_mass = (
        mass.fast_mass("".join(only_letters_list), charge=charge) +
        sum(only_mods_mass_add_list) / (float(charge)))

    return total_peptide_mass
Ejemplo n.º 21
0
def generateMascotIons(length, starter_mass):
    """
    all mascotions used for scoring
    """
    water_mass = 2.0 * Nist_mass('H') + Nist_mass('O')
    amin_mass = 3.0 * Nist_mass('H') + Nist_mass('N')

    for generatorPeptideCombinations in generatePeptides(length):
        for peptides in generatorPeptideCombinations:
            for ion_type in ('b', 'y'):
                ion_type_1 = mass.fast_mass(sequence=peptides,
                                            charge=1,
                                            ion_type=ion_type) + starter_mass
                ion_type_1_star = ion_type_1 - amin_mass
                ion_type_1_o = ion_type_1 - water_mass
                ion_type_2 = calculateDoubleCharged(ion_type_1)
                ion_type_2_star = calculateDoubleCharged(ion_type_1_star)
                ion_type_2_o = calculateDoubleCharged(ion_type_1_o)

                yield ([
                    ion_type_1,
                    "".join(peptides) + "_ion_" + str(ion_type) + "_1"
                ], [
                    ion_type_1_star,
                    "".join(peptides) + "_ion_" + str(ion_type) + "_1_star"
                ], [
                    ion_type_1_o,
                    "".join(peptides) + "_ion_" + str(ion_type) + "_1_o"
                ], [
                    ion_type_2,
                    "".join(peptides) + "_ion_" + str(ion_type) + "_2"
                ], [
                    ion_type_2_star,
                    "".join(peptides) + "_ion_" + str(ion_type) + "_2_star"
                ], [
                    ion_type_2_o,
                    "".join(peptides) + "_ion_" + str(ion_type) + "_2_o"
                ])
Ejemplo n.º 22
0
def transform_sequence_to_masssequence(sequence, mods):
    """
    Amino acids sequence to masssequence

    Parameters
    ----------
    sequence: str
        Sequence of a peptide
    mods: list
        Modifications of the peptide

    Returns
    -------
    list
        masses of indices
    """

    mass_sequence = []
    index = 0
    for i in sequence:
        mass_sequence.append(mass.fast_mass(i) + mods[index])
        index += 1
    return mass_sequence
 def add_mass(self):
     self.data_frame['mass'] = self.data_frame['sequence'].apply(
         lambda sequence: mass.fast_mass(sequence.replace('X', '')))
Ejemplo n.º 24
0
def handcrafted_features(data, tags):

    #
    # DOI 10.1007/s00251-017-1023-5
    # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb
    # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain
    # Modified to handle split for training, validation, and test cohorts
    # Modified for multinomial classification
    #

    # physicochemical amino acid properties
    basicity = {
        'A': 206.4,
        'B': 210.7,
        'C': 206.2,
        'D': 208.6,
        'E': 215.6,
        'F': 212.1,
        'G': 202.7,
        'H': 223.7,
        'I': 210.8,
        'K': 221.8,
        'L': 209.6,
        'M': 213.3,
        'N': 212.8,
        'P': 214.4,
        'Q': 214.2,
        'R': 237.0,
        'S': 207.6,
        'T': 211.7,
        'V': 208.7,
        'W': 216.1,
        'X': 210.2,
        'Y': 213.1,
        'Z': 214.9
    }

    hydrophobicity = {
        'A': 0.16,
        'B': -3.14,
        'C': 2.50,
        'D': -2.49,
        'E': -1.50,
        'F': 5.00,
        'G': -3.31,
        'H': -4.63,
        'I': 4.41,
        'K': -5.00,
        'L': 4.76,
        'M': 3.23,
        'N': -3.79,
        'P': -4.92,
        'Q': -2.76,
        'R': -2.77,
        'S': -2.85,
        'T': -1.08,
        'V': 3.02,
        'W': 4.88,
        'X': 4.59,
        'Y': 2.00,
        'Z': -2.13
    }

    helicity = {
        'A': 1.24,
        'B': 0.92,
        'C': 0.79,
        'D': 0.89,
        'E': 0.85,
        'F': 1.26,
        'G': 1.15,
        'H': 0.97,
        'I': 1.29,
        'K': 0.88,
        'L': 1.28,
        'M': 1.22,
        'N': 0.94,
        'P': 0.57,
        'Q': 0.96,
        'R': 0.95,
        'S': 1.00,
        'T': 1.09,
        'V': 1.27,
        'W': 1.07,
        'X': 1.29,
        'Y': 1.11,
        'Z': 0.91
    }

    mutation_stability = {
        'A': 13,
        'C': 52,
        'D': 11,
        'E': 12,
        'F': 32,
        'G': 27,
        'H': 15,
        'I': 10,
        'K': 24,
        'L': 34,
        'M': 6,
        'N': 6,
        'P': 20,
        'Q': 10,
        'R': 17,
        'S': 10,
        'T': 11,
        'V': 17,
        'W': 55,
        'Y': 31
    }

    # feature conversion and generation
    features_list = []

    for chain in ['tra', 'trb']:

        onehot_encoder = feature_extraction.DictVectorizer(sparse=False)
        features_list.append(
            pd.DataFrame(onehot_encoder.fit_transform(
                data[[chain + '_vgene',
                      chain + '_jgene']].to_dict(orient='records')),
                         columns=onehot_encoder.feature_names_))

        # sequence length
        features_list.append(data[chain + '_cdr3'].apply(
            lambda sequence: parser.length(sequence)).to_frame().rename(
                columns={chain + '_cdr3': 'length'}))

        # number of occurences of each amino acid
        aa_counts = pd.DataFrame.from_records([
            parser.amino_acid_composition(sequence)
            for sequence in data[chain + '_cdr3']
        ]).fillna(0)
        aa_counts.columns = [
            chain + '_count_{}'.format(column) for column in aa_counts.columns
        ]
        features_list.append(aa_counts)

        # physicochemical properties: (average) basicity, (average) hydrophobicity,
        #                             (average) helicity, pI, (average) mutation stability
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_basicity'}))
        features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum(
            [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame(
            ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'}))
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_helicity'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: electrochem.pI(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'pI'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser.
            length(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'avg_mutation_stability'}))

        # peptide mass
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: mass.fast_mass(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'mass'}))

        # positional features
        # amino acid occurence and physicochemical properties at a given position from the center
        pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [
            [] for _ in range(6)
        ]
        for sequence in data[chain + '_cdr3']:
            length = parser.length(sequence)
            start_pos = -1 * (length // 2)
            pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\
              list(range(start_pos, 0)) + list(range(1, start_pos + length + 1))

            pos_aa.append({
                chain + '_pos_{}_{}'.format(pos, aa): 1
                for pos, aa in zip(pos_range, sequence)
            })
            pos_basicity.append({
                chain + '_pos_{}_basicity'.format(pos): basicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_hydro.append({
                chain + '_pos_{}_hydrophobicity'.format(pos):
                hydrophobicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_helicity.append({
                chain + '_pos_{}_helicity'.format(pos): helicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_pI.append({
                chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa)
                for pos, aa in zip(pos_range, sequence)
            })
            pos_mutation.append({
                chain + '_pos_{}_mutation_stability'.format(pos):
                mutation_stability[aa]
                for pos, aa in zip(pos_range, sequence)
            })

        features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0))

    features_list.append(data['weights'])
    for tag in tags:
        features_list.append(data['labels_' + tag])
    features_list.append(data['split'])

    # combine all features
    data_processed = pd.concat(features_list, axis=1)

    return data_processed
Ejemplo n.º 25
0
    def generate_fragments_from_peptide(self,
                                        peptide,
                                        ion_types,
                                        label_format={},
                                        min_charge=1,
                                        max_charge=1,
                                        aa_mass_dict=None,
                                        polarity="+",
                                        ion_composition=None,
                                        modification_dict={},
                                        verbose=False):

        tstart = ttime()

        # specify charges
        if min_charge < 1:
            min_charge = 1

        if max_charge < min_charge:
            max_charge, min_charge = min_charge, max_charge

#         # update ion composition obj
        include_modifications = False
        if len(modification_dict) > 0:
            include_modifications = True
#             self.add_modification_composition(modification_dict)

# determine ion composition
        if ion_composition is None:
            ion_composition = self.ion_composition

        # make backup of peptide sequence
        _peptide = peptide

        # check if shortcuts were used
        ion_types = self.replace_ion_composition_shortcut(ion_types)

        fragment_dict = {}
        for ion_type in ion_types:
            if ion_type in [self._M_all_]:
                peptide = _peptide
                for charge in xrange(min_charge, max_charge + 1):
                    ion_mz = mass.fast_mass(peptide,
                                            ion_type=ion_type,
                                            charge=charge,
                                            ion_comp=ion_composition)

                    ion_label = "{}{}{}".format(ion_type, polarity, charge)
                    fragment_dict[ion_label] = {
                        'mz': ion_mz,
                        'z': charge,
                        'seq': peptide
                    }

            if ion_type in self._all_abc_all_:
                peptide = _peptide
                if not self.check_peptide_rules(ion_type, peptide):
                    continue
                for i in xrange(1, len(peptide)):
                    peptide_seq = peptide[:i]
                    if not self.check_peptide_rules(ion_type, peptide_seq):
                        continue
                    mod_peptide_seq, modification_mass = peptide_seq, 0
                    if include_modifications:
                        mod_peptide_seq, modification_mass = self.check_modification(
                            i, peptide_seq, modification_dict)

                    for charge in xrange(min_charge, max_charge + 1):
                        ion_mz = mass.fast_mass(peptide_seq,
                                                ion_type=ion_type,
                                                charge=charge,
                                                ion_comp=ion_composition)

                        ion_mz = ion_mz + (modification_mass / charge)

                        ion_label, ion_label_full = self.generate_label(
                            ion_type[0], i, polarity, charge, mod_peptide_seq)
                        fragment_dict[ion_label_full] = {
                            'mz': ion_mz,
                            'z': charge,
                            'seq': peptide_seq,
                            'full_label': ion_label_full,
                            'label': ion_label
                        }

            if ion_type in self._all_xyz_all_:
                peptide = _peptide  #[::-1]
                if not self.check_peptide_rules(ion_type, peptide):
                    continue
                # generate list of inverse fragment numbers
                _frag_label_length = np.arange(len(peptide), 0, -1)
                # iterate over peptide length
                for i in xrange(1, len(peptide)):
                    peptide_seq = peptide[i:]
                    if not self.check_peptide_rules(ion_type, peptide_seq):
                        continue
                    mod_peptide_seq, modification_mass = peptide_seq, 0
                    if include_modifications:
                        mod_peptide_seq, modification_mass = self.check_modification(
                            i + 1, peptide_seq, modification_dict)

                    for charge in xrange(min_charge, max_charge + 1):
                        ion_mz = mass.fast_mass(peptide_seq,
                                                ion_type=ion_type,
                                                charge=charge,
                                                ion_comp=ion_composition)

                        # modify ion mass with modification mass
                        ion_mz = ion_mz + (modification_mass / charge)

                        # generate label
                        ion_label, ion_label_full = self.generate_label(
                            ion_type[0],
                            _frag_label_length[i],
                            polarity,
                            charge,
                            mod_peptide_seq,
                            full_ion_type=ion_type)

                        fragment_dict[ion_label_full] = {
                            'mz': ion_mz,
                            'z': charge,
                            'seq': mod_peptide_seq,
                            'full_label': ion_label_full,
                            'label': ion_label
                        }

        # print verbose information
        if verbose:
            msg = "Peptide length: {} | # Fragments: {} | Time to generate: {:.4f}".format(
                len(peptide), len(fragment_dict),
                ttime() - tstart)
            print(msg)

        return fragment_dict
Ejemplo n.º 26
0
def get_peptide_results(resfile, mgfDataArray, options):
    '''
    Retrieve peptide assignments and PTM specifications from mascot .dat file,

    Return values:
        1) list of mascot_hit objects
        2) list of varMod objects
    '''

    # get file header data
    params = resfile.params()

    # get mgf rt vector
    mgfRTs = mgfDataArray[:, 0]

    try:
        fixed_mods = params.getMODS()
    except:
        pass

    # build list of variable modifications and associated mass offsets:
    var_mods = []

    i = 1
    while params.getVarModsName(i):
        modName = params.getVarModsName(i)
        modDelta = params.getVarModsDelta(i)
        modNeutralLoss = params.getVarModsNeutralLoss(i)
        modIndex = i
        var_mods.append(varMod(modIndex, modName, modDelta, modNeutralLoss))
        i += 1

    if options.printVarMods:
        for i in var_mods:
            print i
        sys.exit()

    (scriptName, flags, minProbability, maxHitsToReport, ignoreIonsScoreBelow,
     minPepLenInPepSummary, usePeptideSummary,
     flags2) = resfile.get_ms_mascotresults_params(msparser.ms_mascotoptions())
    results = msparser.ms_peptidesummary(resfile, flags, 1, 999999999, '',
                                         ignoreIonsScoreBelow,
                                         minPepLenInPepSummary, '', flags2)
    #results = msparser.ms_peptidesummary(resfile)
    mascot_hits = []

    if usePeptideSummary:
        pepsum = msparser.ms_peptidesummary(
            resfile,  # results file object
            flags,  # MSRES_group_proteins
            1,  # minProbability
            999999999,  # maxHits
            '',  # unigeneIndexFile
            ignoreIonsScoreBelow,  # ignore hits below
            minPepLenInPepSummary,  # minPepLenINPepSummary
            '',  # singleHit
            flags2)  # flags2

    total_index = 0

    for x in xrange(1, 10000000):
        prot = pepsum.getHit(x)
        # indes, prot_acc, prot_index, prot_matches, varmods
        if prot is not None:
            #print('results for protein hit %x' %x)
            num_peps = prot.getNumPeptides()
            prot_acc = prot.getAccession()
            prot_index = x

            for i in range(1, num_peps + 1):
                query = prot.getPeptideQuery(i)
                p = prot.getPeptideP(i)
                pep = pepsum.getPeptide(query, p)

                #intensity = resfile.getObservedIntensity(query)

                if pep.getAnyMatch(
                ):  # not sure what this does ---> returns a boolean if any peptide is assigned to this query

                    query = pep.getQuery()  # returns index of query
                    queryData = msparser.ms_inputquery(resfile, query)

                    rank = pep.getRank()
                    charge = pep.getCharge()
                    mz = pep.getObserved()
                    seq = pep.getPeptideStr()
                    seq_len = pep.getPeptideLength()
                    score = pep.getIonsScore()
                    #intensity = pep.getTotalIonsIntensity()
                    mod_string = pep.getVarModsStr()
                    prot_matches = pep.getProteins()

                    rt = queryData.getRetentionTimes()

                    miss = pep.getMissedCleavages()

                    identity = results.getPeptideIdentityThreshold(query, 20)
                    homology = results.getHomologyThreshold(query, 20)
                    pep_score = pep.getIonsScore()

                    # TODO: connect UI threshold setting to this conditional
                    if float(score) < float(identity): continue

                    # get 2+ peptides with 1 cysteine
                    if seq.count('C') != 1 or int(charge) != 2: continue

                    # exclude missed cleavages and terminal peptides
                    if seq.count('R') + seq.count('K') != 1: continue

                    # need to count occurrances of IAA/C - make sure CYS is modified w IAA
                    if mod_string.count(str(options.targetMod)) != 1: continue

                    index = np.argmin(np.absolute(mgfRTs - float(rt)))
                    if np.shape < 1:
                        print 'Warning, no MGF intensity found for entry: mz: %s, rt: %s' % (
                            mz, rt)

                    assert float(rt) - float(mgfDataArray[index][0]) < 0.001
                    intensity = mgfDataArray[index][2]

                    if intensity < options.minIntensity: continue

                    hit = mascotHit(float(mz), charge, float(rt), miss, score,
                                    seq, mod_string, query, rank, total_index,
                                    prot_index, prot_acc, prot_matches,
                                    identity, homology, pep_score)

                    hit.exptFragments = []
                    # intensity = 0
                    num_peaks = queryData.getNumberOfPeaks(1)
                    for j in range(1, 1 + num_peaks):
                        peak = [
                            queryData.getPeakMass(1, j),
                            queryData.getPeakIntensity(1, j)
                        ]
                        hit.exptFragments.append(peak)
                        # intensity += peak[1]

                    hit.intensity = intensity
                    hit.sequence_mass = float(mass.fast_mass(seq, charge=2))
                    hit.index = total_index
                    mascot_hits.append(hit)
                    total_index += 1

        else:
            break

    return mascot_hits, var_mods, total_index
Ejemplo n.º 27
0
def get_fragments(peptide,
                  mod_string,
                  var_mods,
                  types=('b', 'y'),
                  maxcharge=2):
    '''
    Generate theoretical sequence ions for a given peptide sequence string
        - types argumnet to be replaced by user specified match ions
        - maxcharge to be replaced by user specified value

    Return Value:
        - List of peptideFragment objects - object for each potential modification site
        - peptideFragment attributes are:

            self.residue = residue
            self.residue_index = residue_index
            self.CRM_mass = CRM_mass
            self.correlation_score = correlation_score

            self.a = a
            self.b = b
            self.c = c
            self.x = x
            self.y = y
            self.z = z

            NB: a,b,c,x,y,z are nested lists - each sublist has the structure ['FRAGMENT SEQUENCE', m/z of fragment]


    NOTE: len(mod_string) == len(peptide) + 2. The two extra entries define modifications at the N and C termini of the pepitde
        ---> for development purposes, remove these

    '''

    mod_string = mod_string[1:len(mod_string) - 1]

    #mod_mass = float(ht_hit.mz) - float(peptide.mz) *2  # need to be calculated in the rolling mod function to account for the presence of native pep mods

    assert len(peptide) == len(mod_string)

    # get the mod string for residues in this fragment
    frag_mod_str = list(mod_string)

    # calculate mass of unmodified peptide
    calc_pep_mz = mass.fast_mass(peptide, charge=2)

    # create a list of masses to add/subtract from each residue
    frag_mod_mass = get_fragment_mod_masses(frag_mod_str, var_mods)

    pepFrags = PeptideFragments()

    a, b, c, x, y, z = [], [], [], [], [], []

    # generate fragment ions and apply mods (native or CRM) as needed
    for i in xrange(1, len(peptide)):
        for ion_type in types:
            for charge in xrange(1, maxcharge):
                if ion_type in 'abc':

                    # generate pure sequence of fragment
                    frag = peptide[:i]

                    # get mass of relevant mods
                    mods = frag_mod_mass[:i]

                    # calculate mass of base fragment
                    mz = mass.fast_mass(peptide[:i],
                                        ion_type=ion_type,
                                        charge=charge)

                    # add total mass of modifications - including CRM
                    mz = mz + sum(mods) / charge

                    b.append([peptide[:i], float(mz), ion_type, charge])

                if ion_type in 'xyz':

                    # generate pure sequence of fragment
                    frag = peptide[i:]

                    # get mass of relevant mods
                    mods = frag_mod_mass[i:]

                    # calculate mass of base fragment
                    mz = mass.fast_mass(peptide[i:],
                                        ion_type=ion_type,
                                        charge=charge)

                    # add total mass of modifications - including CRM
                    mz = mz + sum(mods)

                    y.append([peptide[i:], float(mz), ion_type, charge])

    if len(a) != 0: pepFrags.a = a
    if len(b) != 0: pepFrags.b = b
    if len(c) != 0: pepFrags.c = c
    if len(x) != 0: pepFrags.x = x
    if len(y) != 0: pepFrags.y = y
    if len(z) != 0: pepFrags.z = z

    return pepFrags
Ejemplo n.º 28
0
def fastmass(pep, ion_type, charge):
    return mass.fast_mass(pep, ion_type=ion_type,
                          charge=charge) + 57.021 * pep.count('C') / charge
Ejemplo n.º 29
0
def fragments_b(peptide, maxcharge=1):
    for i in xrange(1, len(peptide) + 1):  #changed  to catch ending aas
        yield mass.fast_mass(peptide[:i], ion_type='b', charge=maxcharge)
Ejemplo n.º 30
0
 def getNativeMass(self):
     return mass.fast_mass(sequence=self.sequence)
Ejemplo n.º 31
0
 def test_fast_mass(self):
     for pep in self.random_peptides:
         self.assertAlmostEqual(
             mass.fast_mass(pep, aa_mass=self.test_aa_mass),
             sum(pep.count(aa) * m for aa, m in self.test_aa_mass.items()) +
             self.mass_H * 2.0 + self.mass_O)
Ejemplo n.º 32
0
 def test_aa_mass(self):
     h2o = mass.calculate_mass(formula='H2O')
     for aa, m in mass.std_aa_mass.items():
         self.assertEqual(m + h2o, mass.fast_mass(aa))
Ejemplo n.º 33
0
def fragments_y(peptide, maxcharge=1):
    for i in xrange(0, len(peptide)):  #changed  to catch ending aas
        yield mass.fast_mass(peptide[i:], ion_type='y', charge=maxcharge)