def Model_2(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']] X_test = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']] Y_train = train[' Label'] X_train, Y_train, X_test = np.array(X_train), np.array(Y_train), np.array(X_test) X_train,Y_train = shuffle(X_train,Y_train,random_state = 3) # Training param = {'max_depth':25,'objective':'reg:logistic','n_estimators':100,'booster':'gbtree', 'colsample_bylevel':0.7,'colsample_bytree': 1,'n_thread': 2} xgb = XGBClassifier( **param, random_state = 3) clf = BaggingClassifier(base_estimator = xgb, n_estimators = 23, random_state = 3, n_jobs = -1) clf.fit(X_train, Y_train) # Predicting Y_prob = [x[1] for x in clf.predict_proba(X_test)] Y_pred = clf.predict(X_test) result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_2.csv", index = False) result["Label"] = Y_pred result.to_csv("Prediction_2.csv", index = False)
def total_mz(self): if self._total_mz is None: mod = (len(self.peptides) - 1) * mass.calculate_mass(formula="H2") total_mass = sum(p.mass for p in self.peptides) - mod self._total_mz = (total_mass / self.total_charge ) + mass.calculate_mass(formula="H") return self._total_mz
def fragments_multi(prot_seq, obs_mass, cal_type, dataframe, tolerance): if cal_type == 'mono': aa_comp = dict(mass.std_aa_mass) ave_cal = False else: aa_comp = dict(mass.std_aa_comp) ave_cal = True found = [] start = 0 s = int(obs_mass) // 107 e = int(obs_mass) // 95 for frag in prot_seq: for i in range(s, e): if i > len(prot_seq): break if math.isclose(round( mass.calculate_mass(prot_seq[start:i], average=ave_cal, aa_comp=aa_comp), 1), obs_mass, abs_tol=tolerance): if i == len(prot_seq): find = [ 'Single', prot_seq[start], int(start + 1), prot_seq[i - 1], int(i), obs_mass, round( mass.calculate_mass(prot_seq[start:i], average=ave_cal, aa_comp=aa_comp), 1), round( obs_mass - round( mass.calculate_mass(prot_seq[start:i], average=ave_cal, aa_comp=aa_comp), 1), 1) ] found.append(find) else: find = [ 'Double', prot_seq[start], int(start + 1), prot_seq[i - 1], int(i), obs_mass, round( mass.calculate_mass(prot_seq[start:i], average=ave_cal, aa_comp=aa_comp), 1), round( obs_mass - round( mass.calculate_mass(prot_seq[start:i], average=ave_cal, aa_comp=aa_comp), 1), 1) ] found.append(find) s += 1 e += 1 start += 1 return (found)
def in_silico_fragmentation(fn): df = pandas.read_table(fn) products = {} for i, x in df.iterrows(): xchg = x['Precursor Charge'] bseq = x['Base Peptide Sequence'] seq = x['Peptide Sequence'] if not products.has_key(seq): parseq, theomass, theomz = calc_precursor_theoretical(seq, int(xchg)) if parseq == None: products[seq] = [0.0, 0.0] continue theoSpec = [] # for c in xrange(1, int(xchg/2)+1): for c in [1]: for n in xrange(1, len(bseq)): bproduct = parseq[:n + 1] + [parseq[-1]] yproduct = ['H-'] + parseq[n + 1:] bp = mass.calculate_mass(parsed_sequence=bproduct, ion_type='b', aa_comp=composition, charge=c) yp = mass.calculate_mass(parsed_sequence=yproduct, ion_type='y', aa_comp=composition, charge=c) theoSpec.append(bp) theoSpec.append(yp) # print "b:%d:%f" % (n,bp), bproduct # print "y:%d:%f" % (len(bseq)-n,yp), yproduct products[seq] = theoSpec return products
def lossConvert(loss, charge): if loss == '': return 0 elif loss == 'n': return massC.calculate_mass(formula='NH3', charge=charge) elif loss == 'o': return massC.calculate_mass(formula='H2O', charge=charge)
def calc_precursor_theoretical(seq, z): try: parseq = parser.parse(seqModX(seq), labels=modLabels, show_unmodified_termini=True) theomass = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition) theomz = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition, charge=z) return (parseq, theomass, theomz) except : return (None, None, None)
def test_calculate_mass(get_mass): assert mass.calculate_mass("ACDE") == pytest.approx(436.12639936, REL) assert mass.calculate_mass(mass.Composition("ACDE")) == pytest.approx( 436.12639936, REL) assert mass.calculate_mass(parsed_sequence="ACDE") == pytest.approx( 418.115834, REL) assert mass.calculate_mass("A") == pytest.approx(89.04767846841, REL) for data in get_mass: sequence = data[0] expected = data[1] assert mass.calculate_mass(sequence) == pytest.approx(expected, REL)
def calc_precursor_theoretical(seq, z): try: parseq = parser.parse(seqModX(seq), labels=modLabels, show_unmodified_termini=True) theomass = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition) theomz = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition, charge=z) return (parseq, theomass, theomz) except: return (None, None, None)
def __init__(self, sequence='', z=1, ion='H+', modification=''): """ Constructor Make it calculate it based on what type of ion it is massOfPO4H=massOfPhosphorous+4*massOfOxygen+massOfHydrogen, massOfRibose=5*massOfCarbon+7*massOfHydrogen+2*massOfOxygen, massOfAdenine=5*massOfCarbon+4*massOfHydrogen+5*massOfNitrogen+massOfRibose, massOfCytosine=4*massOfCarbon+4*massOfHydrogen+3*massOfNitrogen+massOfOxygen+massOfRibose, massOfGuanine=5*massOfCarbon+4*massOfHydrogen+5*massOfNitrogen+massOfOxygen+massOfRibose, massOfThymine=5*massOfCarbon+5*massOfHydrogen+2*massOfNitrogen+massOfOxygen+massOfRibose, massOfUracil=4*massOfCarbon+3*massOfHydrogen+2*massOfNitrogen+2*massOfOxygen+massOfRibose; """ self.DNA_in_base['A'] = mass.calculate_mass(formula='C5H4N5') self.DNA_in_base['G'] = mass.calculate_mass(formula='C5H4N5O') self.DNA_in_base['C'] = mass.calculate_mass(formula='C4H4N3O') self.DNA_in_base['T'] = mass.calculate_mass(formula='C5H5N2O2') self.DNA_in_base['U'] = mass.calculate_mass(formula='C4H3N2O2') self.DNA_in_base['DeoxyRibose'] = mass.calculate_mass(formula='C5H7O2') self.DNA_in_base['H'] = mass.calculate_mass(formula='H') self.DNA_in_base['H+'] = mass.calculate_mass(formula='H+') self.DNA_in_base['O'] = mass.calculate_mass(formula='O') self.DNA_in_base['P'] = mass.calculate_mass(formula='P') self.std_aa_mass = { 'G': 57.02146, 'A': 71.03711, 'S': 87.03203, 'P': 97.05276, 'V': 99.06841, 'T': 101.04768, 'C': 103.00919, 'L': 113.08406, 'I': 113.08406, 'N': 114.04293, 'D': 115.02694, 'Q': 128.05858, 'K': 128.09496, 'E': 129.04259, 'M': 131.04049, 'H': 137.05891, 'F': 147.06841, 'R': 156.10111, 'Y': 163.06333, 'W': 186.07931, }
def parse_data(self): for key in self.data: for rules in self.data[key]: if rules["annotation"] == "standard": for masses in rules["losses"]: self.masses.append(mass.calculate_mass(formula=masses)) self.masses = list(set(self.masses))
def test_Unimod_mass(self): db = mass.Unimod(gzip.open('unimod.xml.gz')) for x in db.mods: self.assertGreater( 0.00001, abs(x['mono_mass'] - mass.calculate_mass( x['composition'], mass_data=db.mass_data)))
def ntps_updateMzBand(self): try: atom_dict = self.get_atom_range_dict(silent=True) except: return minMassD, maxMassD = {}, {} for k, v in atom_dict.iteritems(): minMassD[k] = min(v) maxMassD[k] = max(v) minMass = mass.calculate_mass(composition=minMassD) maxMass = mass.calculate_mass(composition=maxMassD) self.NTPS_mz_band_entry.setText('%s-%s' % (int(minMass), int(maxMass))) return
def write_to_csv(output_mapping_dict): #Writes all information about hits into csv called "spectra_map.csv" with open('spectra_map_temp1.csv', "w") as temp1_file: writer = csv.writer(temp1_file) #legend = ["Antibase Chem Formula", ] writer.writerow(output_mapping_dict.keys()) mass_list = [] for key in output_mapping_dict.keys(): mass_list.append(mass.calculate_mass(formula=key)) writer.writerow(mass_list) for val in zip(*output_mapping_dict.values()): writer.writerow(val) transpose = zip(*csv.reader(open("spectra_map_temp1.csv", "rt"))) headers = [ "Antibase Chemical Formula", "Antibase Molecular Weight", "Adduct", "Scan/Alignment Number", "RT", "Scan/Alignment M/Z", "PPM" ] with open('spectra_map_temp2.csv', "w") as temp2_file: writer2 = csv.writer(temp2_file) writer2.writerow(headers) writer2.writerows(transpose) df = pd.read_csv('spectra_map_temp2.csv') # rearrange column here df_reorder = df[[ 'Scan/Alignment Number', 'Scan/Alignment M/Z', 'Adduct', 'Antibase Chemical Formula', 'Antibase Molecular Weight', 'PPM', 'RT' ]] df_reorder.to_csv('spectra_map.csv', index=False) os.system('rm spectra_map_temp1.csv') os.system('rm spectra_map_temp2.csv')
def test_annotate_peptide_fragments(): fragment_tol_mass = 0.02 fragment_tol_mode = 'Da' peptides = [ 'SYELPDGQVITIGNER', 'MFLSFPTTK', 'DLYANTVLSGGTTMYPGIADR', 'YLYEIAR', 'VAPEEHPVLLTEAPLNPK' ] for peptide in peptides: fragment_mz = np.asarray([ fragment.calc_mz for fragment in spectrum._get_theoretical_peptide_fragments(peptide) ]) fragment_mz += np.random.uniform(-0.9 * fragment_tol_mass, 0.9 * fragment_tol_mass, len(fragment_mz)) num_peaks = 150 mz = np.random.uniform(100, 1400, num_peaks) mz[:len(fragment_mz)] = fragment_mz intensity = np.random.lognormal(0, 1, num_peaks) charge = 2 spec = spectrum.MsmsSpectrum('test_spectrum', mass.calculate_mass(sequence=peptide, charge=charge), charge, mz, intensity, peptide=peptide) spec.annotate_peptide_fragments(fragment_tol_mass, fragment_tol_mode) assert np.count_nonzero(spec.annotation) == len(fragment_mz)
def compute_mass_spectrum(sequence, charge=1): spectrum = numpy.zeros(len(sequence) - 1) for i, iont in enumerate(b_ionts(sequence)): spectrum[i] = mass.calculate_mass(sequence=iont, ion_type='b', charge=charge) return spectrum
def main(): input_filename = sys.argv[1] ppm_tolerance = float(sys.argv[2]) line_counts, table_data = ming_fileio_library.parse_table_with_headers( input_filename) all_sub_peptides = [] for i in range(line_counts): #print table_data["Peptides"][i] peptide = table_data["Peptides"][i] all_sub_peptides.append(peptide) for length in range(10): #substrings = find_all_substring_of_length(peptide, length + 4) substrings = [peptide[:length + 4], peptide[length + 4:]] #print peptide + "\t" + str(substrings) all_sub_peptides += substrings #print len(all_sub_peptides) all_sub_peptides = list(set(all_sub_peptides)) #print len(all_sub_peptides) peptide_mass_map = {} for peptide in all_sub_peptides: peptide_key = peptide + ".2" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=2) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".3" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=3) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".4" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=4) peptide_mass_map[peptide_key] = peptide_mass #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)) #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)) #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4)) #Determine uniqueness find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
def calculate_b_y_ion(sequence, ion_charge): aa_comp = dict(mass.std_aa_comp) aa_comp['C'] = mass.Composition({'H': 8, 'C': 5, 'S': 1, 'O': 2, 'N': 2}) b_ion = [ mass.calculate_mass(sequence[:aa], ion_type='b', charge=ion_charge, aa_comp=aa_comp) for aa in range(1, len(sequence)) ] # aa = the amino acid residue y_ion = [ mass.calculate_mass(sequence[aa:], ion_type='y', charge=ion_charge, aa_comp=aa_comp) for aa in range(1, len(sequence)) ] y_ion.reverse() # record from small to big return (tuple(b_ion), tuple(y_ion))
def mass_diff(amino_acid, mass): """ >>> round(mass_diff("M", 147.04), 2) 16.0 """ unmodified_mass = calculate_mass(composition=Composition(parsed_sequence=[amino_acid])) return mass - unmodified_mass
def main(): input_filename = sys.argv[1] ppm_tolerance = float(sys.argv[2]) line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename) all_sub_peptides = [] for i in range(line_counts): #print table_data["Peptides"][i] peptide = table_data["Peptides"][i] all_sub_peptides.append(peptide) for length in range(10): #substrings = find_all_substring_of_length(peptide, length + 4) substrings = [peptide[:length+4], peptide[length+4:]] #print peptide + "\t" + str(substrings) all_sub_peptides += substrings #print len(all_sub_peptides) all_sub_peptides = list(set(all_sub_peptides)) #print len(all_sub_peptides) peptide_mass_map = {} for peptide in all_sub_peptides: peptide_key = peptide + ".2" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=2) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".3" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=3) peptide_mass_map[peptide_key] = peptide_mass peptide_key = peptide + ".4" peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=4) peptide_mass_map[peptide_key] = peptide_mass #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)) #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)) #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4)) #Determine uniqueness find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
def get_peptide_data(peptide): """ Get data for a given peptide. """ peptide_data = {'sequence': peptide} peptide_data['parsed_sequence'] = parser.parse( peptide, show_unmodified_termini=True # keep the termini, for mass calculations. ) peptide_data['mass'] = mass.calculate_mass( peptide_data['parsed_sequence'] ) return peptide_data
def get_seq_mass(self, start=None, end=None, term="n", **kwds): # kwds could be average, ion_type, etc... slice = _Slice(self.sequence, start, end, term) sub_sequence = self.sub_sequence(slice=slice) composition = Composition(list(sub_sequence)) # Why is adding H2O needed, is there a more pyteomics way of doing this? kwds['ion_comp'] = ION_COMP mass = calculate_mass(composition=composition, **kwds) mass = self.__add_res_mod_masses(mass, slice, term, **kwds) mass = self.__add_term_mod_mass(mass, slice, term, **kwds) return mass
def main(): input_filename = sys.argv[1] line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename) all_sub_peptides = [] for i in range(line_counts): #print table_data["Peptides"][i] for length in range(10): peptide = table_data["Peptides"][i] substrings = find_all_substring_of_length(peptide, length + 4) #print peptide + "\t" + str(substrings) all_sub_peptides += substrings #print len(all_sub_peptides) all_sub_peptides = list(set(all_sub_peptides)) #print len(all_sub_peptides) for peptide in all_sub_peptides: print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)) print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)) print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))
def _get_theoretical_peptide_fragments(peptide: str, types: str = 'by', max_charge: int = 1): """ Get theoretical fragments for the given peptide. Parameters ---------- peptide : str The peptide sequence for which the fragments will be generated. types : str, optional The fragment type. Can be any combination of 'a', 'b', 'c', 'x', 'y', and 'z' (the default is 'by', which means that b-ions and y-ions will be generated). max_charge : int, optional All fragments up to and including the given charge will be generated (the default is 1 to only generate singly-charged fragments). Returns ------- A list of all fragments as (`FragmentAnnotation`, m/z) tuples sorted in ascending m/z order. """ ions = [] amino_acids = parser.parse(peptide) for i in range(1, len(amino_acids)): for ion_type in types: for charge in range(1, max_charge + 1): if ion_type in 'abc': ions.append(( FragmentAnnotation(ion_type, i, charge), mass.calculate_mass(sequence=''.join(amino_acids[:i]), ion_type=ion_type, charge=charge))) else: ions.append(( FragmentAnnotation(ion_type, len(peptide) - i, charge), mass.calculate_mass(sequence=''.join(amino_acids[i:]), ion_type=ion_type, charge=charge))) return sorted(ions, key=operator.itemgetter(1))
def getCIDFragmentIons(sequence, charge): """ Generate CID fragments for a given peptide sequence and charge, and calculates their monoisotopic m/z values. First, all possible b and y ion fragments are generated. Then, the monoisotopic m/z values are calculated for the given charge. This method makes use of the pyteomics package to compute the monoisotopic m/z values. For more information, please refer to: https://pythonhosted.org/pyteomics/mass.html Parameters ---------- sequence : str The peptide which will be fragmented. charge: int The charge of the b and y ions that will be computed. Returns ------- yFragmentMasses : ndarray A numpy array containing the monoisotopic m/z values of the y ion fragments. bFragmentMasses : ndarray A numpy array containing the monoisotopic m/z values of the b ion fragments. """ # generate y and b fragment sequences in a list yFragments = [sequence[i:] for i in range(len(sequence))] bFragments = [sequence[:i + 1] for i in range(len(sequence))] # calculate masses for sequences in y/b-lists yFragmentMasses = np.fromiter( (mass.calculate_mass(sequence=yIon, ion_type='y', charge=charge) for yIon in yFragments), np.float) bFragmentMasses = np.fromiter( (mass.calculate_mass(sequence=bIon, ion_type='b', charge=charge) for bIon in bFragments), np.float) return yFragmentMasses, bFragmentMasses
def __init__(self): """ Constructor """ self.modifications = [] self.my_mods = { '': 0, '+BS3': mass.calculate_mass(formula='C8H10O2'), 'BS3x2': mass.calculate_mass(formula='C16H20O4'), '-H2O': -mass.calculate_mass(formula='H2O'), '-NH3': -mass.calculate_mass(formula='NH3'), 'S-S': -mass.calculate_mass(formula='H2'), 'S-Sx2': -mass.calculate_mass(formula='H4'), 'S-Sx3': -mass.calculate_mass(formula='H6'), '-H20x2': -mass.calculate_mass(formula='H4O2'), '-H20x3': -mass.calculate_mass(formula='H6O3'), '-H2O-NH3': -mass.calculate_mass(formula='H5ON'), '-dHA': -34, '+thio': +32, }
def _create_mgf_entry(peptide, charge=2): """Create a MassIVE-KB style MGF entry for a single PSM. Parameters ---------- peptide : str A peptide sequence. charge : int, optional The peptide charge state. Returns ------- str The PSM entry in an MGF file format. """ mz = calculate_mass(peptide, charge=int(charge)) frags = [] for idx in range(len(peptide)): for zstate in range(1, charge): b_pep = peptide[:idx + 1] frags.append( str(calculate_mass(b_pep, charge=zstate, ion_type="b"))) y_pep = peptide[idx:] frags.append( str(calculate_mass(y_pep, charge=zstate, ion_type="y"))) frag_string = " 1\n".join(frags) + " 1" mgf = [ "BEGIN IONS", f"SEQ={peptide}", f"PEPMASS={mz}", f"CHARGE={charge}+", f"{frag_string}", "END IONS", ] return "\n".join(mgf)
def read_compounds(filename, separator="\t", calculate=True, lib_adducts=[], filename_atoms=""): if calculate: path_nist_database = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'data', 'nist_database.txt') nist_database = nist_database_to_pyteomics(path_nist_database) df = read_csv(filename, sep=separator, float_precision="round_trip") records = [] for index, row in df.iterrows(): record = collections.OrderedDict() comp = pyteomics_mass.Composition(str(row.molecular_formula)) if comp: record["composition"] = collections.OrderedDict( (k, comp[k]) for k in order_composition_by_hill(comp.keys())) sum_CHNOPS = sum( [comp[e] for e in comp if e in ["C", "H", "N", "O", "P", "S"]]) record["CHNOPS"] = sum_CHNOPS == sum(list(comp.values())) if calculate: record["exact_mass"] = round( pyteomics_mass.calculate_mass(formula=str( str(row.molecular_formula)), mass_data=nist_database), 6) else: record["exact_mass"] = float(row.exact_mass) record["compound_id"] = row.compound_id record["compound_name"] = row.compound_name comp = pyteomics_mass.Composition(str(row.molecular_formula)) record["molecular_formula"] = composition_to_string(comp) if "retention_time" in df.columns: record["retention_time"] = row.retention_time elif "rt" in df.columns: record["retention_time"] = row.rt if "adduct" in df.columns: record["adduct"] = row.adduct if lib_adducts and calculate: record["exact_mass"] += lib_adducts.lib[row.adduct]["mass"] records.append(record) else: Warning("{} Skipped".format(row)) return records
def in_silico_fragmentation(fn): df = pandas.read_table(fn) products = {} for i, x in df.iterrows(): xchg = x['Precursor Charge'] bseq = x['Base Peptide Sequence'] seq = x['Peptide Sequence'] if not products.has_key(seq): parseq, theomass, theomz = calc_precursor_theoretical( seq, int(xchg)) if parseq == None: products[seq] = [0.0, 0.0] continue theoSpec = [] # for c in xrange(1, int(xchg/2)+1): for c in [1]: for n in xrange(1, len(bseq)): bproduct = parseq[:n + 1] + [parseq[-1]] yproduct = ['H-'] + parseq[n + 1:] bp = mass.calculate_mass(parsed_sequence=bproduct, ion_type='b', aa_comp=composition, charge=c) yp = mass.calculate_mass(parsed_sequence=yproduct, ion_type='y', aa_comp=composition, charge=c) theoSpec.append(bp) theoSpec.append(yp) # print "b:%d:%f" % (n,bp), bproduct # print "y:%d:%f" % (len(bseq)-n,yp), yproduct products[seq] = theoSpec return products
def getCIDFragmentIons(sequence,charge): """ Generate CID fragments for a given peptide sequence and charge, and calculates their monoisotopic m/z values. First, all possible b and y ion fragments are generated. Then, the monoisotopic m/z values are calculated for the given charge. This method makes use of the pyteomics package to compute the monoisotopic m/z values. For more information, please refer to: https://pythonhosted.org/pyteomics/mass.html Parameters ---------- sequence : str The peptide which will be fragmented. charge: int The charge of the b and y ions that will be computed. Returns ------- yFragmentMasses : ndarray A numpy array containing the monoisotopic m/z values of the y ion fragments. bFragmentMasses : ndarray A numpy array containing the monoisotopic m/z values of the b ion fragments. """ # generate y and b fragment sequences in a list yFragments = [sequence[i:] for i in range(len(sequence))] bFragments = [sequence[:i+1] for i in range(len(sequence))] # calculate masses for sequences in y/b-lists yFragmentMasses = np.fromiter( (mass.calculate_mass(sequence=yIon,ion_type='y',charge=charge) for yIon in yFragments),np.float) bFragmentMasses = np.fromiter( (mass.calculate_mass(sequence=bIon,ion_type='b',charge=charge) for bIon in bFragments),np.float) return yFragmentMasses, bFragmentMasses
def createLinks(antiBase_map): links_list = [] for key in antiBase_map.keys(): for j in range(0, len(antiBase_map[key][1])): temp_link = {} temp_link[ "source"] = "Antibase Chem Formula: " + key + " , " + "Antibase MW: " + str( mass.calculate_mass(formula=key)) temp_link["target"] = "Scan/alignment num: " + str( antiBase_map[key][1][j]) + " , " + "M/Z: " + str( antiBase_map[key][3][j]) + " , " + "Adduct type: " + str( antiBase_map[key][0][j]) temp_link["value"] = 1 / (antiBase_map[key][4][j] * 10**7) links_list.append(temp_link) return links_list
def Model_1(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']] X_test = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']] Y_train = train[' Label'] # Training clf = BaggingClassifier(base_estimator = RandomForestClassifier(random_state = 2), n_estimators = 100, random_state = 2, n_jobs = -1) clf.fit(X_train, Y_train) # Predicting Y_prob = [x[1] for x in clf.predict_proba(X_test)] Y_pred = clf.predict(X_test) result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_1.csv", index = False) result["Label"] = Y_pred result.to_csv("Prediction_1.csv", index = False)
def process_stack(stack, topN=None): assert len(stack) > 7 print(stack) global cnt, cnt_trgr header = stack[:7] fragments = stack[7:] nr_samples = stack[3].split(":")[1].split("/")[0] nr_isotopes = stack[5].split(":")[1] nr_spectra = stack[6].split(":")[1] sumformula = header[0].split(": ")[1] charge = int(header[1].split(": ")[1]) rt = float(header[2].split(": ")[1]) precursor_mz = abs(mass.calculate_mass(formula=sumformula, charge=charge)) # res = [] tmpres = [] for f in fragments: mz, inten = f.split("\t") mz = float(mz) tmpres.append([ precursor_mz, mz, rt, "%s_%s" % (cnt, sumformula), -1, inten, "%s_%s" % (cnt_trgr, sumformula), # transition_group_id 1, # decoy "", # sum-formula "", # protein name sumformula, sumformula, charge, "light"] ) cnt += 1 cnt_trgr += 1 # Sort by intensity tmpres.sort(key=lambda x: float(x[5]),reverse = True) if topN: tmpres = tmpres[:topN] res.extend(tmpres) return res
def loadDB(self, peptideList, minlen=1, maxlen=100): '''should take a list such as that generated by ms1pep.digestpeptidedb()''' sql='''INSERT INTO peptide_fragment (protein_accession,fragment_database_id, fragment_sequence, fragment_start, fragment_end, fragment_mono_mass) values (%s,%s,%s,%s,%s,%s)''' with database.ConnectMySQL(self.host, self.user, self.password,self.database) as sqlCon: cursor=sqlCon.cursor peps=0 for p in peptideList: try: assert p.has_key("sequence") and p.has_key("start") and p.has_key("end") and p.has_key("proteinID") mr=mass.calculate_mass(p['sequence']) if minlen<=len(p['sequence']) and maxlen >=len(p['sequence']): cursor.execute(sql, (p['proteinID'],self.dbid, p['sequence'],p['start'],p['end'],mr)) peps=peps+1 except Exception, e: warnings.warn("error including sequence %s::%s : %s"%(p['proteinID'], p['sequence'], e)) warnings.warn("Uploaded %s peptides to database %s"%(peps, self.dbtag))
def __init__(self, peptide, charge, mods_mass): self.charge = charge self.peptide = peptide # calculate neutral mass of peptide self.target = mass.calculate_mass( sequence=self.peptide, ion_type='M', charge=charge) + float(mods_mass) / float(charge) # calculate upper and lower m/z limits self.targetLL = self.target - self.target / 1000000 * options.ppm self.targetHL = self.target + self.target / 1000000 * options.ppm self.targetIntensityDIct = {} self.targetScanCounter = {} return
def test_annotate_peaks_most_intense(): fragment_tol_mass = 0.02 fragment_tol_mode = 'Da' peptide = 'YLYEIAR' fragment_mz = np.asarray([mz for _, mz in spectrum._get_theoretical_peptide_fragments( peptide)]) mz = np.asarray([fragment_mz[0] - 0.01, fragment_mz[0] + 0.01]) intensity = np.asarray([10, 20]) charge = 2 spec = spectrum.MsmsSpectrum( 'test_spectrum', mass.calculate_mass(sequence=peptide, charge=charge), charge, mz, intensity, peptide=peptide) spec.annotate_peaks(fragment_tol_mass, fragment_tol_mode, peak_assignment='most_intense') assert spec.annotation[0] is None assert spec.annotation[1] is not None
def create_search_space_peptide_fragments(self, fragments, charges, modifications): """ Args: fragments: An array of Fragment objects charges: The charges to consider modifications: The modifications to consider modifications is a dictionary with first element the mod name and second the residue(s) or positions affected Returns: A pandas dataframe with the fragment match information """ peptide_column_headers = [ 'Sequence', 'Ion', 'Charge', 'Mass_Theor', 'Modifications' ] #TODO implement modifications! data_to_save = [ ] # print "Searching mass {0:10.3f}".format(experimental_mass) for fragment in fragments: for z in charges: frag_string_modified = '' fragm_mz = mass.calculate_mass(sequence=fragment.sequence, ion_type=fragment.ion[0], charge=z) data_to_save.append([ fragment.sequence, fragment.ion, z, fragm_mz, frag_string_modified ]) # print '{0:<30} {1:<5} {2:<3} {3:10.3f} {4:10.3f} {5:10.3f} {6:7.1f} {7}'.format(fragment.sequence, fragment.ion, z, experimental_mass, experimental_intensity, fragm_mz, ppm_calculated, frag_string_modified) df = pd.DataFrame(data_to_save, columns=peptide_column_headers) #df.to_csv('matched_fragments.csv') return df
def get_frag_mz(one_hot, ion_position, ion_type, ion_charge): pep_seq = reverse_one_hot_encode(one_hot, amino_acid_modified_codes) if ion_type == 'b': ion_seq = pep_seq[:ion_position] elif ion_type == 'y': ion_seq = pep_seq[-ion_position:] # # TODO: Figure out how to use aacomp # count = ion_seq.count("!") # ion_seq =ion_seq.replace("!", "C") for key, value in dumb_reversal.items(): ion_seq = ion_seq.replace(key, value) mz = mass.calculate_mass(sequence=ion_seq, ion_type=ion_type, charge=int(ion_charge), aa_comp=aa_comp) # mz += count * float(2*12 + 1 + 14)/int(ion_charge) return mz
def test_annotate_peaks_nearest_mz(): fragment_tol_mass = 0.02 fragment_tol_mode = 'Da' peptide = 'YLYEIAR' fragment_mz = np.asarray([fragment.calc_mz for fragment in spectrum._get_theoretical_peptide_fragments( peptide)]) mz = np.asarray([fragment_mz[0] - 0.005, fragment_mz[0] + 0.015]) intensity = np.asarray([10, 20]) charge = 2 spec = spectrum.MsmsSpectrum( 'test_spectrum', mass.calculate_mass(sequence=peptide, charge=charge), charge, mz, intensity, peptide=peptide) spec.annotate_peaks(fragment_tol_mass, fragment_tol_mode, peak_assignment='nearest_mz') assert spec.annotation[0] == spectrum.FragmentAnnotation('b', 1, 1, fragment_mz[0]) assert spec.annotation[1] is None
def listmz(peptide, charges=[2,3,4], fixedmods={},modifications=[]): ''' Calculates the mz values for a given peptide with modifications for each of the charges listed in charges Default is to calculate 2+, 3+ and 4+ listmz(peptide, charges=[2,3,4], fixedmods={"C": 56.0987}, modifications=['3 Phospho (STY)'] ''' hmass=float(Unimod.unimod.database.get_element('H')['mono_mass']) mz=float(mass.calculate_mass(peptide)) for p in modifications: m=re.match(r'(\d+) +(.*) +\(([^\)]*)\) *$', p) if m: pos=int(m.group(1)) label=m.group(2) aa=m.group(3) if peptide[pos-1] in aa: mz = mz + float(Unimod.unimod.database.get_label(label)['delta_mono_mass']) for k in fixedmods.keys(): for a in peptide: if a==k: mz = mz + float(fixedmods[k]) mzcalc=[] for c in charges: mzcalc.append(hmass+(mz/c)) return mzcalc
sys.exit() inputfile01 = open(input_file, "r") # outputfile1 = open(output_file,'w') from pyteomics import parser from pyteomics import mass # gene_list = ['SAA1'] # gene_list = open(gene_list,'r') counter = 0 errcounter = 0 pepinput = "MALTSEYWIILR" ps0 = parser.parse(pepinput, show_unmodified_termini=True) referencemass = mass.calculate_mass(parsed_sequence=ps0) mass_tolerance = 7 # unit: ppm targetmass = 1422.730378 total_pep_list = [] for num, x in enumerate(SeqIO.parse(inputfile01, "fasta")): if num % 10000 == 0: print num # if num > 5000: # break pro = str(x.seq) peplist = digest(pro, enzyme, missed_cleavage, min_pep_length, max_pep_length) if len(peplist) > 0: for p in peplist: total_pep_list.append(p) sort_list = list(set(total_pep_list)) for num1, pep in enumerate(sort_list):