def Model_2(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']] X_test = [AAC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']] Y_train = train[' Label'] X_train, Y_train, X_test = np.array(X_train), np.array(Y_train), np.array(X_test) X_train,Y_train = shuffle(X_train,Y_train,random_state = 3) # Training param = {'max_depth':25,'objective':'reg:logistic','n_estimators':100,'booster':'gbtree', 'colsample_bylevel':0.7,'colsample_bytree': 1,'n_thread': 2} xgb = XGBClassifier( **param, random_state = 3) clf = BaggingClassifier(base_estimator = xgb, n_estimators = 23, random_state = 3, n_jobs = -1) clf.fit(X_train, Y_train) # Predicting Y_prob = [x[1] for x in clf.predict_proba(X_test)] Y_pred = clf.predict(X_test) result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_2.csv", index = False) result["Label"] = Y_pred result.to_csv("Prediction_2.csv", index = False)
def test_charge_calculations_dict(self): self.assertRaises(PyteomicsError, charge, {'H-': 1, '-OH': 1, 'E': 1}, 7, pK_nterm={'H-': {'A': [(9., 1)]}}) self.assertTrue( abs(charge({'A': 3, 'H-': 1, '-OH': 1}, 14.0) + 1.0) < 0.01) self.assertTrue( abs(charge({'A': 1, 'H-': 1, '-OH': 1, 'ntermB': 1, 'ctermA': 1}, 14.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)], 'B': [(3., 1)]}}) + 1.0) < 0.01) self.assertRaises(PyteomicsError, charge, {'A': 1, 'H-': 1, '-OH': 1, 'ctermA': 1}, 14.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)]}}) self.assertRaises(PyteomicsError, charge, {'A': 1, 'H-': 1, '-OH': 1, 'ntermA': 1}, 14.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)]}}) self.assertRaises(PyteomicsError, charge, {'A': 1, 'H-': 1, '-OH': 1, 'ntermA': 2, 'ctermA': 1}, 14.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)]}}) self.assertRaises(PyteomicsError, charge, {'A': 1, 'H-': 1, 'ntermA': 1, 'ctermA': 1}, 14.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)]}})
def test_charge_input(self): for i in range(0, 14): self.assertAlmostEqual( charge('H-ACDEFGH-OH', i), charge(['H-', 'A', 'C', 'D', 'E', 'F', 'G', 'H', '-OH'], i)) for i in range(0, 14): self.assertAlmostEqual( charge('H-ACDEFGH-OH', i), charge({'H-': 1, 'A': 1, 'C': 1, 'D': 1, 'E': 1, 'F': 1, 'G': 1, 'H': 1, '-OH': 1}, i))
def test_charge_calculations_list(self): self.assertRaises(PyteomicsError, charge, ['A','A','A'], 5.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)]}}) self.assertTrue( abs(charge(['H-','A','A','A','-OH'], 0.0) - 1.0) < 0.01) self.assertTrue( abs(charge(['H-','A','A','A','-OH'], 14.0) + 1.0) < 0.01) self.assertTrue( abs(charge(['H-','A','A','A','-OH'], (2.34 + 9.69) / 2.0)) < 0.01)
def test_charge_calculations_str(self): self.assertTrue( abs(charge('AAA', 5.0, pK={'H-': [(9., 1)], '-OH': [(8., -1)]}, pK_nterm={'H-': {'A': [(3., 1)]}})) < 0.01) self.assertTrue( abs(charge('H-AAA-OH', 0.0) - 1.0) < 0.01) self.assertTrue( abs(charge('H-AAA-OH', 14.0) + 1.0) < 0.01) self.assertTrue( abs(charge('H-AAA-OH', (2.34 + 9.69) / 2.0)) < 0.01)
def get_theor_spectrum(peptide, acc_frag, types=('b', 'y'), maxcharge=None, **kwargs): """ Calculates theoretical spectra in two ways: usual one. and formatter in integer (mz / frag_acc). `peptide` -peptide sequence `acc_frag` - accuracy of matching. `types` - ion types. `maxcharge` - maximum charge. ---------- Returns spectra in two ways (usual, integer) """ peaks = {} theoretical_set = defaultdict(set) pl = len(peptide) - 1 if not maxcharge: maxcharge = 1 + int(ec.charge(peptide, pH=2)) for charge in range(1, maxcharge + 1): for ion_type in types: nterminal = ion_type[0] in 'abc' if nterminal: maxpart = peptide[:-1] maxmass = cmass.fast_mass(maxpart, ion_type=ion_type, charge=charge, **kwargs) marr = np.zeros((pl, ), dtype=float) marr[0] = maxmass for i in range(1, pl): marr[i] = marr[i - 1] - mass.fast_mass2( [maxpart[-i]]) / charge ### recalculate else: maxpart = peptide[1:] maxmass = cmass.fast_mass(maxpart, ion_type=ion_type, charge=charge, **kwargs) marr = np.zeros((pl, ), dtype=float) marr[pl - 1] = maxmass for i in range(pl - 2, -1, -1): marr[i] = marr[i + 1] - mass.fast_mass2( [maxpart[-(i + 2)]]) / charge ### recalculate tmp = marr / acc_frag tmp = tmp.astype(int) theoretical_set[ion_type].update(tmp) marr.sort() peaks[ion_type, charge] = marr return peaks, theoretical_set
def Model_1(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in train['Sequence']] X_test = [AAC(x)+DPC(x)+[mass.calculate_mass(sequence=x)/len(x)]+[electrochem.charge(x,len(x))]+[ProteinAnalysis(x).isoelectric_point()] for x in test[' Sequence']] Y_train = train[' Label'] # Training clf = BaggingClassifier(base_estimator = RandomForestClassifier(random_state = 2), n_estimators = 100, random_state = 2, n_jobs = -1) clf.fit(X_train, Y_train) # Predicting Y_prob = [x[1] for x in clf.predict_proba(X_test)] Y_pred = clf.predict(X_test) result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_1.csv", index = False) result["Label"] = Y_pred result.to_csv("Prediction_1.csv", index = False)
peptides = [{'sequence': i} for i in unique_peptides] print 'Parsing peptide sequences...' for peptide in peptides: peptide['parsed_sequence'] = parser.parse(peptide['sequence'], show_unmodified_termini=True) peptide['length'] = parser.length(peptide['parsed_sequence']) print 'Done!' peptides = [peptide for peptide in peptides if peptide['length'] <= 100] print 'Calculating the mass, charge and m/z...' for peptide in peptides: peptide['charge'] = int( round(electrochem.charge(peptide['parsed_sequence'], pH=2.0))) peptide['mass'] = mass.calculate_mass(peptide['parsed_sequence']) peptide['m/z'] = mass.calculate_mass(peptide['parsed_sequence'], charge=peptide['charge']) print 'Done!' print 'Calculating the retention time...' for peptide in peptides: peptide['RT_RP'] = achrom.calculate_RT(peptide['parsed_sequence'], achrom.RCs_zubarev) peptide['RT_normal'] = achrom.calculate_RT(peptide['parsed_sequence'], achrom.RCs_yoshida_lc) print 'Done!' plt.figure() plt.hist([peptide['m/z'] for peptide in peptides], bins=2000, range=(0, 4000))
aa_comp = dict(mass.std_aa_comp) aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0}) aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0}) aa_comp['ox'] = mass.Composition({'O':1}) # Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias. print('Calculating peptide physicochemical properties...') iso_electric_points = [] pep_charges = [] pep_mass = [] i = 0 for peptide in mod_pep: peptide_isoelectric_point = electrochem.pI(peptide) peptide_charge = electrochem.charge(peptide, 7) peptide_mass = mass.calculate_mass(sequence = peptide, aa_comp = aa_comp) pep_charges.append(peptide_charge) iso_electric_points.append(peptide_isoelectric_point) pep_mass.append(peptide_mass) i += 1 print('LC-retention time prediction with the following parameters:') print(lc_params) # Column length: column_length = lc_params['column_length'][0] if isinstance(column_length, numbers.Number) != True: raise NameError('Error in parameter input file, column_length takes only Numeric.')
import pylab import numpy as np from pyteomics import electrochem pHs = np.arange(1, 14, 0.5) # list of values of pH charges = electrochem.charge('PEPTIDE', pHs) # charge function accepts lists of pHs pylab.figure() pylab.plot(pHs, charges) pylab.title("Charge of peptide 'PEPTIDE' vs pH") pylab.xlabel('pH') pylab.ylabel('Charge') pylab.show()
The hyperparameters of SVM model was tuned using GridSearchCV. The model was tested by using 5-fold cross validation. ''' import pandas as pd from sklearn import svm from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix,make_scorer from sklearn.model_selection import GridSearchCV, cross_val_score, KFold from Bio.SeqUtils.ProtParam import ProteinAnalysis from pyteomics import electrochem,mass,parser # Training Set Data Train = pd.read_csv("train.csv") Train_AAC = pd.read_csv("AAC_train.csv") Train_DP2 = pd.read_csv("DP2F_train.csv") Train_C = [electrochem.charge(x,len(x)) for x in Train["Sequence"]] Train_M = [mass.calculate_mass(sequence=x)/len(x) for x in Train["Sequence"]] Train_PI = [ProteinAnalysis(x).isoelectric_point() for x in Train["Sequence"]] # Test Set Data Test = pd.read_csv("test.csv") Test_AAC = pd.read_csv("AAC_test.csv") Test_DP2 = pd.read_csv("DP2F_test.csv") Test_C = [electrochem.charge(x,len(x)) for x in Test["Sequence"]] Test_M = [mass.calculate_mass(sequence=x)/len(x) for x in Test["Sequence"]] Test_PI = [ProteinAnalysis(x).isoelectric_point() for x in Test["Sequence"]] Labels = Train["Lable"] # Assembing Parameters Of Training set
def peptide_mod_biolccc_rt_prediction(lc_params_file, fasta_file_name, custom_gradient, output_name): lc_params = pd.read_csv(lc_params_file) all_required_params = [ 'column_length', 'column_diameter', 'column_pore_size', 'second_solvent_concentration_a', 'second_solvent_concentration_b', 'gradient_0', 'gradient_1', 'gradient_2', 'flow_rate', 'code_format', 'linear', 'model' ] # Check if all parameters are in parameter input file. if sorted(all_required_params) != sorted(list(lc_params.keys())): raise NameError( 'Error in parameter LC input file, check for typos or missing parameter.' ) # TRUE OR FALSE statement about whether the fasta file is in codons or in amino acids # Currently there is not a method in place, to use a nucleotide sequence fasta file as an input. code_format = lc_params['code_format'][0] linear_gradient = lc_params['linear'][0] # if not a linear gradient, a gradient file must be supplied. if not linear_gradient: gradient_file = pd.read_csv(custom_gradient) # which type of model to use for prediction (from TFA or FA) model_type = lc_params['model'][0] if model_type == 'FA': print('formic acid') elif model_type == 'TFA': print('tri') # Initialize empty dictionary of contig names and sequences: seq_df = pd.DataFrame(columns=['contigs', 'seq']) # Initialize empty lists of sequences and contigs: seq_vec = [] contig_vec = [] # Initalize variable that will contain the name of each sequence: last_seq = None # Reading in fasta file fasta_in = open(fasta_file_name, 'r') for line in fasta_in: # Strip the line: line = line.strip() # If the line is blank, move on. if len(line) == 0: # blank line continue # If the line is a header, record the header as last_seq elif line[0] == ">": # header-line last_seq = line[1:] # If the line is a sequence, record the sequence: else: # sequence line # separate if statements for if the fasta file was input as amino acids or as genes or as mrna. Note that code_format == 'genes' and code_format == 'rna' are not functional yet. if (code_format == 'genes'): aa_line = Codon_to_Aminoacid(line) cleaved_line = pyteomics.parser.cleave( str(aa_line), pyteomics.parser.expasy_rules['trypsin']) cleaved_line = list(cleaved_line) elif (code_format == 'rna'): removed_u = line.relace('U', 'T') aa_line = Codon_to_Aminoacid(removed_u) cleaved_line = pyteomics.parser.cleave( str(aa_line), pyteomics.parser.expasy_rules['trypsin']) cleaved_line = list(cleaved_line) elif (code_format == 'aas'): # Digest with trypsin: cleaved_line = pyteomics.parser.cleave( str(line), pyteomics.parser.expasy_rules['trypsin']) cleaved_line = list(cleaved_line) # If the peptide is shorter than 5 amino acids long, then we remove it fromt the dataset: for tryp_pep in cleaved_line: if len(tryp_pep) < 5: continue seq_vec.append(tryp_pep) contig_vec.append(last_seq) # Close the fasta file: fasta_in.close() print('Removing xs and *s from seqs...') contig_vec_pd = pd.Series(contig_vec, name='contig') # Adding in the modification terms for the termini: seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec] # Removing contigs with unknown amino acid (X) or selenocysteine (U): stars_removed_peps = [] for starred_peptide in seq_vec_terms: line_new = starred_peptide if '*' in line_new: continue #some peptides have unknown amino acids, remove them. if 'X' in line_new: continue if 'U' in line_new: continue stars_removed_peps.append(line_new) # Changing B to asparagine b_removed_peps = [] for b_peptide in stars_removed_peps: line_new = re.sub('B', 'N', b_peptide) b_removed_peps.append(line_new) # Changing Z to glutamine z_removed_peps = [] for z_peptide in b_removed_peps: line_new = re.sub('Z', 'Q', z_peptide) z_removed_peps.append(line_new) # Removing contigs that have an unknown amino acid (X), or selenocysteine ('U') contig_vec_no_x = [] for contig_name in range(len(contig_vec)): if 'X' in seq_vec_terms[contig_name]: continue if 'U' in seq_vec_terms[contig_name]: continue if '*' in seq_vec_terms[contig_name]: continue temp_contig = contig_vec[contig_name] contig_vec_no_x.append(temp_contig) # Modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream) print('Modifying peptides...') mod_pep = [] for tryp_pep in z_removed_peps: test_iso = pyteomics.parser.isoforms(tryp_pep, fixed_mods={ 'ox': ['M'], 'cam': ['C'] }, show_unmodified_termini=True) for blah in test_iso: mod_pep.append(blah) # Modified amino acid dictionary for mass calculation: aa_comp = dict(mass.std_aa_comp) aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0}) aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0}) aa_comp['ox'] = mass.Composition({'O': 1}) # Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias. print('Calculating peptide physicochemical properties...') iso_electric_points = [] pep_charges = [] pep_mass = [] i = 0 for peptide in mod_pep: peptide_isoelectric_point = electrochem.pI(peptide) peptide_charge = electrochem.charge(peptide, 7) peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp) pep_charges.append(peptide_charge) iso_electric_points.append(peptide_isoelectric_point) pep_mass.append(peptide_mass) i += 1 print('LC-retention time prediction with the following parameters:') print(lc_params) # Column length: column_length = lc_params['column_length'][0] if isinstance(column_length, numbers.Number) != True: raise NameError( 'Error in parameter input file, column_length takes only Numeric.') # Column diameter: column_diameter = lc_params['column_diameter'][0] if isinstance(column_diameter, numbers.Number) != True: raise NameError( 'Error in parameter input file, column_diameter takes only Numeric.' ) # Column pore size column_pore_size = lc_params['column_pore_size'][0] # 0.11 minutes if isinstance(column_pore_size, numbers.Number) != True: raise NameError( 'Error in parameter input file, column_pore_size takes only Numeric.' ) second_solvent_concentration_a = lc_params[ 'second_solvent_concentration_a'][0] if isinstance(second_solvent_concentration_a, numbers.Number) != True: raise NameError( 'Error in parameter input file, second_solvent_concentration_a takes only Numeric.' ) second_solvent_concentration_b = lc_params[ 'second_solvent_concentration_b'][0] if isinstance(second_solvent_concentration_b, numbers.Number) != True: raise NameError( 'Error in parameter input file, second_solvent_concentration_b takes only Numeric.' ) gradient_0 = lc_params['gradient_0'][0] if isinstance(gradient_0, numbers.Number) != True: raise NameError( 'Error in parameter input file, gradient_0 takes only Numeric.') gradient_1 = lc_params['gradient_1'][0] if isinstance(gradient_1, numbers.Number) != True: raise NameError( 'Error in parameter input file, gradient_1 takes only Numeric.') gradient_2 = lc_params['gradient_2'][0] if isinstance(gradient_2, numbers.Number) != True: raise NameError( 'Error in parameter input file, gradient_2 takes only Numeric.') flow_rate = lc_params['flow_rate'][0] if isinstance(flow_rate, numbers.Number) != True: raise NameError( 'Error in parameter input file, flow_rate takes only Numeric') # biolccc predicting RT times myChromoConditions = biolccc.ChromoConditions() # The column length in mm. myChromoConditions.setColumnLength(column_length) # The internal column diameter in mm. myChromoConditions.setColumnDiameter(column_diameter) # The average pore size in A. myChromoConditions.setColumnPoreSize(column_pore_size) # The concentration of the eluting solvent (ACN for the reversed # phase) in component A in %. myChromoConditions.setSecondSolventConcentrationA( second_solvent_concentration_a) # The concentration of the eluting solvent (ACN for the reversed # phase) in component B in %. myChromoConditions.setSecondSolventConcentrationB( second_solvent_concentration_b) # The shape of the gradient. The example is a linear gradient # from gradient_0% to gradient_1% of component B over gradient_2 minutes. if linear_gradient: myChromoConditions.setGradient( biolccc.Gradient(gradient_0, gradient_1, gradient_2)) else: # loop that goes through and sets a custom gradient. another gradient file is required as the argv[4] file. myGradient = biolccc.Gradient() # An older version of this was more static, and left in the comments below to demonstrate what this loop is doing: for set_point in range(len(gradient_file.columns)): myGradient.addPoint(gradient_file.iloc[0, set_point], gradient_file.iloc[1, set_point]) myChromoConditions.setGradient(myGradient) # The following gradient is an exponential function increasing from gradient_0 # to 100, specifically for the Aylward testing datasetself. # def exp_function(x): # x1 = math.pow(x, 2)//100 # this is the function used to compute these setpoints. #myGradient = biolccc.Gradient() #myGradient.addPoint(0.0, gradient_0) #myGradient.addPoint(15.0, 2.0) #myGradient.addPoint(30.0, 9.0) #myGradient.addPoint(45.0, 20.0) #myGradient.addPoint(60.0, 36.0) #myGradient.addPoint(75.0, 56.0) #myGradient.addPoint(90.0, 81.0) #myGradient.addPoint(gradient_2, gradient_1) #myChromoConditions.setGradient(myGradient) # The flow rate in ml/min. myChromoConditions.setFlowRate(flow_rate) print('Calculating retention times...') # Designating BioLCCC model to use: if model_type == 'TFA': model_to_use = biolccc.rpAcnTfaChain elif model_type == 'FA': model_to_use = biolccc.rpAcnFaRod peptide_rts = [] i = 0 print('Calculating retention times...') for tryp_pep in mod_pep: rt_temp = biolccc.calculateRT(tryp_pep, model_to_use, myChromoConditions) peptide_rts.append(rt_temp) i += 1 # Combining the sequences, times, and physicochemical characteristics. peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence') peptide_rts = pd.Series(peptide_rts, name='rts') iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point') pep_charges_pd = pd.Series(pep_charges, name='charge') pep_mass_pd = pd.Series(pep_mass, name='mass') contig_pd = pd.Series(contig_vec_no_x, name='contig') peptide_dataframe = pd.concat([ peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd, pep_mass_pd, contig_pd ], axis=1) current_date = time.strftime("%Y-%m-%d") custom_name = output_name file_name = custom_name + '_lc-retention-times.csv' peptide_dataframe.to_csv(file_name)
def openms_modelled_rt(rtfilename, output_name): seq_rt_df = pd.read_csv(rtfilename, names=['seq_rt']) df = pd.DataFrame(seq_rt_df.seq_rt.str.split(' ', 1).tolist(), columns=['pep_seq', 'rts']) seq_vec = df['pep_seq'].tolist() peptide_rts = df['rts'].tolist() print('Removing xs and *s from seqs...') seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec] # removing contigs with unknown amino acid (X) or selenocysteine (U) stars_removed_peps = [] for starred_peptide in seq_vec_terms: line_new = starred_peptide # some peptides have unknown amino acids denoted as *, remove them. if '*' in line_new: continue #some peptides have unknown amino acids, remove them. if 'X' in line_new: continue if 'U' in line_new: continue stars_removed_peps.append(line_new) #changing B to asparagine b_removed_peps = [] for b_peptide in stars_removed_peps: line_new = re.sub('B', 'N', b_peptide) b_removed_peps.append(line_new) #changing Z to glutamine z_removed_peps = [] for z_peptide in b_removed_peps: line_new = re.sub('Z', 'Q', z_peptide) z_removed_peps.append(line_new) # #modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream) print('Modifying peptides...') mod_pep = [] for tryp_pep in z_removed_peps: test_iso = pyteomics.parser.isoforms(tryp_pep, fixed_mods={ 'ox': ['M'], 'cam': ['C'] }, show_unmodified_termini=True) for blah in test_iso: mod_pep.append(blah) # # modified amino acid dictionary for mass calculation aa_comp = dict(mass.std_aa_comp) aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0}) aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0}) aa_comp['ox'] = mass.Composition({'O': 1}) #%% # calculate peptide isoelectric points, masses, and charge at pH = 7 print('Calculating peptide physicochemical properties...') iso_electric_points = [] pep_charges = [] pep_mass = [] i = 0 for peptide in mod_pep: peptide_isoelectric_point = electrochem.pI(peptide) peptide_charge = electrochem.charge(peptide, 7) peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp) pep_charges.append(peptide_charge) iso_electric_points.append(peptide_isoelectric_point) pep_mass.append(peptide_mass) i += 1 # Combining the sequences, times, and physicochemical characteristics. peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence') peptide_rts = pd.Series(peptide_rts, name='rts') iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point') pep_charges_pd = pd.Series(pep_charges, name='charge') pep_mass_pd = pd.Series(pep_mass, name='mass') peptide_dataframe = pd.concat([ peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd, pep_mass_pd ], axis=1) current_date = time.strftime("%Y-%m-%d") custom_name = output_name file_name = custom_name + '_lc-retention-times.csv' print(file_name) peptide_dataframe.to_csv(file_name)