def compute_sequence_features(peptides_df): """ Compute features to evaluate database. Parameters: peptides_df : df dataframe with peptides. Returns: None. """ # features seq = "sequence" peptides_df["length"] = peptides_df["sequence"].apply(len) # amino acid counts peptides_df["KR"] = peptides_df[seq].str.count( "K") + peptides_df[seq].str.count("R") peptides_df["aromatic"] = peptides_df[seq].str.count("F") + peptides_df[seq].str.count("W") + \ peptides_df[seq].str.count("Y") peptides_df["acids"] = peptides_df[seq].str.count( "D") + peptides_df[seq].str.count("E") peptides_df["aliphatic"] = peptides_df[seq].str.count("A") + peptides_df[seq].str.count("I") + \ peptides_df[seq].str.count("L") + peptides_df[seq].str.count("M") + \ peptides_df[seq].str.count("V") peptides_df["HGP"] = peptides_df[seq].str.count("G") + peptides_df[seq].str.count("P") + \ peptides_df[seq].str.count("H") # sequence properties peptides_df["isoelectric_point"] = [ electrochem.pI(x) for x in peptides_df["sequence"].values ] peptides_df["gravy"] = [ electrochem.gravy(x) for x in peptides_df["sequence"].values ] return peptides_df
for seq_record in SeqIO.parse(fasta_file, "fasta"): aa_sequence = seq_record.seq aa_sequence = str(aa_sequence.rstrip()) count += 1 if count % 1000 == 0: print count # Stop at 2,000 for testing purposes: #if count == 2000: # break try: pI_count.append(electrochem.pI(aa_sequence)) for aa in aa_sequence: aa_freqs[aa] += 1 residue_count += 1 except auxiliary.PyteomicsError: failed_count += 1 continue logfile.write("For dataset " + name + "\n") logfile.write("A total of %d entries were found\n" % count) logfile.write("A total of %d entries had errors\n" % failed_count) logfile.write("A total of %d entries were used in the analysis\n\n" % (count-failed_count)) freq_results[name] = aa_freqs
aa_comp = dict(mass.std_aa_comp) aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0}) aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0}) aa_comp['ox'] = mass.Composition({'O':1}) # Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias. print('Calculating peptide physicochemical properties...') iso_electric_points = [] pep_charges = [] pep_mass = [] i = 0 for peptide in mod_pep: peptide_isoelectric_point = electrochem.pI(peptide) peptide_charge = electrochem.charge(peptide, 7) peptide_mass = mass.calculate_mass(sequence = peptide, aa_comp = aa_comp) pep_charges.append(peptide_charge) iso_electric_points.append(peptide_isoelectric_point) pep_mass.append(peptide_mass) i += 1 print('LC-retention time prediction with the following parameters:') print(lc_params) # Column length: column_length = lc_params['column_length'][0] if isinstance(column_length, numbers.Number) != True: raise NameError('Error in parameter input file, column_length takes only Numeric.')
def add_p_i(self): self.data_frame['pI'] = self.data_frame['sequence'].apply( lambda sequence: electrochem.pI(sequence))
def test_pI_precision(self): pI_best = pI('PEPTIDE', precision_pI=1e-15) for i in range(16): precision = 10 ** (-i) self.assertTrue( abs(pI('PEPTIDE', precision_pI=precision) - pI_best) < precision)
def test_pI_calculations(self): self.assertTrue( abs(pI('H-AAA-OH') - (2.34 + 9.69) / 2.0) < 0.01)
def peptide_mod_biolccc_rt_prediction(lc_params_file, fasta_file_name, custom_gradient, output_name): lc_params = pd.read_csv(lc_params_file) all_required_params = [ 'column_length', 'column_diameter', 'column_pore_size', 'second_solvent_concentration_a', 'second_solvent_concentration_b', 'gradient_0', 'gradient_1', 'gradient_2', 'flow_rate', 'code_format', 'linear', 'model' ] # Check if all parameters are in parameter input file. if sorted(all_required_params) != sorted(list(lc_params.keys())): raise NameError( 'Error in parameter LC input file, check for typos or missing parameter.' ) # TRUE OR FALSE statement about whether the fasta file is in codons or in amino acids # Currently there is not a method in place, to use a nucleotide sequence fasta file as an input. code_format = lc_params['code_format'][0] linear_gradient = lc_params['linear'][0] # if not a linear gradient, a gradient file must be supplied. if not linear_gradient: gradient_file = pd.read_csv(custom_gradient) # which type of model to use for prediction (from TFA or FA) model_type = lc_params['model'][0] if model_type == 'FA': print('formic acid') elif model_type == 'TFA': print('tri') # Initialize empty dictionary of contig names and sequences: seq_df = pd.DataFrame(columns=['contigs', 'seq']) # Initialize empty lists of sequences and contigs: seq_vec = [] contig_vec = [] # Initalize variable that will contain the name of each sequence: last_seq = None # Reading in fasta file fasta_in = open(fasta_file_name, 'r') for line in fasta_in: # Strip the line: line = line.strip() # If the line is blank, move on. if len(line) == 0: # blank line continue # If the line is a header, record the header as last_seq elif line[0] == ">": # header-line last_seq = line[1:] # If the line is a sequence, record the sequence: else: # sequence line # separate if statements for if the fasta file was input as amino acids or as genes or as mrna. Note that code_format == 'genes' and code_format == 'rna' are not functional yet. if (code_format == 'genes'): aa_line = Codon_to_Aminoacid(line) cleaved_line = pyteomics.parser.cleave( str(aa_line), pyteomics.parser.expasy_rules['trypsin']) cleaved_line = list(cleaved_line) elif (code_format == 'rna'): removed_u = line.relace('U', 'T') aa_line = Codon_to_Aminoacid(removed_u) cleaved_line = pyteomics.parser.cleave( str(aa_line), pyteomics.parser.expasy_rules['trypsin']) cleaved_line = list(cleaved_line) elif (code_format == 'aas'): # Digest with trypsin: cleaved_line = pyteomics.parser.cleave( str(line), pyteomics.parser.expasy_rules['trypsin']) cleaved_line = list(cleaved_line) # If the peptide is shorter than 5 amino acids long, then we remove it fromt the dataset: for tryp_pep in cleaved_line: if len(tryp_pep) < 5: continue seq_vec.append(tryp_pep) contig_vec.append(last_seq) # Close the fasta file: fasta_in.close() print('Removing xs and *s from seqs...') contig_vec_pd = pd.Series(contig_vec, name='contig') # Adding in the modification terms for the termini: seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec] # Removing contigs with unknown amino acid (X) or selenocysteine (U): stars_removed_peps = [] for starred_peptide in seq_vec_terms: line_new = starred_peptide if '*' in line_new: continue #some peptides have unknown amino acids, remove them. if 'X' in line_new: continue if 'U' in line_new: continue stars_removed_peps.append(line_new) # Changing B to asparagine b_removed_peps = [] for b_peptide in stars_removed_peps: line_new = re.sub('B', 'N', b_peptide) b_removed_peps.append(line_new) # Changing Z to glutamine z_removed_peps = [] for z_peptide in b_removed_peps: line_new = re.sub('Z', 'Q', z_peptide) z_removed_peps.append(line_new) # Removing contigs that have an unknown amino acid (X), or selenocysteine ('U') contig_vec_no_x = [] for contig_name in range(len(contig_vec)): if 'X' in seq_vec_terms[contig_name]: continue if 'U' in seq_vec_terms[contig_name]: continue if '*' in seq_vec_terms[contig_name]: continue temp_contig = contig_vec[contig_name] contig_vec_no_x.append(temp_contig) # Modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream) print('Modifying peptides...') mod_pep = [] for tryp_pep in z_removed_peps: test_iso = pyteomics.parser.isoforms(tryp_pep, fixed_mods={ 'ox': ['M'], 'cam': ['C'] }, show_unmodified_termini=True) for blah in test_iso: mod_pep.append(blah) # Modified amino acid dictionary for mass calculation: aa_comp = dict(mass.std_aa_comp) aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0}) aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0}) aa_comp['ox'] = mass.Composition({'O': 1}) # Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias. print('Calculating peptide physicochemical properties...') iso_electric_points = [] pep_charges = [] pep_mass = [] i = 0 for peptide in mod_pep: peptide_isoelectric_point = electrochem.pI(peptide) peptide_charge = electrochem.charge(peptide, 7) peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp) pep_charges.append(peptide_charge) iso_electric_points.append(peptide_isoelectric_point) pep_mass.append(peptide_mass) i += 1 print('LC-retention time prediction with the following parameters:') print(lc_params) # Column length: column_length = lc_params['column_length'][0] if isinstance(column_length, numbers.Number) != True: raise NameError( 'Error in parameter input file, column_length takes only Numeric.') # Column diameter: column_diameter = lc_params['column_diameter'][0] if isinstance(column_diameter, numbers.Number) != True: raise NameError( 'Error in parameter input file, column_diameter takes only Numeric.' ) # Column pore size column_pore_size = lc_params['column_pore_size'][0] # 0.11 minutes if isinstance(column_pore_size, numbers.Number) != True: raise NameError( 'Error in parameter input file, column_pore_size takes only Numeric.' ) second_solvent_concentration_a = lc_params[ 'second_solvent_concentration_a'][0] if isinstance(second_solvent_concentration_a, numbers.Number) != True: raise NameError( 'Error in parameter input file, second_solvent_concentration_a takes only Numeric.' ) second_solvent_concentration_b = lc_params[ 'second_solvent_concentration_b'][0] if isinstance(second_solvent_concentration_b, numbers.Number) != True: raise NameError( 'Error in parameter input file, second_solvent_concentration_b takes only Numeric.' ) gradient_0 = lc_params['gradient_0'][0] if isinstance(gradient_0, numbers.Number) != True: raise NameError( 'Error in parameter input file, gradient_0 takes only Numeric.') gradient_1 = lc_params['gradient_1'][0] if isinstance(gradient_1, numbers.Number) != True: raise NameError( 'Error in parameter input file, gradient_1 takes only Numeric.') gradient_2 = lc_params['gradient_2'][0] if isinstance(gradient_2, numbers.Number) != True: raise NameError( 'Error in parameter input file, gradient_2 takes only Numeric.') flow_rate = lc_params['flow_rate'][0] if isinstance(flow_rate, numbers.Number) != True: raise NameError( 'Error in parameter input file, flow_rate takes only Numeric') # biolccc predicting RT times myChromoConditions = biolccc.ChromoConditions() # The column length in mm. myChromoConditions.setColumnLength(column_length) # The internal column diameter in mm. myChromoConditions.setColumnDiameter(column_diameter) # The average pore size in A. myChromoConditions.setColumnPoreSize(column_pore_size) # The concentration of the eluting solvent (ACN for the reversed # phase) in component A in %. myChromoConditions.setSecondSolventConcentrationA( second_solvent_concentration_a) # The concentration of the eluting solvent (ACN for the reversed # phase) in component B in %. myChromoConditions.setSecondSolventConcentrationB( second_solvent_concentration_b) # The shape of the gradient. The example is a linear gradient # from gradient_0% to gradient_1% of component B over gradient_2 minutes. if linear_gradient: myChromoConditions.setGradient( biolccc.Gradient(gradient_0, gradient_1, gradient_2)) else: # loop that goes through and sets a custom gradient. another gradient file is required as the argv[4] file. myGradient = biolccc.Gradient() # An older version of this was more static, and left in the comments below to demonstrate what this loop is doing: for set_point in range(len(gradient_file.columns)): myGradient.addPoint(gradient_file.iloc[0, set_point], gradient_file.iloc[1, set_point]) myChromoConditions.setGradient(myGradient) # The following gradient is an exponential function increasing from gradient_0 # to 100, specifically for the Aylward testing datasetself. # def exp_function(x): # x1 = math.pow(x, 2)//100 # this is the function used to compute these setpoints. #myGradient = biolccc.Gradient() #myGradient.addPoint(0.0, gradient_0) #myGradient.addPoint(15.0, 2.0) #myGradient.addPoint(30.0, 9.0) #myGradient.addPoint(45.0, 20.0) #myGradient.addPoint(60.0, 36.0) #myGradient.addPoint(75.0, 56.0) #myGradient.addPoint(90.0, 81.0) #myGradient.addPoint(gradient_2, gradient_1) #myChromoConditions.setGradient(myGradient) # The flow rate in ml/min. myChromoConditions.setFlowRate(flow_rate) print('Calculating retention times...') # Designating BioLCCC model to use: if model_type == 'TFA': model_to_use = biolccc.rpAcnTfaChain elif model_type == 'FA': model_to_use = biolccc.rpAcnFaRod peptide_rts = [] i = 0 print('Calculating retention times...') for tryp_pep in mod_pep: rt_temp = biolccc.calculateRT(tryp_pep, model_to_use, myChromoConditions) peptide_rts.append(rt_temp) i += 1 # Combining the sequences, times, and physicochemical characteristics. peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence') peptide_rts = pd.Series(peptide_rts, name='rts') iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point') pep_charges_pd = pd.Series(pep_charges, name='charge') pep_mass_pd = pd.Series(pep_mass, name='mass') contig_pd = pd.Series(contig_vec_no_x, name='contig') peptide_dataframe = pd.concat([ peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd, pep_mass_pd, contig_pd ], axis=1) current_date = time.strftime("%Y-%m-%d") custom_name = output_name file_name = custom_name + '_lc-retention-times.csv' peptide_dataframe.to_csv(file_name)
def handcrafted_features(data, tags): # # DOI 10.1007/s00251-017-1023-5 # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain # Modified to handle split for training, validation, and test cohorts # Modified for multinomial classification # # physicochemical amino acid properties basicity = { 'A': 206.4, 'B': 210.7, 'C': 206.2, 'D': 208.6, 'E': 215.6, 'F': 212.1, 'G': 202.7, 'H': 223.7, 'I': 210.8, 'K': 221.8, 'L': 209.6, 'M': 213.3, 'N': 212.8, 'P': 214.4, 'Q': 214.2, 'R': 237.0, 'S': 207.6, 'T': 211.7, 'V': 208.7, 'W': 216.1, 'X': 210.2, 'Y': 213.1, 'Z': 214.9 } hydrophobicity = { 'A': 0.16, 'B': -3.14, 'C': 2.50, 'D': -2.49, 'E': -1.50, 'F': 5.00, 'G': -3.31, 'H': -4.63, 'I': 4.41, 'K': -5.00, 'L': 4.76, 'M': 3.23, 'N': -3.79, 'P': -4.92, 'Q': -2.76, 'R': -2.77, 'S': -2.85, 'T': -1.08, 'V': 3.02, 'W': 4.88, 'X': 4.59, 'Y': 2.00, 'Z': -2.13 } helicity = { 'A': 1.24, 'B': 0.92, 'C': 0.79, 'D': 0.89, 'E': 0.85, 'F': 1.26, 'G': 1.15, 'H': 0.97, 'I': 1.29, 'K': 0.88, 'L': 1.28, 'M': 1.22, 'N': 0.94, 'P': 0.57, 'Q': 0.96, 'R': 0.95, 'S': 1.00, 'T': 1.09, 'V': 1.27, 'W': 1.07, 'X': 1.29, 'Y': 1.11, 'Z': 0.91 } mutation_stability = { 'A': 13, 'C': 52, 'D': 11, 'E': 12, 'F': 32, 'G': 27, 'H': 15, 'I': 10, 'K': 24, 'L': 34, 'M': 6, 'N': 6, 'P': 20, 'Q': 10, 'R': 17, 'S': 10, 'T': 11, 'V': 17, 'W': 55, 'Y': 31 } # feature conversion and generation features_list = [] for chain in ['tra', 'trb']: onehot_encoder = feature_extraction.DictVectorizer(sparse=False) features_list.append( pd.DataFrame(onehot_encoder.fit_transform( data[[chain + '_vgene', chain + '_jgene']].to_dict(orient='records')), columns=onehot_encoder.feature_names_)) # sequence length features_list.append(data[chain + '_cdr3'].apply( lambda sequence: parser.length(sequence)).to_frame().rename( columns={chain + '_cdr3': 'length'})) # number of occurences of each amino acid aa_counts = pd.DataFrame.from_records([ parser.amino_acid_composition(sequence) for sequence in data[chain + '_cdr3'] ]).fillna(0) aa_counts.columns = [ chain + '_count_{}'.format(column) for column in aa_counts.columns ] features_list.append(aa_counts) # physicochemical properties: (average) basicity, (average) hydrophobicity, # (average) helicity, pI, (average) mutation stability features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_basicity'})) features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum( [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame( ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'})) features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_helicity'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: electrochem.pI(seq)).to_frame().rename( columns={chain + '_cdr3': 'pI'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser. length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_mutation_stability'})) # peptide mass features_list.append(data[chain + '_cdr3'].apply( lambda seq: mass.fast_mass(seq)).to_frame().rename( columns={chain + '_cdr3': 'mass'})) # positional features # amino acid occurence and physicochemical properties at a given position from the center pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [ [] for _ in range(6) ] for sequence in data[chain + '_cdr3']: length = parser.length(sequence) start_pos = -1 * (length // 2) pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\ list(range(start_pos, 0)) + list(range(1, start_pos + length + 1)) pos_aa.append({ chain + '_pos_{}_{}'.format(pos, aa): 1 for pos, aa in zip(pos_range, sequence) }) pos_basicity.append({ chain + '_pos_{}_basicity'.format(pos): basicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_hydro.append({ chain + '_pos_{}_hydrophobicity'.format(pos): hydrophobicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_helicity.append({ chain + '_pos_{}_helicity'.format(pos): helicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_pI.append({ chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa) for pos, aa in zip(pos_range, sequence) }) pos_mutation.append({ chain + '_pos_{}_mutation_stability'.format(pos): mutation_stability[aa] for pos, aa in zip(pos_range, sequence) }) features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0)) features_list.append(data['weights']) for tag in tags: features_list.append(data['labels_' + tag]) features_list.append(data['split']) # combine all features data_processed = pd.concat(features_list, axis=1) return data_processed
def openms_modelled_rt(rtfilename, output_name): seq_rt_df = pd.read_csv(rtfilename, names=['seq_rt']) df = pd.DataFrame(seq_rt_df.seq_rt.str.split(' ', 1).tolist(), columns=['pep_seq', 'rts']) seq_vec = df['pep_seq'].tolist() peptide_rts = df['rts'].tolist() print('Removing xs and *s from seqs...') seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec] # removing contigs with unknown amino acid (X) or selenocysteine (U) stars_removed_peps = [] for starred_peptide in seq_vec_terms: line_new = starred_peptide # some peptides have unknown amino acids denoted as *, remove them. if '*' in line_new: continue #some peptides have unknown amino acids, remove them. if 'X' in line_new: continue if 'U' in line_new: continue stars_removed_peps.append(line_new) #changing B to asparagine b_removed_peps = [] for b_peptide in stars_removed_peps: line_new = re.sub('B', 'N', b_peptide) b_removed_peps.append(line_new) #changing Z to glutamine z_removed_peps = [] for z_peptide in b_removed_peps: line_new = re.sub('Z', 'Q', z_peptide) z_removed_peps.append(line_new) # #modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream) print('Modifying peptides...') mod_pep = [] for tryp_pep in z_removed_peps: test_iso = pyteomics.parser.isoforms(tryp_pep, fixed_mods={ 'ox': ['M'], 'cam': ['C'] }, show_unmodified_termini=True) for blah in test_iso: mod_pep.append(blah) # # modified amino acid dictionary for mass calculation aa_comp = dict(mass.std_aa_comp) aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0}) aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0}) aa_comp['ox'] = mass.Composition({'O': 1}) #%% # calculate peptide isoelectric points, masses, and charge at pH = 7 print('Calculating peptide physicochemical properties...') iso_electric_points = [] pep_charges = [] pep_mass = [] i = 0 for peptide in mod_pep: peptide_isoelectric_point = electrochem.pI(peptide) peptide_charge = electrochem.charge(peptide, 7) peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp) pep_charges.append(peptide_charge) iso_electric_points.append(peptide_isoelectric_point) pep_mass.append(peptide_mass) i += 1 # Combining the sequences, times, and physicochemical characteristics. peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence') peptide_rts = pd.Series(peptide_rts, name='rts') iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point') pep_charges_pd = pd.Series(pep_charges, name='charge') pep_mass_pd = pd.Series(pep_mass, name='mass') peptide_dataframe = pd.concat([ peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd, pep_mass_pd ], axis=1) current_date = time.strftime("%Y-%m-%d") custom_name = output_name file_name = custom_name + '_lc-retention-times.csv' print(file_name) peptide_dataframe.to_csv(file_name)