def get_peptide_values(list_peptides, descriptor_name): """ :param list_peptides: List of amino acid peptides :param descriptor_name: MODLamp-prescribed descriptor name :return: corresponding values for that descriptor for each of the peptides in the input list """ properties = PeptideDescriptor(list_peptides, descriptor_name) properties.calculate_moment() return [x[0] for x in properties.descriptor]
def insert_phycs(seq_df): # Function for compute Isoelectric Point or net_charge of peptide def get_ieq_nc(seq, is_iep=True): protparam = PA(seq) return protparam.isoelectric_point( ) if is_iep else protparam.charge_at_pH(7.0) # Calculating IsoElectricPoints and NeutralCharge data_size = seq_df.size seq_df['IEP'] = list( map(get_ieq_nc, seq_df['Sequence'], [True] * data_size)) # IsoElectricPoints seq_df['Net Charge'] = list( map(get_ieq_nc, seq_df['Sequence'], [False] * data_size)) # Charge(Neutral) # Calculating hydrophobic moment (My assume all peptides are alpha-helix) descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'eisenberg') descrpt.calculate_moment(window=1000, angle=100, modality='max') seq_df['Hydrophobic Moment'] = descrpt.descriptor.reshape(-1) # Calculating "Hopp-Woods" hydrophobicity descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'hopp-woods') descrpt.calculate_global() seq_df['Hydrophobicity'] = descrpt.descriptor.reshape(-1) # Calculating Energy of Transmembrane Propensity descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'tm_tend') descrpt.calculate_global() seq_df['Transmembrane Propensity'] = descrpt.descriptor.reshape(-1) # Calculating Levitt_alpha_helical Propensity descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'levitt_alpha') descrpt.calculate_global() seq_df['Alpha Helical Propensity'] = descrpt.descriptor.reshape(-1) # Calculating Aliphatic Index descrpt = GlobalDescriptor(seq_df['Sequence'].values) descrpt.aliphatic_index() seq_df['Aliphatic Index'] = descrpt.descriptor.reshape(-1) # Calculating Boman Index descrpt = GlobalDescriptor(seq_df['Sequence'].values) descrpt.boman_index() seq_df['Boman Index'] = descrpt.descriptor.reshape(-1) return seq_df
def calc_uH(self, window=1000, angle=100, modality='max'): """Method for calculating hydrophobic moments (Eisenberg scale) for all sequences in the library. :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal hydrophiobic moment for the chosen window size found in the sequence will be returned. :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets. :param modality: {'max' or 'mean'} calculate respectively maximum or mean hydrophobic moment. :return: {numpy.ndarray} calculated hydrophobic moments in the attribute :py:attr:`uH`. .. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_moment()` """ for l in range(self.library.shape[0]): d = PeptideDescriptor(self.library[l], 'eisenberg') d.calculate_moment(window=window, angle=angle, modality=modality) self.uH.append(d.descriptor[:, 0])
def predict(): if request.method == 'POST': seq = request.form['seq'] with open("random.fasta", "w") as fp: fp.write(seq) pepdesc = PeptideDescriptor( '/home/sanika/proj/random.fasta', 'eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta') # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global() # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment( append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global( append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment( append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True) # calculate global Z scale (=window1 autocorrelation) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight f1 = pepdesc.descriptor f2 = globdesc.descriptor result = np.concatenate((f2, f1), axis=1) clf = joblib.load('ml_model.pkl') pred = clf.predict(result) proba = clf.predict_proba(result).tocoo() mc = pred.tocoo() out = mc.col res = [] labels = ['antiviral', 'antibacterial', 'antifungal'] values = proba.data plt.pie(values, labels=labels, autopct='%.0f%%', shadow=True, radius=0.5) plt.savefig('/home/sanika/proj/pie_chart.jpg') figfile = BytesIO() plt.savefig(figfile, format='png') figfile.seek(0) figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii') plt.close() for i in range(len(out)): if out[i] == 0: res.append("antiviral") elif out[i] == 1: res.append("antibacterial") else: res.append("antifungal") return render_template('seq.html', seq=res, result=figdata_png) return render_template('predictor.html')
def upload(): if request.method == 'POST': # This will be executed on POST request. upfile = request.files['file'] if upfile and allowed_file(upfile.filename): filename = secure_filename(upfile.filename) upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) #return render_template('upload.html') #flash("File uploaded", "success") #with open("/home/sanika/proj/uploads/aa.fasta") as f: #lines = f.readlines() #lines = [l for l in lines if "ROW" in l] #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1: #f1.writelines(lines) #f = open(filename) #prot_seq = ReadFasta(f) with open(filename) as fasta_file: # Will close handle cleanly identifiers = [] sequence = [] for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator) identifiers.append(seq_record.id) sequence.append(seq_record.seq) pepdesc = PeptideDescriptor( filename, 'eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor(filename) # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global( ) # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment( append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global( append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment( append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True ) # calculate global Z scale (=window1 autocorrelation) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight f1 = pepdesc.descriptor f2 = globdesc.descriptor result = np.concatenate((f2, f1), axis=1) rs = [] for i in range(len(result)): prt = np.reshape(result[i], (-1, 14)) clf = joblib.load('ml_model.pkl') pred = clf.predict(prt) out = pred.toarray() #print(clf.predict_proba(result)) proba = clf.predict_proba(prt).tocoo() mc = pred.tocoo() out = mc.col res = [] for i in range(len(out)): if out[i] == 0: res.append("antiviral") elif out[i] == 1: res.append("antibacterial") else: res.append("antifungal") rs.append(res) a = [] for i in range(len(rs)): a.append('-'.join(rs[i])) df = pd.DataFrame(data={ "id": identifiers, "sequence": sequence, "activity": a }, columns=['id', 'sequence', 'activity']) df.to_csv("result.csv", sep=',', index=False) os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename)) #return render_template('seq.html', seq = rs) return render_template('up.html', mimetype="text/csv") #flash("File uploaded: Thanks!", "success") else: error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD" return render_template('upload.html', error=error) # This will be executed on GET request. return render_template('predictor.html')
def helical_wheel(sequence, colorcoding='rainbow', lineweights=True, filename=None, seq=False, moment=False): """A function to project a given peptide sequence onto a helical wheel plot. It can be useful to illustrate the properties of alpha-helices, like positioning of charged and hydrophobic residues along the sequence. :param sequence: {str} the peptide sequence for which the helical wheel should be drawn :param colorcoding: {str} the color coding to be used, available: *rainbow*, *charge*, *polar*, *simple*, *amphipathic*, *none* :param lineweights: {boolean} defines whether connection lines decrease in thickness along the sequence :param filename: {str} filename where to safe the plot. *default = None* --> show the plot :param seq: {bool} whether the amino acid sequence should be plotted as a title :param moment: {bool} whether the Eisenberg hydrophobic moment should be calculated and plotted :return: a helical wheel projection plot of the given sequence (interactively or in **filename**) :Example: >>> helical_wheel('GLFDIVKKVVGALG') >>> helical_wheel('KLLKLLKKLLKLLK', colorcoding='charge') >>> helical_wheel('AKLWLKAGRGFGRG', colorcoding='none', lineweights=False) >>> helical_wheel('ACDEFGHIKLMNPQRSTVWY') .. image:: ../docs/static/wheel1.png :height: 300px .. image:: ../docs/static/wheel2.png :height: 300px .. image:: ../docs/static/wheel3.png :height: 300px .. image:: ../docs/static/wheel4.png :height: 300px .. versionadded:: v2.1.5 """ # color mappings aa = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] f_rainbow = [ '#3e3e28', '#ffcc33', '#b30047', '#b30047', '#ffcc33', '#3e3e28', '#80d4ff', '#ffcc33', '#0047b3', '#ffcc33', '#ffcc33', '#b366ff', '#29a329', '#b366ff', '#0047b3', '#ff66cc', '#ff66cc', '#ffcc33', '#ffcc33', '#ffcc33' ] f_charge = [ '#000000', '#000000', '#ff4d94', '#ff4d94', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', '#000000', '#000000', '#000000', '#000000', '#000000', '#80d4ff', '#000000', '#000000', '#000000', '#000000', '#000000' ] f_polar = [ '#000000', '#000000', '#80d4ff', '#80d4ff', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', '#80d4ff', '#80d4ff', '#80d4ff', '#000000', '#000000', '#000000' ] f_simple = [ '#ffcc33', '#ffcc33', '#0047b3', '#0047b3', '#ffcc33', '#7f7f7f', '#0047b3', '#ffcc33', '#0047b3', '#ffcc33', '#ffcc33', '#0047b3', '#ffcc33', '#0047b3', '#0047b3', '#0047b3', '#0047b3', '#ffcc33', '#ffcc33', '#ffcc33' ] f_none = ['#ffffff'] * 20 f_amphi = [ '#ffcc33', '#29a329', '#b30047', '#b30047', '#f79318', '#80d4ff', '#0047b3', '#ffcc33', '#0047b3', '#ffcc33', '#ffcc33', '#80d4ff', '#29a329', '#80d4ff', '#0047b3', '#80d4ff', '#80d4ff', '#ffcc33', '#f79318', '#f79318' ] t_rainbow = [ 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k' ] t_charge = [ 'w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'w', 'w', 'w', 'k', 'w', 'w', 'w', 'w', 'w' ] t_polar = [ 'w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w', 'w', 'w' ] t_simple = [ 'k', 'k', 'w', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w', 'w', 'w', 'w', 'k', 'k', 'k' ] t_none = ['k'] * 20 t_amphi = [ 'k', 'k', 'w', 'w', 'w', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'w' ] d_eisberg = load_scale('eisenberg')[ 1] # eisenberg hydrophobicity values for HM if lineweights: lw = np.arange(0.1, 5.5, 5. / (len(sequence) - 1)) # line thickness array lw = lw[::-1] # inverse order else: lw = [2.] * (len(sequence) - 1) # check which color coding to use if colorcoding == 'rainbow': df = dict(zip(aa, f_rainbow)) dt = dict(zip(aa, t_rainbow)) elif colorcoding == 'charge': df = dict(zip(aa, f_charge)) dt = dict(zip(aa, t_charge)) elif colorcoding == 'polar': df = dict(zip(aa, f_polar)) dt = dict(zip(aa, t_polar)) elif colorcoding == 'simple': df = dict(zip(aa, f_simple)) dt = dict(zip(aa, t_simple)) elif colorcoding == 'none': df = dict(zip(aa, f_none)) dt = dict(zip(aa, t_none)) elif colorcoding == 'amphipathic': df = dict(zip(aa, f_amphi)) dt = dict(zip(aa, t_amphi)) else: print("Unknown color coding, 'rainbow' used instead") df = dict(zip(aa, f_rainbow)) dt = dict(zip(aa, t_rainbow)) # degree to radian deg = np.arange(float(len(sequence))) * -100. deg = [d + 90. for d in deg] # start at 270 degree in unit circle (on top) rad = np.radians(deg) # dict for coordinates and eisenberg values d_hydro = dict(zip(rad, [0.] * len(rad))) # create figure fig = plt.figure(frameon=False, figsize=(10, 10)) ax = fig.add_subplot(111) old = None hm = list() # iterate over sequence for i, r in enumerate(rad): new = (np.cos(r), np.sin(r)) # new AA coordinates if i < 18: # plot the connecting lines if old is not None: line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k', linewidth=lw[i - 1]) line.set_zorder(1) # 1 = level behind circles ax.add_line(line) elif 17 < i < 36: line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k', linewidth=lw[i - 1]) line.set_zorder(1) # 1 = level behind circles ax.add_line(line) new = (np.cos(r) * 1.2, np.sin(r) * 1.2) elif i == 36: line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k', linewidth=lw[i - 1]) line.set_zorder(1) # 1 = level behind circles ax.add_line(line) new = (np.cos(r) * 1.4, np.sin(r) * 1.4) else: new = (np.cos(r) * 1.4, np.sin(r) * 1.4) # plot circles circ = patches.Circle(new, radius=0.1, transform=ax.transData, edgecolor='k', facecolor=df[sequence[i]]) circ.set_zorder(2) # level in front of lines ax.add_patch(circ) # check if N- or C-terminus and add subscript, then plot AA letter if i == 0: ax.text(new[0], new[1], sequence[i] + '$_N$', va='center', ha='center', transform=ax.transData, size=32, color=dt[sequence[i]], fontweight='bold') elif i == len(sequence) - 1: ax.text(new[0], new[1], sequence[i] + '$_C$', va='center', ha='center', transform=ax.transData, size=32, color=dt[sequence[i]], fontweight='bold') else: ax.text(new[0], new[1], sequence[i], va='center', ha='center', transform=ax.transData, size=36, color=dt[sequence[i]], fontweight='bold') eb = d_eisberg[sequence[i]][0] # eisenberg value for this AA hm.append([ eb * new[0], eb * new[1] ]) # save eisenberg hydrophobicity vector value to later calculate HM old = (np.cos(r), np.sin(r)) # save as previous coordinates # draw hydrophobic moment arrow if moment option if moment: v_hm = np.sum(np.array(hm), 0) x = .0333 * v_hm[0] y = .0333 * v_hm[1] ax.arrow(0., 0., x, y, head_width=0.04, head_length=0.03, transform=ax.transData, color='k', linewidth=6.) desc = PeptideDescriptor(sequence) # calculate hydrophobic moment desc.calculate_moment() if abs( x ) < 0.2 and y > 0.: # right positioning of HM text so arrow does not cover it z = -0.2 else: z = 0.2 plt.text(0., z, str(round(desc.descriptor[0][0], 3)), fontdict={ 'fontsize': 20, 'fontweight': 'bold', 'ha': 'center' }) # plot shape if len(sequence) < 19: ax.set_xlim(-1.2, 1.2) ax.set_ylim(-1.2, 1.2) else: ax.set_xlim(-1.4, 1.4) ax.set_ylim(-1.4, 1.4) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(False) cur_axes = plt.gca() cur_axes.axes.get_xaxis().set_visible(False) cur_axes.axes.get_yaxis().set_visible(False) plt.tight_layout() if seq: plt.title(sequence, fontweight='bold', fontsize=20) # show or save plot if filename: plt.savefig(filename, dpi=150) else: plt.show()
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Script to calculate different peptide descriptors for a given sequences.fasta file and save them to two files. """ from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor # Load sequence file into descriptor object pepdesc = PeptideDescriptor('/path/to/sequences.fasta', 'Eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor('/path/to/sequences.fasta') # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global() # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment(append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global(append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment(append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True) # calculate global Z scale (=window1 autocorrelation) # save descriptor data to .csv file col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3' pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length
def analyze_generated(self, num, fname='analysis.txt', plot=False): """ Method to analyze the generated sequences located in `self.generated`. :param num: {int} wanted number of sequences to sample :param fname: {str} filename to save analysis info to :param plot: {bool} whether to plot an overview of descriptors :return: file with analysis info (distances) """ with open(fname, 'w') as f: print("Analyzing...") f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n") f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated)))) count = len(set(self.generated) & set(self.sequences)) # get shared entries in both lists f.write("%.1f percent of generated sequences are present in the training data.\n" % ((count / len(self.generated)) * 100)) d = GlobalDescriptor(self.generated) len1 = len(d.sequences) d.filter_aa('B') len2 = len(d.sequences) d.length() f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n") f.write("Number of sequences too short:\t%i\n" % (num - len1)) f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2)) f.write("Number of valid unique seqs:\t%i\n" % len2) f.write("Mean sequence length: \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor))) f.write("Median sequence length: \t\t%i\n" % np.median(d.descriptor)) f.write("Minimal sequence length: \t\t%i\n" % np.min(d.descriptor)) f.write("Maximal sequence length: \t\t%i\n" % np.max(d.descriptor)) descriptor = 'pepcats' seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor) seq_desc.calculate_autocorr(7) gen_desc = PeptideDescriptor(d.sequences, descriptor) gen_desc.calculate_autocorr(7) # random comparison set self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) # generate rand seqs probas = count_aas(''.join(seq_desc.sequences)).values() # get the aa distribution of training seqs self.ran.generate_sequences(proba=probas) ran_desc = PeptideDescriptor(self.ran.sequences, descriptor) ran_desc.calculate_autocorr(7) # amphipathic helices comparison set self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) self.hel.generate_sequences() hel_desc = PeptideDescriptor(self.hel.sequences, descriptor) hel_desc.calculate_autocorr(7) # distance calculation f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper()) desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" % (np.mean(hel_dist), np.std(hel_dist))) # more simple descriptors g_seq = GlobalDescriptor(seq_desc.sequences) g_gen = GlobalDescriptor(gen_desc.sequences) g_ran = GlobalDescriptor(ran_desc.sequences) g_hel = GlobalDescriptor(hel_desc.sequences) g_seq.calculate_all() g_gen.calculate_all() g_ran.calculate_all() g_hel.calculate_all() sclr = StandardScaler() sclr.fit(g_seq.descriptor) f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n") desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" % (np.mean(hel_dist), np.std(hel_dist))) # hydrophobic moments uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg') uh_seq.calculate_moment() uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg') uh_gen.calculate_moment() uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg') uh_ran.calculate_moment() uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg') uh_hel.calculate_moment() f.write("\n\nHYDROPHOBIC MOMENTS\n\n") f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor))) f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor))) f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor))) f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor))) if plot: if self.refs: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences], ['training', 'sampled', 'hel', 'ran']) else: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled']) a.plot_summary(filename=fname[:-4] + '.png')
def describe_sequences(): path = r"C:\Users\Patrick\OneDrive - University College Dublin\Bioinformatics\HemolyticStudies\BOTH_peptides.json" aa_letters = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters] tri_letters = [ "%s%s%s" % (a, b, c) for a in aa_letters for b in aa_letters for c in aa_letters ] conjoint_letters = ["A", "I", "Y", "H", "R", "D", "C"] letters = { 1: aa_letters, 2: di_letters, 3: tri_letters, 4: conjoint_letters } #Conjoint src = https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0828-1 conjoint_dict = { "A": "A", "G": "A", "V": "A", "I": "I", "L": "I", "F": "I", "P": "I", "Y": "Y", "M": "Y", "T": "Y", "S": "Y", "H": "H", "N": "H", "Q": "H", "W": "H", "R": "R", "K": "R", "D": "D", "E": "D", "C": "C", } def counter(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i: 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k: d[k] / l for k in d} if seq_type == 2: for a in range(l - 1): s = string[a:a + seq_type] try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k: d[k] / (l - 1) for k in d} if seq_type == 3: for a in range(l - 2): s = string[a:a + seq_type] try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k: d[k] / (l - 2) for k in d} return d def counter_boolean(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i: 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] = 1.0 except KeyError: d[s] = 1.0 if seq_type == 2: for a in range(l - 1): s = string[a:a + seq_type] try: d[s] = 1.0 except KeyError: d[s] = 1.0 return d def counter_abs(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i: 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] = d[s] + 1.0 except KeyError: d[s] = 1.0 if seq_type == 2: for a in range(l - 1): s = string[a:a + seq_type] try: d[s] = d[s] + 1.0 except KeyError: d[s] = 1.0 return d def residue_distribution(all_residues, seq_type, dp): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter(all_residues, seq_type) if seq_type == 1: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters elif seq_type == 2: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50])) elif seq_type == 3: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if tp[i] >= 20])) elif seq_type == 4: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type]])) r_c = [i[1] for i in residue_counts] dis = np.array([ r_c, ]) return dis def residue_boolean(all_residues, seq_type, dp): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter_boolean(all_residues, seq_type) if seq_type == 1: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters elif seq_type == 2: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50])) r_c = [i[1] for i in residue_counts] dis = np.array([ r_c, ]) return dis def residue_abs(all_residues, seq_type, dp): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter_abs(all_residues, seq_type) if seq_type == 1: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters elif seq_type == 2: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50])) r_c = [i[1] for i in residue_counts] dis = np.array([ r_c, ]) return dis with open(path, "r") as f: text = f.read() peptides = eval(text)["Peptides"] train_peptides, test_peptides = train_test_split(peptides, test_size=0.15, random_state=42) train_peptides_seqs = [peptide["seq"] for peptide in train_peptides] for peptide in peptides: if peptide["seq"] in train_peptides_seqs: peptide["train"] = True else: peptide["train"] = False print(len([p for p in peptides if p["train"] == True])) print(len([p for p in peptides if p["train"] == False])) new_peptides = [] for peptide in peptides: if peptide["train"] == True: new_peptide = peptide.copy() new_seq = ''.join(reversed(peptide["seq"])) new_peptide["seq"] = new_seq new_peptides.append(new_peptide) #peptides.extend(new_peptides) random.shuffle(peptides) print(len([p for p in peptides if p["train"] == True])) print(len([p for p in peptides if p["train"] == False])) print("doubling complete") dp = {i: 0 for i in letters[2]} tp = {i: 0 for i in letters[3]} name_i = 0 for peptide in peptides: temp_set = set() seq = peptide["seq"] l = len(seq) for a in range(l - 1): s = seq[a:a + 2] temp_set.add(s) for s in temp_set: dp[s] = dp[s] + 1 for peptide in peptides: temp_set = set() seq = peptide["seq"] l = len(seq) for a in range(l - 2): s = seq[a:a + 3] temp_set.add(s) for s in temp_set: tp[s] = tp[s] + 1 for peptide in peptides: peptide["conjoint_seq"] = "".join( [conjoint_dict[letter] for letter in peptide["seq"]]) for peptide in peptides: globdesc = GlobalDescriptor(peptide["seq"]) globdesc.calculate_all(amide=peptide["cTer"] == "Amidation") ctdc = CTD.CalculateC(peptide["seq"]) ctdc_keys = list(sorted(list([key for key in ctdc]))) ctdc_vals = np.array([[ctdc[key] for key in ctdc_keys]]) conjointtriad = ConjointTriad.CalculateConjointTriad(peptide["seq"]) conjointtriad_keys = list(sorted(list([key for key in conjointtriad]))) conjointtriad_vals = np.array( [[conjointtriad[key] for key in conjointtriad_keys]]) conjoint_dis = residue_distribution(peptide["conjoint_seq"], 4, None) #peptide["GlobalDescriptor"] = globdesc #print(peptide["GlobalDescriptor"].descriptor) #Eisenberg hydrophobicity consensus #Take most of the values from here pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) #pepdesc.calculate_profile(append=True, prof_type = "uH") pepdesc.load_scale("Ez") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("aasi") pepdesc.calculate_global(append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("abhprk") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("charge_acid") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("cougar") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("gravy") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("hopp-woods") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("kytedoolittle") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("ppcali") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("msw") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("charge_phys") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("flexibility") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("bulkiness") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("TM_tend") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("mss") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("t_scale") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("peparc") pepdesc.calculate_arc(modality="max", append=True) pepdesc.calculate_arc(modality="mean", append=True) pepdesc.load_scale("msw") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("polarity") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("pepcats") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("isaeci") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("refractivity") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("z3") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("z5") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) #pepdesc.load_scale("PPCALI") #pepdesc.calculate_autocorr(2) #peptide["PeptideDescriptor"] = pepdesc protein = PyPro() protein.ReadProteinSequence(peptide["seq"]) paac = protein.GetPAAC(lamda=1, weight=0.05) paac2 = [[ paac[a] for a in list( sorted([k for k in paac], key=lambda x: int(x.replace("PAAC", "")))) ]] cTer = np.array([[1 if peptide["cTer"] == "Amidation" else 0]]) paac = np.array(paac2) analysed_seq = ProteinAnalysis(peptide["seq"]) secondary_structure_fraction = np.array( [analysed_seq.secondary_structure_fraction()]) peptide["TotalDescriptor"] = str( np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1)) try: pepid = np.array([[ int(peptide["id"].replace("HEMOLYTIK", "").replace( "DRAMP", "").replace("DBAASP", "")) ]]) except KeyError: pepid = 0 pep_train = np.array([[1 if peptide["train"] == True else 0]]) freq_1d = residue_distribution(peptide["seq"], 1, dp) freq_2d = residue_distribution(peptide["seq"], 2, dp) freq_3d = residue_distribution(peptide["seq"], 3, dp) freq_1dbool = residue_boolean(peptide["seq"], 1, dp) freq_2dbool = residue_boolean(peptide["seq"], 2, dp) freq_1dabs = residue_abs(peptide["seq"], 1, dp) freq_2dabs = residue_abs(peptide["seq"], 2, dp) len_peptide = np.array([[len(peptide["seq"])]]) if peptide["activity"] == "YES": pepact = 1 else: pepact = 0 pepact = np.array([[pepact]]) peptide_di2 = di2(peptide["seq"]) peptide_di3 = di3(peptide["conjoint_seq"]) ####################### AAindex ######################### to_get = [ ("CHAM810101", "mean"), #Steric Hinderance ("CHAM810101", "total"), #Steric Hinderance ("KYTJ820101", "mean"), #Hydropathy ("KLEP840101", "total"), #Charge ("KLEP840101", "mean"), #Charge ("MITS020101", "mean"), #Amphiphilicity ("FAUJ830101", "mean"), #Hydrophobic parameter pi ("GOLD730102", "total"), #Residue volume ("MEEJ800101", "mean"), #Retention coefficient in HPLC ("OOBM850105", "mean"), #Optimized side chain interaction parameter ("OOBM850105", "total"), #Optimized side chain interaction parameter ("VELV850101", "total"), #Electron-ion interaction parameter ("VELV850101", "mean"), #Electron-ion interaction parameter ("PUNT030102", "mean"), #Knowledge-based membrane-propensity scale from 3D_Helix ("BHAR880101", "mean"), #Average flexibility indeces ("KRIW790102", "mean"), #Fraction of site occupied by water ("PLIV810101", "mean"), #Partition coefficient ("ZIMJ680102", "mean"), #Bulkiness ("ZIMJ680102", "total"), #Bulkiness ("ZHOH040101", "mean"), #Stability scale ("CHAM820102", "total"), #Free energy solubility in water #From HemoPi: src = https://github.com/riteshcanfly/Hemopi/blob/master/pcCalculator.java ("HOPT810101", "mean"), #Hydrophilicity ("EISD840101", "mean"), #Hydrophobicity ("FAUJ880109", "total"), #Net Hydrogen ("EISD860101", "mean"), #Solvation ] tetra_peptides = [ "KLLL", # src = https://github.com/riteshcanfly/Hemopi/blob/master/tetrapos.txt "GCSC", "AAAK", "KLLS", "LGKL", "VLKA", "LLGK", "LVGA", "LSDF", "SDFK", "SWLR", "WLRD", ] tp_bin = [] for t_p in tetra_peptides: if t_p in peptide["seq"]: tp_bin.append(1) else: tp_bin.append(0) tp_bin = np.array([tp_bin]) for identifier, mode in to_get: x = aaf(peptide["seq"], identifier, mode) aminoacidindeces = np.array([[ aaf(peptide["seq"], identifier, mode) for identifier, mode in to_get ]]) peptide["array"] = np.concatenate( ( pepid, pep_train, pepdesc.descriptor, globdesc.descriptor, len_peptide, cTer, secondary_structure_fraction, aminoacidindeces, ctdc_vals, conjointtriad_vals, tp_bin, freq_1d, freq_2d, freq_3d, freq_1dbool, freq_2dbool, freq_1dabs, freq_2dabs, peptide_di2, peptide_di3, #Conjoint Alphabet paac, pepact, ), axis=1) #print(peptide["TotalDescriptor"]) x = np.concatenate([peptide["array"] for peptide in peptides], axis=0) np.save("peptides_array", x, allow_pickle=False)
def _add_features_to_peptide_series(self, peptide, index, n_cluster=-1, lpvs=None): # primary intensity weights d = delta, pd = penalty delta # TODO only d_start and d_stop depends on impval, pd_start and pd_stop does not because # they are always between a d_start and d_stop, and should thus be above imp_val! # therefore we can write out d_start as and d_stop as: # [before_start, after_start], [befrore_stop, after_stop] # thus if we have # raw data = [0, 0, 5, 5, 7, 7, 5, 5, 0, 0] # then for the peptide 3--------------8 # before_start, after_start = [ 0, 5 ] # but for the peptide 5--6 # before_start, after_start = [ 5, 7 ] # by making a none linear model we could formulate the w_start parameter as follows: # w_start * (after_start - max(before_start, imp_val)) # which is consistent with how we currently do the grid search (imp_val=4): # d_start = 5 - max(0, 4) = 1 # d_start = 7 - max(5, 4) = 2 if lpvs is None: lpvs = set() i_start = peptide.start.index i_stop = peptide.stop.index # MS Delta series = pd.Series(np.zeros(len(index)) * np.nan, index=index) ms_int = self.ms_intensity_features.type series[ms_int, 'start'] = self.start_scores[i_start] series[ms_int, 'stop'] = self.stop_scores[i_stop] if 4 < len(peptide): penalty = SequenceRange(peptide.start + 1, peptide.stop - 1, validate=False) series[ms_int, 'penalty_start'] = self.start_scores[penalty.slice].sum() series[ms_int, 'penalty_stop'] = self.stop_scores[penalty.slice].sum() else: series[ms_int, 'penalty_start'] = series[ms_int, 'penalty_stop'] = 0 # MS Bool b_obs, f_obs = self._calc_observed(peptide) series[self.ms_bool_features.type, "first"] = self.h_first[i_start] series[self.ms_bool_features.type, "last"] = self.h_last[i_stop] series[self.ms_bool_features.type, "observed"] = b_obs # MS Frequency # ptm weights # TODO: should it get extra penalties if there are PTM's between start and end? ms_freq = self.ms_frequency_features.type series[ms_freq, 'acetylation'] = self.ac_freq[i_start] series[ms_freq, 'amidation'] = self.am_freq[i_stop] series[ms_freq, 'start'] = self.h_start_freq[i_start] series[ms_freq, 'stop'] = self.h_stop_freq[i_stop] series[ms_freq, 'observed'] = f_obs series[ms_freq, 'sample'] = self.h_sample[peptide.slice].min() series[ms_freq, 'ladder'] = \ self.h_ladder_start[i_start] * self.h_ladder_stop[i_stop] series[ms_freq, 'protein_coverage'] = self.protein_coverage series[ms_freq, 'cluster_coverage'] = self.cluster_coverage[n_cluster] # thise are good features, but there may be better ways to extract them series[ms_freq, 'bond'] = self.h_bond[self.get_bond_slice(peptide)].min() # MS Counts ms_count = self.ms_count_features.type series[ms_count, 'start'] = self.start_counts[peptide.start] series[ms_count, 'stop'] = self.stop_counts[peptide.stop] # series[ms_count, 'ladder'] = \ # self.h_ladder_start[i_start] + self.h_ladder_stop[i_stop] ############################################################ # Chemical sequence = self.protein_sequence[peptide.slice] peptide_features = GlobalDescriptor(sequence) is_amidated = series[ms_freq, 'amidation'] > 0.05 peptide_features.calculate_all(amide=is_amidated) chem = self.chemical_features.type for i, name in enumerate(peptide_features.featurenames): if name in self.chemical_features.features: series[chem, name] = peptide_features.descriptor[0, i] eisenberg = PeptideDescriptor(sequence, 'eisenberg') eisenberg.calculate_moment() series[chem, 'eisenberg'] = eisenberg.descriptor.flatten()[0] # Annotations series[self.annotations.type, "Known"] = peptide in self.known_peptides # series[self.annotations.type, "Type"] = peptide in self.known_peptides series[self.annotations.type, "Cluster"] = n_cluster series[self.annotations.type, "Sequence"] = peptide.seq series[self.annotations.type, "LPV"] = False # TODO! series[self.annotations.type, "N Flanking"] = \ self.get_nflanking_region(peptide.start, self.protein_sequence) series[self.annotations.type, "C Flanking"] = \ self.get_cflanking_region(peptide.stop, self.protein_sequence) series[self.annotations.type, "LPV"] = peptide in lpvs if f_obs != 0: _pep_index = (slice(None), slice(None), peptide.start.pos, peptide.stop.pos) series[self.annotations.type, "Intensity"] = self.df.loc[_pep_index, :].sum().sum() return series
def describe_sequences(): aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters] letters = {1 : aa_letters, 2 : di_letters} def counter(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i : 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k : d[k]/l for k in d} if seq_type == 2: for a in range(l-1): s = string[a:a+seq_type] try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k : d[k]/(l-1) for k in d} return d def residue_distribution(all_residues, seq_type): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter(all_residues, seq_type) residue_counts = list(sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters r_c = [i[1] for i in residue_counts] dis = np.array([r_c,]) return dis peptides = [{"seq" : "FLPILASLAAKFGPKLFCLVTKKC", "cTer" : None, "activity" : "YES"}, {"seq" : "ILGPVISTIGGVLGGLLKNL", "cTer" : "Amidation", "activity" : "YES"}, {"seq": "GIGGKILSGLKTALKGAAKELASTYLH", "cTer" : None, "activity" : "NO"}, {"seq": "GIGSAILSAGKSALKGLAKGLAEHFAN", "cTer" : None, "activity" : "NO"}, {"seq": "FLSLIPHAINAVSAIAKHF", "cTer" : "Amidation", "activity" : "NO"}, ] for peptide in peptides: #print(peptide["id"]) #print(peptide["seq"]) globdesc = GlobalDescriptor(peptide["seq"]) globdesc.calculate_all(amide = peptide["cTer"] == "Amidation") #peptide["GlobalDescriptor"] = globdesc #print(peptide["GlobalDescriptor"].descriptor) #Eisenberg hydrophobicity consensus #Take most of the values from here pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg") pepdesc.calculate_global() pepdesc.calculate_moment(append=True) #pepdesc.calculate_profile(append=True, prof_type = "uH") pepdesc.load_scale("Ez") pepdesc.calculate_global(append=True) pepdesc.load_scale("charge_phys") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("flexibility") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("polarity") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("isaeci") pepdesc.calculate_global(append=True) pepdesc.load_scale("refractivity") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("z5") pepdesc.calculate_global(append=True) #peptide["PeptideDescriptor"] = pepdesc peptide["TotalDescriptor"] = str(np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1)) try: pepid = np.array([[int(peptide["id"].replace("HEMOLYTIK",""))]]) except KeyError: pepid = np.array([[0]]) freq_1d = residue_distribution(peptide["seq"], 1) freq_2d = residue_distribution(peptide["seq"], 2) len_peptide = np.array([[len(peptide["seq"])]]) if peptide["activity"] == "YES": pepact = 1 else: pepact = 0 pepact = np.array([[pepact]]) peptide_di2 = di2(peptide["seq"]) peptide["array"] = np.concatenate((pepid, pepdesc.descriptor, globdesc.descriptor, len_peptide, freq_1d, #freq_2d, #peptide_di2, pepact,), axis=1) #print(peptide["TotalDescriptor"]) x = np.concatenate([peptide["array"] for peptide in peptides], axis=0) print(x) np.save("hemolytik_array_custom_tests", x, allow_pickle=False)