def calc_len(self): """Method to get the sequence length of all sequences in the library. :return: {numpy.ndarray} sequence lengths in the attribute :py:attr:`len`. """ for l in range(self.library.shape[0]): d = GlobalDescriptor(self.library[l]) d.length() self.len.append(d.descriptor[:, 0])
def analyze_training(self): """ Method to analyze the distribution of the training data :return: prints out information about the length distribution of the sequences in ``self.sequences`` """ d = GlobalDescriptor(self.sequences) d.length() print("\nLENGTH DISTRIBUTION OF TRAINING DATA:\n") print("Number of sequences: \t%i" % len(self.sequences)) print("Mean sequence length: \t%.1f ± %.1f" % (np.mean(d.descriptor), np.std(d.descriptor))) print("Median sequence length: \t%i" % np.median(d.descriptor)) print("Minimal sequence length:\t%i" % np.min(d.descriptor)) print("Maximal sequence length:\t%i" % np.max(d.descriptor))
def makeintlistdic_from_allep(dir_name, run_dir): i = 1 intlistdic = {} len_ave_list, pi_ave_list, hyd_ave_list, len_var_list, pi_var_list, hyd_var_list = [ ], [], [], [], [], [] while True: if os.path.exists(dir_name + run_dir + str(i) + '.txt'): len_list_ep, pi_list_ep, hyd_list_ep = [], [], [] seq_size = 0 with open(dir_name + run_dir + str(i) + '.txt') as f: for line in f: seq = line[:-1] seq = GlobalDescriptor(seq) seq.length() len_list_ep.append(seq.descriptor[0][0]) seq.isoelectric_point() pi_list_ep.append(seq.descriptor[0][0]) seq.hydrophobic_ratio() hyd_list_ep.append(seq.descriptor[0][0]) seq_size += 1 len_ave_list.append(round(len(len_list_ep) / seq_size, 3)) pi_ave_list.append(round(len(pi_list_ep) / seq_size, 3)) hyd_ave_list.append(round(len(hyd_list_ep) / seq_size, 3)) len_var_list.append(round(statistics.pvariance(len_list_ep), 3)) pi_var_list.append(round(statistics.pvariance(pi_list_ep), 3)) hyd_var_list.append(round(statistics.pvariance(hyd_list_ep), 3)) i += 1 else: break intlistdic["len_ave"] = len_ave_list intlistdic["pi_ave"] = pi_ave_list intlistdic["hyd_ave"] = hyd_ave_list intlistdic["len_var"] = len_var_list intlistdic["pi_var"] = pi_var_list intlistdic["hyd_var"] = hyd_var_list # print(intlistdic, len(len_ave_list)) return intlistdic
def predict(): if request.method == 'POST': seq = request.form['seq'] with open("random.fasta", "w") as fp: fp.write(seq) pepdesc = PeptideDescriptor( '/home/sanika/proj/random.fasta', 'eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta') # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global() # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment( append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global( append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment( append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True) # calculate global Z scale (=window1 autocorrelation) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight f1 = pepdesc.descriptor f2 = globdesc.descriptor result = np.concatenate((f2, f1), axis=1) clf = joblib.load('ml_model.pkl') pred = clf.predict(result) proba = clf.predict_proba(result).tocoo() mc = pred.tocoo() out = mc.col res = [] labels = ['antiviral', 'antibacterial', 'antifungal'] values = proba.data plt.pie(values, labels=labels, autopct='%.0f%%', shadow=True, radius=0.5) plt.savefig('/home/sanika/proj/pie_chart.jpg') figfile = BytesIO() plt.savefig(figfile, format='png') figfile.seek(0) figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii') plt.close() for i in range(len(out)): if out[i] == 0: res.append("antiviral") elif out[i] == 1: res.append("antibacterial") else: res.append("antifungal") return render_template('seq.html', seq=res, result=figdata_png) return render_template('predictor.html')
def upload(): if request.method == 'POST': # This will be executed on POST request. upfile = request.files['file'] if upfile and allowed_file(upfile.filename): filename = secure_filename(upfile.filename) upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) #return render_template('upload.html') #flash("File uploaded", "success") #with open("/home/sanika/proj/uploads/aa.fasta") as f: #lines = f.readlines() #lines = [l for l in lines if "ROW" in l] #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1: #f1.writelines(lines) #f = open(filename) #prot_seq = ReadFasta(f) with open(filename) as fasta_file: # Will close handle cleanly identifiers = [] sequence = [] for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator) identifiers.append(seq_record.id) sequence.append(seq_record.seq) pepdesc = PeptideDescriptor( filename, 'eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor(filename) # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global( ) # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment( append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global( append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment( append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True ) # calculate global Z scale (=window1 autocorrelation) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight f1 = pepdesc.descriptor f2 = globdesc.descriptor result = np.concatenate((f2, f1), axis=1) rs = [] for i in range(len(result)): prt = np.reshape(result[i], (-1, 14)) clf = joblib.load('ml_model.pkl') pred = clf.predict(prt) out = pred.toarray() #print(clf.predict_proba(result)) proba = clf.predict_proba(prt).tocoo() mc = pred.tocoo() out = mc.col res = [] for i in range(len(out)): if out[i] == 0: res.append("antiviral") elif out[i] == 1: res.append("antibacterial") else: res.append("antifungal") rs.append(res) a = [] for i in range(len(rs)): a.append('-'.join(rs[i])) df = pd.DataFrame(data={ "id": identifiers, "sequence": sequence, "activity": a }, columns=['id', 'sequence', 'activity']) df.to_csv("result.csv", sep=',', index=False) os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename)) #return render_template('seq.html', seq = rs) return render_template('up.html', mimetype="text/csv") #flash("File uploaded: Thanks!", "success") else: error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD" return render_template('upload.html', error=error) # This will be executed on GET request. return render_template('predictor.html')
globdesc = GlobalDescriptor('/path/to/sequences.fasta') # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global() # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment(append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global(append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment(append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True) # calculate global Z scale (=window1 autocorrelation) # save descriptor data to .csv file col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3' pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight # save descriptor data to .csv file col_names2 = 'ID,Sequence,Length,BomanIndex,Aromaticity,AliphaticIndex,InstabilityIndex,Charge,MW' globdesc.save_descriptor('/path/to/descriptors2.csv', header=col_names2)
def analyze_generated(self, num, fname='analysis.txt', plot=False): """ Method to analyze the generated sequences located in `self.generated`. :param num: {int} wanted number of sequences to sample :param fname: {str} filename to save analysis info to :param plot: {bool} whether to plot an overview of descriptors :return: file with analysis info (distances) """ with open(fname, 'w') as f: print("Analyzing...") f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n") f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated)))) count = len(set(self.generated) & set(self.sequences)) # get shared entries in both lists f.write("%.1f percent of generated sequences are present in the training data.\n" % ((count / len(self.generated)) * 100)) d = GlobalDescriptor(self.generated) len1 = len(d.sequences) d.filter_aa('B') len2 = len(d.sequences) d.length() f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n") f.write("Number of sequences too short:\t%i\n" % (num - len1)) f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2)) f.write("Number of valid unique seqs:\t%i\n" % len2) f.write("Mean sequence length: \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor))) f.write("Median sequence length: \t\t%i\n" % np.median(d.descriptor)) f.write("Minimal sequence length: \t\t%i\n" % np.min(d.descriptor)) f.write("Maximal sequence length: \t\t%i\n" % np.max(d.descriptor)) descriptor = 'pepcats' seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor) seq_desc.calculate_autocorr(7) gen_desc = PeptideDescriptor(d.sequences, descriptor) gen_desc.calculate_autocorr(7) # random comparison set self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) # generate rand seqs probas = count_aas(''.join(seq_desc.sequences)).values() # get the aa distribution of training seqs self.ran.generate_sequences(proba=probas) ran_desc = PeptideDescriptor(self.ran.sequences, descriptor) ran_desc.calculate_autocorr(7) # amphipathic helices comparison set self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) self.hel.generate_sequences() hel_desc = PeptideDescriptor(self.hel.sequences, descriptor) hel_desc.calculate_autocorr(7) # distance calculation f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper()) desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" % (np.mean(hel_dist), np.std(hel_dist))) # more simple descriptors g_seq = GlobalDescriptor(seq_desc.sequences) g_gen = GlobalDescriptor(gen_desc.sequences) g_ran = GlobalDescriptor(ran_desc.sequences) g_hel = GlobalDescriptor(hel_desc.sequences) g_seq.calculate_all() g_gen.calculate_all() g_ran.calculate_all() g_hel.calculate_all() sclr = StandardScaler() sclr.fit(g_seq.descriptor) f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n") desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" % (np.mean(hel_dist), np.std(hel_dist))) # hydrophobic moments uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg') uh_seq.calculate_moment() uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg') uh_gen.calculate_moment() uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg') uh_ran.calculate_moment() uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg') uh_hel.calculate_moment() f.write("\n\nHYDROPHOBIC MOMENTS\n\n") f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor))) f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor))) f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor))) f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor))) if plot: if self.refs: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences], ['training', 'sampled', 'hel', 'ran']) else: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled']) a.plot_summary(filename=fname[:-4] + '.png')