def upload(): if request.method == 'POST': # This will be executed on POST request. upfile = request.files['file'] if upfile and allowed_file(upfile.filename): filename = secure_filename(upfile.filename) upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) #return render_template('upload.html') #flash("File uploaded", "success") #with open("/home/sanika/proj/uploads/aa.fasta") as f: #lines = f.readlines() #lines = [l for l in lines if "ROW" in l] #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1: #f1.writelines(lines) #f = open(filename) #prot_seq = ReadFasta(f) with open(filename) as fasta_file: # Will close handle cleanly identifiers = [] sequence = [] for seq_record in SeqIO.parse(fasta_file, 'fasta'): # (generator) identifiers.append(seq_record.id) sequence.append(seq_record.seq) pepdesc = PeptideDescriptor( filename, 'eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor(filename) # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global( ) # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment( append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global( append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment( append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True ) # calculate global Z scale (=window1 autocorrelation) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight f1 = pepdesc.descriptor f2 = globdesc.descriptor result = np.concatenate((f2, f1), axis=1) rs = [] for i in range(len(result)): prt = np.reshape(result[i], (-1, 14)) clf = joblib.load('ml_model.pkl') pred = clf.predict(prt) out = pred.toarray() #print(clf.predict_proba(result)) proba = clf.predict_proba(prt).tocoo() mc = pred.tocoo() out = mc.col res = [] for i in range(len(out)): if out[i] == 0: res.append("antiviral") elif out[i] == 1: res.append("antibacterial") else: res.append("antifungal") rs.append(res) a = [] for i in range(len(rs)): a.append('-'.join(rs[i])) df = pd.DataFrame(data={ "id": identifiers, "sequence": sequence, "activity": a }, columns=['id', 'sequence', 'activity']) df.to_csv("result.csv", sep=',', index=False) os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename)) #return render_template('seq.html', seq = rs) return render_template('up.html', mimetype="text/csv") #flash("File uploaded: Thanks!", "success") else: error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD" return render_template('upload.html', error=error) # This will be executed on GET request. return render_template('predictor.html')
def predict(): if request.method == 'POST': seq = request.form['seq'] with open("random.fasta", "w") as fp: fp.write(seq) pepdesc = PeptideDescriptor( '/home/sanika/proj/random.fasta', 'eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta') # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global() # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment( append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global( append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment( append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True) # calculate global Z scale (=window1 autocorrelation) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight f1 = pepdesc.descriptor f2 = globdesc.descriptor result = np.concatenate((f2, f1), axis=1) clf = joblib.load('ml_model.pkl') pred = clf.predict(result) proba = clf.predict_proba(result).tocoo() mc = pred.tocoo() out = mc.col res = [] labels = ['antiviral', 'antibacterial', 'antifungal'] values = proba.data plt.pie(values, labels=labels, autopct='%.0f%%', shadow=True, radius=0.5) plt.savefig('/home/sanika/proj/pie_chart.jpg') figfile = BytesIO() plt.savefig(figfile, format='png') figfile.seek(0) figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii') plt.close() for i in range(len(out)): if out[i] == 0: res.append("antiviral") elif out[i] == 1: res.append("antibacterial") else: res.append("antifungal") return render_template('seq.html', seq=res, result=figdata_png) return render_template('predictor.html')
def main(infolder, outfolder): descriptor = 'PPCALI' print "RF Peptide Learning Info\n========================\n" print datetime.now().strftime("%Y-%m-%d_%H-%M") + "\n" print("INPUT:\nInputfolder is\t%s\nOutputfolder is\t%s\nDescriptor is\t%s , auto-correlated (window 7)\n" % (infolder, outfolder, descriptor)) # -------------------------------- TRAINING -------------------------------- print "LOG:\nLoading data..." Pos = PeptideDescriptor(infolder + '/Pos.fasta', descriptor) Pos.filter_duplicates() Neg = PeptideDescriptor(infolder + '/Neg.fasta', descriptor) Neg.filter_duplicates() targets = np.array(len(Pos.sequences) * [1] + len(Neg.sequences) * [0]) # target vector # Descriptor calculation print "Calculating %s descriptor..." % descriptor Data = PeptideDescriptor(Pos.sequences + Neg.sequences, descriptor) Data.calculate_autocorr(7) # Standard Scaling print "Loading prefitted scaler and standard scaling %s descriptor..." % descriptor scaler = pickle.load(open(infolder + '/scaler.p', 'r')) Data = scaler.transform(Data.descriptor) # Classifier print "Loading pretrained classifier..." clf = pickle.load(open(infolder + '/classifier.p', 'r')) # fitting classifier print "Fitting Random Forest classifier..." clf.fit(Data, targets) fit_leafs = clf.apply(Data) print "\tRF out-of-bag score: %.2f" % clf.oob_score_ # -------------------------------- LIBRARY -------------------------------- # Loading library print "Loading sequence library..." Lib = PeptideDescriptor(infolder + '/Lib.fasta', descriptor) class_labels = [l[:3] for l in Lib.names] # extract class labels from sequence names print "\tLibrary size: %i" % len(Lib.sequences) print "\tLibrary composition is:\n\t\thel: %i\n\t\tasy: %i\n\t\tnCM: %i" % (class_labels.count('hel'), class_labels.count('asy'), class_labels.count('nCM')) # Calculating descriptors for library members print "Calculating %s descriptor for library..." % descriptor D = PeptideDescriptor(Lib.sequences, descriptor) D.calculate_autocorr(7) # combining both libraries and scaling descriptor print "Standard scaling %s descriptor for library..." % descriptor X = scaler.transform(D.descriptor) # -------------------------------- PREDICTING -------------------------------- # get single tree predictions and calculate stdev print "Predicting single tree results, standard deviation and entropy for library..." start = time.time() preds = get_tree_pred(clf, X) print "Predicting class probabilities for library..." probas = clf.predict_proba(X) probas = probas[:, 1].tolist() variance = np.var(preds, axis=1) print("\tPredictions took %.1f s" % (time.time() - start)) # calculate similarity of library members to training data print("Calculating Random Forest similarity (cosine)...") start = time.time() lib_leafs = clf.apply(X) # leaf indices where library samples end up in -> RF intrinsic similarity measure D_RF = pairwise_distances(lib_leafs, fit_leafs, metric='cosine') RF_dist = D_RF.mean(axis=1).tolist() print ("\tDistance calculation took %.1f s" % (time.time() - start)) # scaling all output features print "Min-Max scaling outputs..." sclr = MinMaxScaler() # some transformations from lists to numpy matrices to arrays back to min-max scaled list: variance = np.squeeze(sclr.fit_transform(variance.reshape(-1, 1))).tolist() RF_dist = np.squeeze(sclr.fit_transform(np.array(RF_dist).reshape(-1, 1))).tolist() # construct final list with all values (prediction, RF_dist, var, sum) print "Creating result dictionaries..." sums = [0.5 * (x * (1 - y) + z) for x, y, z in zip(variance, RF_dist, probas)] # dens-weight + proba # create data frame with all values d = pd.DataFrame({'Class': class_labels, 'Prediction': probas, 'RFSimilarity': RF_dist, 'TreeVariance': variance, 'WeighedSum': sums}, index=Lib.sequences) d.index.name = 'Sequence' d = d[['Class', 'Prediction', 'RFSimilarity', 'TreeVariance', 'WeighedSum']].sort_values('WeighedSum', ascending=False) # get top 10 predictions according to the weighted sum synth_sele = d[:10] # writing output print "Saving output files to output directory..." synth_sele.to_csv(outfolder + '/' + datetime.now().strftime("%Y-%m-%d_%H-%M") + 'synthesis_selection.csv') d.to_csv(outfolder + '/library_pred.csv') # saving scaler and classifier to pickle file for later usage pickle.dump(sclr, open(outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-scaler.p', 'w')) pickle.dump(clf, open(outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-classifier.p', 'w')) print("Total runtime: %.1f s\n" % (time.time() - globstart)) print "\nALL DONE SUCCESSFULLY" print "Look for your results file in %s\nAnd maybe save this terminal output to a logfile ;-)" % outfolder
return preds Pos = PeptideDescriptor( '/Users/modlab/y/pycharm/activelearning/retrospective/input/B/Pos.fasta', 'PPCALI') Pos.keep_natural_aa() Neg = PeptideDescriptor( '/Users/modlab/y/pycharm/activelearning/retrospective/input/B/Neg.fasta', 'PPCALI') Neg.keep_natural_aa() y = np.array(len(Pos.sequences) * [1] + len(Neg.sequences) * [0]) # target vector Data = PeptideDescriptor(Pos.sequences + Neg.sequences, 'PPCALI') Data.calculate_autocorr(7) # Scaler scaler = StandardScaler() X = scaler.fit_transform(Data.descriptor) # Classifier clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0,
# Load sequence file into descriptor object pepdesc = PeptideDescriptor('/path/to/sequences.fasta', 'Eisenberg') # use Eisenberg consensus scale globdesc = GlobalDescriptor('/path/to/sequences.fasta') # --------------- Peptide Descriptor (AA scales) Calculations --------------- pepdesc.calculate_global() # calculate global Eisenberg hydrophobicity pepdesc.calculate_moment(append=True) # calculate Eisenberg hydrophobic moment # load other AA scales pepdesc.load_scale('gravy') # load GRAVY scale pepdesc.calculate_global(append=True) # calculate global GRAVY hydrophobicity pepdesc.calculate_moment(append=True) # calculate GRAVY hydrophobic moment pepdesc.load_scale('z3') # load old Z scale pepdesc.calculate_autocorr( 1, append=True) # calculate global Z scale (=window1 autocorrelation) # save descriptor data to .csv file col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3' pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1) # --------------- Global Descriptor Calculations --------------- globdesc.length() # sequence length globdesc.boman_index(append=True) # Boman index globdesc.aromaticity(append=True) # global aromaticity globdesc.aliphatic_index(append=True) # aliphatic index globdesc.instability_index(append=True) # instability index globdesc.calculate_charge(ph=7.4, amide=False, append=True) # net charge globdesc.calculate_MW(amide=False, append=True) # molecular weight # save descriptor data to .csv file
def analyze_generated(self, num, fname='analysis.txt', plot=False): """ Method to analyze the generated sequences located in `self.generated`. :param num: {int} wanted number of sequences to sample :param fname: {str} filename to save analysis info to :param plot: {bool} whether to plot an overview of descriptors :return: file with analysis info (distances) """ with open(fname, 'w') as f: print("Analyzing...") f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n") f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated)))) count = len(set(self.generated) & set(self.sequences)) # get shared entries in both lists f.write("%.1f percent of generated sequences are present in the training data.\n" % ((count / len(self.generated)) * 100)) d = GlobalDescriptor(self.generated) len1 = len(d.sequences) d.filter_aa('B') len2 = len(d.sequences) d.length() f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n") f.write("Number of sequences too short:\t%i\n" % (num - len1)) f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2)) f.write("Number of valid unique seqs:\t%i\n" % len2) f.write("Mean sequence length: \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor))) f.write("Median sequence length: \t\t%i\n" % np.median(d.descriptor)) f.write("Minimal sequence length: \t\t%i\n" % np.min(d.descriptor)) f.write("Maximal sequence length: \t\t%i\n" % np.max(d.descriptor)) descriptor = 'pepcats' seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor) seq_desc.calculate_autocorr(7) gen_desc = PeptideDescriptor(d.sequences, descriptor) gen_desc.calculate_autocorr(7) # random comparison set self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) # generate rand seqs probas = count_aas(''.join(seq_desc.sequences)).values() # get the aa distribution of training seqs self.ran.generate_sequences(proba=probas) ran_desc = PeptideDescriptor(self.ran.sequences, descriptor) ran_desc.calculate_autocorr(7) # amphipathic helices comparison set self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) self.hel.generate_sequences() hel_desc = PeptideDescriptor(self.hel.sequences, descriptor) hel_desc.calculate_autocorr(7) # distance calculation f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper()) desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" % (np.mean(hel_dist), np.std(hel_dist))) # more simple descriptors g_seq = GlobalDescriptor(seq_desc.sequences) g_gen = GlobalDescriptor(gen_desc.sequences) g_ran = GlobalDescriptor(ran_desc.sequences) g_hel = GlobalDescriptor(hel_desc.sequences) g_seq.calculate_all() g_gen.calculate_all() g_ran.calculate_all() g_hel.calculate_all() sclr = StandardScaler() sclr.fit(g_seq.descriptor) f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n") desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" % (np.mean(hel_dist), np.std(hel_dist))) # hydrophobic moments uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg') uh_seq.calculate_moment() uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg') uh_gen.calculate_moment() uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg') uh_ran.calculate_moment() uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg') uh_hel.calculate_moment() f.write("\n\nHYDROPHOBIC MOMENTS\n\n") f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor))) f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor))) f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor))) f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor))) if plot: if self.refs: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences], ['training', 'sampled', 'hel', 'ran']) else: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled']) a.plot_summary(filename=fname[:-4] + '.png')
from sklearn.metrics import matthews_corrcoef, accuracy_score from progressbar import ProgressBar from modlamp.core import read_fasta from modlamp.descriptors import PeptideDescriptor seed = np.random.RandomState(seed=42) for d in os.listdir('./output'): if os.path.isdir('./output/' + d): print("\nRunning %s..." % d) sclr = pickle.load(open('./output/' + d + '/scaler.p', 'r')) pos = read_fasta('./input/' + d + '/Pos.fasta')[0] neg = read_fasta('./input/' + d + '/Neg.fasta')[0] desc = PeptideDescriptor(pos + neg, 'PPCALI') desc.calculate_autocorr(7) X = sclr.transform(desc.descriptor) y = np.array(len(pos) * [1] + len(neg) * [0]) skf = StratifiedKFold(y, n_folds=10) synth = pd.read_csv('./output/' + d + '/synthesis_selection.csv') print("\tPerforming 10-fold cross-validation") mcc = list() acc = list() pbar = ProgressBar() for train, test in pbar(skf): clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None,