def main(libsize=1000): # load training sequences data = load_AMPvsUniProt() # describe sequences with PEPCATS descriptor X = PeptideDescriptor(data.sequences, 'pepcats') X.calculate_crosscorr(7) # initialize Random Forest classifier clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1) # fit the classifier on the PEPCATS data clf.fit(X.descriptor, data.target) # evaluate classifier performance as RF out of bag score print("RandomForest OOB classifcation score: %.3f" % clf.oob_score_) # generate a virtual peptide library of `size` sequences to screen Lib = MixedLibrary(libsize) Lib.generate_sequences() print("Actual lirutal library size (without duplicates): %i" % len(Lib.sequences)) # describe library with PEPCATS descriptor X_lib = PeptideDescriptor(Lib.sequences, 'pepcats') X_lib.calculate_crosscorr(7) # predict class probabilities for sequences in Library proba = clf.predict_proba(X_lib.descriptor) # create ordered dictionary with sequences and prediction values and order it according to AMP predictions d = dict(zip(Lib.sequences, proba[:, 1])) d50 = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True)[:50]) # 50 top AMP predictions # print the 50 top ranked predictions with their predicted probabilities print("Sequence,Predicted_AMP_Probability") for k in d50.keys(): print(k + "," + str(d50[k]))
def main(): # generate some virtual peptide sequences libnum = 1000 # 1000 sequences per sublibrary h = Helices(seqnum=libnum) r = Random(seqnum=libnum) n = AMPngrams(seqnum=libnum, n_min=4) h.generate_sequences() r.generate_sequences(proba='AMP') n.generate_sequences() # calculate molecular descirptors for the peptides d = PeptideDescriptor(seqs=np.hstack( (h.sequences, r.sequences, n.sequences)), scalename='pepcats') d.calculate_crosscorr(window=7) # train a som on the descriptors and print / plot the training error som = SOM(x=12, y=12) som.fit(data=d.descriptor, epochs=100000, decay='hill') print("Fit error: %.4f" % som.error) som.plot_error_history(filename="som_error.png") # load known antimicrobial peptides (AMPs) and transmembrane sequences dataset = load_AMPvsTM() d2 = PeptideDescriptor(dataset.sequences, 'pepcats') d2.calculate_crosscorr(7) targets = np.array(libnum * [0] + libnum * [1] + libnum * [2] + 206 * [3]) names = ['Helices', 'Random', 'nGrams', 'AMP'] # plot som maps with location of AMPs som.plot_point_map(np.vstack((d.descriptor, d2.descriptor[206:])), targets, names, filename="peptidesom.png") som.plot_density_map(np.vstack((d.descriptor, d2.descriptor)), filename="density.png") som.plot_distance_map(colormap='Reds', filename="distances.png") colormaps = ['Oranges', 'Purples', 'Greens', 'Reds'] for i, c in enumerate(set(targets)): som.plot_class_density(np.vstack((d.descriptor, d2.descriptor)), targets, c, names, colormap=colormaps[i], filename='class%i.png' % c) # get neighboring peptides (AMPs / TMs) for a sequence of interest my_d = PeptideDescriptor(seqs='GLFDIVKKVVGALLAG', scalename='pepcats') my_d.calculate_crosscorr(window=7) som.get_neighbors(datapoint=my_d.descriptor, data=d2.descriptor, labels=dataset.sequences, d=0)
if y == '1': class_in = 1 elif y == '-1': class_in = 0 out.write(x + ', ' + str(class_in)) out.write('\n') out.close() # load the reformatted data data = load_custom(os.getcwd() + '/formatted.csv') # create descriptors for peptide sequences descr_temp = PeptideDescriptor(data.sequences, scalename='pepArc') descr_temp.calculate_crosscorr(window=4) # develop best model and print out score with cross validation best_RF = train_best_model('RF', descr_temp.descriptor, data.target) score_cv(best_RF, descr_temp.descriptor, data.target, cv=10) y_pred = [] # get predictions for values for i in range(0, 392): try: pep_descr = PeptideDescriptor(samples_test[i], scalename='pepArc') pep_descr.calculate_crosscorr(window=4) proba = best_RF.predict_proba(pep_descr.descriptor) y_pred.append(proba)
from modlamp.descriptors import PeptideDescriptor from modlamp.datasets import load_AMPvsTM from som import SOM # generate some virtual peptide sequences libnum = 1000 # 1000 sequences per sublibrary h = Helices(seqnum=libnum) r = Random(seqnum=libnum) n = AMPngrams(seqnum=libnum, n_min=4) h.generate_sequences() r.generate_sequences(proba='AMP') n.generate_sequences() # calculate molecular descirptors for the peptides d = PeptideDescriptor(seqs=np.hstack((h.sequences, r.sequences, n.sequences)), scalename='pepcats') d.calculate_crosscorr(window=7) # train a som on the descriptors and print / plot the training error som = SOM(x=12, y=12) som.fit(data=d.descriptor, epochs=100000, decay='hill') print("Fit error: %.4f" % som.error) som.plot_error_history(filename="som_error.png") # load known antimicrobial peptides (AMPs) and transmembrane sequences dataset = load_AMPvsTM() d2 = PeptideDescriptor(dataset.sequences, 'pepcats') d2.calculate_crosscorr(7) targets = np.array(libnum*[0] + libnum*[1] + libnum*[2] + 206*[3]) names = ['Helices', 'Random', 'nGrams', 'AMP'] # plot som maps with location of AMPs