def main(libsize=1000):
    # load training sequences
    data = load_AMPvsUniProt()

    # describe sequences with PEPCATS descriptor
    X = PeptideDescriptor(data.sequences, 'pepcats')
    X.calculate_crosscorr(7)

    # initialize Random Forest classifier
    clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1)

    # fit the classifier on the PEPCATS data
    clf.fit(X.descriptor, data.target)

    # evaluate classifier performance as RF out of bag score
    print("RandomForest OOB classifcation score: %.3f" % clf.oob_score_)

    # generate a virtual peptide library of `size` sequences to screen
    Lib = MixedLibrary(libsize)
    Lib.generate_sequences()
    print("Actual lirutal library size (without duplicates): %i" %
          len(Lib.sequences))

    # describe library with PEPCATS descriptor
    X_lib = PeptideDescriptor(Lib.sequences, 'pepcats')
    X_lib.calculate_crosscorr(7)

    # predict class probabilities for sequences in Library
    proba = clf.predict_proba(X_lib.descriptor)

    # create ordered dictionary with sequences and prediction values and order it according to AMP predictions
    d = dict(zip(Lib.sequences, proba[:, 1]))
    d50 = OrderedDict(
        sorted(d.items(), key=lambda t: t[1],
               reverse=True)[:50])  # 50 top AMP predictions

    # print the 50 top ranked predictions with their predicted probabilities
    print("Sequence,Predicted_AMP_Probability")
    for k in d50.keys():
        print(k + "," + str(d50[k]))
Example #2
0
def main():
    # generate some virtual peptide sequences
    libnum = 1000  # 1000 sequences per sublibrary
    h = Helices(seqnum=libnum)
    r = Random(seqnum=libnum)
    n = AMPngrams(seqnum=libnum, n_min=4)
    h.generate_sequences()
    r.generate_sequences(proba='AMP')
    n.generate_sequences()

    # calculate molecular descirptors for the peptides
    d = PeptideDescriptor(seqs=np.hstack(
        (h.sequences, r.sequences, n.sequences)),
                          scalename='pepcats')
    d.calculate_crosscorr(window=7)

    # train a som on the descriptors and print / plot the training error
    som = SOM(x=12, y=12)
    som.fit(data=d.descriptor, epochs=100000, decay='hill')
    print("Fit error: %.4f" % som.error)
    som.plot_error_history(filename="som_error.png")

    # load known antimicrobial peptides (AMPs) and transmembrane sequences
    dataset = load_AMPvsTM()
    d2 = PeptideDescriptor(dataset.sequences, 'pepcats')
    d2.calculate_crosscorr(7)
    targets = np.array(libnum * [0] + libnum * [1] + libnum * [2] + 206 * [3])
    names = ['Helices', 'Random', 'nGrams', 'AMP']

    # plot som maps with location of AMPs
    som.plot_point_map(np.vstack((d.descriptor, d2.descriptor[206:])),
                       targets,
                       names,
                       filename="peptidesom.png")
    som.plot_density_map(np.vstack((d.descriptor, d2.descriptor)),
                         filename="density.png")
    som.plot_distance_map(colormap='Reds', filename="distances.png")

    colormaps = ['Oranges', 'Purples', 'Greens', 'Reds']
    for i, c in enumerate(set(targets)):
        som.plot_class_density(np.vstack((d.descriptor, d2.descriptor)),
                               targets,
                               c,
                               names,
                               colormap=colormaps[i],
                               filename='class%i.png' % c)

    # get neighboring peptides (AMPs / TMs) for a sequence of interest
    my_d = PeptideDescriptor(seqs='GLFDIVKKVVGALLAG', scalename='pepcats')
    my_d.calculate_crosscorr(window=7)
    som.get_neighbors(datapoint=my_d.descriptor,
                      data=d2.descriptor,
                      labels=dataset.sequences,
                      d=0)
Example #3
0
    if y == '1':
        class_in = 1
    elif y == '-1':
        class_in = 0

    out.write(x + ', ' + str(class_in))
    out.write('\n')
out.close()

# load the reformatted data
data = load_custom(os.getcwd() + '/formatted.csv')

# create descriptors for peptide sequences
descr_temp = PeptideDescriptor(data.sequences, scalename='pepArc')
descr_temp.calculate_crosscorr(window=4)

# develop best model and print out score with cross validation
best_RF = train_best_model('RF', descr_temp.descriptor, data.target)
score_cv(best_RF, descr_temp.descriptor, data.target, cv=10)

y_pred = []

# get predictions for values
for i in range(0, 392):
    try:
        pep_descr = PeptideDescriptor(samples_test[i], scalename='pepArc')
        pep_descr.calculate_crosscorr(window=4)
        proba = best_RF.predict_proba(pep_descr.descriptor)
        y_pred.append(proba)
Example #4
0
from modlamp.descriptors import PeptideDescriptor
from modlamp.datasets import load_AMPvsTM
from som import SOM

# generate some virtual peptide sequences
libnum = 1000  # 1000 sequences per sublibrary
h = Helices(seqnum=libnum)
r = Random(seqnum=libnum)
n = AMPngrams(seqnum=libnum, n_min=4)
h.generate_sequences()
r.generate_sequences(proba='AMP')
n.generate_sequences()

# calculate molecular descirptors for the peptides
d = PeptideDescriptor(seqs=np.hstack((h.sequences, r.sequences, n.sequences)), scalename='pepcats')
d.calculate_crosscorr(window=7)

# train a som on the descriptors and print / plot the training error
som = SOM(x=12, y=12)
som.fit(data=d.descriptor, epochs=100000, decay='hill')
print("Fit error: %.4f" % som.error)
som.plot_error_history(filename="som_error.png")

# load known antimicrobial peptides (AMPs) and transmembrane sequences
dataset = load_AMPvsTM()
d2 = PeptideDescriptor(dataset.sequences, 'pepcats')
d2.calculate_crosscorr(7)
targets = np.array(libnum*[0] + libnum*[1] + libnum*[2] + 206*[3])
names = ['Helices', 'Random', 'nGrams', 'AMP']

# plot som maps with location of AMPs