Ejemplo n.º 1
0
def main():
    # generate some virtual peptide sequences
    libnum = 1000  # 1000 sequences per sublibrary
    h = Helices(seqnum=libnum)
    r = Random(seqnum=libnum)
    n = AMPngrams(seqnum=libnum, n_min=4)
    h.generate_sequences()
    r.generate_sequences(proba='AMP')
    n.generate_sequences()

    # calculate molecular descirptors for the peptides
    d = PeptideDescriptor(seqs=np.hstack(
        (h.sequences, r.sequences, n.sequences)),
                          scalename='pepcats')
    d.calculate_crosscorr(window=7)

    # train a som on the descriptors and print / plot the training error
    som = SOM(x=12, y=12)
    som.fit(data=d.descriptor, epochs=100000, decay='hill')
    print("Fit error: %.4f" % som.error)
    som.plot_error_history(filename="som_error.png")

    # load known antimicrobial peptides (AMPs) and transmembrane sequences
    dataset = load_AMPvsTM()
    d2 = PeptideDescriptor(dataset.sequences, 'pepcats')
    d2.calculate_crosscorr(7)
    targets = np.array(libnum * [0] + libnum * [1] + libnum * [2] + 206 * [3])
    names = ['Helices', 'Random', 'nGrams', 'AMP']

    # plot som maps with location of AMPs
    som.plot_point_map(np.vstack((d.descriptor, d2.descriptor[206:])),
                       targets,
                       names,
                       filename="peptidesom.png")
    som.plot_density_map(np.vstack((d.descriptor, d2.descriptor)),
                         filename="density.png")
    som.plot_distance_map(colormap='Reds', filename="distances.png")

    colormaps = ['Oranges', 'Purples', 'Greens', 'Reds']
    for i, c in enumerate(set(targets)):
        som.plot_class_density(np.vstack((d.descriptor, d2.descriptor)),
                               targets,
                               c,
                               names,
                               colormap=colormaps[i],
                               filename='class%i.png' % c)

    # get neighboring peptides (AMPs / TMs) for a sequence of interest
    my_d = PeptideDescriptor(seqs='GLFDIVKKVVGALLAG', scalename='pepcats')
    my_d.calculate_crosscorr(window=7)
    som.get_neighbors(datapoint=my_d.descriptor,
                      data=d2.descriptor,
                      labels=dataset.sequences,
                      d=0)
Ejemplo n.º 2
0
class SequenceHandler(object):
    """ Class for handling peptide sequences, e.g. loading, one-hot encoding or decoding and saving """
    
    def __init__(self, window=0, step=2, refs=True):
        """
        :param window: {str} window used for chopping up sequences. If 0: False
        :param step: {int} size of the steps to move the window forward
        :param refs {bool} whether to generate reference sequence sets for analysis
        """
        self.sequences = None
        self.generated = None
        self.ran = None
        self.hel = None
        self.X = list()
        self.y = list()
        self.window = window
        self.step = step
        self.refs = refs
        # generate translation dictionary for one-hot encoding
        _, self.to_one_hot, self.vocab = _onehotencode('A')
    
    def load_sequences(self, filename):
        """ Method to load peptide sequences from a csv file

        :param filename: {str} filename of the sequence file to be read (``csv``, one sequence per line)
        :return: sequences in self.sequences
        """
        with open(filename) as f:
            self.sequences = [s.strip() for s in f]
        self.sequences = random.sample(self.sequences, len(self.sequences))  # shuffle sequences randomly
    
    def pad_sequences(self, pad_char=' ', padlen=0):
        """ Pad all sequences to the longest length (default, padlen=0) or a given length

        :param pad_char: {str} Character to pad sequences with
        :param padlen: {int} Custom length of padding to add to all sequences to (optional), default: 0. If
        0, sequences are padded to the length of the longest sequence in the training set. If a window is used and the
        padded sequence is shorter than the window size, it is padded to fit the window.
        """
        if padlen:
            padded_seqs = []
            for seq in self.sequences:
                if len(seq) < self.window:
                    padded_seq = seq + pad_char * (self.step + self.window - len(seq))
                else:
                    padded_seq = seq + pad_char * padlen
                padded_seqs.append(padded_seq)
        else:
            length = max([len(seq) for seq in self.sequences])
            padded_seqs = []
            for seq in self.sequences:
                padded_seq = 'B' + seq + pad_char * (length - len(seq))
                padded_seqs.append(padded_seq)
        
        if pad_char not in self.vocab:
            self.vocab += [pad_char]
        
        self.sequences = padded_seqs  # overwrite sequences with padded sequences
    
    def one_hot_encode(self, target='all'):
        """ Chop up loaded sequences into patterns of length ``window`` by moving by stepsize ``step`` and translate
        them with a one-hot vector encoding

        :param target: {str} whether all proceeding AA should be learned or just the last one in sequence (`all`, `one`)
        :return: one-hot encoded sequence patterns in self.X and corresponding target amino acids in self.y
        """
        if self.window == 0:
            for s in self.sequences:
                self.X.append([self.to_one_hot[char] for char in s[:-self.step]])
                if target == 'all':
                    self.y.append([self.to_one_hot[char] for char in s[self.step:]])
                elif target == 'one':
                    self.y.append(s[-self.step:])
            
            self.X = np.reshape(self.X, (len(self.X), len(self.sequences[0]) - self.step, len(self.vocab)))
            self.y = np.reshape(self.y, (len(self.y), len(self.sequences[0]) - self.step, len(self.vocab)))
        
        else:
            for s in self.sequences:
                for i in range(0, len(s) - self.window, self.step):
                    self.X.append([self.to_one_hot[char] for char in s[i: i + self.window]])
                    if target == 'all':
                        self.y.append([self.to_one_hot[char] for char in s[i + 1: i + self.window + 1]])
                    elif target == 'one':
                        self.y.append(s[-self.step:])
            
            self.X = np.reshape(self.X, (len(self.X), self.window, len(self.vocab)))
            self.y = np.reshape(self.y, (len(self.y), self.window, len(self.vocab)))
        
        print("\nData shape:\nX: " + str(self.X.shape) + "\ny: " + str(self.y.shape))
    
    def analyze_training(self):
        """ Method to analyze the distribution of the training data

        :return: prints out information about the length distribution of the sequences in ``self.sequences``
        """
        d = GlobalDescriptor(self.sequences)
        d.length()
        print("\nLENGTH DISTRIBUTION OF TRAINING DATA:\n")
        print("Number of sequences:    \t%i" % len(self.sequences))
        print("Mean sequence length:   \t%.1f ± %.1f" % (np.mean(d.descriptor), np.std(d.descriptor)))
        print("Median sequence length: \t%i" % np.median(d.descriptor))
        print("Minimal sequence length:\t%i" % np.min(d.descriptor))
        print("Maximal sequence length:\t%i" % np.max(d.descriptor))
    
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')
    
    def save_generated(self, logdir, filename):
        """ Save all sequences in `self.generated` to file

        :param logdir: {str} current log directory (used for comparison sequences)
        :param filename: {str} filename to save the sequences to
        :return: saved file
        """
        with open(filename, 'w') as f:
            for s in self.generated:
                f.write(s + '\n')
        
        self.ran.save_fasta(logdir + '/random_sequences.fasta')
        self.hel.save_fasta(logdir + '/helical_sequences.fasta')
Ejemplo n.º 3
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from modlamp.sequences import Helices, Random, AMPngrams
from modlamp.descriptors import PeptideDescriptor
from modlamp.datasets import load_AMPvsTM
from som import SOM

# generate some virtual peptide sequences
libnum = 1000  # 1000 sequences per sublibrary
h = Helices(seqnum=libnum)
r = Random(seqnum=libnum)
n = AMPngrams(seqnum=libnum, n_min=4)
h.generate_sequences()
r.generate_sequences(proba='AMP')
n.generate_sequences()

# calculate molecular descirptors for the peptides
d = PeptideDescriptor(seqs=np.hstack((h.sequences, r.sequences, n.sequences)), scalename='pepcats')
d.calculate_crosscorr(window=7)

# train a som on the descriptors and print / plot the training error
som = SOM(x=12, y=12)
som.fit(data=d.descriptor, epochs=100000, decay='hill')
print("Fit error: %.4f" % som.error)
som.plot_error_history(filename="som_error.png")

# load known antimicrobial peptides (AMPs) and transmembrane sequences
dataset = load_AMPvsTM()
d2 = PeptideDescriptor(dataset.sequences, 'pepcats')