def get_peptide_values(list_peptides, descriptor_name):
    """
    :param list_peptides: List of amino acid peptides
    :param descriptor_name: MODLamp-prescribed descriptor name
    :return: corresponding values for that descriptor for each of the peptides in the input list
    """
    properties = PeptideDescriptor(list_peptides, descriptor_name)
    properties.calculate_moment()
    return [x[0] for x in properties.descriptor]
Beispiel #2
0
 def calc_H(self, scale='eisenberg'):
     """Method for calculating global hydrophobicity (Eisenberg scale) of all sequences in the library.
     
     :param scale: {str} hydrophobicity scale to use. For available scales,
         see :class:`modlamp.descriptors.PeptideDescriptor`.
     :return: {numpy.ndarray} Eisenberg hydrophobicities in the attribute :py:attr:`H`.
     
     .. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_global()`
     """
     for l in range(self.library.shape[0]):
         d = PeptideDescriptor(self.library[l], scale)
         d.calculate_global()
         self.H.append(d.descriptor[:, 0])
Beispiel #3
0
class TestCore(unittest.TestCase):
    b = BaseSequence(1, 10, 20)
    b.sequences = [
        'GLFDIVKKVVGALG', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALK',
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'AGGURST', 'aggo'
    ]
    n = BaseDescriptor('GLFDIVKKVVGALGSLGLFDIVKKVVGALGSL')
    b.names = ['1', '2', '3', '4', '5', '6']
    s = PeptideDescriptor([
        'GLFDIVKKVVGALG', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALK',
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'AGGURST', 'aggorst'
    ])
    s.names = b.names
    l = Random(100, 7, 28)
    l.generate_sequences()
    d = PeptideDescriptor(l.sequences, 'eisenberg')
    d.calculate_moment()

    def test_ngrams(self):
        self.n.count_ngrams([2, 3])
        self.assertEqual(self.n.descriptor['ALG'], 2)

    def test_filter_aa(self):
        self.b.filter_aa(['C'])
        self.assertEqual(len(self.b.sequences), 5)

    def test_filter_duplicates(self):
        self.b.filter_duplicates()
        self.assertEqual(len(self.b.sequences), 4)

    def test_keep_natural_aa(self):
        self.assertIn('ABCDEFGHIJKLMNOPQRSTUVWXYZ', self.s.sequences)
        self.s.keep_natural_aa()
        self.assertNotIn('ABCDEFGHIJKLMNOPQRSTUVWXYZ', self.s.sequences)

    def test_mutate(self):
        self.b.mutate_AA(2, 1.)
        self.assertNotEqual('GLFDIVKKVVGALG', self.b.sequences[0])

    def test_rand_selection(self):
        self.d.random_selection(10)
        self.assertEqual(len(self.d.sequences), 10)
        self.assertEqual(len(self.d.descriptor), 10)

    def test_safe_fasta(self):
        self.d.save_fasta(join(dirname(__file__), 'files/saved.fasta'),
                          names=True)
        self.d.save_fasta(join(dirname(__file__), 'files/saved.fasta'),
                          names=False)
Beispiel #4
0
 def calc_uH(self, window=1000, angle=100, modality='max'):
     """Method for calculating hydrophobic moments (Eisenberg scale) for all sequences in the library.
     
     :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the
         window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences
         shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal
         hydrophiobic moment for the chosen window size found in the sequence will be returned.
     :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets.
     :param modality: {'max' or 'mean'} calculate respectively maximum or mean hydrophobic moment.
     :return: {numpy.ndarray} calculated hydrophobic moments in the attribute :py:attr:`uH`.
     
     .. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_moment()`
     """
     for l in range(self.library.shape[0]):
         d = PeptideDescriptor(self.library[l], 'eisenberg')
         d.calculate_moment(window=window, angle=angle, modality=modality)
         self.uH.append(d.descriptor[:, 0])
def main(libsize=1000):
    # load training sequences
    data = load_AMPvsUniProt()

    # describe sequences with PEPCATS descriptor
    X = PeptideDescriptor(data.sequences, 'pepcats')
    X.calculate_crosscorr(7)

    # initialize Random Forest classifier
    clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1)

    # fit the classifier on the PEPCATS data
    clf.fit(X.descriptor, data.target)

    # evaluate classifier performance as RF out of bag score
    print("RandomForest OOB classifcation score: %.3f" % clf.oob_score_)

    # generate a virtual peptide library of `size` sequences to screen
    Lib = MixedLibrary(libsize)
    Lib.generate_sequences()
    print("Actual lirutal library size (without duplicates): %i" %
          len(Lib.sequences))

    # describe library with PEPCATS descriptor
    X_lib = PeptideDescriptor(Lib.sequences, 'pepcats')
    X_lib.calculate_crosscorr(7)

    # predict class probabilities for sequences in Library
    proba = clf.predict_proba(X_lib.descriptor)

    # create ordered dictionary with sequences and prediction values and order it according to AMP predictions
    d = dict(zip(Lib.sequences, proba[:, 1]))
    d50 = OrderedDict(
        sorted(d.items(), key=lambda t: t[1],
               reverse=True)[:50])  # 50 top AMP predictions

    # print the 50 top ranked predictions with their predicted probabilities
    print("Sequence,Predicted_AMP_Probability")
    for k in d50.keys():
        print(k + "," + str(d50[k]))
Beispiel #6
0
def main(infolder, outfolder):

    descriptor = 'PPCALI'
    
    print "RF Peptide Learning Info\n========================\n"
    print datetime.now().strftime("%Y-%m-%d_%H-%M") + "\n"
    print("INPUT:\nInputfolder is\t%s\nOutputfolder is\t%s\nDescriptor is\t%s , auto-correlated (window 7)\n" %
            (infolder, outfolder, descriptor))

    # -------------------------------- TRAINING --------------------------------
    print "LOG:\nLoading data..."
    Pos = PeptideDescriptor(infolder + '/Pos.fasta', descriptor)
    Pos.filter_duplicates()
    Neg = PeptideDescriptor(infolder + '/Neg.fasta', descriptor)
    Neg.filter_duplicates()
    targets = np.array(len(Pos.sequences) * [1] + len(Neg.sequences) * [0])  # target vector

    # Descriptor calculation
    print "Calculating %s descriptor..." % descriptor
    Data = PeptideDescriptor(Pos.sequences + Neg.sequences, descriptor)
    Data.calculate_autocorr(7)
    
    # Standard Scaling
    print "Loading prefitted scaler and standard scaling %s descriptor..." % descriptor
    scaler = pickle.load(open(infolder + '/scaler.p', 'r'))
    Data = scaler.transform(Data.descriptor)

    # Classifier
    print "Loading pretrained classifier..."
    clf = pickle.load(open(infolder + '/classifier.p', 'r'))
    
    # fitting classifier
    print "Fitting Random Forest classifier..."
    clf.fit(Data, targets)
    fit_leafs = clf.apply(Data)
    print "\tRF out-of-bag score: %.2f" % clf.oob_score_

    # -------------------------------- LIBRARY --------------------------------
    # Loading library
    print "Loading sequence library..."
    Lib = PeptideDescriptor(infolder + '/Lib.fasta', descriptor)
    class_labels = [l[:3] for l in Lib.names]  # extract class labels from sequence names
    
    print "\tLibrary size: %i" % len(Lib.sequences)
    print "\tLibrary composition is:\n\t\thel: %i\n\t\tasy: %i\n\t\tnCM: %i" % (class_labels.count('hel'),
                                                                                class_labels.count('asy'),
                                                                                class_labels.count('nCM'))

    # Calculating descriptors for library members
    print "Calculating %s descriptor for library..." % descriptor
    D = PeptideDescriptor(Lib.sequences, descriptor)
    D.calculate_autocorr(7)
   
    # combining both libraries and scaling descriptor
    print "Standard scaling %s descriptor for library..." % descriptor
    X = scaler.transform(D.descriptor)

    # -------------------------------- PREDICTING --------------------------------
    # get single tree predictions and calculate stdev
    print "Predicting single tree results, standard deviation and entropy for library..."
    start = time.time()
    preds = get_tree_pred(clf, X)

    print "Predicting class probabilities for library..."
    probas = clf.predict_proba(X)
    probas = probas[:, 1].tolist()
    variance = np.var(preds, axis=1)
    print("\tPredictions took %.1f s" % (time.time() - start))

    # calculate similarity of library members to training data
    print("Calculating Random Forest similarity (cosine)...")
    start = time.time()
    lib_leafs = clf.apply(X)  # leaf indices where library samples end up in -> RF intrinsic similarity measure
    D_RF = pairwise_distances(lib_leafs, fit_leafs, metric='cosine')
    RF_dist = D_RF.mean(axis=1).tolist()
    print ("\tDistance calculation took %.1f s" % (time.time() - start))

    # scaling all output features
    print "Min-Max scaling outputs..."
    sclr = MinMaxScaler()
    # some transformations from lists to numpy matrices to arrays back to min-max scaled list:
    variance = np.squeeze(sclr.fit_transform(variance.reshape(-1, 1))).tolist()
    RF_dist = np.squeeze(sclr.fit_transform(np.array(RF_dist).reshape(-1, 1))).tolist()

    # construct final list with all values (prediction, RF_dist, var, sum)
    print "Creating result dictionaries..."
    sums = [0.5 * (x * (1 - y) + z) for x, y, z in zip(variance, RF_dist, probas)]  # dens-weight + proba

    # create data frame with all values
    d = pd.DataFrame({'Class': class_labels, 'Prediction': probas, 'RFSimilarity': RF_dist, 'TreeVariance': variance,
                    'WeighedSum': sums}, index=Lib.sequences)
    d.index.name = 'Sequence'
    d = d[['Class', 'Prediction', 'RFSimilarity', 'TreeVariance', 'WeighedSum']].sort_values('WeighedSum',
                                                                                           ascending=False)
    
    # get top 10 predictions according to the weighted sum
    synth_sele = d[:10]

    # writing output
    print "Saving output files to output directory..."
    synth_sele.to_csv(outfolder + '/' + datetime.now().strftime("%Y-%m-%d_%H-%M") + 'synthesis_selection.csv')
    d.to_csv(outfolder + '/library_pred.csv')
    
    # saving scaler and classifier to pickle file for later usage
    pickle.dump(sclr, open(outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-scaler.p', 'w'))
    pickle.dump(clf, open(outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-classifier.p', 'w'))

    print("Total runtime: %.1f s\n" % (time.time() - globstart))
    print "\nALL DONE SUCCESSFULLY"
    print "Look for your results file in %s\nAnd maybe save this terminal output to a logfile ;-)" % outfolder
Beispiel #7
0
args = parser.parse_args()

file = open(args.InFile)
lines = file.readlines()

Index = []
Pep = []

for line in lines:
    if '>' in line:
        Index.append(line.strip('\n'))
    else:
        line = line.strip('\n')
        line = line.strip('\r')
        Pep.append(line)

df = pd.DataFrame()

for i, l in enumerate(Pep):

    D = PeptideDescriptor(l)
    D.count_ngrams([int(args.Ngrams)])

    df1 = pd.DataFrame(D.descriptor, index=[
        "sequence" + str(i),
    ])
    df = pd.concat([df, df1], axis=0)

df = df.fillna(0)
df.to_csv(args.OutFile, sep='\t', index=None)
from sklearn.metrics.pairwise import pairwise_distances

from modlamp.descriptors import PeptideDescriptor


def get_tree_pred(model, X):
    preds = np.empty((X.shape[0], len(model.estimators_)))
    for i, tree in enumerate(model.estimators_):
        preds[:, i] = tree.predict_proba(
            X.astype('float32'),
            check_input=False)[:, 1]  # don't always check input dim
    return preds


Pos = PeptideDescriptor(
    '/Users/modlab/y/pycharm/activelearning/retrospective/input/B/Pos.fasta',
    'PPCALI')
Pos.keep_natural_aa()
Neg = PeptideDescriptor(
    '/Users/modlab/y/pycharm/activelearning/retrospective/input/B/Neg.fasta',
    'PPCALI')
Neg.keep_natural_aa()
y = np.array(len(Pos.sequences) * [1] +
             len(Neg.sequences) * [0])  # target vector

Data = PeptideDescriptor(Pos.sequences + Neg.sequences, 'PPCALI')
Data.calculate_autocorr(7)

# Scaler
scaler = StandardScaler()
X = scaler.fit_transform(Data.descriptor)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to calculate different peptide descriptors for a given sequences.fasta file and save them to two files.
"""

from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor

# Load sequence file into descriptor object
pepdesc = PeptideDescriptor('/path/to/sequences.fasta',
                            'Eisenberg')  # use Eisenberg consensus scale
globdesc = GlobalDescriptor('/path/to/sequences.fasta')

# --------------- Peptide Descriptor (AA scales) Calculations ---------------
pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate Eisenberg hydrophobic moment

# load other AA scales
pepdesc.load_scale('gravy')  # load GRAVY scale
pepdesc.calculate_global(append=True)  # calculate global GRAVY hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate GRAVY hydrophobic moment
pepdesc.load_scale('z3')  # load old Z scale
pepdesc.calculate_autocorr(
    1, append=True)  # calculate global Z scale (=window1 autocorrelation)

# save descriptor data to .csv file
col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3'
pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1)

# --------------- Global Descriptor Calculations ---------------
globdesc.length()  # sequence length
Beispiel #10
0
def exec(peptide, time_node):
	file = open("../src/public/jobs/service1/service1.fasta", "w") 
	file.write(peptide)
	file.close()
	fasta = SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta")
	if(any(fasta) == False): #False when `fasta` is empty
		return "error"
	cantidad = 0
	for record in SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta"):
		cantidad = cantidad+1
	if (cantidad == 1):
		properties = {}
		for record in SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta"):
			properties[str(record.id)] = {}
			#save properties

			properties[str(record.id)]["length"] = len(record.seq)

			#formula
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.formula(amide=True)
				properties[str(record.id)]["formula"] = desc.descriptor[0][0]
			except:
				properties[str(record.id)]["formula"] = "-"

			#molecular weigth
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_MW(amide=True)
				properties[str(record.id)]["molecular_weigth"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["molecular_weigth"] = "-"

			#boman_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.boman_index()
				properties[str(record.id)]["boman_index"] = float("%.4f" % desc.descriptor[0][0])				
			except:
				properties[str(record.id)]["boman_index"] = "-"

			#charge
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_charge(ph=7, amide=True)
				properties[str(record.id)]["charge"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge"] = "-"
				

			#charge density
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.charge_density(ph=7, amide=True)
				properties[str(record.id)]["charge_density"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge_density"] = "-"

			#estimate isoelectric point
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.isoelectric_point()
				properties[str(record.id)]["isoelectric_point"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["isoelectric_point"] = "-"

			#estimate inestability index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.instability_index()
				properties[str(record.id)]["instability_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["instability_index"] = "-"

			#estimate aromaticity
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aromaticity()
				properties[str(record.id)]["aromaticity"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aromaticity"] = "-"

			#estimate aliphatic_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aliphatic_index()
				properties[str(record.id)]["aliphatic_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aliphatic_index"] = "-"

			#estimate hydrophobic_ratio
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.hydrophobic_ratio()
				properties[str(record.id)]["hydrophobic_ratio"] = float("%.4f" % desc.descriptor[0][0])	
			except:
				properties[str(record.id)]["hydrophobic_ratio"] = "-"

			#profile hydrophobicity
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='H')
				properties[str(record.id)]["hydrophobicity_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobicity_profile"] = "-"

			#profile hydrophobic
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='uH')
				properties[str(record.id)]["hydrophobic_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobic_profile"] = "-"

			#moment
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_moment()
				properties[str(record.id)]["calculate_moment"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["calculate_moment"] = "-"

			try:
				os.mkdir("../src/public/jobs/service1/"+time_node)
			except:
				print("Error")
			
			#generate plot profile
			plot_profile(str(record.seq), scalename='eisenberg', filename= "../src/public/jobs/service1/"+time_node+"/profile.png")

			#generate helical wheel
			helical_wheel(str(record.seq), colorcoding='charge', lineweights=False, filename= "../src/public/jobs/service1/"+time_node+"/helical.png")
			
			return(properties)
	
	if (cantidad > 1):
		properties = {}
		for record in SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta"):
			properties[str(record.id)] = {}

			properties[str(record.id)]["length"] = len(record.seq)
			
			#formula
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.formula(amide=True)
				properties[str(record.id)]["formula"] = desc.descriptor[0][0]
			except:
				properties[str(record.id)]["formula"] = "-"

			#molecular weigth
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_MW(amide=True)
				properties[str(record.id)]["molecular_weigth"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["molecular_weigth"] = "-"

			#boman_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.boman_index()
				properties[str(record.id)]["boman_index"] = float("%.4f" % desc.descriptor[0][0])				
			except:
				properties[str(record.id)]["boman_index"] = "-"

			#charge
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_charge(ph=7, amide=True)
				properties[str(record.id)]["charge"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge"] = "-"
				

			#charge density
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.charge_density(ph=7, amide=True)
				properties[str(record.id)]["charge_density"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge_density"] = "-"

			#estimate isoelectric point
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.isoelectric_point()
				properties[str(record.id)]["isoelectric_point"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["isoelectric_point"] = "-"

			#estimate inestability index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.instability_index()
				properties[str(record.id)]["instability_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["instability_index"] = "-"

			#estimate aromaticity
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aromaticity()
				properties[str(record.id)]["aromaticity"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aromaticity"] = "-"

			#estimate aliphatic_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aliphatic_index()
				properties[str(record.id)]["aliphatic_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aliphatic_index"] = "-"

			#estimate hydrophobic_ratio
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.hydrophobic_ratio()
				properties[str(record.id)]["hydrophobic_ratio"] = float("%.4f" % desc.descriptor[0][0])	
			except:
				properties[str(record.id)]["hydrophobic_ratio"] = "-"

			#profile hydrophobicity
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='H')
				properties[str(record.id)]["hydrophobicity_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobicity_profile"] = "-"

			#profile hydrophobic
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='uH')
				properties[str(record.id)]["hydrophobic_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobic_profile"] = "-"

			#moment
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_moment()
				properties[str(record.id)]["calculate_moment"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["calculate_moment"] = "-"

		return(properties)
def describe_sequences():
    aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    letters = {1 : aa_letters, 2 : di_letters}
    
    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i : 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k : d[k]/l for k in d}
        if seq_type == 2:        
            for a in range(l-1):
                s = string[a:a+seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k : d[k]/(l-1) for k in d}
        return d
        
    def residue_distribution(all_residues, seq_type):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        residue_counts = list(sorted([(i, d[i]) for i in letters[seq_type] ]))                              ##Removes ambiguous letters
        r_c = [i[1] for i in residue_counts]
        dis = np.array([r_c,])
        return dis
    
    peptides = [{"seq" : "FLPILASLAAKFGPKLFCLVTKKC", "cTer" : None, "activity" : "YES"},
                {"seq" : "ILGPVISTIGGVLGGLLKNL", "cTer" : "Amidation", "activity" : "YES"},
                {"seq": "GIGGKILSGLKTALKGAAKELASTYLH", "cTer" : None, "activity" : "NO"},
                {"seq": "GIGSAILSAGKSALKGLAKGLAEHFAN", "cTer" : None, "activity" : "NO"},
                {"seq": "FLSLIPHAINAVSAIAKHF", "cTer" : "Amidation", "activity" : "NO"},
    ]
    
    
    for peptide in peptides:
        #print(peptide["id"])
        #print(peptide["seq"])
        
        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide = peptide["cTer"] == "Amidation")
        
        #peptide["GlobalDescriptor"] = globdesc
        
        #print(peptide["GlobalDescriptor"].descriptor)
        
        #Eisenberg hydrophobicity consensus
        #Take most of the values from here
        
        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global()
        pepdesc.calculate_moment(append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")
        
        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(append=True)
    
        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("z5")
        pepdesc.calculate_global(append=True)
        
        #peptide["PeptideDescriptor"] = pepdesc
    
        peptide["TotalDescriptor"] = str(np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))
        
        try:
            pepid = np.array([[int(peptide["id"].replace("HEMOLYTIK",""))]])
        except KeyError:
            pepid = np.array([[0]])
        
        freq_1d = residue_distribution(peptide["seq"], 1)
        freq_2d = residue_distribution(peptide["seq"], 2)
        
        len_peptide = np.array([[len(peptide["seq"])]])
        
        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])
        
        peptide_di2 = di2(peptide["seq"])
        
        peptide["array"] = np.concatenate((pepid, pepdesc.descriptor, globdesc.descriptor, len_peptide, 
               freq_1d, 
               #freq_2d, 
               #peptide_di2, 
               pepact,), axis=1)
        #print(peptide["TotalDescriptor"])
        
    
    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)
    print(x)
    
    np.save("hemolytik_array_custom_tests", x, allow_pickle=False)
class TestPeptideDescriptor(unittest.TestCase):

    D = PeptideDescriptor('GLFDIVKKVVGALG', 'pepcats')
    A = PeptideDescriptor('GLFDIVKKVVGALG', 'peparc')
    data_ac = np.array([
        0.714285714286, 0.0714285714286, 0.0714285714286, 0.142857142857,
        0.142857142857, 0.0714285714286, 0.538461538462, 0.0, 0.0,
        0.0769230769231, 0.0769230769231, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.636363636364, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.555555555556, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0
    ])
    data_cc = np.array([
        0.714285714286, 0.538461538462, 0.5, 0.636363636364, 0.6,
        0.555555555556, 0.5, 0.0714285714286, 0.0769230769231, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0769230769231, 0.0833333333333, 0.0, 0.0, 0.0, 0.0,
        0.142857142857, 0.153846153846, 0.166666666667, 0.0909090909091, 0.1,
        0.222222222222, 0.125, 0.142857142857, 0.153846153846, 0.166666666667,
        0.0909090909091, 0.1, 0.222222222222, 0.125, 0.0, 0.0769230769231,
        0.0833333333333, 0.0, 0.0, 0.0, 0.0, 0.0714285714286, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0769230769231, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.1, 0.111111111111, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1,
        0.111111111111, 0.0, 0.0, 0.0769230769231, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0714285714286, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0909090909091, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0909090909091, 0.1,
        0.0, 0.0, 0.0714285714286, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.142857142857, 0.0769230769231, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.142857142857, 0.0769230769231, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.142857142857, 0.0769230769231, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0714285714286, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0
    ])
    data_aa = np.array([
        0.07142857, 0., 0.07142857, 0., 0.07142857, 0.21428571, 0., 0.07142857,
        0.14285714, 0.14285714, 0., 0., 0., 0., 0., 0., 0., 0.21428571, 0., 0.
    ])
    data_arc = [200, 60, 30, 30, 0]
    E = PeptideDescriptor('X', 'eisenberg')
    E.read_fasta(join(dirname(__file__), 'files/test.fasta'))
    data_mom = np.array([])
    data_glob = np.array([])

    def test_filereader(self):
        self.assertEqual(self.D.sequences[0], self.E.sequences[0])

    def test_autocorr_size(self):
        self.D.calculate_autocorr(7)
        self.assertEqual(len(self.D.descriptor[0]), 42)

    def test_crosscorr_size(self):
        self.D.calculate_crosscorr(7)
        self.assertEqual(len(self.D.descriptor[0]), 147)

    def test_autocorr_values(self):
        self.D.calculate_autocorr(7)
        for n in range(len(self.D.descriptor[0])):
            self.assertAlmostEqual(self.D.descriptor[0][n],
                                   self.data_ac[n],
                                   places=8)

    def test_crosscorr_values(self):
        self.D.calculate_crosscorr(7)
        for n in range(len(self.D.descriptor[0])):
            self.assertAlmostEqual(self.D.descriptor[0][n],
                                   self.data_cc[n],
                                   places=8)

    def test_global_value(self):
        self.D.calculate_global()
        self.assertEqual(self.D.descriptor[0][0], 1)
        self.E.calculate_global()
        self.assertAlmostEqual(self.E.descriptor[0][0],
                               0.44714285714285723,
                               places=8)

    def test_moment_value(self):
        self.E.calculate_moment()
        self.assertAlmostEqual(self.E.descriptor[0][0],
                               0.49723753135551985,
                               places=8)

    def test_count_aa(self):
        self.D.count_aa()
        for n in range(len(self.D.descriptor[0])):
            self.assertAlmostEqual(self.D.descriptor[0][n],
                                   self.data_aa[n],
                                   places=8)

    def test_arc_size(self):
        self.A.calculate_arc()
        self.assertEqual(self.A.descriptor.tolist()[0], self.data_arc)
Beispiel #13
0
def upload():

    if request.method == 'POST':
        # This will be executed on POST request.
        upfile = request.files['file']
        if upfile and allowed_file(upfile.filename):

            filename = secure_filename(upfile.filename)
            upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            #return render_template('upload.html')
            #flash("File uploaded", "success")
            #with open("/home/sanika/proj/uploads/aa.fasta") as f:
            #lines = f.readlines()
            #lines = [l for l in lines if "ROW" in l]

            #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1:
            #f1.writelines(lines)

            #f = open(filename)
            #prot_seq = ReadFasta(f)

            with open(filename) as fasta_file:  # Will close handle cleanly
                identifiers = []
                sequence = []
                for seq_record in SeqIO.parse(fasta_file,
                                              'fasta'):  # (generator)
                    identifiers.append(seq_record.id)
                    sequence.append(seq_record.seq)

            pepdesc = PeptideDescriptor(
                filename, 'eisenberg')  # use Eisenberg consensus scale
            globdesc = GlobalDescriptor(filename)

            # --------------- Peptide Descriptor (AA scales) Calculations ---------------
            pepdesc.calculate_global(
            )  # calculate global Eisenberg hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate Eisenberg hydrophobic moment

            # load other AA scales
            pepdesc.load_scale('gravy')  # load GRAVY scale
            pepdesc.calculate_global(
                append=True)  # calculate global GRAVY hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate GRAVY hydrophobic moment
            pepdesc.load_scale('z3')  # load old Z scale
            pepdesc.calculate_autocorr(
                1, append=True
            )  # calculate global Z scale (=window1 autocorrelation)

            # --------------- Global Descriptor Calculations ---------------
            globdesc.length()  # sequence length
            globdesc.boman_index(append=True)  # Boman index
            globdesc.aromaticity(append=True)  # global aromaticity
            globdesc.aliphatic_index(append=True)  # aliphatic index
            globdesc.instability_index(append=True)  # instability index
            globdesc.calculate_charge(ph=7.4, amide=False,
                                      append=True)  # net charge
            globdesc.calculate_MW(amide=False, append=True)  # molecular weight

            f1 = pepdesc.descriptor
            f2 = globdesc.descriptor
            result = np.concatenate((f2, f1), axis=1)
            rs = []
            for i in range(len(result)):
                prt = np.reshape(result[i], (-1, 14))
                clf = joblib.load('ml_model.pkl')
                pred = clf.predict(prt)
                out = pred.toarray()
                #print(clf.predict_proba(result))
                proba = clf.predict_proba(prt).tocoo()
                mc = pred.tocoo()
                out = mc.col
                res = []
                for i in range(len(out)):
                    if out[i] == 0:
                        res.append("antiviral")
                    elif out[i] == 1:
                        res.append("antibacterial")
                    else:
                        res.append("antifungal")
                rs.append(res)
            a = []
            for i in range(len(rs)):
                a.append('-'.join(rs[i]))

            df = pd.DataFrame(data={
                "id": identifiers,
                "sequence": sequence,
                "activity": a
            },
                              columns=['id', 'sequence', 'activity'])
            df.to_csv("result.csv", sep=',', index=False)

            os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            #return render_template('seq.html', seq = rs)
            return render_template('up.html', mimetype="text/csv")

            #flash("File uploaded: Thanks!", "success")
        else:
            error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD"
            return render_template('upload.html', error=error)

    # This will be executed on GET request.
    return render_template('predictor.html')
Beispiel #14
0
def describe_sequences():
    path = r"C:\Users\Patrick\OneDrive - University College Dublin\Bioinformatics\HemolyticStudies\BOTH_peptides.json"

    aa_letters = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    tri_letters = [
        "%s%s%s" % (a, b, c) for a in aa_letters for b in aa_letters
        for c in aa_letters
    ]
    conjoint_letters = ["A", "I", "Y", "H", "R", "D", "C"]
    letters = {
        1: aa_letters,
        2: di_letters,
        3: tri_letters,
        4: conjoint_letters
    }

    #Conjoint src = https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0828-1

    conjoint_dict = {
        "A": "A",
        "G": "A",
        "V": "A",
        "I": "I",
        "L": "I",
        "F": "I",
        "P": "I",
        "Y": "Y",
        "M": "Y",
        "T": "Y",
        "S": "Y",
        "H": "H",
        "N": "H",
        "Q": "H",
        "W": "H",
        "R": "R",
        "K": "R",
        "D": "D",
        "E": "D",
        "C": "C",
    }

    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / l for k in d}
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 1) for k in d}
        if seq_type == 3:
            for a in range(l - 2):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 2) for k in d}
        return d

    def counter_boolean(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def counter_abs(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def residue_distribution(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        elif seq_type == 3:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if tp[i] >= 20]))
        elif seq_type == 4:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]]))

        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_boolean(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_boolean(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_abs(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_abs(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    with open(path, "r") as f:
        text = f.read()

    peptides = eval(text)["Peptides"]

    train_peptides, test_peptides = train_test_split(peptides,
                                                     test_size=0.15,
                                                     random_state=42)

    train_peptides_seqs = [peptide["seq"] for peptide in train_peptides]

    for peptide in peptides:
        if peptide["seq"] in train_peptides_seqs:
            peptide["train"] = True
        else:
            peptide["train"] = False

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))

    new_peptides = []
    for peptide in peptides:
        if peptide["train"] == True:
            new_peptide = peptide.copy()
            new_seq = ''.join(reversed(peptide["seq"]))
            new_peptide["seq"] = new_seq
            new_peptides.append(new_peptide)

    #peptides.extend(new_peptides)
    random.shuffle(peptides)

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))
    print("doubling complete")

    dp = {i: 0 for i in letters[2]}
    tp = {i: 0 for i in letters[3]}

    name_i = 0

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 1):
            s = seq[a:a + 2]
            temp_set.add(s)
        for s in temp_set:
            dp[s] = dp[s] + 1

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 2):
            s = seq[a:a + 3]
            temp_set.add(s)
        for s in temp_set:
            tp[s] = tp[s] + 1

    for peptide in peptides:
        peptide["conjoint_seq"] = "".join(
            [conjoint_dict[letter] for letter in peptide["seq"]])

    for peptide in peptides:

        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide=peptide["cTer"] == "Amidation")

        ctdc = CTD.CalculateC(peptide["seq"])
        ctdc_keys = list(sorted(list([key for key in ctdc])))
        ctdc_vals = np.array([[ctdc[key] for key in ctdc_keys]])

        conjointtriad = ConjointTriad.CalculateConjointTriad(peptide["seq"])
        conjointtriad_keys = list(sorted(list([key for key in conjointtriad])))
        conjointtriad_vals = np.array(
            [[conjointtriad[key] for key in conjointtriad_keys]])

        conjoint_dis = residue_distribution(peptide["conjoint_seq"], 4, None)

        #peptide["GlobalDescriptor"] = globdesc

        #print(peptide["GlobalDescriptor"].descriptor)

        #Eisenberg hydrophobicity consensus
        #Take most of the values from here

        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")

        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("aasi")
        pepdesc.calculate_global(append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("abhprk")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_acid")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("cougar")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("gravy")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("hopp-woods")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("kytedoolittle")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("ppcali")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("bulkiness")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("TM_tend")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("mss")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("t_scale")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("peparc")
        pepdesc.calculate_arc(modality="max", append=True)
        pepdesc.calculate_arc(modality="mean", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("pepcats")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z3")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z5")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        #pepdesc.load_scale("PPCALI")
        #pepdesc.calculate_autocorr(2)
        #peptide["PeptideDescriptor"] = pepdesc

        protein = PyPro()
        protein.ReadProteinSequence(peptide["seq"])
        paac = protein.GetPAAC(lamda=1, weight=0.05)
        paac2 = [[
            paac[a] for a in list(
                sorted([k for k in paac],
                       key=lambda x: int(x.replace("PAAC", ""))))
        ]]

        cTer = np.array([[1 if peptide["cTer"] == "Amidation" else 0]])

        paac = np.array(paac2)

        analysed_seq = ProteinAnalysis(peptide["seq"])
        secondary_structure_fraction = np.array(
            [analysed_seq.secondary_structure_fraction()])

        peptide["TotalDescriptor"] = str(
            np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))

        try:
            pepid = np.array([[
                int(peptide["id"].replace("HEMOLYTIK", "").replace(
                    "DRAMP", "").replace("DBAASP", ""))
            ]])
        except KeyError:
            pepid = 0

        pep_train = np.array([[1 if peptide["train"] == True else 0]])

        freq_1d = residue_distribution(peptide["seq"], 1, dp)
        freq_2d = residue_distribution(peptide["seq"], 2, dp)
        freq_3d = residue_distribution(peptide["seq"], 3, dp)
        freq_1dbool = residue_boolean(peptide["seq"], 1, dp)
        freq_2dbool = residue_boolean(peptide["seq"], 2, dp)
        freq_1dabs = residue_abs(peptide["seq"], 1, dp)
        freq_2dabs = residue_abs(peptide["seq"], 2, dp)

        len_peptide = np.array([[len(peptide["seq"])]])

        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])

        peptide_di2 = di2(peptide["seq"])
        peptide_di3 = di3(peptide["conjoint_seq"])

        ####################### AAindex #########################
        to_get = [
            ("CHAM810101", "mean"),  #Steric Hinderance
            ("CHAM810101", "total"),  #Steric Hinderance
            ("KYTJ820101", "mean"),  #Hydropathy
            ("KLEP840101", "total"),  #Charge
            ("KLEP840101", "mean"),  #Charge
            ("MITS020101", "mean"),  #Amphiphilicity
            ("FAUJ830101", "mean"),  #Hydrophobic parameter pi
            ("GOLD730102", "total"),  #Residue volume
            ("MEEJ800101", "mean"),  #Retention coefficient in HPLC
            ("OOBM850105",
             "mean"),  #Optimized side chain interaction parameter
            ("OOBM850105",
             "total"),  #Optimized side chain interaction parameter
            ("VELV850101", "total"),  #Electron-ion interaction parameter
            ("VELV850101", "mean"),  #Electron-ion interaction parameter
            ("PUNT030102",
             "mean"),  #Knowledge-based membrane-propensity scale from 3D_Helix
            ("BHAR880101", "mean"),  #Average flexibility indeces
            ("KRIW790102", "mean"),  #Fraction of site occupied by water
            ("PLIV810101", "mean"),  #Partition coefficient
            ("ZIMJ680102", "mean"),  #Bulkiness
            ("ZIMJ680102", "total"),  #Bulkiness
            ("ZHOH040101", "mean"),  #Stability scale
            ("CHAM820102", "total"),  #Free energy solubility in water
            #From HemoPi: src = https://github.com/riteshcanfly/Hemopi/blob/master/pcCalculator.java
            ("HOPT810101", "mean"),  #Hydrophilicity 
            ("EISD840101", "mean"),  #Hydrophobicity
            ("FAUJ880109", "total"),  #Net Hydrogen
            ("EISD860101", "mean"),  #Solvation
        ]

        tetra_peptides = [
            "KLLL",  # src = https://github.com/riteshcanfly/Hemopi/blob/master/tetrapos.txt
            "GCSC",
            "AAAK",
            "KLLS",
            "LGKL",
            "VLKA",
            "LLGK",
            "LVGA",
            "LSDF",
            "SDFK",
            "SWLR",
            "WLRD",
        ]

        tp_bin = []
        for t_p in tetra_peptides:
            if t_p in peptide["seq"]:
                tp_bin.append(1)
            else:
                tp_bin.append(0)
        tp_bin = np.array([tp_bin])

        for identifier, mode in to_get:
            x = aaf(peptide["seq"], identifier, mode)

        aminoacidindeces = np.array([[
            aaf(peptide["seq"], identifier, mode)
            for identifier, mode in to_get
        ]])

        peptide["array"] = np.concatenate(
            (
                pepid,
                pep_train,
                pepdesc.descriptor,
                globdesc.descriptor,
                len_peptide,
                cTer,
                secondary_structure_fraction,
                aminoacidindeces,
                ctdc_vals,
                conjointtriad_vals,
                tp_bin,
                freq_1d,
                freq_2d,
                freq_3d,
                freq_1dbool,
                freq_2dbool,
                freq_1dabs,
                freq_2dabs,
                peptide_di2,
                peptide_di3,  #Conjoint Alphabet
                paac,
                pepact,
            ),
            axis=1)
        #print(peptide["TotalDescriptor"])

    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)

    np.save("peptides_array", x, allow_pickle=False)
Beispiel #15
0
    def _add_features_to_peptide_series(self,
                                        peptide,
                                        index,
                                        n_cluster=-1,
                                        lpvs=None):
        # primary intensity weights d = delta, pd = penalty delta
        # TODO only d_start and d_stop depends on impval, pd_start and pd_stop does not because
        # they are always between a d_start and d_stop, and should thus be above imp_val!
        # therefore we can write out d_start as and d_stop as:
        #   [before_start, after_start], [befrore_stop, after_stop]
        # thus if we have
        #       raw data     = [0, 0, 5, 5, 7, 7, 5, 5, 0, 0]
        # then for the peptide        3--------------8
        #       before_start, after_start = [ 0, 5 ]
        # but for the peptide               5--6
        #       before_start, after_start = [ 5, 7 ]
        # by making a none linear model we could formulate the w_start parameter as follows:
        # w_start * (after_start - max(before_start, imp_val))
        # which is consistent with how we currently do the grid search (imp_val=4):
        #       d_start = 5 - max(0, 4) = 1
        #       d_start = 7 - max(5, 4) = 2
        if lpvs is None:
            lpvs = set()
        i_start = peptide.start.index
        i_stop = peptide.stop.index

        # MS Delta
        series = pd.Series(np.zeros(len(index)) * np.nan, index=index)
        ms_int = self.ms_intensity_features.type
        series[ms_int, 'start'] = self.start_scores[i_start]
        series[ms_int, 'stop'] = self.stop_scores[i_stop]

        if 4 < len(peptide):
            penalty = SequenceRange(peptide.start + 1,
                                    peptide.stop - 1,
                                    validate=False)
            series[ms_int,
                   'penalty_start'] = self.start_scores[penalty.slice].sum()
            series[ms_int,
                   'penalty_stop'] = self.stop_scores[penalty.slice].sum()
        else:
            series[ms_int, 'penalty_start'] = series[ms_int,
                                                     'penalty_stop'] = 0

        # MS Bool
        b_obs, f_obs = self._calc_observed(peptide)
        series[self.ms_bool_features.type, "first"] = self.h_first[i_start]
        series[self.ms_bool_features.type, "last"] = self.h_last[i_stop]
        series[self.ms_bool_features.type, "observed"] = b_obs

        # MS Frequency
        # ptm weights
        # TODO: should it get extra penalties if there are PTM's between start and end?
        ms_freq = self.ms_frequency_features.type
        series[ms_freq, 'acetylation'] = self.ac_freq[i_start]
        series[ms_freq, 'amidation'] = self.am_freq[i_stop]

        series[ms_freq, 'start'] = self.h_start_freq[i_start]
        series[ms_freq, 'stop'] = self.h_stop_freq[i_stop]
        series[ms_freq, 'observed'] = f_obs
        series[ms_freq, 'sample'] = self.h_sample[peptide.slice].min()
        series[ms_freq, 'ladder'] = \
            self.h_ladder_start[i_start] * self.h_ladder_stop[i_stop]
        series[ms_freq, 'protein_coverage'] = self.protein_coverage
        series[ms_freq, 'cluster_coverage'] = self.cluster_coverage[n_cluster]

        # thise are good features, but there may be better ways to extract them
        series[ms_freq,
               'bond'] = self.h_bond[self.get_bond_slice(peptide)].min()

        # MS Counts
        ms_count = self.ms_count_features.type
        series[ms_count, 'start'] = self.start_counts[peptide.start]
        series[ms_count, 'stop'] = self.stop_counts[peptide.stop]
        #  series[ms_count, 'ladder'] = \
        #      self.h_ladder_start[i_start] + self.h_ladder_stop[i_stop]

        ############################################################

        # Chemical
        sequence = self.protein_sequence[peptide.slice]
        peptide_features = GlobalDescriptor(sequence)

        is_amidated = series[ms_freq, 'amidation'] > 0.05
        peptide_features.calculate_all(amide=is_amidated)

        chem = self.chemical_features.type
        for i, name in enumerate(peptide_features.featurenames):
            if name in self.chemical_features.features:
                series[chem, name] = peptide_features.descriptor[0, i]

            eisenberg = PeptideDescriptor(sequence, 'eisenberg')
            eisenberg.calculate_moment()
            series[chem, 'eisenberg'] = eisenberg.descriptor.flatten()[0]

        # Annotations
        series[self.annotations.type, "Known"] = peptide in self.known_peptides
        #  series[self.annotations.type, "Type"] = peptide in self.known_peptides
        series[self.annotations.type, "Cluster"] = n_cluster
        series[self.annotations.type, "Sequence"] = peptide.seq
        series[self.annotations.type, "LPV"] = False  # TODO!

        series[self.annotations.type, "N Flanking"] = \
            self.get_nflanking_region(peptide.start, self.protein_sequence)
        series[self.annotations.type, "C Flanking"] = \
            self.get_cflanking_region(peptide.stop, self.protein_sequence)
        series[self.annotations.type, "LPV"] = peptide in lpvs
        if f_obs != 0:
            _pep_index = (slice(None), slice(None), peptide.start.pos,
                          peptide.stop.pos)
            series[self.annotations.type,
                   "Intensity"] = self.df.loc[_pep_index, :].sum().sum()
        return series
Beispiel #16
0
def insert_phycs(seq_df):
    #  Function for compute Isoelectric Point or net_charge of peptide
    def get_ieq_nc(seq, is_iep=True):
        protparam = PA(seq)
        return protparam.isoelectric_point(
        ) if is_iep else protparam.charge_at_pH(7.0)

    # Calculating IsoElectricPoints and NeutralCharge
    data_size = seq_df.size
    seq_df['IEP'] = list(
        map(get_ieq_nc, seq_df['Sequence'],
            [True] * data_size))  # IsoElectricPoints
    seq_df['Net Charge'] = list(
        map(get_ieq_nc, seq_df['Sequence'],
            [False] * data_size))  # Charge(Neutral)

    # Calculating hydrophobic moment (My assume all peptides are alpha-helix)
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'eisenberg')
    descrpt.calculate_moment(window=1000, angle=100, modality='max')
    seq_df['Hydrophobic Moment'] = descrpt.descriptor.reshape(-1)

    # Calculating "Hopp-Woods" hydrophobicity
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'hopp-woods')
    descrpt.calculate_global()
    seq_df['Hydrophobicity'] = descrpt.descriptor.reshape(-1)

    # Calculating Energy of Transmembrane Propensity
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'tm_tend')
    descrpt.calculate_global()
    seq_df['Transmembrane Propensity'] = descrpt.descriptor.reshape(-1)

    # Calculating Levitt_alpha_helical Propensity
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'levitt_alpha')
    descrpt.calculate_global()
    seq_df['Alpha Helical Propensity'] = descrpt.descriptor.reshape(-1)

    # Calculating Aliphatic Index
    descrpt = GlobalDescriptor(seq_df['Sequence'].values)
    descrpt.aliphatic_index()
    seq_df['Aliphatic Index'] = descrpt.descriptor.reshape(-1)

    # Calculating Boman Index
    descrpt = GlobalDescriptor(seq_df['Sequence'].values)
    descrpt.boman_index()
    seq_df['Boman Index'] = descrpt.descriptor.reshape(-1)

    return seq_df
Beispiel #17
0
    i += 1

    if y == '1':
        class_in = 1
    elif y == '-1':
        class_in = 0

    out.write(x + ', ' + str(class_in))
    out.write('\n')
out.close()

# load the reformatted data
data = load_custom(os.getcwd() + '/formatted.csv')

# create descriptors for peptide sequences
descr_temp = PeptideDescriptor(data.sequences, scalename='pepArc')
descr_temp.calculate_crosscorr(window=4)

# develop best model and print out score with cross validation
best_RF = train_best_model('RF', descr_temp.descriptor, data.target)
score_cv(best_RF, descr_temp.descriptor, data.target, cv=10)

y_pred = []

# get predictions for values
for i in range(0, 392):
    try:
        pep_descr = PeptideDescriptor(samples_test[i], scalename='pepArc')
        pep_descr.calculate_crosscorr(window=4)
        proba = best_RF.predict_proba(pep_descr.descriptor)
        y_pred.append(proba)
Beispiel #18
0
def plot_profile(sequence,
                 window=5,
                 scalename='Eisenberg',
                 filename=None,
                 color='red',
                 seq=False,
                 ylim=None):
    """ Function to generate sequence profile plots of a given amino acid scale or a moment thereof.

    .. note::
        :func:`plot_profile` can only plot one-dimensional amino acid scales given in
        :class:`modlamp.descriptors.PeptideDescriptor`.

    :param sequence: {str} Peptide sequence for which the profile should be plotted.
    :param window: {int, uneven} Window size for which the average value is plotted for the center amino acid.
    :param scalename: {str} Amino acid scale to be used to describe the sequence.
    :param filename: {str} Filename  where to safe the plot. *default = None* --> show the plot
    :param color: {str} Color of the plot line.
    :param seq: {bool} Whether the amino acid sequence should be plotted as the title.
    :param ylim: {tuple of float} Y-Axis limits. Provide as tuple, e.g. (0.5, -0.2)
    :return: a profile plot of the input sequence interactively or with the specified *filename*
    :Example:

    >>> plot_profile('GLFDIVKKVVGALGSL', scalename='eisenberg')

    .. image:: ../docs/static/profileplot.png
        :height: 300px

    .. versionadded:: v2.1.5
    """
    # check if given scale is defined in PeptideDescriptor
    d = PeptideDescriptor(sequence, scalename)
    if len(d.scale['A']) > 1:
        raise KeyError(
            "\nSorry\nThis function can only calculate profiles for 1D scales. '%s' has more than one "
            "dimension" % scalename)
    seq_data = list()
    seq_profile = list()
    for a in sequence:
        seq_data.append(d.scale[a])  # describe sequence by given scale
    i = 0  # AA index
    while (i + window) < len(sequence):
        seq_profile.append(np.mean(
            seq_data[i:(i + window +
                        1)]))  # append average value for given window
        i += 1

    # plot
    fig, ax = plt.subplots()
    x_range = range(int(window / 2), int(len(sequence) - int(window) / 2))
    line = ax.plot(x_range, seq_profile)
    plt.setp(line, color=color, linewidth=2.0)

    # axis labes and title
    ax.set_xlabel('sequence position', fontweight='bold')
    ax.set_ylabel(scalename + ' value', fontweight='bold')
    ax.text(max(x_range) / 2 + 1,
            1.05 * max(seq_profile),
            'window size: ' + str(window),
            fontsize=16,
            fontweight='bold')
    if seq:
        ax.set_title(sequence, fontsize=16, fontweight='bold', y=1.02)
    if ylim:
        ax.set_ylim(ylim)
    else:
        ax.set_ylim(1.2 * max(seq_profile), 1.2 * min(seq_profile))

    # only left and bottom axes, no box
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

    # show or save plot
    if filename:
        plt.savefig(filename, dpi=150)
    else:
        plt.show()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from progressbar import ProgressBar

from modlamp.core import read_fasta
from modlamp.descriptors import PeptideDescriptor

seed = np.random.RandomState(seed=42)

for d in os.listdir('./output'):
    if os.path.isdir('./output/' + d):
        print("\nRunning %s..." % d)
        sclr = pickle.load(open('./output/' + d + '/scaler.p', 'r'))
        pos = read_fasta('./input/' + d + '/Pos.fasta')[0]
        neg = read_fasta('./input/' + d + '/Neg.fasta')[0]
        desc = PeptideDescriptor(pos + neg, 'PPCALI')
        desc.calculate_autocorr(7)
        X = sclr.transform(desc.descriptor)
        y = np.array(len(pos) * [1] + len(neg) * [0])
        skf = StratifiedKFold(y, n_folds=10)

        synth = pd.read_csv('./output/' + d + '/synthesis_selection.csv')

        print("\tPerforming 10-fold cross-validation")
        mcc = list()
        acc = list()
        pbar = ProgressBar()
        for train, test in pbar(skf):
            clf = RandomForestClassifier(bootstrap=True,
                                         class_weight=None,
                                         criterion='gini',
Beispiel #20
0
def helical_wheel(sequence,
                  colorcoding='rainbow',
                  lineweights=True,
                  filename=None,
                  seq=False,
                  moment=False):
    """A function to project a given peptide sequence onto a helical wheel plot. It can be useful to illustrate the
    properties of alpha-helices, like positioning of charged and hydrophobic residues along the sequence.

    :param sequence: {str} the peptide sequence for which the helical wheel should be drawn
    :param colorcoding: {str} the color coding to be used, available: *rainbow*, *charge*, *polar*, *simple*,
        *amphipathic*, *none*
    :param lineweights: {boolean} defines whether connection lines decrease in thickness along the sequence
    :param filename: {str} filename  where to safe the plot. *default = None* --> show the plot
    :param seq: {bool} whether the amino acid sequence should be plotted as a title
    :param moment: {bool} whether the Eisenberg hydrophobic moment should be calculated and plotted
    :return: a helical wheel projection plot of the given sequence (interactively or in **filename**)
    :Example:

    >>> helical_wheel('GLFDIVKKVVGALG')
    >>> helical_wheel('KLLKLLKKLLKLLK', colorcoding='charge')
    >>> helical_wheel('AKLWLKAGRGFGRG', colorcoding='none', lineweights=False)
    >>> helical_wheel('ACDEFGHIKLMNPQRSTVWY')

    .. image:: ../docs/static/wheel1.png
        :height: 300px
    .. image:: ../docs/static/wheel2.png
        :height: 300px
    .. image:: ../docs/static/wheel3.png
        :height: 300px
    .. image:: ../docs/static/wheel4.png
        :height: 300px

    .. versionadded:: v2.1.5
    """
    # color mappings
    aa = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    f_rainbow = [
        '#3e3e28', '#ffcc33', '#b30047', '#b30047', '#ffcc33', '#3e3e28',
        '#80d4ff', '#ffcc33', '#0047b3', '#ffcc33', '#ffcc33', '#b366ff',
        '#29a329', '#b366ff', '#0047b3', '#ff66cc', '#ff66cc', '#ffcc33',
        '#ffcc33', '#ffcc33'
    ]
    f_charge = [
        '#000000', '#000000', '#ff4d94', '#ff4d94', '#000000', '#000000',
        '#80d4ff', '#000000', '#80d4ff', '#000000', '#000000', '#000000',
        '#000000', '#000000', '#80d4ff', '#000000', '#000000', '#000000',
        '#000000', '#000000'
    ]
    f_polar = [
        '#000000', '#000000', '#80d4ff', '#80d4ff', '#000000', '#000000',
        '#80d4ff', '#000000', '#80d4ff', '#000000', '#000000', '#80d4ff',
        '#000000', '#80d4ff', '#80d4ff', '#80d4ff', '#80d4ff', '#000000',
        '#000000', '#000000'
    ]
    f_simple = [
        '#ffcc33', '#ffcc33', '#0047b3', '#0047b3', '#ffcc33', '#7f7f7f',
        '#0047b3', '#ffcc33', '#0047b3', '#ffcc33', '#ffcc33', '#0047b3',
        '#ffcc33', '#0047b3', '#0047b3', '#0047b3', '#0047b3', '#ffcc33',
        '#ffcc33', '#ffcc33'
    ]
    f_none = ['#ffffff'] * 20
    f_amphi = [
        '#ffcc33', '#29a329', '#b30047', '#b30047', '#f79318', '#80d4ff',
        '#0047b3', '#ffcc33', '#0047b3', '#ffcc33', '#ffcc33', '#80d4ff',
        '#29a329', '#80d4ff', '#0047b3', '#80d4ff', '#80d4ff', '#ffcc33',
        '#f79318', '#f79318'
    ]
    t_rainbow = [
        'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k',
        'w', 'k', 'k', 'k', 'k', 'k'
    ]
    t_charge = [
        'w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'w', 'w', 'w',
        'k', 'w', 'w', 'w', 'w', 'w'
    ]
    t_polar = [
        'w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'k', 'w', 'k',
        'k', 'k', 'k', 'w', 'w', 'w'
    ]
    t_simple = [
        'k', 'k', 'w', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w',
        'w', 'w', 'w', 'k', 'k', 'k'
    ]
    t_none = ['k'] * 20
    t_amphi = [
        'k', 'k', 'w', 'w', 'w', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'k',
        'w', 'k', 'k', 'k', 'w', 'w'
    ]
    d_eisberg = load_scale('eisenberg')[
        1]  # eisenberg hydrophobicity values for HM

    if lineweights:
        lw = np.arange(0.1, 5.5,
                       5. / (len(sequence) - 1))  # line thickness array
        lw = lw[::-1]  # inverse order
    else:
        lw = [2.] * (len(sequence) - 1)

    # check which color coding to use
    if colorcoding == 'rainbow':
        df = dict(zip(aa, f_rainbow))
        dt = dict(zip(aa, t_rainbow))
    elif colorcoding == 'charge':
        df = dict(zip(aa, f_charge))
        dt = dict(zip(aa, t_charge))
    elif colorcoding == 'polar':
        df = dict(zip(aa, f_polar))
        dt = dict(zip(aa, t_polar))
    elif colorcoding == 'simple':
        df = dict(zip(aa, f_simple))
        dt = dict(zip(aa, t_simple))
    elif colorcoding == 'none':
        df = dict(zip(aa, f_none))
        dt = dict(zip(aa, t_none))
    elif colorcoding == 'amphipathic':
        df = dict(zip(aa, f_amphi))
        dt = dict(zip(aa, t_amphi))
    else:
        print("Unknown color coding, 'rainbow' used instead")
        df = dict(zip(aa, f_rainbow))
        dt = dict(zip(aa, t_rainbow))

    # degree to radian
    deg = np.arange(float(len(sequence))) * -100.
    deg = [d + 90. for d in deg]  # start at 270 degree in unit circle (on top)
    rad = np.radians(deg)

    # dict for coordinates and eisenberg values
    d_hydro = dict(zip(rad, [0.] * len(rad)))

    # create figure
    fig = plt.figure(frameon=False, figsize=(10, 10))
    ax = fig.add_subplot(111)
    old = None
    hm = list()

    # iterate over sequence
    for i, r in enumerate(rad):
        new = (np.cos(r), np.sin(r))  # new AA coordinates
        if i < 18:
            # plot the connecting lines
            if old is not None:
                line = lines.Line2D((old[0], new[0]), (old[1], new[1]),
                                    transform=ax.transData,
                                    color='k',
                                    linewidth=lw[i - 1])
                line.set_zorder(1)  # 1 = level behind circles
                ax.add_line(line)
        elif 17 < i < 36:
            line = lines.Line2D((old[0], new[0]), (old[1], new[1]),
                                transform=ax.transData,
                                color='k',
                                linewidth=lw[i - 1])
            line.set_zorder(1)  # 1 = level behind circles
            ax.add_line(line)
            new = (np.cos(r) * 1.2, np.sin(r) * 1.2)
        elif i == 36:
            line = lines.Line2D((old[0], new[0]), (old[1], new[1]),
                                transform=ax.transData,
                                color='k',
                                linewidth=lw[i - 1])
            line.set_zorder(1)  # 1 = level behind circles
            ax.add_line(line)
            new = (np.cos(r) * 1.4, np.sin(r) * 1.4)
        else:
            new = (np.cos(r) * 1.4, np.sin(r) * 1.4)

        # plot circles
        circ = patches.Circle(new,
                              radius=0.1,
                              transform=ax.transData,
                              edgecolor='k',
                              facecolor=df[sequence[i]])
        circ.set_zorder(2)  # level in front of lines
        ax.add_patch(circ)

        # check if N- or C-terminus and add subscript, then plot AA letter
        if i == 0:
            ax.text(new[0],
                    new[1],
                    sequence[i] + '$_N$',
                    va='center',
                    ha='center',
                    transform=ax.transData,
                    size=32,
                    color=dt[sequence[i]],
                    fontweight='bold')
        elif i == len(sequence) - 1:
            ax.text(new[0],
                    new[1],
                    sequence[i] + '$_C$',
                    va='center',
                    ha='center',
                    transform=ax.transData,
                    size=32,
                    color=dt[sequence[i]],
                    fontweight='bold')
        else:
            ax.text(new[0],
                    new[1],
                    sequence[i],
                    va='center',
                    ha='center',
                    transform=ax.transData,
                    size=36,
                    color=dt[sequence[i]],
                    fontweight='bold')

        eb = d_eisberg[sequence[i]][0]  # eisenberg value for this AA
        hm.append([
            eb * new[0], eb * new[1]
        ])  # save eisenberg hydrophobicity vector value to later calculate HM

        old = (np.cos(r), np.sin(r))  # save as previous coordinates

    # draw hydrophobic moment arrow if moment option
    if moment:
        v_hm = np.sum(np.array(hm), 0)
        x = .0333 * v_hm[0]
        y = .0333 * v_hm[1]
        ax.arrow(0.,
                 0.,
                 x,
                 y,
                 head_width=0.04,
                 head_length=0.03,
                 transform=ax.transData,
                 color='k',
                 linewidth=6.)
        desc = PeptideDescriptor(sequence)  # calculate hydrophobic moment
        desc.calculate_moment()
        if abs(
                x
        ) < 0.2 and y > 0.:  # right positioning of HM text so arrow does not cover it
            z = -0.2
        else:
            z = 0.2
        plt.text(0.,
                 z,
                 str(round(desc.descriptor[0][0], 3)),
                 fontdict={
                     'fontsize': 20,
                     'fontweight': 'bold',
                     'ha': 'center'
                 })

    # plot shape
    if len(sequence) < 19:
        ax.set_xlim(-1.2, 1.2)
        ax.set_ylim(-1.2, 1.2)
    else:
        ax.set_xlim(-1.4, 1.4)
        ax.set_ylim(-1.4, 1.4)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    cur_axes = plt.gca()
    cur_axes.axes.get_xaxis().set_visible(False)
    cur_axes.axes.get_yaxis().set_visible(False)
    plt.tight_layout()

    if seq:
        plt.title(sequence, fontweight='bold', fontsize=20)

    # show or save plot
    if filename:
        plt.savefig(filename, dpi=150)
    else:
        plt.show()
Beispiel #21
0
from modlamp.sequences import Helices, Random, AMPngrams
from modlamp.descriptors import PeptideDescriptor
from modlamp.datasets import load_AMPvsTM
from som import SOM

# generate some virtual peptide sequences
libnum = 1000  # 1000 sequences per sublibrary
h = Helices(seqnum=libnum)
r = Random(seqnum=libnum)
n = AMPngrams(seqnum=libnum, n_min=4)
h.generate_sequences()
r.generate_sequences(proba='AMP')
n.generate_sequences()

# calculate molecular descirptors for the peptides
d = PeptideDescriptor(seqs=np.hstack((h.sequences, r.sequences, n.sequences)), scalename='pepcats')
d.calculate_crosscorr(window=7)

# train a som on the descriptors and print / plot the training error
som = SOM(x=12, y=12)
som.fit(data=d.descriptor, epochs=100000, decay='hill')
print("Fit error: %.4f" % som.error)
som.plot_error_history(filename="som_error.png")

# load known antimicrobial peptides (AMPs) and transmembrane sequences
dataset = load_AMPvsTM()
d2 = PeptideDescriptor(dataset.sequences, 'pepcats')
d2.calculate_crosscorr(7)
targets = np.array(libnum*[0] + libnum*[1] + libnum*[2] + 206*[3])
names = ['Helices', 'Random', 'nGrams', 'AMP']
Beispiel #22
0
def predict():

    if request.method == 'POST':

        seq = request.form['seq']
        with open("random.fasta", "w") as fp:
            fp.write(seq)

        pepdesc = PeptideDescriptor(
            '/home/sanika/proj/random.fasta',
            'eisenberg')  # use Eisenberg consensus scale
        globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta')

        # --------------- Peptide Descriptor (AA scales) Calculations ---------------
        pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate Eisenberg hydrophobic moment

        # load other AA scales
        pepdesc.load_scale('gravy')  # load GRAVY scale
        pepdesc.calculate_global(
            append=True)  # calculate global GRAVY hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate GRAVY hydrophobic moment
        pepdesc.load_scale('z3')  # load old Z scale
        pepdesc.calculate_autocorr(
            1,
            append=True)  # calculate global Z scale (=window1 autocorrelation)

        # --------------- Global Descriptor Calculations ---------------
        globdesc.length()  # sequence length
        globdesc.boman_index(append=True)  # Boman index
        globdesc.aromaticity(append=True)  # global aromaticity
        globdesc.aliphatic_index(append=True)  # aliphatic index
        globdesc.instability_index(append=True)  # instability index
        globdesc.calculate_charge(ph=7.4, amide=False,
                                  append=True)  # net charge
        globdesc.calculate_MW(amide=False, append=True)  # molecular weight

        f1 = pepdesc.descriptor
        f2 = globdesc.descriptor
        result = np.concatenate((f2, f1), axis=1)

        clf = joblib.load('ml_model.pkl')
        pred = clf.predict(result)
        proba = clf.predict_proba(result).tocoo()
        mc = pred.tocoo()
        out = mc.col
        res = []
        labels = ['antiviral', 'antibacterial', 'antifungal']
        values = proba.data
        plt.pie(values,
                labels=labels,
                autopct='%.0f%%',
                shadow=True,
                radius=0.5)
        plt.savefig('/home/sanika/proj/pie_chart.jpg')

        figfile = BytesIO()
        plt.savefig(figfile, format='png')
        figfile.seek(0)
        figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii')
        plt.close()

        for i in range(len(out)):
            if out[i] == 0:
                res.append("antiviral")
            elif out[i] == 1:
                res.append("antibacterial")
            else:
                res.append("antifungal")

        return render_template('seq.html', seq=res, result=figdata_png)

    return render_template('predictor.html')
    try:
        desc = GlobalDescriptor([database['Sequence'][i]])
        desc.hydrophobic_ratio()
        hydrophobic_ratio_array.append(desc.descriptor[0][0])
    except:
        hydrophobic_ratio_array.append('')

database['hydrophobic_ratio'] = hydrophobic_ratio_array

print("Estimate hydrophobicity_profile_array")

#profile hydrophobicity
hydrophobicity_profile_array = []
for i in range(len(database)):
    try:
        desc = PeptideDescriptor([database['Sequence'][i]],
                                 scalename='Eisenberg')
        desc.calculate_profile(prof_type='H')
        hydrophobicity_profile_array.append(desc.descriptor[0][0])
    except:
        hydrophobicity_profile_array.append('')

database['hydrophobicity_profile'] = hydrophobicity_profile_array

print("Estimate hydrophobic_profile_array")
#profile hydrophobicity
hydrophobic_profile_array = []
for i in range(len(database)):
    try:
        desc = PeptideDescriptor([database['Sequence'][i]],
                                 scalename='Eisenberg')
        desc.calculate_profile(prof_type='uH')
Beispiel #24
0
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')
Beispiel #25
0
def main():
    # generate some virtual peptide sequences
    libnum = 1000  # 1000 sequences per sublibrary
    h = Helices(seqnum=libnum)
    r = Random(seqnum=libnum)
    n = AMPngrams(seqnum=libnum, n_min=4)
    h.generate_sequences()
    r.generate_sequences(proba='AMP')
    n.generate_sequences()

    # calculate molecular descirptors for the peptides
    d = PeptideDescriptor(seqs=np.hstack(
        (h.sequences, r.sequences, n.sequences)),
                          scalename='pepcats')
    d.calculate_crosscorr(window=7)

    # train a som on the descriptors and print / plot the training error
    som = SOM(x=12, y=12)
    som.fit(data=d.descriptor, epochs=100000, decay='hill')
    print("Fit error: %.4f" % som.error)
    som.plot_error_history(filename="som_error.png")

    # load known antimicrobial peptides (AMPs) and transmembrane sequences
    dataset = load_AMPvsTM()
    d2 = PeptideDescriptor(dataset.sequences, 'pepcats')
    d2.calculate_crosscorr(7)
    targets = np.array(libnum * [0] + libnum * [1] + libnum * [2] + 206 * [3])
    names = ['Helices', 'Random', 'nGrams', 'AMP']

    # plot som maps with location of AMPs
    som.plot_point_map(np.vstack((d.descriptor, d2.descriptor[206:])),
                       targets,
                       names,
                       filename="peptidesom.png")
    som.plot_density_map(np.vstack((d.descriptor, d2.descriptor)),
                         filename="density.png")
    som.plot_distance_map(colormap='Reds', filename="distances.png")

    colormaps = ['Oranges', 'Purples', 'Greens', 'Reds']
    for i, c in enumerate(set(targets)):
        som.plot_class_density(np.vstack((d.descriptor, d2.descriptor)),
                               targets,
                               c,
                               names,
                               colormap=colormaps[i],
                               filename='class%i.png' % c)

    # get neighboring peptides (AMPs / TMs) for a sequence of interest
    my_d = PeptideDescriptor(seqs='GLFDIVKKVVGALLAG', scalename='pepcats')
    my_d.calculate_crosscorr(window=7)
    som.get_neighbors(datapoint=my_d.descriptor,
                      data=d2.descriptor,
                      labels=dataset.sequences,
                      d=0)