def test_filter_aa(self):
     D = GlobalDescriptor(
         ['GLFDIVKKVVGALG', 'LLLLLL', 'KKKKKKKKKK', 'DDDDDDDDDDDD'])
     D.calculate_charge()
     D.filter_aa(['D'])
     self.assertEqual(D.sequences, ['LLLLLL', 'KKKKKKKKKK'])
     self.assertEqual(len(D.descriptor), 2)
 def test_filter_values(self):
     E = GlobalDescriptor(
         ['GLFDIVKKVVGALG', 'LLLLLL', 'KKKKKKKKKK', 'DDDDDDDDDDDD'])
     E.calculate_charge()
     E.filter_values(values=[1.], operator='>=')
     self.assertEqual(E.sequences, ['KKKKKKKKKK'])
     self.assertEqual(len(E.descriptor), 1)
Ejemplo n.º 3
0
def insert_phycs(seq_df):
    #  Function for compute Isoelectric Point or net_charge of peptide
    def get_ieq_nc(seq, is_iep=True):
        protparam = PA(seq)
        return protparam.isoelectric_point(
        ) if is_iep else protparam.charge_at_pH(7.0)

    # Calculating IsoElectricPoints and NeutralCharge
    data_size = seq_df.size
    seq_df['IEP'] = list(
        map(get_ieq_nc, seq_df['Sequence'],
            [True] * data_size))  # IsoElectricPoints
    seq_df['Net Charge'] = list(
        map(get_ieq_nc, seq_df['Sequence'],
            [False] * data_size))  # Charge(Neutral)

    # Calculating hydrophobic moment (My assume all peptides are alpha-helix)
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'eisenberg')
    descrpt.calculate_moment(window=1000, angle=100, modality='max')
    seq_df['Hydrophobic Moment'] = descrpt.descriptor.reshape(-1)

    # Calculating "Hopp-Woods" hydrophobicity
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'hopp-woods')
    descrpt.calculate_global()
    seq_df['Hydrophobicity'] = descrpt.descriptor.reshape(-1)

    # Calculating Energy of Transmembrane Propensity
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'tm_tend')
    descrpt.calculate_global()
    seq_df['Transmembrane Propensity'] = descrpt.descriptor.reshape(-1)

    # Calculating Levitt_alpha_helical Propensity
    descrpt = PeptideDescriptor(seq_df['Sequence'].values, 'levitt_alpha')
    descrpt.calculate_global()
    seq_df['Alpha Helical Propensity'] = descrpt.descriptor.reshape(-1)

    # Calculating Aliphatic Index
    descrpt = GlobalDescriptor(seq_df['Sequence'].values)
    descrpt.aliphatic_index()
    seq_df['Aliphatic Index'] = descrpt.descriptor.reshape(-1)

    # Calculating Boman Index
    descrpt = GlobalDescriptor(seq_df['Sequence'].values)
    descrpt.boman_index()
    seq_df['Boman Index'] = descrpt.descriptor.reshape(-1)

    return seq_df
Ejemplo n.º 4
0
 def calc_len(self):
     """Method to get the sequence length of all sequences in the library.
     
     :return: {numpy.ndarray} sequence lengths in the attribute :py:attr:`len`.
     """
     for l in range(self.library.shape[0]):
         d = GlobalDescriptor(self.library[l])
         d.length()
         self.len.append(d.descriptor[:, 0])
Ejemplo n.º 5
0
 def calc_charge(self, ph=7.0, amide=True):
     """Method to calculate the total molecular charge at a given pH for all sequences in the library.
     
     :param ph: {float} ph at which to calculate the peptide charge.
     :param amide: {boolean} whether the sequences have an amidated C-terminus (-> charge += 1).
     :return: {numpy.ndarray} calculated charges in the attribute :py:attr:`charge`.
     """
     for l in range(self.library.shape[0]):
         d = GlobalDescriptor(self.library[l])
         d.calculate_charge(ph=ph, amide=amide)
         self.charge.append(d.descriptor[:, 0])
Ejemplo n.º 6
0
    def analyze_training(self):
        """ Method to analyze the distribution of the training data

        :return: prints out information about the length distribution of the sequences in ``self.sequences``
        """
        d = GlobalDescriptor(self.sequences)
        d.length()
        print("\nLENGTH DISTRIBUTION OF TRAINING DATA:\n")
        print("Number of sequences:    \t%i" % len(self.sequences))
        print("Mean sequence length:   \t%.1f ± %.1f" % (np.mean(d.descriptor), np.std(d.descriptor)))
        print("Median sequence length: \t%i" % np.median(d.descriptor))
        print("Minimal sequence length:\t%i" % np.min(d.descriptor))
        print("Maximal sequence length:\t%i" % np.max(d.descriptor))
Ejemplo n.º 7
0
    def generate_features(seq):
        """
        expect a list of sequences (a list of one for single sequence input)
        return pandas dataframe containing 20 unscaled features 10 from modlamp, 
        10 from custom feature generateion

        """
        from modlamp.descriptors import GlobalDescriptor
        custom_features = pd.Series(seq).apply(generate_custom_features)
        gdesc = GlobalDescriptor(seq)
        gdesc.calculate_all()
        modlamp_features = pd.DataFrame(gdesc.descriptor)
        modlamp_features.columns=gdesc.featurenames
        out = pd.concat([modlamp_features,custom_features],axis=1)
        return out
Ejemplo n.º 8
0
def calculate_peptide_props(fasta_dict):
    '''
    Give a sequence_dictionary (made from get_sequence_dict) returns a
    list of dictionaries. Each dictionary has type of chemical property as the
    keys and the calculated value for that property as the value. Designed to
    be written to a csv file using DictWriter.
    '''
    property_list = []
    for header in fasta_dict:
        s = str(fasta_dict[header].seq)
        t = GlobalDescriptor([s])
        t.calculate_all()
        d = dict(zip(t.featurenames, t.descriptor[0]))
        d['Peptide_name'] = header
        property_list.append(d)
    return property_list
Ejemplo n.º 9
0
def makeintlistdic_from_allep(dir_name, run_dir):
    i = 1
    intlistdic = {}
    len_ave_list, pi_ave_list, hyd_ave_list, len_var_list, pi_var_list, hyd_var_list = [
    ], [], [], [], [], []

    while True:
        if os.path.exists(dir_name + run_dir + str(i) + '.txt'):
            len_list_ep, pi_list_ep, hyd_list_ep = [], [], []
            seq_size = 0

            with open(dir_name + run_dir + str(i) + '.txt') as f:
                for line in f:
                    seq = line[:-1]
                    seq = GlobalDescriptor(seq)
                    seq.length()
                    len_list_ep.append(seq.descriptor[0][0])
                    seq.isoelectric_point()
                    pi_list_ep.append(seq.descriptor[0][0])
                    seq.hydrophobic_ratio()
                    hyd_list_ep.append(seq.descriptor[0][0])
                    seq_size += 1

                len_ave_list.append(round(len(len_list_ep) / seq_size, 3))
                pi_ave_list.append(round(len(pi_list_ep) / seq_size, 3))
                hyd_ave_list.append(round(len(hyd_list_ep) / seq_size, 3))
                len_var_list.append(round(statistics.pvariance(len_list_ep),
                                          3))
                pi_var_list.append(round(statistics.pvariance(pi_list_ep), 3))
                hyd_var_list.append(round(statistics.pvariance(hyd_list_ep),
                                          3))

                i += 1
        else:
            break

    intlistdic["len_ave"] = len_ave_list
    intlistdic["pi_ave"] = pi_ave_list
    intlistdic["hyd_ave"] = hyd_ave_list
    intlistdic["len_var"] = len_var_list
    intlistdic["pi_var"] = pi_var_list
    intlistdic["hyd_var"] = hyd_var_list
    # print(intlistdic, len(len_ave_list))

    return intlistdic
Ejemplo n.º 10
0
    def _read_header(self):
        """Priveat method called by ``__init__`` to read all file headers into the class attributes and calculate
        sequence dependant values.
        
        :return: headers in class attributes.
        """

        d = GlobalDescriptor('X')  # template

        # loop through all files in the directory
        for i, file in enumerate(self.filenames):
            with open(join(
                    self.directory,
                    file)) as f:  # read first 4 lines as header, rest as data
                head = [next(f) for _ in range(4)]
                data = [next(f) for _ in range(4, (self.wmax - self.wmin) + 5)]

            # read headers into class attributes
            name = head[0].split('\r\n')[0]
            self.names.append(name)
            sequence = head[1].split('\r\n')[0].strip()
            self.sequences.append(sequence)
            umol = float(head[2].split('\r\n')[0])
            self.conc_umol.append(umol)
            self.solvent.append(head[3].split('\r\n')[0])

            # read CD data
            wlengths = [int(line.split(',')[0])
                        for line in data]  # get rid of s***** line ends
            ellipts = [
                float(line.split(',')[1].split('\r\n')[0]) for line in data
            ]
            self.circular_dichroism.append(
                np.array(list(zip(wlengths, ellipts))))

            # calculate MW and transform concentration to mg/ml
            d.sequences = [sequence]
            d.calculate_MW(amide=self.amide)
            self.mw.append(d.descriptor[0][0])
            self.conc_mgml.append(self.mw[i] * umol / 10**6)
            self.meanres_mw.append(
                self.mw[i] / (len(sequence) -
                              1))  # mean residue molecular weight (MW / n-1)
Ejemplo n.º 11
0
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')
Ejemplo n.º 12
0
def predict():

    if request.method == 'POST':

        seq = request.form['seq']
        with open("random.fasta", "w") as fp:
            fp.write(seq)

        pepdesc = PeptideDescriptor(
            '/home/sanika/proj/random.fasta',
            'eisenberg')  # use Eisenberg consensus scale
        globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta')

        # --------------- Peptide Descriptor (AA scales) Calculations ---------------
        pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate Eisenberg hydrophobic moment

        # load other AA scales
        pepdesc.load_scale('gravy')  # load GRAVY scale
        pepdesc.calculate_global(
            append=True)  # calculate global GRAVY hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate GRAVY hydrophobic moment
        pepdesc.load_scale('z3')  # load old Z scale
        pepdesc.calculate_autocorr(
            1,
            append=True)  # calculate global Z scale (=window1 autocorrelation)

        # --------------- Global Descriptor Calculations ---------------
        globdesc.length()  # sequence length
        globdesc.boman_index(append=True)  # Boman index
        globdesc.aromaticity(append=True)  # global aromaticity
        globdesc.aliphatic_index(append=True)  # aliphatic index
        globdesc.instability_index(append=True)  # instability index
        globdesc.calculate_charge(ph=7.4, amide=False,
                                  append=True)  # net charge
        globdesc.calculate_MW(amide=False, append=True)  # molecular weight

        f1 = pepdesc.descriptor
        f2 = globdesc.descriptor
        result = np.concatenate((f2, f1), axis=1)

        clf = joblib.load('ml_model.pkl')
        pred = clf.predict(result)
        proba = clf.predict_proba(result).tocoo()
        mc = pred.tocoo()
        out = mc.col
        res = []
        labels = ['antiviral', 'antibacterial', 'antifungal']
        values = proba.data
        plt.pie(values,
                labels=labels,
                autopct='%.0f%%',
                shadow=True,
                radius=0.5)
        plt.savefig('/home/sanika/proj/pie_chart.jpg')

        figfile = BytesIO()
        plt.savefig(figfile, format='png')
        figfile.seek(0)
        figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii')
        plt.close()

        for i in range(len(out)):
            if out[i] == 0:
                res.append("antiviral")
            elif out[i] == 1:
                res.append("antibacterial")
            else:
                res.append("antifungal")

        return render_template('seq.html', seq=res, result=figdata_png)

    return render_template('predictor.html')
Ejemplo n.º 13
0
def propi():
    des_fis = GlobalDescriptor(seq)
    des_fis.calculate_all()
    prop_fis = des_fis.descriptor

    # Composición de aminoácidos
    amino_comp = map(AC.CalculateAAComposition, seq)  # AA
    dipep_comp = map(AC.CalculateDipeptideComposition, seq)  # Dipéptidos

    # Autocorrelación
    moreau_auto = map(auto.CalculateNormalizedMoreauBrotoAutoTotal,
                      seq)  # Moreau
    moran_auto = map(auto.CalculateMoranAutoTotal, seq)  # Moran
    geary_auto = map(auto.CalculateGearyAutoTotal, seq)  # Geary

    # Composition - Distribution - Transition
    ctd = map(CTD.CalculateCTD, seq)

    # QuasiSequence
    sqa = map(lambda p: qua.GetQuasiSequenceOrder(p, maxlag=5, weight=0.1),
              seq)
    secq = map(lambda p: qua.GetSequenceOrderCouplingNumber(p, d=1), seq)

    amino_comp = pd.DataFrame.from_dict(amino_comp)
    amino_comp.reset_index(drop=True, inplace=True)
    dipep_comp = pd.DataFrame.from_dict(dipep_comp)
    dipep_comp.reset_index(drop=True, inplace=True)

    moreau_auto = pd.DataFrame.from_dict(moreau_auto)
    moreau_auto.reset_index(drop=True, inplace=True)
    moran_auto = pd.DataFrame.from_dict(moran_auto)
    moran_auto.reset_index(drop=True, inplace=True)
    geary_auto = pd.DataFrame.from_dict(geary_auto)
    geary_auto.reset_index(drop=True, inplace=True)

    ctd = pd.DataFrame.from_dict(ctd)
    ctd.reset_index(drop=True, inplace=True)

    # PseudoAAC - Tipo I
    Hydrophobicity = PAAC._Hydrophobicity
    hydrophilicity = PAAC._hydrophilicity
    residuemass = PAAC._residuemass
    pK1 = PAAC._pK1
    pK2 = PAAC._pK2
    pI = PAAC._pI
    clasI_pse = map(
        lambda p: PAAC.GetPseudoAAC(
            p,
            lamda=3,
            weight=0.7,
            AAP=[Hydrophobicity, hydrophilicity, residuemass, pK1, pK2, pI]),
        seq)
    clasI_pse = pd.DataFrame.from_dict(clasI_pse)
    clasI_pse.reset_index(drop=True, inplace=True)

    sqa = pd.DataFrame.from_dict(sqa)
    sqa.reset_index(drop=True, inplace=True)
    secq = pd.DataFrame.from_dict(secq)
    secq.reset_index(drop=True, inplace=True)

    prop_fis = pd.DataFrame(prop_fis)
    prop_fis.columns = [
        'Longitud', 'MW', 'Carga', 'DensCarga', 'pIso', 'InestInd', 'Aroma',
        'Alifa', 'Boman', 'HidroRa'
    ]

    var = pd.concat([
        amino_comp, dipep_comp, moreau_auto, moran_auto, ctd, clasI_pse, sqa,
        secq, geary_auto, prop_fis
    ],
                    axis=1)
    return var
class TestGlobalDescriptor(unittest.TestCase):

    G = GlobalDescriptor(
        ['GLFDIVKKVVGALG', 'LLLLLL', 'KKKKKKKKKK', 'DDDDDDDDDDDD'])
    G2 = GlobalDescriptor(join(dirname(__file__), 'files/lib.fasta'))
    G3 = GlobalDescriptor(join(dirname(__file__), 'files/lib.csv'))

    def test_load(self):
        self.assertEqual('GLFDIVKKVVGALG', self.G.sequences[0])
        self.assertEqual('LASKSTSGIGVFGRIRAGLKLKST', self.G2.sequences[2])
        self.assertEqual('NPGKSTTRRI', self.G3.sequences[-1])

    def test_charge(self):
        self.G.calculate_charge()
        self.assertAlmostEqual(self.G.descriptor[0, 0], 0.996, 3)
        self.G.calculate_charge(amide=True)
        self.assertAlmostEqual(self.G.descriptor[0, 0], 1.996, 3)
        self.G.calculate_charge(ph=9.84)
        self.assertAlmostEqual(self.G.descriptor[0, 0], -0.000, 3)

    def test_isoelectric(self):
        self.G.isoelectric_point()
        self.assertAlmostEqual(self.G.descriptor[0, 0], 9.840, 3)
        self.G.isoelectric_point(amide=True)
        self.assertAlmostEqual(self.G.descriptor[0, 0], 10.7090, 4)

    def test_charge_density(self):
        self.G.charge_density()
        self.assertAlmostEqual(self.G.descriptor[0, 0], 0.00070, 4)
        self.G.charge_density(amide=True)

    def test_aliphatic_index(self):
        self.G.aliphatic_index()
        self.assertAlmostEqual(self.G.descriptor[0, 0], 152.857, 3)

    def test_boman_index(self):
        self.G.boman_index()
        self.assertAlmostEqual(self.G.descriptor[0, 0], -1.0479, 4)

    def test_filter_aa(self):
        D = GlobalDescriptor(
            ['GLFDIVKKVVGALG', 'LLLLLL', 'KKKKKKKKKK', 'DDDDDDDDDDDD'])
        D.calculate_charge()
        D.filter_aa(['D'])
        self.assertEqual(D.sequences, ['LLLLLL', 'KKKKKKKKKK'])
        self.assertEqual(len(D.descriptor), 2)

    def test_filter_values(self):
        E = GlobalDescriptor(
            ['GLFDIVKKVVGALG', 'LLLLLL', 'KKKKKKKKKK', 'DDDDDDDDDDDD'])
        E.calculate_charge()
        E.filter_values(values=[1.], operator='>=')
        self.assertEqual(E.sequences, ['KKKKKKKKKK'])
        self.assertEqual(len(E.descriptor), 1)

    def test_instability_index(self):
        self.G.instability_index()
        self.assertAlmostEqual(self.G.descriptor[0, 0], -8.214, 3)

    def test_length(self):
        self.G.length()
        self.assertEqual(self.G.descriptor[0, 0], 14)

    def test_molweight(self):
        self.G.calculate_MW()
        self.assertEqual(self.G.descriptor[0, 0], 1415.72)

    def test_featurescaling(self):
        self.G.calculate_charge()
        self.G.calculate_MW(append=True)
        self.G.feature_scaling()
        self.assertAlmostEqual(-5.55111512e-17,
                               np.mean(self.G.descriptor, axis=0).tolist()[0])
        self.assertAlmostEqual(1.,
                               np.std(self.G.descriptor, axis=0).tolist()[0])

    def test_hydroratio(self):
        self.G.hydrophobic_ratio()
        self.assertAlmostEqual(0.57142857, self.G.descriptor[0][0])

    def test_aromaticity(self):
        self.G.aromaticity()
        self.assertAlmostEqual(0.07142857142857142, self.G.descriptor[0][0])

    def test_formula(self):
        self.G.formula(amide=True, append=True)
        self.assertEqual('C67 H115 N17 O16', self.G.descriptor[0, -1])
Ejemplo n.º 15
0
def upload():

    if request.method == 'POST':
        # This will be executed on POST request.
        upfile = request.files['file']
        if upfile and allowed_file(upfile.filename):

            filename = secure_filename(upfile.filename)
            upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            #return render_template('upload.html')
            #flash("File uploaded", "success")
            #with open("/home/sanika/proj/uploads/aa.fasta") as f:
            #lines = f.readlines()
            #lines = [l for l in lines if "ROW" in l]

            #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1:
            #f1.writelines(lines)

            #f = open(filename)
            #prot_seq = ReadFasta(f)

            with open(filename) as fasta_file:  # Will close handle cleanly
                identifiers = []
                sequence = []
                for seq_record in SeqIO.parse(fasta_file,
                                              'fasta'):  # (generator)
                    identifiers.append(seq_record.id)
                    sequence.append(seq_record.seq)

            pepdesc = PeptideDescriptor(
                filename, 'eisenberg')  # use Eisenberg consensus scale
            globdesc = GlobalDescriptor(filename)

            # --------------- Peptide Descriptor (AA scales) Calculations ---------------
            pepdesc.calculate_global(
            )  # calculate global Eisenberg hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate Eisenberg hydrophobic moment

            # load other AA scales
            pepdesc.load_scale('gravy')  # load GRAVY scale
            pepdesc.calculate_global(
                append=True)  # calculate global GRAVY hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate GRAVY hydrophobic moment
            pepdesc.load_scale('z3')  # load old Z scale
            pepdesc.calculate_autocorr(
                1, append=True
            )  # calculate global Z scale (=window1 autocorrelation)

            # --------------- Global Descriptor Calculations ---------------
            globdesc.length()  # sequence length
            globdesc.boman_index(append=True)  # Boman index
            globdesc.aromaticity(append=True)  # global aromaticity
            globdesc.aliphatic_index(append=True)  # aliphatic index
            globdesc.instability_index(append=True)  # instability index
            globdesc.calculate_charge(ph=7.4, amide=False,
                                      append=True)  # net charge
            globdesc.calculate_MW(amide=False, append=True)  # molecular weight

            f1 = pepdesc.descriptor
            f2 = globdesc.descriptor
            result = np.concatenate((f2, f1), axis=1)
            rs = []
            for i in range(len(result)):
                prt = np.reshape(result[i], (-1, 14))
                clf = joblib.load('ml_model.pkl')
                pred = clf.predict(prt)
                out = pred.toarray()
                #print(clf.predict_proba(result))
                proba = clf.predict_proba(prt).tocoo()
                mc = pred.tocoo()
                out = mc.col
                res = []
                for i in range(len(out)):
                    if out[i] == 0:
                        res.append("antiviral")
                    elif out[i] == 1:
                        res.append("antibacterial")
                    else:
                        res.append("antifungal")
                rs.append(res)
            a = []
            for i in range(len(rs)):
                a.append('-'.join(rs[i]))

            df = pd.DataFrame(data={
                "id": identifiers,
                "sequence": sequence,
                "activity": a
            },
                              columns=['id', 'sequence', 'activity'])
            df.to_csv("result.csv", sep=',', index=False)

            os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            #return render_template('seq.html', seq = rs)
            return render_template('up.html', mimetype="text/csv")

            #flash("File uploaded: Thanks!", "success")
        else:
            error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD"
            return render_template('upload.html', error=error)

    # This will be executed on GET request.
    return render_template('predictor.html')
Ejemplo n.º 16
0
def describe_sequences():
    aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    letters = {1 : aa_letters, 2 : di_letters}
    
    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i : 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k : d[k]/l for k in d}
        if seq_type == 2:        
            for a in range(l-1):
                s = string[a:a+seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k : d[k]/(l-1) for k in d}
        return d
        
    def residue_distribution(all_residues, seq_type):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        residue_counts = list(sorted([(i, d[i]) for i in letters[seq_type] ]))                              ##Removes ambiguous letters
        r_c = [i[1] for i in residue_counts]
        dis = np.array([r_c,])
        return dis
    
    peptides = [{"seq" : "FLPILASLAAKFGPKLFCLVTKKC", "cTer" : None, "activity" : "YES"},
                {"seq" : "ILGPVISTIGGVLGGLLKNL", "cTer" : "Amidation", "activity" : "YES"},
                {"seq": "GIGGKILSGLKTALKGAAKELASTYLH", "cTer" : None, "activity" : "NO"},
                {"seq": "GIGSAILSAGKSALKGLAKGLAEHFAN", "cTer" : None, "activity" : "NO"},
                {"seq": "FLSLIPHAINAVSAIAKHF", "cTer" : "Amidation", "activity" : "NO"},
    ]
    
    
    for peptide in peptides:
        #print(peptide["id"])
        #print(peptide["seq"])
        
        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide = peptide["cTer"] == "Amidation")
        
        #peptide["GlobalDescriptor"] = globdesc
        
        #print(peptide["GlobalDescriptor"].descriptor)
        
        #Eisenberg hydrophobicity consensus
        #Take most of the values from here
        
        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global()
        pepdesc.calculate_moment(append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")
        
        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(append=True)
    
        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(append=True)
        pepdesc.calculate_global(append=True)
        
        pepdesc.load_scale("z5")
        pepdesc.calculate_global(append=True)
        
        #peptide["PeptideDescriptor"] = pepdesc
    
        peptide["TotalDescriptor"] = str(np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))
        
        try:
            pepid = np.array([[int(peptide["id"].replace("HEMOLYTIK",""))]])
        except KeyError:
            pepid = np.array([[0]])
        
        freq_1d = residue_distribution(peptide["seq"], 1)
        freq_2d = residue_distribution(peptide["seq"], 2)
        
        len_peptide = np.array([[len(peptide["seq"])]])
        
        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])
        
        peptide_di2 = di2(peptide["seq"])
        
        peptide["array"] = np.concatenate((pepid, pepdesc.descriptor, globdesc.descriptor, len_peptide, 
               freq_1d, 
               #freq_2d, 
               #peptide_di2, 
               pepact,), axis=1)
        #print(peptide["TotalDescriptor"])
        
    
    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)
    print(x)
    
    np.save("hemolytik_array_custom_tests", x, allow_pickle=False)
Ejemplo n.º 17
0
def exec(peptide, time_node):
	file = open("../src/public/jobs/service1/service1.fasta", "w") 
	file.write(peptide)
	file.close()
	fasta = SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta")
	if(any(fasta) == False): #False when `fasta` is empty
		return "error"
	cantidad = 0
	for record in SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta"):
		cantidad = cantidad+1
	if (cantidad == 1):
		properties = {}
		for record in SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta"):
			properties[str(record.id)] = {}
			#save properties

			properties[str(record.id)]["length"] = len(record.seq)

			#formula
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.formula(amide=True)
				properties[str(record.id)]["formula"] = desc.descriptor[0][0]
			except:
				properties[str(record.id)]["formula"] = "-"

			#molecular weigth
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_MW(amide=True)
				properties[str(record.id)]["molecular_weigth"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["molecular_weigth"] = "-"

			#boman_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.boman_index()
				properties[str(record.id)]["boman_index"] = float("%.4f" % desc.descriptor[0][0])				
			except:
				properties[str(record.id)]["boman_index"] = "-"

			#charge
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_charge(ph=7, amide=True)
				properties[str(record.id)]["charge"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge"] = "-"
				

			#charge density
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.charge_density(ph=7, amide=True)
				properties[str(record.id)]["charge_density"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge_density"] = "-"

			#estimate isoelectric point
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.isoelectric_point()
				properties[str(record.id)]["isoelectric_point"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["isoelectric_point"] = "-"

			#estimate inestability index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.instability_index()
				properties[str(record.id)]["instability_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["instability_index"] = "-"

			#estimate aromaticity
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aromaticity()
				properties[str(record.id)]["aromaticity"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aromaticity"] = "-"

			#estimate aliphatic_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aliphatic_index()
				properties[str(record.id)]["aliphatic_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aliphatic_index"] = "-"

			#estimate hydrophobic_ratio
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.hydrophobic_ratio()
				properties[str(record.id)]["hydrophobic_ratio"] = float("%.4f" % desc.descriptor[0][0])	
			except:
				properties[str(record.id)]["hydrophobic_ratio"] = "-"

			#profile hydrophobicity
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='H')
				properties[str(record.id)]["hydrophobicity_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobicity_profile"] = "-"

			#profile hydrophobic
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='uH')
				properties[str(record.id)]["hydrophobic_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobic_profile"] = "-"

			#moment
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_moment()
				properties[str(record.id)]["calculate_moment"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["calculate_moment"] = "-"

			try:
				os.mkdir("../src/public/jobs/service1/"+time_node)
			except:
				print("Error")
			
			#generate plot profile
			plot_profile(str(record.seq), scalename='eisenberg', filename= "../src/public/jobs/service1/"+time_node+"/profile.png")

			#generate helical wheel
			helical_wheel(str(record.seq), colorcoding='charge', lineweights=False, filename= "../src/public/jobs/service1/"+time_node+"/helical.png")
			
			return(properties)
	
	if (cantidad > 1):
		properties = {}
		for record in SeqIO.parse("../src/public/jobs/service1/service1.fasta", "fasta"):
			properties[str(record.id)] = {}

			properties[str(record.id)]["length"] = len(record.seq)
			
			#formula
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.formula(amide=True)
				properties[str(record.id)]["formula"] = desc.descriptor[0][0]
			except:
				properties[str(record.id)]["formula"] = "-"

			#molecular weigth
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_MW(amide=True)
				properties[str(record.id)]["molecular_weigth"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["molecular_weigth"] = "-"

			#boman_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.boman_index()
				properties[str(record.id)]["boman_index"] = float("%.4f" % desc.descriptor[0][0])				
			except:
				properties[str(record.id)]["boman_index"] = "-"

			#charge
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.calculate_charge(ph=7, amide=True)
				properties[str(record.id)]["charge"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge"] = "-"
				

			#charge density
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.charge_density(ph=7, amide=True)
				properties[str(record.id)]["charge_density"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["charge_density"] = "-"

			#estimate isoelectric point
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.isoelectric_point()
				properties[str(record.id)]["isoelectric_point"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["isoelectric_point"] = "-"

			#estimate inestability index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.instability_index()
				properties[str(record.id)]["instability_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["instability_index"] = "-"

			#estimate aromaticity
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aromaticity()
				properties[str(record.id)]["aromaticity"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aromaticity"] = "-"

			#estimate aliphatic_index
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.aliphatic_index()
				properties[str(record.id)]["aliphatic_index"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["aliphatic_index"] = "-"

			#estimate hydrophobic_ratio
			try:
				desc = GlobalDescriptor(str(record.seq))
				desc.hydrophobic_ratio()
				properties[str(record.id)]["hydrophobic_ratio"] = float("%.4f" % desc.descriptor[0][0])	
			except:
				properties[str(record.id)]["hydrophobic_ratio"] = "-"

			#profile hydrophobicity
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='H')
				properties[str(record.id)]["hydrophobicity_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobicity_profile"] = "-"

			#profile hydrophobic
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_profile(prof_type='uH')
				properties[str(record.id)]["hydrophobic_profile"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["hydrophobic_profile"] = "-"

			#moment
			try:
				desc = PeptideDescriptor(str(record.seq), scalename='Eisenberg')
				desc.calculate_moment()
				properties[str(record.id)]["calculate_moment"] = float("%.4f" % desc.descriptor[0][0])
			except:
				properties[str(record.id)]["calculate_moment"] = "-"

		return(properties)
Ejemplo n.º 18
0
Archivo: protein.py Proyecto: jancr/ppv
    def _add_features_to_peptide_series(self,
                                        peptide,
                                        index,
                                        n_cluster=-1,
                                        lpvs=None):
        # primary intensity weights d = delta, pd = penalty delta
        # TODO only d_start and d_stop depends on impval, pd_start and pd_stop does not because
        # they are always between a d_start and d_stop, and should thus be above imp_val!
        # therefore we can write out d_start as and d_stop as:
        #   [before_start, after_start], [befrore_stop, after_stop]
        # thus if we have
        #       raw data     = [0, 0, 5, 5, 7, 7, 5, 5, 0, 0]
        # then for the peptide        3--------------8
        #       before_start, after_start = [ 0, 5 ]
        # but for the peptide               5--6
        #       before_start, after_start = [ 5, 7 ]
        # by making a none linear model we could formulate the w_start parameter as follows:
        # w_start * (after_start - max(before_start, imp_val))
        # which is consistent with how we currently do the grid search (imp_val=4):
        #       d_start = 5 - max(0, 4) = 1
        #       d_start = 7 - max(5, 4) = 2
        if lpvs is None:
            lpvs = set()
        i_start = peptide.start.index
        i_stop = peptide.stop.index

        # MS Delta
        series = pd.Series(np.zeros(len(index)) * np.nan, index=index)
        ms_int = self.ms_intensity_features.type
        series[ms_int, 'start'] = self.start_scores[i_start]
        series[ms_int, 'stop'] = self.stop_scores[i_stop]

        if 4 < len(peptide):
            penalty = SequenceRange(peptide.start + 1,
                                    peptide.stop - 1,
                                    validate=False)
            series[ms_int,
                   'penalty_start'] = self.start_scores[penalty.slice].sum()
            series[ms_int,
                   'penalty_stop'] = self.stop_scores[penalty.slice].sum()
        else:
            series[ms_int, 'penalty_start'] = series[ms_int,
                                                     'penalty_stop'] = 0

        # MS Bool
        b_obs, f_obs = self._calc_observed(peptide)
        series[self.ms_bool_features.type, "first"] = self.h_first[i_start]
        series[self.ms_bool_features.type, "last"] = self.h_last[i_stop]
        series[self.ms_bool_features.type, "observed"] = b_obs

        # MS Frequency
        # ptm weights
        # TODO: should it get extra penalties if there are PTM's between start and end?
        ms_freq = self.ms_frequency_features.type
        series[ms_freq, 'acetylation'] = self.ac_freq[i_start]
        series[ms_freq, 'amidation'] = self.am_freq[i_stop]

        series[ms_freq, 'start'] = self.h_start_freq[i_start]
        series[ms_freq, 'stop'] = self.h_stop_freq[i_stop]
        series[ms_freq, 'observed'] = f_obs
        series[ms_freq, 'sample'] = self.h_sample[peptide.slice].min()
        series[ms_freq, 'ladder'] = \
            self.h_ladder_start[i_start] * self.h_ladder_stop[i_stop]
        series[ms_freq, 'protein_coverage'] = self.protein_coverage
        series[ms_freq, 'cluster_coverage'] = self.cluster_coverage[n_cluster]

        # thise are good features, but there may be better ways to extract them
        series[ms_freq,
               'bond'] = self.h_bond[self.get_bond_slice(peptide)].min()

        # MS Counts
        ms_count = self.ms_count_features.type
        series[ms_count, 'start'] = self.start_counts[peptide.start]
        series[ms_count, 'stop'] = self.stop_counts[peptide.stop]
        #  series[ms_count, 'ladder'] = \
        #      self.h_ladder_start[i_start] + self.h_ladder_stop[i_stop]

        ############################################################

        # Chemical
        sequence = self.protein_sequence[peptide.slice]
        peptide_features = GlobalDescriptor(sequence)

        is_amidated = series[ms_freq, 'amidation'] > 0.05
        peptide_features.calculate_all(amide=is_amidated)

        chem = self.chemical_features.type
        for i, name in enumerate(peptide_features.featurenames):
            if name in self.chemical_features.features:
                series[chem, name] = peptide_features.descriptor[0, i]

            eisenberg = PeptideDescriptor(sequence, 'eisenberg')
            eisenberg.calculate_moment()
            series[chem, 'eisenberg'] = eisenberg.descriptor.flatten()[0]

        # Annotations
        series[self.annotations.type, "Known"] = peptide in self.known_peptides
        #  series[self.annotations.type, "Type"] = peptide in self.known_peptides
        series[self.annotations.type, "Cluster"] = n_cluster
        series[self.annotations.type, "Sequence"] = peptide.seq
        series[self.annotations.type, "LPV"] = False  # TODO!

        series[self.annotations.type, "N Flanking"] = \
            self.get_nflanking_region(peptide.start, self.protein_sequence)
        series[self.annotations.type, "C Flanking"] = \
            self.get_cflanking_region(peptide.stop, self.protein_sequence)
        series[self.annotations.type, "LPV"] = peptide in lpvs
        if f_obs != 0:
            _pep_index = (slice(None), slice(None), peptide.start.pos,
                          peptide.stop.pos)
            series[self.annotations.type,
                   "Intensity"] = self.df.loc[_pep_index, :].sum().sum()
        return series
Ejemplo n.º 19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to calculate different peptide descriptors for a given sequences.fasta file and save them to two files.
"""

from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor

# Load sequence file into descriptor object
pepdesc = PeptideDescriptor('/path/to/sequences.fasta',
                            'Eisenberg')  # use Eisenberg consensus scale
globdesc = GlobalDescriptor('/path/to/sequences.fasta')

# --------------- Peptide Descriptor (AA scales) Calculations ---------------
pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate Eisenberg hydrophobic moment

# load other AA scales
pepdesc.load_scale('gravy')  # load GRAVY scale
pepdesc.calculate_global(append=True)  # calculate global GRAVY hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate GRAVY hydrophobic moment
pepdesc.load_scale('z3')  # load old Z scale
pepdesc.calculate_autocorr(
    1, append=True)  # calculate global Z scale (=window1 autocorrelation)

# save descriptor data to .csv file
col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3'
pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1)

# --------------- Global Descriptor Calculations ---------------
globdesc.length()  # sequence length
Ejemplo n.º 20
0
def describe_sequences():
    path = r"C:\Users\Patrick\OneDrive - University College Dublin\Bioinformatics\HemolyticStudies\BOTH_peptides.json"

    aa_letters = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    tri_letters = [
        "%s%s%s" % (a, b, c) for a in aa_letters for b in aa_letters
        for c in aa_letters
    ]
    conjoint_letters = ["A", "I", "Y", "H", "R", "D", "C"]
    letters = {
        1: aa_letters,
        2: di_letters,
        3: tri_letters,
        4: conjoint_letters
    }

    #Conjoint src = https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0828-1

    conjoint_dict = {
        "A": "A",
        "G": "A",
        "V": "A",
        "I": "I",
        "L": "I",
        "F": "I",
        "P": "I",
        "Y": "Y",
        "M": "Y",
        "T": "Y",
        "S": "Y",
        "H": "H",
        "N": "H",
        "Q": "H",
        "W": "H",
        "R": "R",
        "K": "R",
        "D": "D",
        "E": "D",
        "C": "C",
    }

    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / l for k in d}
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 1) for k in d}
        if seq_type == 3:
            for a in range(l - 2):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 2) for k in d}
        return d

    def counter_boolean(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def counter_abs(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def residue_distribution(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        elif seq_type == 3:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if tp[i] >= 20]))
        elif seq_type == 4:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]]))

        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_boolean(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_boolean(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_abs(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_abs(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    with open(path, "r") as f:
        text = f.read()

    peptides = eval(text)["Peptides"]

    train_peptides, test_peptides = train_test_split(peptides,
                                                     test_size=0.15,
                                                     random_state=42)

    train_peptides_seqs = [peptide["seq"] for peptide in train_peptides]

    for peptide in peptides:
        if peptide["seq"] in train_peptides_seqs:
            peptide["train"] = True
        else:
            peptide["train"] = False

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))

    new_peptides = []
    for peptide in peptides:
        if peptide["train"] == True:
            new_peptide = peptide.copy()
            new_seq = ''.join(reversed(peptide["seq"]))
            new_peptide["seq"] = new_seq
            new_peptides.append(new_peptide)

    #peptides.extend(new_peptides)
    random.shuffle(peptides)

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))
    print("doubling complete")

    dp = {i: 0 for i in letters[2]}
    tp = {i: 0 for i in letters[3]}

    name_i = 0

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 1):
            s = seq[a:a + 2]
            temp_set.add(s)
        for s in temp_set:
            dp[s] = dp[s] + 1

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 2):
            s = seq[a:a + 3]
            temp_set.add(s)
        for s in temp_set:
            tp[s] = tp[s] + 1

    for peptide in peptides:
        peptide["conjoint_seq"] = "".join(
            [conjoint_dict[letter] for letter in peptide["seq"]])

    for peptide in peptides:

        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide=peptide["cTer"] == "Amidation")

        ctdc = CTD.CalculateC(peptide["seq"])
        ctdc_keys = list(sorted(list([key for key in ctdc])))
        ctdc_vals = np.array([[ctdc[key] for key in ctdc_keys]])

        conjointtriad = ConjointTriad.CalculateConjointTriad(peptide["seq"])
        conjointtriad_keys = list(sorted(list([key for key in conjointtriad])))
        conjointtriad_vals = np.array(
            [[conjointtriad[key] for key in conjointtriad_keys]])

        conjoint_dis = residue_distribution(peptide["conjoint_seq"], 4, None)

        #peptide["GlobalDescriptor"] = globdesc

        #print(peptide["GlobalDescriptor"].descriptor)

        #Eisenberg hydrophobicity consensus
        #Take most of the values from here

        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")

        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("aasi")
        pepdesc.calculate_global(append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("abhprk")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_acid")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("cougar")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("gravy")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("hopp-woods")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("kytedoolittle")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("ppcali")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("bulkiness")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("TM_tend")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("mss")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("t_scale")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("peparc")
        pepdesc.calculate_arc(modality="max", append=True)
        pepdesc.calculate_arc(modality="mean", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("pepcats")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z3")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z5")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        #pepdesc.load_scale("PPCALI")
        #pepdesc.calculate_autocorr(2)
        #peptide["PeptideDescriptor"] = pepdesc

        protein = PyPro()
        protein.ReadProteinSequence(peptide["seq"])
        paac = protein.GetPAAC(lamda=1, weight=0.05)
        paac2 = [[
            paac[a] for a in list(
                sorted([k for k in paac],
                       key=lambda x: int(x.replace("PAAC", ""))))
        ]]

        cTer = np.array([[1 if peptide["cTer"] == "Amidation" else 0]])

        paac = np.array(paac2)

        analysed_seq = ProteinAnalysis(peptide["seq"])
        secondary_structure_fraction = np.array(
            [analysed_seq.secondary_structure_fraction()])

        peptide["TotalDescriptor"] = str(
            np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))

        try:
            pepid = np.array([[
                int(peptide["id"].replace("HEMOLYTIK", "").replace(
                    "DRAMP", "").replace("DBAASP", ""))
            ]])
        except KeyError:
            pepid = 0

        pep_train = np.array([[1 if peptide["train"] == True else 0]])

        freq_1d = residue_distribution(peptide["seq"], 1, dp)
        freq_2d = residue_distribution(peptide["seq"], 2, dp)
        freq_3d = residue_distribution(peptide["seq"], 3, dp)
        freq_1dbool = residue_boolean(peptide["seq"], 1, dp)
        freq_2dbool = residue_boolean(peptide["seq"], 2, dp)
        freq_1dabs = residue_abs(peptide["seq"], 1, dp)
        freq_2dabs = residue_abs(peptide["seq"], 2, dp)

        len_peptide = np.array([[len(peptide["seq"])]])

        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])

        peptide_di2 = di2(peptide["seq"])
        peptide_di3 = di3(peptide["conjoint_seq"])

        ####################### AAindex #########################
        to_get = [
            ("CHAM810101", "mean"),  #Steric Hinderance
            ("CHAM810101", "total"),  #Steric Hinderance
            ("KYTJ820101", "mean"),  #Hydropathy
            ("KLEP840101", "total"),  #Charge
            ("KLEP840101", "mean"),  #Charge
            ("MITS020101", "mean"),  #Amphiphilicity
            ("FAUJ830101", "mean"),  #Hydrophobic parameter pi
            ("GOLD730102", "total"),  #Residue volume
            ("MEEJ800101", "mean"),  #Retention coefficient in HPLC
            ("OOBM850105",
             "mean"),  #Optimized side chain interaction parameter
            ("OOBM850105",
             "total"),  #Optimized side chain interaction parameter
            ("VELV850101", "total"),  #Electron-ion interaction parameter
            ("VELV850101", "mean"),  #Electron-ion interaction parameter
            ("PUNT030102",
             "mean"),  #Knowledge-based membrane-propensity scale from 3D_Helix
            ("BHAR880101", "mean"),  #Average flexibility indeces
            ("KRIW790102", "mean"),  #Fraction of site occupied by water
            ("PLIV810101", "mean"),  #Partition coefficient
            ("ZIMJ680102", "mean"),  #Bulkiness
            ("ZIMJ680102", "total"),  #Bulkiness
            ("ZHOH040101", "mean"),  #Stability scale
            ("CHAM820102", "total"),  #Free energy solubility in water
            #From HemoPi: src = https://github.com/riteshcanfly/Hemopi/blob/master/pcCalculator.java
            ("HOPT810101", "mean"),  #Hydrophilicity 
            ("EISD840101", "mean"),  #Hydrophobicity
            ("FAUJ880109", "total"),  #Net Hydrogen
            ("EISD860101", "mean"),  #Solvation
        ]

        tetra_peptides = [
            "KLLL",  # src = https://github.com/riteshcanfly/Hemopi/blob/master/tetrapos.txt
            "GCSC",
            "AAAK",
            "KLLS",
            "LGKL",
            "VLKA",
            "LLGK",
            "LVGA",
            "LSDF",
            "SDFK",
            "SWLR",
            "WLRD",
        ]

        tp_bin = []
        for t_p in tetra_peptides:
            if t_p in peptide["seq"]:
                tp_bin.append(1)
            else:
                tp_bin.append(0)
        tp_bin = np.array([tp_bin])

        for identifier, mode in to_get:
            x = aaf(peptide["seq"], identifier, mode)

        aminoacidindeces = np.array([[
            aaf(peptide["seq"], identifier, mode)
            for identifier, mode in to_get
        ]])

        peptide["array"] = np.concatenate(
            (
                pepid,
                pep_train,
                pepdesc.descriptor,
                globdesc.descriptor,
                len_peptide,
                cTer,
                secondary_structure_fraction,
                aminoacidindeces,
                ctdc_vals,
                conjointtriad_vals,
                tp_bin,
                freq_1d,
                freq_2d,
                freq_3d,
                freq_1dbool,
                freq_2dbool,
                freq_1dabs,
                freq_2dabs,
                peptide_di2,
                peptide_di3,  #Conjoint Alphabet
                paac,
                pepact,
            ),
            axis=1)
        #print(peptide["TotalDescriptor"])

    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)

    np.save("peptides_array", x, allow_pickle=False)
Ejemplo n.º 21
0
from modlamp.descriptors import GlobalDescriptor

sequences = []
MIC = []
units = []
actives = {}

# read the file with 3 columns containing MIC values
with open('Saureus.csv', 'r') as f:
    for line in f:
        sequences.append(line.split(',')[0])
        MIC.append(line.split(',')[1])
        units.append(line.split(',')[2])

D = GlobalDescriptor(sequences)
D.calculate_MW()
MW = D.descriptor.tolist()

for i, u in enumerate(units):
    if u == 'ug/ml\r\n':  # find MIC values in ug/mL
        if '+' in MIC[i]:
            mic = float(MIC[i].split('+')[0]) + float(MIC[i].split('+')[1])  # if with stdev, take upper bound
            actives[sequences[i]] = round((mic / float(MW[i][0])) * 1000., 1)  # convert ug/mL to uM
        elif '-' in MIC[i]:
            mic = float(MIC[i].split('-')[1])  # if with stdev, be conservative and take upper bound
            actives[sequences[i]] = round((mic / float(MW[i][0])) * 1000., 1)  # convert ug/mL to uM
        else:
            actives[sequences[i]] = round((float(MIC[i]) / float(MW[i][0])) * 1000., 1)  # convert ug/mL to uM

s_inactive = [s for s, v in actives.items() if v > 100.0]
charges = coll.defaultdict(list)
charges_long = []
charge_densities = coll.defaultdict(list)
charge_densities_long = []
polarities = coll.defaultdict(list)
polarities_long = []
gravy = coll.defaultdict(list)
gravy_long = []

for gp in peptides:
    #
    eisenbergs[gp] = get_peptide_values(peptides[gp], 'eisenberg')
    for val in eisenbergs[gp]:
        eisenbergs_long.append([gp, val])
    #
    properties = GlobalDescriptor(peptides[gp])
    properties.calculate_charge(ph=7.4, amide=True)
    charges[gp] = [x[0] for x in properties.descriptor]
    for val in charges[gp]:
        charges_long.append([gp, val])
    #
    properties = GlobalDescriptor(peptides[gp])
    properties.charge_density(ph=7.4, amide=True)
    charge_densities[gp] = [x[0] for x in properties.descriptor]
    for val in charge_densities[gp]:
        charge_densities_long.append([gp, val])
    #
    polarities[gp] = get_peptide_values(peptides[gp], 'polarity')
    for val in polarities[gp]:
        polarities_long.append([gp, val])
    #
Ejemplo n.º 23
0
ch_den=[]
ip=[]
ii=[]
bi=[]
hr=[]
ar=[]
al=[]
for i in arr_motifs:
 print(i[0])
 motif.append(i[0])
 arr_len.append(len(i[0]))


# charge
for i,j in zip(arr_motifs, arr_len):
 desc = GlobalDescriptor(i[0])
 desc.calculate_charge(ph=7.4, amide=True)
 ch.append(desc.descriptor/j)


#hydrophobic ratio
for i,j in zip(arr_motifs, arr_len):
 desc = GlobalDescriptor(i[0])
 desc.hydrophobic_ratio()
 hr.append(desc.descriptor/j)

# aromaticity
for i,j in zip(arr_motifs, arr_len):
 desc = GlobalDescriptor(i[0])
 desc.aromaticity()
 ar.append(desc.descriptor/j)
import pandas as pd
import sys
from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor

database = pd.read_csv(sys.argv[1])
path_output = sys.argv[2]

print("Estimate formula")

#get formula for each sequence
formula_array = []

for i in range(len(database)):

    try:
        desc = GlobalDescriptor([database['Sequence'][i]])
        desc.formula(amide=True)
        for v in desc.descriptor:
            formula_array.append(v[0])
    except:
        formula_array.append('')

database['formula'] = formula_array

print("Estimate molecular_weigth")
#get MW for each sequence
molecular_weigth_array = []

for i in range(len(database)):
    try:
        desc = GlobalDescriptor([database['Sequence'][i]])
Ejemplo n.º 25
0
            len_list = []
            aacomp_diclist = []
            pi_list = []
            hyd_list = []
            ctd_diclist = []
            header_list = []
            # row_list.append("\n")
            row_list.append("\n" + run_dir + file_name)

            for line in f:
                seq = line[:-1]
                DesObject = PyPro.GetProDes(seq)
                if do_length:
                    len_list.append(len(seq))
                if do_pi:
                    glob_seq = GlobalDescriptor(seq)
                    glob_seq.isoelectric_point()
                    pi_list.append(glob_seq.descriptor[0][0])
                if do_hyd:
                    glob_seq = GlobalDescriptor(seq)
                    glob_seq.hydrophobic_ratio()
                    hyd_list.append(glob_seq.descriptor[0][0])
                if do_aacomp:
                    aacomp_diclist.append(DesObject.GetAAComp())
                if do_ctd:
                    # calculate 147 CTD descriptors
                    # Default: False
                    ctd_diclist.append(DesObject.GetCTD())

        if file_name == real_file:
            run_dir = real_file + "/"
Ejemplo n.º 26
0
"""Some more features other than amino acid composition of each amino acid  in sequence"""

newFeatures = [
    'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity',
    'AliphaticInd', 'BomanInd', 'HydRatio'
]

#writing feature names in excel sheet
for i in range(cols + len(aminoAcid) + 1,
               cols + len(aminoAcid) + len(newFeatures) + 1):
    writingSheet.cell(
        row=1, column=i).value = newFeatures[i - (cols + len(aminoAcid) + 1)]

for i in range(2, rows + 1):  #filling feature value in excel sheet
    pepSequencee = readingSheet.cell(row=i, column=cols).value
    desc = GlobalDescriptor(pepSequencee)
    desc.calculate_all(amide=True)
    array = desc.descriptor.tolist()
    countt = 1
    for j in range(cols + len(aminoAcid) + 1,
                   cols + len(aminoAcid) + 1 + len(newFeatures)):
        writingSheet.cell(row=i, column=j).value = float(array[0][countt])
        countt += 1

writingBook.save(str(outputFile))  #saving all data to output file

##################################################################TESTING DATA####################################################

trainingData = pd.read_csv(r"test.csv")  #reading CSV training data
trainingData.to_excel(r"test.xlsx", index=None,
                      header=True)  #converting CSV to Excel