Example #1
0
 def calc_len(self):
     """Method to get the sequence length of all sequences in the library.
     
     :return: {numpy.ndarray} sequence lengths in the attribute :py:attr:`len`.
     """
     for l in range(self.library.shape[0]):
         d = GlobalDescriptor(self.library[l])
         d.length()
         self.len.append(d.descriptor[:, 0])
Example #2
0
    def analyze_training(self):
        """ Method to analyze the distribution of the training data

        :return: prints out information about the length distribution of the sequences in ``self.sequences``
        """
        d = GlobalDescriptor(self.sequences)
        d.length()
        print("\nLENGTH DISTRIBUTION OF TRAINING DATA:\n")
        print("Number of sequences:    \t%i" % len(self.sequences))
        print("Mean sequence length:   \t%.1f ± %.1f" % (np.mean(d.descriptor), np.std(d.descriptor)))
        print("Median sequence length: \t%i" % np.median(d.descriptor))
        print("Minimal sequence length:\t%i" % np.min(d.descriptor))
        print("Maximal sequence length:\t%i" % np.max(d.descriptor))
Example #3
0
def makeintlistdic_from_allep(dir_name, run_dir):
    i = 1
    intlistdic = {}
    len_ave_list, pi_ave_list, hyd_ave_list, len_var_list, pi_var_list, hyd_var_list = [
    ], [], [], [], [], []

    while True:
        if os.path.exists(dir_name + run_dir + str(i) + '.txt'):
            len_list_ep, pi_list_ep, hyd_list_ep = [], [], []
            seq_size = 0

            with open(dir_name + run_dir + str(i) + '.txt') as f:
                for line in f:
                    seq = line[:-1]
                    seq = GlobalDescriptor(seq)
                    seq.length()
                    len_list_ep.append(seq.descriptor[0][0])
                    seq.isoelectric_point()
                    pi_list_ep.append(seq.descriptor[0][0])
                    seq.hydrophobic_ratio()
                    hyd_list_ep.append(seq.descriptor[0][0])
                    seq_size += 1

                len_ave_list.append(round(len(len_list_ep) / seq_size, 3))
                pi_ave_list.append(round(len(pi_list_ep) / seq_size, 3))
                hyd_ave_list.append(round(len(hyd_list_ep) / seq_size, 3))
                len_var_list.append(round(statistics.pvariance(len_list_ep),
                                          3))
                pi_var_list.append(round(statistics.pvariance(pi_list_ep), 3))
                hyd_var_list.append(round(statistics.pvariance(hyd_list_ep),
                                          3))

                i += 1
        else:
            break

    intlistdic["len_ave"] = len_ave_list
    intlistdic["pi_ave"] = pi_ave_list
    intlistdic["hyd_ave"] = hyd_ave_list
    intlistdic["len_var"] = len_var_list
    intlistdic["pi_var"] = pi_var_list
    intlistdic["hyd_var"] = hyd_var_list
    # print(intlistdic, len(len_ave_list))

    return intlistdic
Example #4
0
def predict():

    if request.method == 'POST':

        seq = request.form['seq']
        with open("random.fasta", "w") as fp:
            fp.write(seq)

        pepdesc = PeptideDescriptor(
            '/home/sanika/proj/random.fasta',
            'eisenberg')  # use Eisenberg consensus scale
        globdesc = GlobalDescriptor('/home/sanika/proj/random.fasta')

        # --------------- Peptide Descriptor (AA scales) Calculations ---------------
        pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate Eisenberg hydrophobic moment

        # load other AA scales
        pepdesc.load_scale('gravy')  # load GRAVY scale
        pepdesc.calculate_global(
            append=True)  # calculate global GRAVY hydrophobicity
        pepdesc.calculate_moment(
            append=True)  # calculate GRAVY hydrophobic moment
        pepdesc.load_scale('z3')  # load old Z scale
        pepdesc.calculate_autocorr(
            1,
            append=True)  # calculate global Z scale (=window1 autocorrelation)

        # --------------- Global Descriptor Calculations ---------------
        globdesc.length()  # sequence length
        globdesc.boman_index(append=True)  # Boman index
        globdesc.aromaticity(append=True)  # global aromaticity
        globdesc.aliphatic_index(append=True)  # aliphatic index
        globdesc.instability_index(append=True)  # instability index
        globdesc.calculate_charge(ph=7.4, amide=False,
                                  append=True)  # net charge
        globdesc.calculate_MW(amide=False, append=True)  # molecular weight

        f1 = pepdesc.descriptor
        f2 = globdesc.descriptor
        result = np.concatenate((f2, f1), axis=1)

        clf = joblib.load('ml_model.pkl')
        pred = clf.predict(result)
        proba = clf.predict_proba(result).tocoo()
        mc = pred.tocoo()
        out = mc.col
        res = []
        labels = ['antiviral', 'antibacterial', 'antifungal']
        values = proba.data
        plt.pie(values,
                labels=labels,
                autopct='%.0f%%',
                shadow=True,
                radius=0.5)
        plt.savefig('/home/sanika/proj/pie_chart.jpg')

        figfile = BytesIO()
        plt.savefig(figfile, format='png')
        figfile.seek(0)
        figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii')
        plt.close()

        for i in range(len(out)):
            if out[i] == 0:
                res.append("antiviral")
            elif out[i] == 1:
                res.append("antibacterial")
            else:
                res.append("antifungal")

        return render_template('seq.html', seq=res, result=figdata_png)

    return render_template('predictor.html')
Example #5
0
def upload():

    if request.method == 'POST':
        # This will be executed on POST request.
        upfile = request.files['file']
        if upfile and allowed_file(upfile.filename):

            filename = secure_filename(upfile.filename)
            upfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            #return render_template('upload.html')
            #flash("File uploaded", "success")
            #with open("/home/sanika/proj/uploads/aa.fasta") as f:
            #lines = f.readlines()
            #lines = [l for l in lines if "ROW" in l]

            #with open("/home/sanika/proj/uploads/out.fasta", "w") as f1:
            #f1.writelines(lines)

            #f = open(filename)
            #prot_seq = ReadFasta(f)

            with open(filename) as fasta_file:  # Will close handle cleanly
                identifiers = []
                sequence = []
                for seq_record in SeqIO.parse(fasta_file,
                                              'fasta'):  # (generator)
                    identifiers.append(seq_record.id)
                    sequence.append(seq_record.seq)

            pepdesc = PeptideDescriptor(
                filename, 'eisenberg')  # use Eisenberg consensus scale
            globdesc = GlobalDescriptor(filename)

            # --------------- Peptide Descriptor (AA scales) Calculations ---------------
            pepdesc.calculate_global(
            )  # calculate global Eisenberg hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate Eisenberg hydrophobic moment

            # load other AA scales
            pepdesc.load_scale('gravy')  # load GRAVY scale
            pepdesc.calculate_global(
                append=True)  # calculate global GRAVY hydrophobicity
            pepdesc.calculate_moment(
                append=True)  # calculate GRAVY hydrophobic moment
            pepdesc.load_scale('z3')  # load old Z scale
            pepdesc.calculate_autocorr(
                1, append=True
            )  # calculate global Z scale (=window1 autocorrelation)

            # --------------- Global Descriptor Calculations ---------------
            globdesc.length()  # sequence length
            globdesc.boman_index(append=True)  # Boman index
            globdesc.aromaticity(append=True)  # global aromaticity
            globdesc.aliphatic_index(append=True)  # aliphatic index
            globdesc.instability_index(append=True)  # instability index
            globdesc.calculate_charge(ph=7.4, amide=False,
                                      append=True)  # net charge
            globdesc.calculate_MW(amide=False, append=True)  # molecular weight

            f1 = pepdesc.descriptor
            f2 = globdesc.descriptor
            result = np.concatenate((f2, f1), axis=1)
            rs = []
            for i in range(len(result)):
                prt = np.reshape(result[i], (-1, 14))
                clf = joblib.load('ml_model.pkl')
                pred = clf.predict(prt)
                out = pred.toarray()
                #print(clf.predict_proba(result))
                proba = clf.predict_proba(prt).tocoo()
                mc = pred.tocoo()
                out = mc.col
                res = []
                for i in range(len(out)):
                    if out[i] == 0:
                        res.append("antiviral")
                    elif out[i] == 1:
                        res.append("antibacterial")
                    else:
                        res.append("antifungal")
                rs.append(res)
            a = []
            for i in range(len(rs)):
                a.append('-'.join(rs[i]))

            df = pd.DataFrame(data={
                "id": identifiers,
                "sequence": sequence,
                "activity": a
            },
                              columns=['id', 'sequence', 'activity'])
            df.to_csv("result.csv", sep=',', index=False)

            os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            #return render_template('seq.html', seq = rs)
            return render_template('up.html', mimetype="text/csv")

            #flash("File uploaded: Thanks!", "success")
        else:
            error = "PLEASE CHECK THE FORMAT OF FILE TO UPLOAD"
            return render_template('upload.html', error=error)

    # This will be executed on GET request.
    return render_template('predictor.html')
globdesc = GlobalDescriptor('/path/to/sequences.fasta')

# --------------- Peptide Descriptor (AA scales) Calculations ---------------
pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate Eisenberg hydrophobic moment

# load other AA scales
pepdesc.load_scale('gravy')  # load GRAVY scale
pepdesc.calculate_global(append=True)  # calculate global GRAVY hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate GRAVY hydrophobic moment
pepdesc.load_scale('z3')  # load old Z scale
pepdesc.calculate_autocorr(
    1, append=True)  # calculate global Z scale (=window1 autocorrelation)

# save descriptor data to .csv file
col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3'
pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1)

# --------------- Global Descriptor Calculations ---------------
globdesc.length()  # sequence length
globdesc.boman_index(append=True)  # Boman index
globdesc.aromaticity(append=True)  # global aromaticity
globdesc.aliphatic_index(append=True)  # aliphatic index
globdesc.instability_index(append=True)  # instability index
globdesc.calculate_charge(ph=7.4, amide=False, append=True)  # net charge
globdesc.calculate_MW(amide=False, append=True)  # molecular weight

# save descriptor data to .csv file
col_names2 = 'ID,Sequence,Length,BomanIndex,Aromaticity,AliphaticIndex,InstabilityIndex,Charge,MW'
globdesc.save_descriptor('/path/to/descriptors2.csv', header=col_names2)
Example #7
0
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')