Beispiel #1
0
    def train(self, data, verbose=True):
        """ Train all models and return the best one.

        Models are evaluated and ranked according to their ROC-AUC on a validation data set.

        Parameters
        ----------
        data: pysster.Data
            A Data object providing training and validation data sets.
        
        verbose: bool
            If True, progress information (train/val loss) will be printed throughout the training.

        Returns
        -------
        results: tuple(pysster.Model, str)
            The best performing model and an overview table of all models are returned.
        """
        best_model_path = "{}/{}".format(
            gettempdir(),
            ''.join(random.choice(string.ascii_uppercase) for _ in range(20)))
        aucs = []
        max_auroc = -1
        for i, candidate in enumerate(self.candidates):
            model = Model(candidate, data)
            model.train(data, verbose)
            predictions = model.predict(data, "val")
            labels = data.get_labels("val")
            report = utils.performance_report(labels, predictions)
            roc_auc = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis],
                             axis=0)
            roc_auc = (roc_auc / np.sum(report[:, -1]))[3]
            aucs.append(roc_auc)
            if aucs[-1] > max_auroc:
                max_auroc = aucs[-1]
                utils.save_model(model, best_model_path)
            K.clear_session()
            K.reset_uids()
            if not verbose: continue
            print("\n=== Summary ===")
            print("Model {}/{} = {:.5f} weighted avg roc-auc".format(
                i + 1, len(self.candidates), aucs[i]))
            for param in candidate:
                if not param in ["input_shape"]:
                    print(" - {}: {}".format(param, candidate[param]))
        # load the best model (and remove it from disc)
        model = utils.load_model(best_model_path)
        remove(best_model_path)
        remove("{}.h5".format(best_model_path))
        # save a formatted summary of all trained models
        table = self._grid_search_table(aucs)
        return model, table
Beispiel #2
0
 def test_utils_save_load_model(self):
     utils.save_model(self.m1, gettempdir() + "/model")
     self.assertTrue(isfile(gettempdir() + "/model"))
     self.assertTrue(isfile(gettempdir() + "/model.h5"))
     model = utils.load_model(gettempdir() + "/model")
     self.assertTrue(self.m1.params == model.params)
     self.assertTrue(self.m1.model.get_config() == model.model.get_config())
     for x in range(6):
         self.assertTrue(
             np.allclose(self.m1.model.get_weights()[x],
                         model.model.get_weights()[x]))
     remove(gettempdir() + "/model")
     remove(gettempdir() + "/model.h5")
def measure_rbp(entry):
    import os
    from time import time
    from pysster import utils

    output_folder = entry[4] + "_pysster/"
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)

    start = time()

    # predict secondary structures
    utils.predict_structures(entry[0], entry[0] + ".struct", annotate=True)
    utils.predict_structures(entry[1], entry[1] + ".struct", annotate=True)
    utils.predict_structures(entry[2], entry[2] + ".struct", annotate=True)
    utils.predict_structures(entry[3], entry[3] + ".struct", annotate=True)

    from pysster.Data import Data
    from pysster.Model import Model

    # load data
    data = Data([entry[0] + ".struct", entry[1] + ".struct"], ("ACGU", "HIMS"))
    data.train_val_test_split(
        0.8, 0.1999
    )  # we need to have at least one test sequence, even though we have a separate test object

    # training
    params = {"kernel_len": 8}
    model = Model(params, data)
    model.train(data)

    # load and predict test data
    data_test = Data([entry[2] + ".struct", entry[3] + ".struct"],
                     ("ACGU", "HIMS"))
    predictions = model.predict(data_test, "all")

    stop = time()
    print("{}, time in seconds: {}".format(entry[4], stop - start))

    # performance evaluation
    labels = data_test.get_labels("all")
    utils.plot_roc(labels, predictions, output_folder + "roc.pdf")
    utils.plot_prec_recall(labels, predictions, output_folder + "prec.pdf")

    # get motifs
    activations = model.get_max_activations(data_test, "all")
    _ = model.visualize_all_kernels(activations, data_test, output_folder)

    # save model to drive
    utils.save_model(model, "{}model.pkl".format(output_folder))
Beispiel #4
0
##Perfomance evaluation
predictions = model.predict(data, "test")
predictions

labels = data.get_labels("test")
labels

utils.plot_roc(labels, predictions, output_folder + "roc.png")
utils.plot_prec_recall(labels, predictions, output_folder + "prec.png")
print(utils.get_performance_report(labels, predictions))

Image(output_folder + "roc.png")
Image(output_folder + "prec.png")

activations = model.get_max_activations(data, "test")
logos = model.visualize_all_kernels(activations, data, output_folder)
Image(output_folder + "motif_kernel_13.png")
Image(output_folder + "activations_kernel_13.png")
Image(output_folder + "position_kernel_13.png")
Image(output_folder + "data/alu.png")

utils.save_as_meme([logo[0] for logo in logos],
                   output_folder + "motifs_seq.meme")
utils.save_as_meme([logo[1] for logo in logos],
                   output_folder + "motifs_struct.meme")
model.plot_clustering(activations, output_folder + "clustering.png")
Image(output_folder + "clustering.png")

utils.save_data(data, output_folder + "data.pkl")
utils.save_model(model, output_folder + "model.pkl")
Beispiel #5
0
    def train(self, data, pr_auc=False, verbose=True):
        """ Train all models and return the best one.

        Models are evaluated and ranked according to their ROC-AUC or PR-AUC (precision-recall)
        on a validation data set.

        Parameters
        ----------
        data: pysster.Data
            A Data object providing training and validation data sets.
        
        pr_auc: bool
            If True, the area under the precision-recall curve will be maximized instead of the area under the ROC curve

        verbose: bool
            If True, progress information (train/val loss) will be printed throughout the training.

        Returns
        -------
        results: tuple(pysster.Model, str)
            The best performing model and an overview table of all models are returned.
        """
        best_model_path = "{}/{}".format(
            gettempdir(),
            ''.join(random.choice(string.ascii_uppercase) for _ in range(20)))
        if True == pr_auc:
            metric_idx = 4
            metric_name = "pre-auc"
        else:
            metric_idx = 3
            metric_name = "roc-auc"
        metric = []
        max_metric = -1
        for i, candidate in enumerate(self.candidates):
            model = Model(candidate, data)
            model.train(data, verbose)
            predictions = model.predict(data, "val")
            labels = data.get_labels("val")
            report = utils.performance_report(labels, predictions)
            metric_val = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis],
                                axis=0)
            metric_val = (metric_val / np.sum(report[:, -1]))[metric_idx]
            metric.append(metric_val)
            if metric[-1] > max_metric:
                max_metric = metric[-1]
                utils.save_model(model, best_model_path)
            K.clear_session()
            K.reset_uids()
            if not verbose: continue
            print("\n=== Summary ===")
            print("Model {}/{} = {:.5f} weighted avg {}".format(
                i + 1, len(self.candidates), metric[i], metric_name))
            for param in candidate:
                if not param in ["input_shape"]:
                    print(" - {}: {}".format(param, candidate[param]))
        # load the best model (and remove it from disc)
        model = utils.load_model(best_model_path)
        remove(best_model_path)
        remove("{}.h5".format(best_model_path))
        # save a formatted summary of all trained models
        table = self._grid_search_table(metric, metric_name)
        return model, table
def main():

    RBPs = [("data/pum2.train.positive.fasta",
             "data/pum2.train.negative.fasta",
             "data/pum2.test.positive.fasta",
             "data/pum2.test.negative.fasta",
             "PUM2"),
            ("data/qki.train.positive.fasta",
             "data/qki.train.negative.fasta",
             "data/qki.test.positive.fasta",
             "data/qki.test.negative.fasta",
             "QKI"),
            ("data/igf2bp123.train.positive.fasta",
             "data/igf2bp123.train.negative.fasta",
             "data/igf2bp123.test.positive.fasta",
             "data/igf2bp123.test.negative.fasta",
             "IGF2BP123"),
            ("data/srsf1.train.positive.fasta",
             "data/srsf1.train.negative.fasta",
             "data/srsf1.test.positive.fasta",
             "data/srsf1.test.negative.fasta",
             "SRSF1"),
            ("data/taf2n.train.positive.fasta",
             "data/taf2n.train.negative.fasta",
             "data/taf2n.test.positive.fasta",
             "data/taf2n.test.negative.fasta",
             "TAF2N"),
            ("data/nova.train.positive.fasta",
             "data/nova.train.negative.fasta",
             "data/nova.test.positive.fasta",
             "data/nova.test.negative.fasta",
             "NOVA")]

    for entry in RBPs:
        output_folder = entry[4] + "_pysster/"
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)

        start = time()

        # predict secondary structures
        utils.predict_structures(entry[0], entry[0]+".struct.gz", annotate=True)
        utils.predict_structures(entry[1], entry[1]+".struct.gz", annotate=True)
        utils.predict_structures(entry[2], entry[2]+".struct.gz", annotate=True)
        utils.predict_structures(entry[3], entry[3]+".struct.gz", annotate=True)

        # load data
        data = Data([entry[0]+".struct.gz", entry[1]+".struct.gz"], ("ACGU", "HIMS"))
        data.train_val_test_split(0.8, 0.1999) # we need to have at least one test sequence, even though we don't need it
        print(data.get_summary())

        # training
        params = {"kernel_len": 8}
        model = Model(params, data)
        model.train(data)

        # load and predict test data
        data_test = Data([entry[2]+".struct.gz", entry[3]+".struct.gz"], ("ACGU", "HIMS"))
        predictions = model.predict(data_test, "all")

        stop = time()
        print("{}, time in seconds: {}".format(entry[4], stop-start))

        # performance evaluation
        labels = data_test.get_labels("all")
        utils.plot_roc(labels, predictions, output_folder+"roc.pdf")
        utils.plot_prec_recall(labels, predictions, output_folder+"prec.pdf")
        print(utils.get_performance_report(labels, predictions))

        # get motifs
        activations = model.get_max_activations(data_test, "all")
        logos, scores = [], []
        for kernel in range(model.params["kernel_num"]):
            logo, score = model.visualize_kernel(activations, data_test, kernel, output_folder)
            logos.append(logo)
            scores.append(score)
        
        # sort motifs by importance score
        sorted_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])]
        with open(output_folder+"kernel_scores.txt", "wt") as handle:
            for x in sorted_idx:
                print("kernel {:>3}: {:.3f}".format(x, scores[x]))
                handle.write("kernel {:>3}: {:.3f}\n".format(x, scores[x]))

        # save model to drive
        utils.save_model(model, "{}model.pkl".format(output_folder))