Example #1
0
def main():

    RBPs = [("data/pum2.train.positive.fasta",
             "data/pum2.train.negative.fasta",
             "data/pum2.test.positive.fasta",
             "data/pum2.test.negative.fasta",
             "PUM2"),
            ("data/qki.train.positive.fasta",
             "data/qki.train.negative.fasta",
             "data/qki.test.positive.fasta",
             "data/qki.test.negative.fasta",
             "QKI"),
            ("data/igf2bp123.train.positive.fasta",
             "data/igf2bp123.train.negative.fasta",
             "data/igf2bp123.test.positive.fasta",
             "data/igf2bp123.test.negative.fasta",
             "IGF2BP123"),
            ("data/srsf1.train.positive.fasta",
             "data/srsf1.train.negative.fasta",
             "data/srsf1.test.positive.fasta",
             "data/srsf1.test.negative.fasta",
             "SRSF1"),
            ("data/taf2n.train.positive.fasta",
             "data/taf2n.train.negative.fasta",
             "data/taf2n.test.positive.fasta",
             "data/taf2n.test.negative.fasta",
             "TAF2N"),
            ("data/nova.train.positive.fasta",
             "data/nova.train.negative.fasta",
             "data/nova.test.positive.fasta",
             "data/nova.test.negative.fasta",
             "NOVA")]

    for entry in RBPs:
        output_folder = entry[4] + "_pysster/"
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)

        start = time()

        # predict secondary structures
        utils.predict_structures(entry[0], entry[0]+".struct.gz", annotate=True)
        utils.predict_structures(entry[1], entry[1]+".struct.gz", annotate=True)
        utils.predict_structures(entry[2], entry[2]+".struct.gz", annotate=True)
        utils.predict_structures(entry[3], entry[3]+".struct.gz", annotate=True)

        # load data
        data = Data([entry[0]+".struct.gz", entry[1]+".struct.gz"], ("ACGU", "HIMS"))
        data.train_val_test_split(0.8, 0.1999) # we need to have at least one test sequence, even though we don't need it
        print(data.get_summary())

        # training
        params = {"kernel_len": 8}
        model = Model(params, data)
        model.train(data)

        # load and predict test data
        data_test = Data([entry[2]+".struct.gz", entry[3]+".struct.gz"], ("ACGU", "HIMS"))
        predictions = model.predict(data_test, "all")

        stop = time()
        print("{}, time in seconds: {}".format(entry[4], stop-start))

        # performance evaluation
        labels = data_test.get_labels("all")
        utils.plot_roc(labels, predictions, output_folder+"roc.pdf")
        utils.plot_prec_recall(labels, predictions, output_folder+"prec.pdf")
        print(utils.get_performance_report(labels, predictions))

        # get motifs
        activations = model.get_max_activations(data_test, "all")
        logos, scores = [], []
        for kernel in range(model.params["kernel_num"]):
            logo, score = model.visualize_kernel(activations, data_test, kernel, output_folder)
            logos.append(logo)
            scores.append(score)
        
        # sort motifs by importance score
        sorted_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])]
        with open(output_folder+"kernel_scores.txt", "wt") as handle:
            for x in sorted_idx:
                print("kernel {:>3}: {:.3f}".format(x, scores[x]))
                handle.write("kernel {:>3}: {:.3f}\n".format(x, scores[x]))

        # save model to drive
        utils.save_model(model, "{}model.pkl".format(output_folder))
Example #2
0
class Test_Model(unittest.TestCase):
    def setUp(self):
        folder = dirname(__file__)
        file_name = folder + "/data/rna.fasta"
        self.data = Data(file_name, ("ACGU", "()."))
        self.params = {
            "conv_num": 1,
            "kernel_num": 3,
            "kernel_len": 5,
            "neuron_num": 2,
            "epochs": 3
        }
        self.m1 = Model(self.params, self.data, seed=2)
        self.m2 = Model(self.params, self.data, seed=13)
        self.m3 = Model(self.params, self.data, seed=2)

    def test_model_init(self):
        self.assertTrue(self.m1.params["conv_num"] == 1)
        self.assertTrue(self.m1.params["kernel_num"] == 3)
        self.assertTrue(self.m1.params["kernel_len"] == 5)
        self.assertTrue(self.m1.params["neuron_num"] == 2)
        self.assertTrue(self.m1.params["activation"] == "sigmoid")
        self.assertTrue(self.m1.model.layers[2].get_weights()[0].shape == (5,
                                                                           12,
                                                                           3))
        self.assertTrue(
            np.allclose(self.m1.model.layers[2].get_weights()[0],
                        self.m3.model.layers[2].get_weights()[0]))
        self.assertFalse(
            np.allclose(self.m2.model.layers[2].get_weights()[0],
                        self.m3.model.layers[2].get_weights()[0]))
        self.assertTrue(
            np.allclose(self.m1.model.layers[6].get_weights()[0],
                        self.m3.model.layers[6].get_weights()[0]))
        self.assertFalse(
            np.allclose(self.m2.model.layers[6].get_weights()[0],
                        self.m3.model.layers[6].get_weights()[0]))

    def test_model_train_predict(self):
        for obj in [self.m1, self.m2, self.m3]:
            obj.train(self.data, verbose=False)
            predictions = obj.predict(self.data, "test")
            self.assertTrue(predictions.shape == (3, 3))
            self.assertTrue((predictions > 0.49).all())
            self.assertTrue((predictions < 0.51).all())
            predictions = obj.predict(self.data, "all")
            self.assertTrue(predictions.shape == (20, 3))
            self.assertTrue((predictions > 0.49).all())
            self.assertTrue((predictions < 0.51).all())
        self.assertTrue(
            np.allclose(self.m1.model.layers[2].get_weights()[0],
                        self.m3.model.layers[2].get_weights()[0],
                        atol=0.001))
        self.assertFalse(
            np.allclose(self.m2.model.layers[2].get_weights()[0],
                        self.m3.model.layers[2].get_weights()[0],
                        atol=0.001))
        self.assertTrue(
            np.allclose(self.m1.model.layers[6].get_weights()[0],
                        self.m3.model.layers[6].get_weights()[0],
                        atol=0.001))
        self.assertFalse(
            np.allclose(self.m2.model.layers[6].get_weights()[0],
                        self.m3.model.layers[6].get_weights()[0],
                        atol=0.001))

    def test_model_get_max_activations(self):
        acts = self.m1.get_max_activations(self.data, 'test')
        self.assertTrue(acts['activations'].shape == (3, 3))
        self.assertTrue(acts['labels'].shape == (3, 3))
        self.assertTrue(acts['group'] == 'test')

    def test_model_visualize_kernel(self):
        acts = self.m1.get_max_activations(self.data, 'all')
        folder = gettempdir() + '/'
        # individual kernels
        for kernel in range(self.params['kernel_num']):
            motif, score = self.m1.visualize_kernel(acts, self.data, kernel,
                                                    folder)
            self.assertTrue(
                isfile(folder + "motif_kernel_{}.png".format(kernel)))
            self.assertTrue(
                isfile(folder + "position_kernel_{}.png".format(kernel)))
            self.assertTrue(
                isfile(folder + "activations_kernel_{}.png".format(kernel)))
            remove(folder + "motif_kernel_{}.png".format(kernel))
            remove(folder + "position_kernel_{}.png".format(kernel))
            remove(folder + "activations_kernel_{}.png".format(kernel))
            self.assertTrue(isinstance(motif, tuple))
            self.assertTrue(isinstance(motif[0], Motif))
            self.assertTrue(np.isclose(score, 0) or score > 0)
        # all kernels
        motifs = self.m1.visualize_all_kernels(acts, self.data, folder)
        self.assertTrue(len(motifs) == 3)
        for x in range(3):
            self.assertTrue(isinstance(motifs[x], tuple))
            self.assertTrue(isinstance(motifs[x][0], Motif))
        for kernel in range(self.params['kernel_num']):
            self.assertTrue(
                isfile(folder + "motif_kernel_{}.png".format(kernel)))
            self.assertTrue(
                isfile(folder + "position_kernel_{}.png".format(kernel)))
            self.assertTrue(
                isfile(folder + "activations_kernel_{}.png".format(kernel)))
            remove(folder + "motif_kernel_{}.png".format(kernel))
            remove(folder + "position_kernel_{}.png".format(kernel))
            remove(folder + "activations_kernel_{}.png".format(kernel))
        self.assertTrue(isfile(folder + "summary.html"))
        remove(folder + "summary.html")

    def test_model_plot_clustering(self):
        acts = self.m1.get_max_activations(self.data, 'test')
        self.m1.plot_clustering(acts, gettempdir() + "/clust.png")
        self.assertFalse(isfile(gettempdir() + "/clust.png"))

    def test_model_optimized_inputs(self):
        self.m1.visualize_optimized_inputs(self.data,
                                           self.m1.model.layers[2].name,
                                           gettempdir() + "/test.png")
        self.m1.visualize_optimized_inputs(self.data,
                                           self.m1.model.layers[2].name,
                                           gettempdir() + "/test2.png",
                                           nodes=[0])
        with Image.open(gettempdir() + "/test.png") as img:
            self.assertTrue(img.size == (1998, 1128))
        with Image.open(gettempdir() + "/test2.png") as img:
            self.assertTrue(img.size == (1998, 376))
        remove(gettempdir() + "/test.png")
        remove(gettempdir() + "/test2.png")
Example #3
0
    def test_data_additional(self):
        self.assertTrue(len(self.data_pwm.meta) == 2)
        self.assertTrue(self.data_pwm.meta[0]['is_categorical'] == False)
        self.assertTrue(self.data_pwm.meta[1]['is_categorical'] == True)
        self.assertTrue(self.data_pwm.meta[0]['data'] == [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 15, 14,
            13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
        ])
        self.assertTrue(len(self.data_pwm.meta[1]['data']) == 32)
        for x in self.data_pwm.meta[1]['data']:
            self.assertTrue(sum(x) == 1)
        self.assertTrue((self.data_pwm.meta[1]['data'][0] ==
                         self.data_pwm.meta[1]['data'][31]).all())
        self.assertTrue((self.data_pwm.meta[1]['data'][13] ==
                         self.data_pwm.meta[1]['data'][18]).all())
        addi = self.data_pwm._get_additional_data([0, 1, 15, 16], 0, 4)
        self.assertTrue(len(addi) == 4)
        self.assertTrue(
            np.allclose(addi[0], [1, *self.data_pwm.meta[1]['data'][0]]))
        self.assertTrue(
            np.allclose(addi[1], [2, *self.data_pwm.meta[1]['data'][1]]))
        self.assertTrue(
            np.allclose(addi[2], [16, *self.data_pwm.meta[1]['data'][15]]))
        self.assertTrue(
            np.allclose(addi[3], [16, *self.data_pwm.meta[1]['data'][16]]))

        # check position-wise additional data
        self.assertTrue(len(self.data_pwm.positionwise) == 2)
        self.assertTrue(
            list(self.data_pwm.positionwise.keys()) == ["feat1", "feat2"])
        gen = self.data_pwm._data_generator("all", 32, False, False)
        dat = next(gen)
        self.assertTrue(dat[1].shape == (32, 17))
        self.assertTrue(dat[0].shape == (32, 10, 14))
        self.assertTrue(
            np.allclose(dat[0][0, :, 12],
                        [0.9, 0.8, 0.7, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
        self.assertTrue(
            np.allclose(dat[0][31, :, 13],
                        [0.1, 0.2, 0.3, 0.1, 1.0, 1.0, 0.8, 0.3, 0.2, 0.1]))
        self.assertTrue(
            np.allclose(dat[0][31, :, 12],
                        [2.1, 2.2, 2.3, 2.1, 2.0, 2.0, 2.8, 2.3, 2.2, 2.1]))

        mod = Model(
            {
                "conv_num": 1,
                "kernel_num": 2,
                "kernel_len": 4,
                "neuron_num": 2,
                "epochs": 2
            }, self.data_pwm)
        mod.train(self.data_pwm, verbose=True)
        predictions = mod.predict(self.data_pwm, "all")
        self.assertTrue(predictions.shape == (32, 2))

        # check kernel output plot for position-wise data
        folder = gettempdir() + '/'
        acts = mod.get_max_activations(self.data_pwm, 'all')
        motif, score = mod.visualize_kernel(acts, self.data_pwm, 0, folder)
        with Image.open(folder + "additional_features_kernel_0.png") as img:
            self.assertTrue(img.size == (500, 1400))
        remove(folder + "additional_features_kernel_0.png")
        remove(folder + "motif_kernel_0.png")
        remove(folder + "position_kernel_0.png")
        remove(folder + "activations_kernel_0.png")