def setUp(self): folder = dirname(__file__) file_name = folder + "/data/rna.fasta" self.data = Data(file_name, ("ACGU", "().")) self.params = {"conv_num":1, "kernel_num":3, "kernel_len":5, "neuron_num":2, "epochs":3} self.m1 = Model(self.params, self.data, seed = 2) self.m2 = Model(self.params, self.data, seed = 13) self.m3 = Model(self.params, self.data, seed = 2)
def train(self, data, verbose=True): """ Train all models and return the best one. Models are evaluated and ranked according to their ROC-AUC on a validation data set. Parameters ---------- data: pysster.Data A Data object providing training and validation data sets. verbose: bool If True, progress information (train/val loss) will be printed throughout the training. Returns ------- results: tuple(pysster.Model, str) The best performing model and an overview table of all models are returned. """ best_model_path = "{}/{}".format( gettempdir(), ''.join(random.choice(string.ascii_uppercase) for _ in range(20))) aucs = [] max_auroc = -1 for i, candidate in enumerate(self.candidates): model = Model(candidate, data) model.train(data, verbose) predictions = model.predict(data, "val") labels = data.get_labels("val") report = utils.performance_report(labels, predictions) roc_auc = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis], axis=0) roc_auc = (roc_auc / np.sum(report[:, -1]))[3] aucs.append(roc_auc) if aucs[-1] > max_auroc: max_auroc = aucs[-1] utils.save_model(model, best_model_path) K.clear_session() K.reset_uids() if not verbose: continue print("\n=== Summary ===") print("Model {}/{} = {:.5f} weighted avg roc-auc".format( i + 1, len(self.candidates), aucs[i])) for param in candidate: if not param in ["input_shape"]: print(" - {}: {}".format(param, candidate[param])) # load the best model (and remove it from disc) model = utils.load_model(best_model_path) remove(best_model_path) remove("{}.h5".format(best_model_path)) # save a formatted summary of all trained models table = self._grid_search_table(aucs) return model, table
def test_data_additional(self): self.assertTrue(len(self.data_pwm.meta) == 2) self.assertTrue(self.data_pwm.meta[0]['is_categorical'] == False) self.assertTrue(self.data_pwm.meta[1]['is_categorical'] == True) self.assertTrue(self.data_pwm.meta[0]['data'] == [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]) self.assertTrue(len(self.data_pwm.meta[1]['data']) == 32) for x in self.data_pwm.meta[1]['data']: self.assertTrue(sum(x) == 1) self.assertTrue((self.data_pwm.meta[1]['data'][0] == self.data_pwm.meta[1]['data'][31]).all()) self.assertTrue((self.data_pwm.meta[1]['data'][13] == self.data_pwm.meta[1]['data'][18]).all()) addi = self.data_pwm._get_additional_data([0,1,15,16], 0, 4) self.assertTrue(len(addi) == 4) self.assertTrue(np.allclose(addi[0], [1,*self.data_pwm.meta[1]['data'][0]])) self.assertTrue(np.allclose(addi[1], [2,*self.data_pwm.meta[1]['data'][1]])) self.assertTrue(np.allclose(addi[2], [16,*self.data_pwm.meta[1]['data'][15]])) self.assertTrue(np.allclose(addi[3], [16,*self.data_pwm.meta[1]['data'][16]])) mod = Model({"conv_num":1, "kernel_num":2, "kernel_len":4, "neuron_num":2, "epochs":2}, self.data_pwm) mod.train(self.data_pwm, verbose=True) predictions = mod.predict(self.data_pwm, "all") self.assertTrue(predictions.shape == (32,2))
def load_model(file_path): """ Load a pysster.Model object. Parameters ---------- file_path : str A file containing a pickled pysster.Model object (file_path.h5 must also exist, see save_model()). Returns ------- model : pysster.Model A Model object. """ from pysster.Model import Model if not os.path.exists(file_path): raise RuntimeError("Path not found.") if not os.path.exists("{}.h5".format(file_path)): raise RuntimeError("HDF5 file not found.") with gzip.open(file_path, "rb") as handle: params = pickle.load(handle) model = Model(params, None) model.model = keras.models.load_model("{}.h5".format(file_path)) return model
def measure_rbp(entry): import os from time import time from pysster import utils output_folder = entry[4] + "_pysster/" if not os.path.isdir(output_folder): os.makedirs(output_folder) start = time() # predict secondary structures utils.predict_structures(entry[0], entry[0] + ".struct", annotate=True) utils.predict_structures(entry[1], entry[1] + ".struct", annotate=True) utils.predict_structures(entry[2], entry[2] + ".struct", annotate=True) utils.predict_structures(entry[3], entry[3] + ".struct", annotate=True) from pysster.Data import Data from pysster.Model import Model # load data data = Data([entry[0] + ".struct", entry[1] + ".struct"], ("ACGU", "HIMS")) data.train_val_test_split( 0.8, 0.1999 ) # we need to have at least one test sequence, even though we have a separate test object # training params = {"kernel_len": 8} model = Model(params, data) model.train(data) # load and predict test data data_test = Data([entry[2] + ".struct", entry[3] + ".struct"], ("ACGU", "HIMS")) predictions = model.predict(data_test, "all") stop = time() print("{}, time in seconds: {}".format(entry[4], stop - start)) # performance evaluation labels = data_test.get_labels("all") utils.plot_roc(labels, predictions, output_folder + "roc.pdf") utils.plot_prec_recall(labels, predictions, output_folder + "prec.pdf") # get motifs activations = model.get_max_activations(data_test, "all") _ = model.visualize_all_kernels(activations, data_test, output_folder) # save model to drive utils.save_model(model, "{}model.pkl".format(output_folder))
def train(self, data, pr_auc=False, verbose=True): """ Train all models and return the best one. Models are evaluated and ranked according to their ROC-AUC or PR-AUC (precision-recall) on a validation data set. Parameters ---------- data: pysster.Data A Data object providing training and validation data sets. pr_auc: bool If True, the area under the precision-recall curve will be maximized instead of the area under the ROC curve verbose: bool If True, progress information (train/val loss) will be printed throughout the training. Returns ------- results: tuple(pysster.Model, str) The best performing model and an overview table of all models are returned. """ best_model_path = "{}/{}".format( gettempdir(), ''.join(random.choice(string.ascii_uppercase) for _ in range(20))) if True == pr_auc: metric_idx = 4 metric_name = "pre-auc" else: metric_idx = 3 metric_name = "roc-auc" metric = [] max_metric = -1 for i, candidate in enumerate(self.candidates): model = Model(candidate, data) model.train(data, verbose) predictions = model.predict(data, "val") labels = data.get_labels("val") report = utils.performance_report(labels, predictions) metric_val = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis], axis=0) metric_val = (metric_val / np.sum(report[:, -1]))[metric_idx] metric.append(metric_val) if metric[-1] > max_metric: max_metric = metric[-1] utils.save_model(model, best_model_path) K.clear_session() K.reset_uids() if not verbose: continue print("\n=== Summary ===") print("Model {}/{} = {:.5f} weighted avg {}".format( i + 1, len(self.candidates), metric[i], metric_name)) for param in candidate: if not param in ["input_shape"]: print(" - {}: {}".format(param, candidate[param])) # load the best model (and remove it from disc) model = utils.load_model(best_model_path) remove(best_model_path) remove("{}.h5".format(best_model_path)) # save a formatted summary of all trained models table = self._grid_search_table(metric, metric_name) return model, table
class Test_Model(unittest.TestCase): def setUp(self): folder = dirname(__file__) file_name = folder + "/data/rna.fasta" self.data = Data(file_name, ("ACGU", "().")) self.params = { "conv_num": 1, "kernel_num": 3, "kernel_len": 5, "neuron_num": 2, "epochs": 3 } self.m1 = Model(self.params, self.data, seed=2) self.m2 = Model(self.params, self.data, seed=13) self.m3 = Model(self.params, self.data, seed=2) def test_model_init(self): self.assertTrue(self.m1.params["conv_num"] == 1) self.assertTrue(self.m1.params["kernel_num"] == 3) self.assertTrue(self.m1.params["kernel_len"] == 5) self.assertTrue(self.m1.params["neuron_num"] == 2) self.assertTrue(self.m1.params["activation"] == "sigmoid") self.assertTrue(self.m1.model.layers[2].get_weights()[0].shape == (5, 12, 3)) self.assertTrue( np.allclose(self.m1.model.layers[2].get_weights()[0], self.m3.model.layers[2].get_weights()[0])) self.assertFalse( np.allclose(self.m2.model.layers[2].get_weights()[0], self.m3.model.layers[2].get_weights()[0])) self.assertTrue( np.allclose(self.m1.model.layers[6].get_weights()[0], self.m3.model.layers[6].get_weights()[0])) self.assertFalse( np.allclose(self.m2.model.layers[6].get_weights()[0], self.m3.model.layers[6].get_weights()[0])) def test_model_train_predict(self): for obj in [self.m1, self.m2, self.m3]: obj.train(self.data, verbose=False) predictions = obj.predict(self.data, "test") self.assertTrue(predictions.shape == (3, 3)) self.assertTrue((predictions > 0.49).all()) self.assertTrue((predictions < 0.51).all()) predictions = obj.predict(self.data, "all") self.assertTrue(predictions.shape == (20, 3)) self.assertTrue((predictions > 0.49).all()) self.assertTrue((predictions < 0.51).all()) self.assertTrue( np.allclose(self.m1.model.layers[2].get_weights()[0], self.m3.model.layers[2].get_weights()[0], atol=0.001)) self.assertFalse( np.allclose(self.m2.model.layers[2].get_weights()[0], self.m3.model.layers[2].get_weights()[0], atol=0.001)) self.assertTrue( np.allclose(self.m1.model.layers[6].get_weights()[0], self.m3.model.layers[6].get_weights()[0], atol=0.001)) self.assertFalse( np.allclose(self.m2.model.layers[6].get_weights()[0], self.m3.model.layers[6].get_weights()[0], atol=0.001)) def test_model_get_max_activations(self): acts = self.m1.get_max_activations(self.data, 'test') self.assertTrue(acts['activations'].shape == (3, 3)) self.assertTrue(acts['labels'].shape == (3, 3)) self.assertTrue(acts['group'] == 'test') def test_model_visualize_kernel(self): acts = self.m1.get_max_activations(self.data, 'all') folder = gettempdir() + '/' # individual kernels for kernel in range(self.params['kernel_num']): motif, score = self.m1.visualize_kernel(acts, self.data, kernel, folder) self.assertTrue( isfile(folder + "motif_kernel_{}.png".format(kernel))) self.assertTrue( isfile(folder + "position_kernel_{}.png".format(kernel))) self.assertTrue( isfile(folder + "activations_kernel_{}.png".format(kernel))) remove(folder + "motif_kernel_{}.png".format(kernel)) remove(folder + "position_kernel_{}.png".format(kernel)) remove(folder + "activations_kernel_{}.png".format(kernel)) self.assertTrue(isinstance(motif, tuple)) self.assertTrue(isinstance(motif[0], Motif)) self.assertTrue(np.isclose(score, 0) or score > 0) # all kernels motifs = self.m1.visualize_all_kernels(acts, self.data, folder) self.assertTrue(len(motifs) == 3) for x in range(3): self.assertTrue(isinstance(motifs[x], tuple)) self.assertTrue(isinstance(motifs[x][0], Motif)) for kernel in range(self.params['kernel_num']): self.assertTrue( isfile(folder + "motif_kernel_{}.png".format(kernel))) self.assertTrue( isfile(folder + "position_kernel_{}.png".format(kernel))) self.assertTrue( isfile(folder + "activations_kernel_{}.png".format(kernel))) remove(folder + "motif_kernel_{}.png".format(kernel)) remove(folder + "position_kernel_{}.png".format(kernel)) remove(folder + "activations_kernel_{}.png".format(kernel)) self.assertTrue(isfile(folder + "summary.html")) remove(folder + "summary.html") def test_model_plot_clustering(self): acts = self.m1.get_max_activations(self.data, 'test') self.m1.plot_clustering(acts, gettempdir() + "/clust.png") self.assertFalse(isfile(gettempdir() + "/clust.png")) def test_model_optimized_inputs(self): self.m1.visualize_optimized_inputs(self.data, self.m1.model.layers[2].name, gettempdir() + "/test.png") self.m1.visualize_optimized_inputs(self.data, self.m1.model.layers[2].name, gettempdir() + "/test2.png", nodes=[0]) with Image.open(gettempdir() + "/test.png") as img: self.assertTrue(img.size == (1998, 1128)) with Image.open(gettempdir() + "/test2.png") as img: self.assertTrue(img.size == (1998, 376)) remove(gettempdir() + "/test.png") remove(gettempdir() + "/test2.png")
class Test_utils(unittest.TestCase): def setUp(self): self.folder = dirname(__file__) file_name = self.folder + "/data/rna.fasta" self.data = Data(file_name, ("ACGU", "().")) self.params = { "conv_num": 1, "kernel_num": 3, "kernel_len": 5, "neuron_num": 2, "epochs": 3 } self.m1 = Model(self.params, self.data, seed=2) self.m1.train(self.data, verbose=False) def test_utils_save_load_model(self): utils.save_model(self.m1, gettempdir() + "/model") self.assertTrue(isfile(gettempdir() + "/model")) self.assertTrue(isfile(gettempdir() + "/model.h5")) model = utils.load_model(gettempdir() + "/model") self.assertTrue(self.m1.params == model.params) self.assertTrue(self.m1.model.get_config() == model.model.get_config()) for x in range(6): self.assertTrue( np.allclose(self.m1.model.get_weights()[x], model.model.get_weights()[x])) remove(gettempdir() + "/model") remove(gettempdir() + "/model.h5") def test_utils_save_load_data(self): utils.save_data(self.data, gettempdir() + "/data") self.assertTrue(isfile(gettempdir() + "/data")) data = utils.load_data(gettempdir() + "/data") self.assertTrue(isinstance(data, Data)) remove(gettempdir() + "/data") def test_utils_annotate_structures(self): utils.annotate_structures(self.folder + "/data/rna_annot.fasta", gettempdir() + "/test.fasta") with open(self.folder + "/data/rna_annot_ref.fasta", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test.fasta", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test.fasta") def test_utils_predict_structures(self): # RNAfold and RNAlib bindings not available skip = False try: from RNA import fold except: if which("RNAfold") == None: try: utils.predict_structures( self.folder + "/data/rna_pred.fasta", gettempdir() + "/test2.fasta", 2, False) raise RuntimeError( 'predict_structures should have raised an error at this point, but did not' ) except: skip = True # we got an error, as expected #annotate=False if skip == True: return utils.predict_structures(self.folder + "/data/rna_pred.fasta", gettempdir() + "/test2.fasta", 2, False) if not isfile(gettempdir() + "/test2.fasta"): return with open(self.folder + "/data/rna_pred_ref.fasta", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test2.fasta", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test2.fasta") #annotate=True utils.predict_structures(self.folder + "/data/rna_pred.fasta", gettempdir() + "/test2.fasta", 2, True) if not isfile(gettempdir() + "/test2.fasta"): return with open(self.folder + "/data/rna_pred_ref_annot.fasta", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test2.fasta", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test2.fasta") def test_utils_save_as_meme(self): logos = [Motif('ACGT', ['GATTACA']), Motif('ACGT', ['AAAA'])] utils.save_as_meme(logos, gettempdir() + "/test.meme") with open(self.folder + "/data/ref.meme", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test.meme", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test.meme")
def main(): RBPs = [("data/pum2.train.positive.fasta", "data/pum2.train.negative.fasta", "data/pum2.test.positive.fasta", "data/pum2.test.negative.fasta", "PUM2"), ("data/qki.train.positive.fasta", "data/qki.train.negative.fasta", "data/qki.test.positive.fasta", "data/qki.test.negative.fasta", "QKI"), ("data/igf2bp123.train.positive.fasta", "data/igf2bp123.train.negative.fasta", "data/igf2bp123.test.positive.fasta", "data/igf2bp123.test.negative.fasta", "IGF2BP123"), ("data/srsf1.train.positive.fasta", "data/srsf1.train.negative.fasta", "data/srsf1.test.positive.fasta", "data/srsf1.test.negative.fasta", "SRSF1"), ("data/taf2n.train.positive.fasta", "data/taf2n.train.negative.fasta", "data/taf2n.test.positive.fasta", "data/taf2n.test.negative.fasta", "TAF2N"), ("data/nova.train.positive.fasta", "data/nova.train.negative.fasta", "data/nova.test.positive.fasta", "data/nova.test.negative.fasta", "NOVA")] for entry in RBPs: output_folder = entry[4] + "_pysster/" if not os.path.isdir(output_folder): os.makedirs(output_folder) start = time() # predict secondary structures utils.predict_structures(entry[0], entry[0]+".struct.gz", annotate=True) utils.predict_structures(entry[1], entry[1]+".struct.gz", annotate=True) utils.predict_structures(entry[2], entry[2]+".struct.gz", annotate=True) utils.predict_structures(entry[3], entry[3]+".struct.gz", annotate=True) # load data data = Data([entry[0]+".struct.gz", entry[1]+".struct.gz"], ("ACGU", "HIMS")) data.train_val_test_split(0.8, 0.1999) # we need to have at least one test sequence, even though we don't need it print(data.get_summary()) # training params = {"kernel_len": 8} model = Model(params, data) model.train(data) # load and predict test data data_test = Data([entry[2]+".struct.gz", entry[3]+".struct.gz"], ("ACGU", "HIMS")) predictions = model.predict(data_test, "all") stop = time() print("{}, time in seconds: {}".format(entry[4], stop-start)) # performance evaluation labels = data_test.get_labels("all") utils.plot_roc(labels, predictions, output_folder+"roc.pdf") utils.plot_prec_recall(labels, predictions, output_folder+"prec.pdf") print(utils.get_performance_report(labels, predictions)) # get motifs activations = model.get_max_activations(data_test, "all") logos, scores = [], [] for kernel in range(model.params["kernel_num"]): logo, score = model.visualize_kernel(activations, data_test, kernel, output_folder) logos.append(logo) scores.append(score) # sort motifs by importance score sorted_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])] with open(output_folder+"kernel_scores.txt", "wt") as handle: for x in sorted_idx: print("kernel {:>3}: {:.3f}".format(x, scores[x])) handle.write("kernel {:>3}: {:.3f}\n".format(x, scores[x])) # save model to drive utils.save_model(model, "{}model.pkl".format(output_folder))
def test_data_additional(self): self.assertTrue(len(self.data_pwm.meta) == 2) self.assertTrue(self.data_pwm.meta[0]['is_categorical'] == False) self.assertTrue(self.data_pwm.meta[1]['is_categorical'] == True) self.assertTrue(self.data_pwm.meta[0]['data'] == [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ]) self.assertTrue(len(self.data_pwm.meta[1]['data']) == 32) for x in self.data_pwm.meta[1]['data']: self.assertTrue(sum(x) == 1) self.assertTrue((self.data_pwm.meta[1]['data'][0] == self.data_pwm.meta[1]['data'][31]).all()) self.assertTrue((self.data_pwm.meta[1]['data'][13] == self.data_pwm.meta[1]['data'][18]).all()) addi = self.data_pwm._get_additional_data([0, 1, 15, 16], 0, 4) self.assertTrue(len(addi) == 4) self.assertTrue( np.allclose(addi[0], [1, *self.data_pwm.meta[1]['data'][0]])) self.assertTrue( np.allclose(addi[1], [2, *self.data_pwm.meta[1]['data'][1]])) self.assertTrue( np.allclose(addi[2], [16, *self.data_pwm.meta[1]['data'][15]])) self.assertTrue( np.allclose(addi[3], [16, *self.data_pwm.meta[1]['data'][16]])) # check position-wise additional data self.assertTrue(len(self.data_pwm.positionwise) == 2) self.assertTrue( list(self.data_pwm.positionwise.keys()) == ["feat1", "feat2"]) gen = self.data_pwm._data_generator("all", 32, False, False) dat = next(gen) self.assertTrue(dat[1].shape == (32, 17)) self.assertTrue(dat[0].shape == (32, 10, 14)) self.assertTrue( np.allclose(dat[0][0, :, 12], [0.9, 0.8, 0.7, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])) self.assertTrue( np.allclose(dat[0][31, :, 13], [0.1, 0.2, 0.3, 0.1, 1.0, 1.0, 0.8, 0.3, 0.2, 0.1])) self.assertTrue( np.allclose(dat[0][31, :, 12], [2.1, 2.2, 2.3, 2.1, 2.0, 2.0, 2.8, 2.3, 2.2, 2.1])) mod = Model( { "conv_num": 1, "kernel_num": 2, "kernel_len": 4, "neuron_num": 2, "epochs": 2 }, self.data_pwm) mod.train(self.data_pwm, verbose=True) predictions = mod.predict(self.data_pwm, "all") self.assertTrue(predictions.shape == (32, 2)) # check kernel output plot for position-wise data folder = gettempdir() + '/' acts = mod.get_max_activations(self.data_pwm, 'all') motif, score = mod.visualize_kernel(acts, self.data_pwm, 0, folder) with Image.open(folder + "additional_features_kernel_0.png") as img: self.assertTrue(img.size == (500, 1400)) remove(folder + "additional_features_kernel_0.png") remove(folder + "motif_kernel_0.png") remove(folder + "position_kernel_0.png") remove(folder + "activations_kernel_0.png")