Example #1
0
    def train(self, data, verbose=True):
        """ Train all models and return the best one.

        Models are evaluated and ranked according to their ROC-AUC on a validation data set.

        Parameters
        ----------
        data: pysster.Data
            A Data object providing training and validation data sets.
        
        verbose: bool
            If True, progress information (train/val loss) will be printed throughout the training.

        Returns
        -------
        results: tuple(pysster.Model, str)
            The best performing model and an overview table of all models are returned.
        """
        best_model_path = "{}/{}".format(
            gettempdir(),
            ''.join(random.choice(string.ascii_uppercase) for _ in range(20)))
        aucs = []
        max_auroc = -1
        for i, candidate in enumerate(self.candidates):
            model = Model(candidate, data)
            model.train(data, verbose)
            predictions = model.predict(data, "val")
            labels = data.get_labels("val")
            report = utils.performance_report(labels, predictions)
            roc_auc = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis],
                             axis=0)
            roc_auc = (roc_auc / np.sum(report[:, -1]))[3]
            aucs.append(roc_auc)
            if aucs[-1] > max_auroc:
                max_auroc = aucs[-1]
                utils.save_model(model, best_model_path)
            K.clear_session()
            K.reset_uids()
            if not verbose: continue
            print("\n=== Summary ===")
            print("Model {}/{} = {:.5f} weighted avg roc-auc".format(
                i + 1, len(self.candidates), aucs[i]))
            for param in candidate:
                if not param in ["input_shape"]:
                    print(" - {}: {}".format(param, candidate[param]))
        # load the best model (and remove it from disc)
        model = utils.load_model(best_model_path)
        remove(best_model_path)
        remove("{}.h5".format(best_model_path))
        # save a formatted summary of all trained models
        table = self._grid_search_table(aucs)
        return model, table
Example #2
0
 def test_utils_save_load_model(self):
     utils.save_model(self.m1, gettempdir() + "/model")
     self.assertTrue(isfile(gettempdir() + "/model"))
     self.assertTrue(isfile(gettempdir() + "/model.h5"))
     model = utils.load_model(gettempdir() + "/model")
     self.assertTrue(self.m1.params == model.params)
     self.assertTrue(self.m1.model.get_config() == model.model.get_config())
     for x in range(6):
         self.assertTrue(
             np.allclose(self.m1.model.get_weights()[x],
                         model.model.get_weights()[x]))
     remove(gettempdir() + "/model")
     remove(gettempdir() + "/model.h5")
Example #3
0
    def train(self, data, pr_auc=False, verbose=True):
        """ Train all models and return the best one.

        Models are evaluated and ranked according to their ROC-AUC or PR-AUC (precision-recall)
        on a validation data set.

        Parameters
        ----------
        data: pysster.Data
            A Data object providing training and validation data sets.
        
        pr_auc: bool
            If True, the area under the precision-recall curve will be maximized instead of the area under the ROC curve

        verbose: bool
            If True, progress information (train/val loss) will be printed throughout the training.

        Returns
        -------
        results: tuple(pysster.Model, str)
            The best performing model and an overview table of all models are returned.
        """
        best_model_path = "{}/{}".format(
            gettempdir(),
            ''.join(random.choice(string.ascii_uppercase) for _ in range(20)))
        if True == pr_auc:
            metric_idx = 4
            metric_name = "pre-auc"
        else:
            metric_idx = 3
            metric_name = "roc-auc"
        metric = []
        max_metric = -1
        for i, candidate in enumerate(self.candidates):
            model = Model(candidate, data)
            model.train(data, verbose)
            predictions = model.predict(data, "val")
            labels = data.get_labels("val")
            report = utils.performance_report(labels, predictions)
            metric_val = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis],
                                axis=0)
            metric_val = (metric_val / np.sum(report[:, -1]))[metric_idx]
            metric.append(metric_val)
            if metric[-1] > max_metric:
                max_metric = metric[-1]
                utils.save_model(model, best_model_path)
            K.clear_session()
            K.reset_uids()
            if not verbose: continue
            print("\n=== Summary ===")
            print("Model {}/{} = {:.5f} weighted avg {}".format(
                i + 1, len(self.candidates), metric[i], metric_name))
            for param in candidate:
                if not param in ["input_shape"]:
                    print(" - {}: {}".format(param, candidate[param]))
        # load the best model (and remove it from disc)
        model = utils.load_model(best_model_path)
        remove(best_model_path)
        remove("{}.h5".format(best_model_path))
        # save a formatted summary of all trained models
        table = self._grid_search_table(metric, metric_name)
        return model, table
Example #4
0
import os
from pysster.Data import Data
from pysster import utils
from IPython.display import Image
DATA = "/mnt/isilon/dbhi_bfx/perry/brian/"
#establish output directory
output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_10_17_18_2_feats/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

#load the pysster prediction model
model = utils.load_model(
    "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/run_10_17_18_2_feats/model.pkl"
)

add_cgi_features = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__microsat.out"
]

add_both_features = [x.replace('cgi.', 'both.') for x in add_cgi_features]

indel_len_feat = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__indel_length.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/both.indel.unsample__indel_length.out"
]

#load the dataset as data
data = Data([
import os
from pysster.Data import Data
from pysster import utils
from IPython.display import Image
DATA = "/mnt/isilon/dbhi_bfx/perry/brian/"
#establish output directory
output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_1_4_19_kav_8k_each_all_feats/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

#load the pysster prediction model
model = utils.load_model(
    "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_1_4_19_kav_8k_each/model.pkl"
)

add_cgi_features = [
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__microsat.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__lowmappabilityall.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__notinlowmappabilityall.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__siren_similarRegions_dist1.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__segdupall.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__notinsegdupall.out",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__notinrefseq_union_cds.sort.out",
    DATA +
Example #6
0
import os
from pysster.Data import Data
from pysster import utils
from IPython.display import Image
DATA = "/mnt/isilon/dbhi_bfx/perry/brian/"
#establish output directory
output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_run_12_5_18_tar_cgi_kav_both/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

#load the pysster prediction model
model = utils.load_model(
    "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_10_18_18_all_add_feats_back/model.pkl"
)

add_cgi_features = [
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.microsat.out",
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.lowmappabilityall.out",
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.notinlowmappabilityall.out",
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.siren_similarRegions_dist1.out",
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.segdupall.out",
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.notinsegdupall.out",
    DATA +
    "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.notinrefseq_union_cds.sort.out",
    DATA +
Example #7
0
import os
from pysster.Data import Data
from pysster import utils
from IPython.display import Image
DATA = "/mnt/isilon/dbhi_bfx/perry/brian/"
#establish output directory
output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_ref_fa_10_24_18/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

#load the pysster prediction model
model = utils.load_model(
    "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_ref_seq_only_sampled_10_22_18/model.pkl"
)

#load the dataset as data
data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_ref_fa/cgi.indel.unsample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_ref_fa/both.indel.unsample.fa.gz"
], ("ACGT"))

#run the model of pysster on all of the data set
predictions = model.predict(data, "all")
predictions

labels = data.get_labels("all")
labels

utils.plot_roc(labels, predictions, output_folder + "roc.png")
                else:
                    pass

    return chroms, starts, ends


#establish output directory and take output directory name for csv file of labels and predictions
output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_run_12_21_18_kav_cgi_8k_samp_kav_both_8k_samp/"
if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

class_file_name = output_folder.split('/')[-2]

#load the pysster prediction model
model = utils.load_model(
    "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/target_exp/pysster_output/target_train_model_no_add_feats_12_21_18/model.pkl"
)

#load the dataset as data
data = Data([
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/cgi.indel.sample.fa.gz",
    DATA +
    "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/both.indel.sample.fa.gz"
], ("ACGT", "XDI"))

#run the model of pysster on all of the data set
predictions = model.predict(data, "all")

labels = data.get_labels("all")