def sample_from_model(kwargs, model, size, x):
    K.clear_session()
    log = logging.getLogger("sample_from_model")
    try:
        model_folder = resolve_dir(kwargs['model_dir'])

        log.info(
            "sampling with parameters, conf: %s, kwargs: %s, model_folder: %s,"
            "size: %d", conf, kwargs, model_folder, size)

        device = get_device()

        log.info("device to be used: %s", device)

        log.info("model: %s, x.shape: %s", model, x.shape)

        with tf.device(device):
            model = rm_utils.load_model_and_params(
                os.path.join(model_folder, "best_model"))

        log.info("model loaded")

        samples = model.sample(size, tf.convert_to_tensor(x))

        log.info("model samples.shape: %s", samples.shape)
        return samples
    except:
        error_msg = traceback.format_exc()
        log.error("Sampling failed: %s", error_msg)
    finally:
        log.info("Sampling completed")
Exemple #2
0
 def generate_data(self):
     data_train, data_validate, data_test = self.load_data_no_discrete_normalised_as_array(
         resolve_dir(self.file))
     self._train_size = data_train.shape[0]
     self._validate_size = data_validate.shape[0]
     self._test_size = data_test.shape[0]
     return np.r_[data_train, data_validate, data_test]
def predict_from_model(y, x, kwargs, op_names, model):
    K.clear_session()
    log = logging.getLogger("predict_from_model")
    try:
        model_folder = resolve_dir(kwargs['model_dir'])
        device = get_device()

        log.info(
            "using dev %s, kwargs: %s, conf: %s, model: %s, model_folder: %s, op_names: %s",
            device, kwargs, conf, model, model_folder, op_names)

        log.info("date received y.shape: %s, x.shape: %s", y.shape, x.shape)

        with tf.device(device):
            model = rm_utils.load_model_and_params(
                os.path.join(model_folder, "best_model"))

        marginals = kwargs['marginals']

        log.info("preparing dataset start")
        dataset = tf.data.Dataset.from_tensor_slices((y, x))
        dataset = dataset.repeat(1)
        dataset = dataset.prefetch(3 * conf.eval_batch_size)
        dataset = dataset.batch(conf.eval_batch_size)
        log.info("preparing dataset completed")

        collector = rm_utils.InMemoryCollector()
        log.info("Starting collecting prediction")
        for i, (y, x) in enumerate(dataset):
            results = {}
            for op_name in op_names:
                res = None
                log.info(
                    "Running at %d, op_name: %s, y.shape: %s, x.shape: %s", i,
                    op_name, y.shape, x.shape)
                if op_name == "log_likelihood":
                    ll = model.log_prob(y, x, marginals=marginals)
                    log.info(
                        "log_prob completed with %s",
                        "list of len: {}, shapes: {}".format(
                            len(ll), ", ".join([str(l.shape) for l in ll])) if
                        isinstance(ll, list) else "array of shape: {}".format(
                            str(ll.shape)))
                    if isinstance(ll, list):
                        ll = np.concatenate(ll, axis=1)
                    log.info("concatenate completed")
                    results["log_likelihood"] = ll
                else:
                    raise ValueError(
                        "not recognized op name: {}".format(op_name))
            collector.collect(results)
            log.info("collect completed")

        return collector.result()
    except:
        error_msg = traceback.format_exc()
        log.error("Error in predict: %s", error_msg)
 def load_estimator(self, model_dir, params, **kwargs):
     estimator = TfEstimator(self.model,
                             resolve_dir(model_dir),
                             params,
                             x_size=kwargs["x_size"],
                             y_size=kwargs["y_size"])
     estimator.restore(not_exists_ok=False if 'not_exists_ok' not in
                       kwargs else kwargs['not_exists_ok'])
     return estimator
Exemple #5
0
    def generate_data(self):
        f = h5py.File(resolve_dir(self.file), 'r')

        data_train = f['train']
        data_validate=f['validation']
        data_test=f['test']

        self._train_size = data_train.shape[0]
        self._validate_size = data_validate.shape[0]
        self._test_size = data_test.shape[0]
        return np.r_[data_train, data_validate, data_test]
    def __call__(self, *varg, **kwargs):
        from models.tensorboard import Tensorboard

        conf.visible_device_list = self.conf_override.visible_device_list

        model_dir = self.model_dir(**kwargs)
        resolved_dir = resolve_dir(model_dir)

        self.log = logging.getLogger(self.__class__.__name__)

        stats_model_dir = resolve_dir(os.path.join(model_dir, "stats"))
        tb = Tensorboard(stats_model_dir)
        try:

            os.makedirs(stats_model_dir, exist_ok=True)

            tb.log_dict_as_table("conf", conf.__dict__)
            tb.log_dict_as_table("model_conf", conf.__dict__)
            tb.log_dict_as_table("early_stopping",
                                 self.early_stopping.__dict__)

            model_dir, train_stats, validation_stats, test_stats = self.call_me(
                model_dir, *varg, **kwargs)

            tb.log_dict_as_table("train_stats", train_stats)
            tb.log_dict_as_table("validation_stats", validation_stats)
            tb.log_dict_as_table("test_stats", test_stats)

            return model_dir, train_stats, validation_stats, test_stats

        except Exception:
            error_msg = traceback.format_exc()
            self.log.error(error_msg)
            tb.log_dict_as_table("message", {'error': error_msg})
            self.log_error(kwargs, resolved_dir)
            return None, None, None, None
        finally:
            tb.close()
    def call_me(self, model_dir, *varg, **kwargs):
        resolved_model_dir = resolve_dir(model_dir)
        kwargs['model_dir'] = resolved_model_dir

        self.data_loader.load_data()

        self.log.info("About to train the model")

        create_model_and_train(kwargs, resolved_model_dir, self.data_loader,
                               self.model.name(), self.early_stopping)

        self.log.info("Train done")

        train = create_model_and_validate(kwargs,
                                          resolved_model_dir, self.data_loader,
                                          self.model.name(), "train")
        valid = create_model_and_validate(kwargs,
                                          resolved_model_dir, self.data_loader,
                                          self.model.name(), "valid")
        test = create_model_and_validate(kwargs,
                                         resolved_model_dir, self.data_loader,
                                         self.model.name(), "test")

        res_train = {}
        if train is not None:
            res_train['ll_mean'] = train[0]
            res_train['ll_se'] = train[1]
        else:
            res_train['ll_mean'] = np.nan
            res_train['ll_se'] = np.nan

        res_valid = {}
        if valid is not None:
            res_valid['ll_mean'] = valid[0]
            res_valid['ll_se'] = valid[1]
        else:
            res_valid['ll_mean'] = np.nan
            res_valid['ll_se'] = np.nan

        res_test = {}
        if test is not None:
            res_test['ll_mean'] = test[0]
            res_test['ll_se'] = test[1]
        else:
            res_test['ll_mean'] = np.nan
            res_test['ll_se'] = np.nan

        self.log.info("Training and validation complete")
        return model_dir, res_train, res_valid, res_test
Exemple #8
0
    def generate_data(self):
        data = pd.read_pickle(resolve_dir(self.file))
        data.drop("Meth", axis=1, inplace=True)
        data.drop("Eth", axis=1, inplace=True)
        data.drop("Time", axis=1, inplace=True)

        B = self.get_correlation_numbers(data)

        while np.any(B > 1):
            col_to_remove = np.where(B > 1)[0][0]
            col_name = data.columns[col_to_remove]
            data.drop(col_name, axis=1, inplace=True)
            B = self.get_correlation_numbers(data)
        # print(data.corr())
        data = (data - data.mean()) / data.std()

        return data.as_matrix()
Exemple #9
0
def save_best_model_exp(name, opt):
    with open(
            os.path.join(gen_utils.resolve_dir(conf.dir),
                         experiment_file(name)), 'w') as f:
        data = OrderedDict()
        data["model_factory"] = opt.model_factory.state
        data["data_set_info"] = opt.data_set_info
        data["best_model_valid_ll"] = opt.best_model_valid_ll
        data["best_model_train_ll"] = opt.best_model_train_ll
        data["best_model_test_ll"] = opt.best_model_test_ll
        data["best_model_params"] = opt.best_model_params
        data["best_model_dir"] = opt.best_model_dir

        data["hyper_param_search_results"] = opt.hyper_param_search.results
        data["hyper_param_search"] = opt.hyper_param_search.state
        data["conf"] = conf.__dict__

        json.dump(data, f, cls=NumpyEncoder, indent=4)
Exemple #10
0
    def __init__(self, experiment_name, persist_load_exp=True):
        self._data_loader = None
        self._model_factory = None
        self._hyper_param_search = None
        self._valid_batch_size = None
        self._early_stopping = None
        self._persist_load_exp = persist_load_exp

        self.best_model_valid_ll = None
        self.best_model_train_ll = None
        self.best_model_test_ll = None
        self.best_model_params = None
        self.best_model_dir = None

        self.data_set_info = None

        conf.dir = os.path.join('{ROOT}', experiment_name)
        resolved_dir = resolve_dir(conf.dir)
        os.makedirs(resolved_dir, exist_ok=True)
        self.log = None
Exemple #11
0
def compute_mi(x, params, kwargs, y_size):

    model_folder = resolve_dir(params['model_dir'])

    all_y_vars = ["y%d" % i for i in range(y_size)]

    mi = np.full((y_size, y_size), np.nan, dtype=np.float32)

    def run_for_combination(y_i, y_j):
        device = get_device()

        def prob(**kwargs):
            y_list = [kwargs[y_var] for y_var in all_y_vars]
            y = tf.concat(y_list, axis=-1)
            x_broadcasted = tf.broadcast_to(
                x, [tf.shape(y)[0], tf.shape(x)[-1]])
            return model.prob(y, x=x_broadcasted, training=False)

        with tf.device(device):
            model = utils.load_model_and_params(
                os.path.join(model_folder, "best_model"))

        integrate_out = [
            "y%d" % i for i in set(range(y_size)) - set([y_i, y_j])
        ]
        return tf_utils.mi(prob,
                           var1="y%d" % y_i,
                           var2="y%d" % y_j,
                           integrate_out=integrate_out,
                           **kwargs)

    all_combinations = [(y_i, y_j) for y_i in range(y_size)
                        for y_j in range(y_i + 1, y_size)]

    with multiprocessing.Pool(maxtasksperchild=1) as pool:
        mis = pool.starmap(run_for_combination, all_combinations)

    for i, (y_i, y_j) in enumerate(all_combinations):
        mi[y_i][y_j] = mis[i]

    return mi
Exemple #12
0
    def generate_data(self):
        rng = np.random.RandomState(42)

        data = np.load(resolve_dir(self.file))
        rng.shuffle(data)
        N = data.shape[0]

        data = np.delete(data, 3, axis=1)
        data = np.delete(data, 1, axis=1)
        ############################
        # Add noise
        ############################
        # global_intensity_noise = 0.1*rng.rand(N, 1)
        voltage_noise = 0.01 * rng.rand(N, 1)
        # grp_noise = 0.001*rng.rand(N, 1)
        gap_noise = 0.001 * rng.rand(N, 1)
        sm_noise = rng.rand(N, 3)
        time_noise = np.zeros((N, 1))
        # noise = np.hstack((gap_noise, grp_noise, voltage_noise, global_intensity_noise, sm_noise, time_noise))
        # noise = np.hstack((gap_noise, grp_noise, voltage_noise, sm_noise, time_noise))
        noise = np.hstack((gap_noise, voltage_noise, sm_noise, time_noise))
        data = data + noise
        return data
Exemple #13
0
 def generate_data(self):
     data = np.load(resolve_dir(self.file))
     return data
Exemple #14
0
 def data_file(self, type, ext):
     return os.path.join(resolve_dir(conf.dir), self._file_name(type, ext))
Exemple #15
0
    def load(self, complain_on_diff=False, model_name=None):
        if not self._persist_load_exp:
            raise IOError()

        name = model_name if model_name is not None else self.model_factory.model_name
        with open(os.path.join(resolve_dir(conf.dir), experiment_file(name)), 'r') as f:
            data = json.load(f)

        self.best_model_valid_ll = data["best_model_valid_ll"]
        self.best_model_train_ll = data["best_model_train_ll"]
        self.best_model_test_ll = data["best_model_test_ll"]
        self.best_model_params = data["best_model_params"]
        self.best_model_dir = data["best_model_dir"]
        self.data_set_info = data["data_set_info"]
        hyper_param_search_results = data["hyper_param_search_results"]
        hyper_param_search_state = data["hyper_param_search"]
        model_factory_state = data["model_factory"]

        for i, v in enumerate(hyper_param_search_state["space"]):
            if 'bounds' in v and isinstance(v['bounds'][0], list):
                elements = v['bounds']
                new_elements = []
                for element in elements:
                    new_elements.append(tuple(element))
                v['bounds'] = new_elements

        if complain_on_diff:
            affecting_experiment = conf.values_affecting_experiment()
            if affecting_experiment != {key: value for key, value in data["conf"].items() if
                                        key in affecting_experiment}:
                raise ValueError("Running model factory conf different than loaded one")

            # num workers can change

            affecting_experiment = conf.values_affecting_experiment()
            if affecting_experiment != {key: value for key, value in data["conf"].items() if
                                        key in affecting_experiment}:
                raise ValueError("Running conf different than loaded one")

            if self.hyper_param_search.state != hyper_param_search_state:
                raise ValueError("hyper_param_search")

            if self.model_factory.state != model_factory_state:
                raise ValueError("model_factory")

            if self.data_loader.state != self.data_set_info:
                raise ValueError("data_loader")

        if self.hyper_param_search is None:
            self.hyper_param_search = HyperParamSearch.from_state(hyper_param_search_state)

        if self.model_factory is None:
            self.model_factory = TrainEvalModelFactory.from_state(model_factory_state)

        if self.data_loader is None:
            data_loader = DataLoader.from_state(self.data_set_info)
            data_loader.load_from_file()
            self.data_loader = data_loader

        for hyper_param_step in hyper_param_search_results:
            self.hyper_param_search.tell(
                tuple([tuple(hyper_param_step['x'][dim.name]) if isinstance(hyper_param_step['x'][dim.name], list)
                       else hyper_param_step['x'][dim.name] for dim in self.hyper_param_search.space]),
                hyper_param_step['train'],
                hyper_param_step['validation'],
                hyper_param_step['test'],
                hyper_param_step['model_dir'] if 'model_dir' in hyper_param_step else "")
Exemple #16
0
    def run(self):
        init_logging(os.path.join(resolve_dir(conf.dir), "output_{}.log".format(self.model_factory.model.name())))
        self.log = logging.getLogger("experiment")
        try:
            self.load(complain_on_diff=True)
            print("Loaded %s" % os.path.join(resolve_dir(conf.dir), experiment_file(self.model_factory.model_name)))
        except IOError:
            print("Clean run")
            pass

        futures = []
        if conf.print_progress:
            progress_mon = ProgressMonitor(self.hyper_param_search.num_samples, "{model}/{data_set}".
                                       format(model=self.model_factory.model_name, data_set=self.data_set_name()))
        else:
            progress_mon = NoOpProgressMonitor()

        done = self.hyper_param_search.done
        progress_mon.progress(done)

        device_assignment = {device: 0 for device in conf.visible_device_list}
        self.data_loader.free()

        tasks = []
        try:
            while True:
                x = self.hyper_param_search.ask()
                objective_fun = self.model_factory.create_train_eval(self.data_loader,
                                                                     self.hyper_param_search.space_names,
                                                                     self.early_stopping)
                objective_fun.conf_override = deepcopy(conf)
                args_named = self.hyper_param_search.to_named_params(x)
                tasks.append(WorkItem(objective_fun, x, args_named, None))
        except StopIteration:
            pass

        with (SameProcessExecutor() if conf.num_workers <= 0 else concurrent.futures.ProcessPoolExecutor(
                conf.num_workers)) as executor:
            task_id = 0
            submitted_during_round = False
            while len(futures) > 0 or len(tasks) > 0:
                if task_id == 0:
                    submitted_during_round = False
                    #bring tasks that are restricted wrt device to the front of the queue
                    for i in range(len(tasks)):
                        task = tasks[i]
                        if np.any([re.search(pattern, task.name) for pattern, dev in conf.device_placement_mapping]):
                            del tasks[i]
                            tasks.insert(0, task)

                made_round = task_id == len(tasks)
                submit = False
                if (len(futures) < conf.num_workers or conf.num_workers <= 0) and len(tasks) > 0:
                    task_id = task_id % len(tasks)
                    next_wi = tasks[task_id]  # x is a list of n_points points
                    allowed_devices = list(device_assignment.keys())
                    if len(device_assignment) > 0:

                        for pattern, dev in conf.device_placement_mapping:
                            if re.search(pattern, next_wi.name):
                                allowed_devices = [dev]
                                break

                        allowed_device_assignment = {k: v for k, v in device_assignment.items() if
                                                     k in allowed_devices}
                        if len(allowed_device_assignment) > 0:
                            device = sorted([(t[1], t[0]) for t in list(allowed_device_assignment.items())])[0][1]
                            device_assignment[device] += 1
                            next_wi.objective_fun.conf_override.visible_device_list = [device]
                            submit = True

                    else:
                        next_wi.objective_fun.conf_override.visible_device_list = []
                        submit = True

                    if submit:
                        time.sleep(10)
                        submitted_during_round = True
                        next_wi.future = executor.submit(next_wi.objective_fun, **next_wi.args_named)
                        futures.append(next_wi)

                        del tasks[task_id]
                    else:
                        task_id += 1

                for wi in list(futures):
                    try:
                        model_dir, train_eval, validation_eval, test_eval = wi.future.result(0)

                        if len(device_assignment) > 0:
                            device_assignment[wi.objective_fun.conf_override.visible_device_list[0]] -= 1
                            if abs(device_assignment[wi.objective_fun.conf_override.visible_device_list[0]]) < 1e-2:
                                device_assignment[wi.objective_fun.conf_override.visible_device_list[0]] = 0

                        self.train_eval_task_finished(
                            futures, wi, model_dir, train_eval, validation_eval, test_eval)
                        done += 1

                        progress_mon.progress(done)
                    except concurrent.futures.TimeoutError:
                        pass

                if (len(futures) != 0 and len(futures) == conf.num_workers) or (
                        made_round and not submitted_during_round):
                    time.sleep(5)

        for wi in list(futures):
            model_dir, train_eval, validation_eval, test_eval = wi.future.result()
            self.train_eval_task_finished(futures, wi, model_dir, train_eval, validation_eval, test_eval)
            done += 1
            progress_mon.progress(done)

        self.save()
Exemple #17
0
    def load_data(self, generator_fn=None):
        try:
            self.load_from_file()
            print("loaded data: %s" % self._file_name("data", "memmap"))
        except IOError:
            os.makedirs(resolve_dir(conf.dir), exist_ok=True)
            data = self.generate_data(
            ) if generator_fn is None else generator_fn()
            data = data.astype(getattr(np, "float%s" % conf.precision))
            if self.uniqueness_threshold is not None:
                cat_columns = []
                for i in range(data.shape[1]):
                    uniqueness = len(np.unique(data[:, i])) / len(data[:, i])
                    if uniqueness < self.uniqueness_threshold:
                        cat_columns.append(i)
                        print("cat column: %d" % i)
                non_cat_columns = [
                    i for i in range(data.shape[1]) if i not in cat_columns
                ]
                data = data[:, non_cat_columns]

            if self.y_transforms is not None:
                for y_transform in self.y_transforms:
                    y_transform.transform(data)

            self._data = np.memmap(self.data_file('data', "memmap"),
                                   dtype=getattr(np, "float" + conf.precision),
                                   mode="w+",
                                   shape=data.shape)
            self._data[:] = data[:]
            with open(self.data_file('data', "json"), 'w') as out:
                out.write(
                    json.dumps(
                        {
                            'dtype': "float" + conf.precision,
                            'shape': data.shape
                        },
                        indent=4))
            del data

            train, validation, test = self.sample_cross_validation()

            self._train_data = np.memmap(self.data_file('train', "memmap"),
                                         dtype=getattr(
                                             np, "float" + conf.precision),
                                         mode="w+",
                                         shape=train.shape)
            self._train_data[:] = train[:]
            with open(self.data_file('train', "json"), 'w') as out:
                out.write(
                    json.dumps(
                        {
                            'dtype': "float" + conf.precision,
                            'shape': train.shape
                        },
                        indent=4))
            del train

            self._validation_data = np.memmap(
                self.data_file('validation', "memmap"),
                dtype=getattr(np, "float" + conf.precision),
                mode="w+",
                shape=validation.shape)
            self._validation_data[:] = validation[:]
            with open(self.data_file('validation', "json"), 'w') as out:
                out.write(
                    json.dumps(
                        {
                            'dtype': "float" + conf.precision,
                            'shape': validation.shape
                        },
                        indent=4))
            del validation

            self._test_data = np.memmap(self.data_file('test', "memmap"),
                                        dtype=getattr(np, "float" +
                                                      conf.precision),
                                        mode="w+",
                                        shape=test.shape)
            self._test_data[:] = test[:]
            with open(self.data_file('test', "json"), 'w') as out:
                out.write(
                    json.dumps(
                        {
                            'dtype': "float" + conf.precision,
                            'shape': test.shape
                        },
                        indent=4))
            del test

            print("generated and saved data: %s" %
                  self._file_name("data", "memap"))