Ejemplo n.º 1
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)
        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)
        logger.record_tabular("{}/LossBefore".format(self.policy.name),
                              loss_before)
        logger.record_tabular("{}/LossAfter".format(self.policy.name),
                              loss_after)
        logger.record_tabular("{}/dLoss".format(self.policy.name),
                              loss_before - loss_after)
        logger.record_tabular("{}/KLBefore".format(self.policy.name),
                              policy_kl_before)
        logger.record_tabular("{}/KL".format(self.policy.name), policy_kl)
        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        logger.record_tabular("{}/Entropy".format(self.policy.name),
                              np.mean(pol_ent))

        self._fit_baseline(samples_data)
Ejemplo n.º 2
0
 def _iam_terrain_generator(self, regen):
     """
     When parallel processing, don't want each worker to generate its own
     terrain. This method ensures that one worker generates the terrain,
     which is then used by other workers. It's still possible to have each
     worker use their own terrain by passing each worker a different hfield
     and texture dir.
     """
     if not os.path.exists(self.hfield_dir):
         os.makedirs(self.hfield_dir)
     terrain_path = os.path.join(self.hfield_dir, self.HFIELD_FNAME)
     lock_path = self._get_lock_path()
     if regen or (not regen and not os.path.exists(terrain_path)):
         # use a simple lock file to prevent different workers overwriting
         # the file, and/or running their own unique terrains
         if not os.path.exists(lock_path):
             with open(lock_path, 'w') as f:
                 f.write(str(os.getpid()))
             return True
         else:
             # wait for the worker that's generating the terrain to finish
             total = 0
             logger.log(
                 "Process {0} waiting for terrain generation...".format(
                     os.getpid()))
             while os.path.exists(lock_path) and total < 120:
                 time.sleep(5)
                 total += 5
             if os.path.exists(lock_path):
                 raise ("Process {0} timed out waiting for terrain "
                        "generation, or stale lock file").format(
                            os.getpid())
             logger.log("Done.")
             return False
     return False
Ejemplo n.º 3
0
    def train(self):
        plotter = Plotter()
        if self.plot:
            plotter.init_plot(self.env, self.policy)
        self.start_worker()
        self.init_opt()
        for itr in range(self.current_itr, self.n_itr):
            with logger.prefix('itr #%d | ' % itr):
                paths = self.sampler.obtain_samples(itr)
                samples_data = self.sampler.process_samples(itr, paths)
                self.log_diagnostics(paths)
                self.optimize_policy(itr, samples_data)
                logger.log("saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)
                self.current_itr = itr + 1
                params["algo"] = self
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    plotter.update_plot(self.policy, self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

        plotter.close()
        self.shutdown_worker()
Ejemplo n.º 4
0
    def optimize_policy(self, itr, **kwargs):
        paths = self.obtain_samples(itr)

        samples_data = self.process_samples(itr, paths)
        if self._save_sample_frequency > 0 and itr % self._save_sample_frequency == 0:
            self.save_samples(itr, samples_data)
        self.log_diagnostics(paths)

        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        inference_opt_input_values = self._inference_opt_input_values(
            samples_data)

        self.train_policy_and_embedding_networks(policy_opt_input_values)
        self.train_inference_network(inference_opt_input_values)

        samples_data = self.evaluate(policy_opt_input_values, samples_data)
        self.visualize_distribution(samples_data)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)

        return self.get_itr_snapshot(itr, samples_data)
Ejemplo n.º 5
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)
        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)
        logger.record_tabular("{}/LossBefore".format(self.policy.name),
                              loss_before)
        logger.record_tabular("{}/LossAfter".format(self.policy.name),
                              loss_after)
        logger.record_tabular("{}/dLoss".format(self.policy.name),
                              loss_before - loss_after)
        logger.record_tabular("{}/KLBefore".format(self.policy.name),
                              policy_kl_before)
        logger.record_tabular("{}/KL".format(self.policy.name), policy_kl)

        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        logger.record_tabular("{}/Entropy".format(self.policy.name),
                              np.mean(pol_ent))

        num_traj = self.batch_size // self.max_path_length
        actions = samples_data["actions"][:num_traj, ...]
        logger.record_histogram("{}/Actions".format(self.policy.name), actions)

        self._fit_baseline(samples_data)
Ejemplo n.º 6
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())

        self.start_worker(sess)
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                params = self.optimize_policy(itr, )
                if self.plot:
                    self.plotter.update_plot(self.policy, self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
                logger.log("Saving snapshot...")
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('IterTime', time.time() - itr_start_time)
                logger.record_tabular('Time', time.time() - start_time)
                logger.dump_tabular()
        self.shutdown_worker()
        if created_session:
            sess.close()
Ejemplo n.º 7
0
 def _memory_selection(self, itr):
     logger.log('Memory selection...')
     self.model_paths = self.memory_selection.select_paths_subset(
         self.all_paths, self.all_Rs)
     self.softmax_ids = self.softmax_selection.select_paths_subset(
         self.all_paths, self.all_Rs, return_indices=True)
     self.feed = self.model.get_feed(self.model_paths)
Ejemplo n.º 8
0
    def train_policy_and_embedding_networks(self, policy_opt_input_values):
        """ Joint optimization of policy and embedding networks """

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)

        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        embed_kl_before = self.f_embedding_kl(*policy_opt_input_values)

        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)

        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        embed_kl = self.f_embedding_kl(*policy_opt_input_values)

        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)

        logger.record_tabular('Policy/LossBefore', loss_before)
        logger.record_tabular('Policy/LossAfter', loss_after)
        logger.record_tabular('Policy/KLBefore', policy_kl_before)
        logger.record_tabular('Policy/KL', policy_kl)
        logger.record_tabular('Policy/dLoss', loss_before - loss_after)
        logger.record_tabular('Embedding/KLBefore', embed_kl_before)
        logger.record_tabular('Embedding/KL', embed_kl)

        return loss_after
Ejemplo n.º 9
0
Archivo: npo.py Proyecto: gntoni/garage
 def optimize_policy(self, itr, samples_data):
     all_input_values = tuple(
         ext.extract(samples_data, "observations", "actions", "advantages"))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [
         agent_infos[k] for k in self.policy.distribution.dist_info_keys
     ]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"], )
     logger.log("Computing loss before")
     loss_before = self.optimizer.loss(all_input_values)
     logger.log("Computing KL before")
     mean_kl_before = self.optimizer.constraint_val(all_input_values)
     logger.log("Optimizing")
     self.optimizer.optimize(all_input_values)
     logger.log("Computing KL after")
     mean_kl = self.optimizer.constraint_val(all_input_values)
     logger.log("Computing loss after")
     loss_after = self.optimizer.loss(all_input_values)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKLBefore', mean_kl_before)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     return dict()
Ejemplo n.º 10
0
    def outer_optimize(self, samples_data):
        logger.log("optimizing policy")
        observations = ext.extract(samples_data, "observations")
        actions = ext.extract(samples_data, "actions")
        advantages = ext.extract(samples_data, "advantages")

        num_traj = len(samples_data["paths"])

        observations = observations[0].reshape(
            -1, self.env.spec.observation_space.shape[0])
        actions = actions[0].reshape(-1, self.env.spec.action_space.shape[0])
        advantages = advantages[0].reshape(-1)
        inputs = tuple([observations, actions, advantages])

        s_g = self._opt_fun["f_train"](*(list(inputs)))
        #s_g = [x / num_traj for x in s_g]
        self.gradient_backup = copy.deepcopy(s_g)
        g_flat = self.flatten_parameters(s_g)

        loss_before = self._opt_fun["f_loss"](*(list(inputs)))
        self.backup_policy.set_param_values(
            self.policy.get_param_values(trainable=True), trainable=True)
        self.optimizer.optimize(inputs, g_flat)
        loss_after = self._opt_fun["f_loss"](*(list(inputs)))
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self._opt_fun['f_kl'](*(list(inputs)))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
Ejemplo n.º 11
0
 def train_once(self, itr, paths):
     itr_start_time = time.time()
     with logger.prefix('itr #%d | ' % itr):
         self.log_diagnostics(paths)
         logger.log("Optimizing policy...")
         self.optimize_policy(itr, paths)
         logger.record_tabular('IterTime', time.time() - itr_start_time)
         logger.dump_tabular()
Ejemplo n.º 12
0
def worker_init_envs(G, alloc, scope, env):
    logger.log("initializing environment on worker %d" % G.worker_id)
    if not hasattr(G, 'parallel_vec_envs'):
        G.parallel_vec_envs = dict()
        G.parallel_vec_env_template = dict()
    G.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env)))
                                  for idx in alloc]
    G.parallel_vec_env_template[scope] = env
Ejemplo n.º 13
0
    def train(self):
        self._initiate_training()

        for itr in range(self.n_itr):
            self._training_step(itr)

        logger.log('Terminate training')
        self.sess.close()
Ejemplo n.º 14
0
    def optimize(self, inputs, extra_inputs=None, callback=None, name=None):
        with tf.name_scope(name, "optimize", values=[inputs, extra_inputs]):

            if not inputs:
                # Assumes that we should always sample mini-batches
                raise NotImplementedError

            f_loss = self._opt_fun["f_loss"]

            if extra_inputs is None:
                extra_inputs = tuple()

            last_loss = f_loss(*(tuple(inputs) + extra_inputs))

            start_time = time.time()

            dataset = BatchDataset(inputs,
                                   self._batch_size,
                                   extra_inputs=extra_inputs)

            sess = tf.get_default_session()

            for epoch in range(self._max_epochs):
                if self._verbose:
                    logger.log("Epoch %d" % (epoch))
                    progbar = pyprind.ProgBar(len(inputs[0]))

                for batch in dataset.iterate(update=True):
                    sess.run(self._train_op,
                             dict(list(zip(self._input_vars, batch))))
                    if self._verbose:
                        progbar.update(len(batch[0]))

                if self._verbose:
                    if progbar.active:
                        progbar.stop()

                new_loss = f_loss(*(tuple(inputs) + extra_inputs))

                if self._verbose:
                    logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
                if self._callback or callback:
                    elapsed = time.time() - start_time
                    callback_args = dict(
                        loss=new_loss,
                        params=self._target.get_param_values(
                            trainable=True) if self._target else None,
                        itr=epoch,
                        elapsed=elapsed,
                    )
                    if self._callback:
                        self._callback(callback_args)
                    if callback:
                        callback(**callback_args)

                if abs(last_loss - new_loss) < self._tolerance:
                    break
                last_loss = new_loss
Ejemplo n.º 15
0
    def optimize(self, inputs, extra_inputs=None):

        if not inputs:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_loss = self._opt_fun["f_loss"]
        f_grad = self._opt_fun["f_grad"]
        f_grad_tilde = self._opt_fun["f_grad_tilde"]

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        param = np.copy(self._target.get_param_values(trainable=True))
        logger.log("Start SVRPG optimization: #parameters: %d, #inputs %d" %
                   (len(param), len(inputs[0])))
        dataset = BatchDataset(inputs,
                               self._batch_size,
                               extra_inputs=extra_inputs)
        start_time = time.time()

        for epoch in range(self._max_epochs):
            if self._verbose:
                logger.log("Epoch %d" % (epoch))
                progbar = pyprind.ProgBar(len(inputs[0]))
            grad_sum = np.zeros_like(param)
            g_mean_tilde = f_grad_tilde(inputs, extra_inputs)
            logger.record_tabular('g_mean_tilde', LA.norm(g_mean_tilde))
            print("-------------mini-batch-------------------")
            num_batch = 0
            while num_batch < self._max_batch:
                batch = dataset.random_batch()
                g = f_grad(*(batch)) - f_grad_tilde(*(batch)) + g_mean_tilde
                grad_sum += g
                prev_w = np.copy(self._target.get_param_values(trainable=True))
                step = self._alpha * g
                cur_w = prev_w + step
                self._target.set_param_value(cur_w, trainable=True)
                num_batch += 1
            print("max batch achieved {:}".format(num_batch))
            grad_sum /= 1.0 * num_batch
            logger.record_tabular('gdist', LA.norm(grad_sum - g_mean_tilde))
            cur_w = np.copy(self._target.get_param_values(trainable=True))
            w_tilde = self._target_tilde.get_params_values(trainable=True)
            self._target_tilde.set_param_values(cur_w, trainable=True)
            logger.record_tabular('wnorm', LA.norm(cur_w))
            logger.record_tabular('w_dist',
                                  LA.norm(cur_w - w_tilde) / LA.norm(cur_w))

            if self._verbose:
                if progbar.active:
                    progbar.stop()
            if abs(LA.norm(cur_w - w_tilde) /
                   LA.norm(cur_w)) < self._tolerance:
                break
Ejemplo n.º 16
0
 def _gen_terrain(self, regen=True):
     logger.log("Process {0} generating terrain...".format(os.getpid()))
     x, y, hfield = terrain.generate_hills(40, 40, 500)
     hfield = self._mod_hfield(hfield)
     terrain.save_heightfield(
         x, y, hfield, self.HFIELD_FNAME, path=self.hfield_dir)
     terrain.save_texture(
         x, y, hfield, self.TEXTURE_FNAME, path=self.texturedir)
     logger.log("Generated.")
Ejemplo n.º 17
0
    def optimize_gen(self,
                     inputs,
                     extra_inputs=None,
                     callback=None,
                     yield_itr=None):

        if len(inputs) == 0:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_opt = self._opt_fun["f_opt"]
        f_loss = self._opt_fun["f_loss"]

        if extra_inputs is None:
            extra_inputs = tuple()

        last_loss = f_loss(*(tuple(inputs) + extra_inputs))

        start_time = time.time()

        dataset = BatchDataset(
            inputs,
            self._batch_size,
            extra_inputs=extra_inputs
            #, randomized=self._randomized
        )

        itr = 0
        for epoch in pyprind.prog_bar(list(range(self._max_epochs))):
            for batch in dataset.iterate(update=True):
                f_opt(*batch)
                if yield_itr is not None and (itr % (yield_itr + 1)) == 0:
                    yield
                itr += 1

            new_loss = f_loss(*(tuple(inputs) + extra_inputs))
            if self._verbose:
                logger.log("Epoch %d, loss %s" % (epoch, new_loss))

            if self._callback or callback:
                elapsed = time.time() - start_time
                callback_args = dict(
                    loss=new_loss,
                    params=self._target.get_param_values(trainable=True)
                    if self._target else None,
                    itr=epoch,
                    elapsed=elapsed,
                )
                if self._callback:
                    self._callback(callback_args)
                if callback:
                    callback(**callback_args)

            if abs(last_loss - new_loss) < self._tolerance:
                break
            last_loss = new_loss
Ejemplo n.º 18
0
    def decide_new_skill(self, samples_data):
        """
        Decide if new skill should be made. If yes, return also start and end observations for training.
        :param samples_data: processed sampled data:
                dict(observations, actions, advantages, rewards, returns, valids, agent_infos, env_infos, paths)
        :return: (bool: make new skill, start_obss, end_obss)
        """
        min_length = 2
        max_length = 5
        action_map = None  # {0: 's', 1: 'L', 2: 'R'}
        min_f_score = 2
        max_results = 10
        aggregations = []  # sublist of ['mean', 'most_freq', 'nearest_mean', 'medoid'] or 'all'
        f_score_step_factor = 1.5

        paths = samples_data['paths']
        path_trie = PathTrie(self._hrl_policy.num_skills)
        for path in paths:
            actions = path['actions'].argmax(axis=1).tolist()
            observations = path['observations']
            path_trie.add_all_subpaths(
                actions,
                observations,
                min_length=min_length,
                max_length=max_length
            )
        logger.log('Searched {} rollouts'.format(len(paths)))

        frequent_paths = path_trie.items(
            action_map=action_map,
            min_count=10,
            min_f_score=min_f_score,
            max_results=max_results,
            aggregations=aggregations
        )
        logger.log('Found {} frequent paths: [index, actions, count, f-score]'.format(len(frequent_paths)))
        for i, f_path in enumerate(frequent_paths):
            logger.log('    {:2}: {:{pad}}\t{}\t{:.3f}'.format(
                i,
                str(f_path['actions']),
                f_path['count'],
                f_path['f_score'],
                pad=max_length*3))

        # if self._added_skills > 0:
        #     return False, None  # DEBUG add only one skill
        if len(frequent_paths) == 0:
            return False, None
        top_subpath = frequent_paths[0]
        prev_f_score, self._last_f_score = self._last_f_score, top_subpath['f_score']
        if self._last_f_score > prev_f_score * f_score_step_factor:
            logger.log('Decided to make new skill, since its f-score {} > {} * {}'.format(self._last_f_score, f_score_step_factor, prev_f_score))
            logger.log('New skill is based on subpath: {}'.format(top_subpath['actions']))
            self._added_skills += 1
            return True, top_subpath
        return False, None
Ejemplo n.º 19
0
def populate_task(env, policy, scope=None):
    logger.log("Populating workers...")
    if singleton_pool.n_parallel > 1:
        singleton_pool.run_each(_worker_populate_task, [
            (pickle.dumps(env), pickle.dumps(policy), scope)
        ] * singleton_pool.n_parallel)
    else:
        # avoid unnecessary copying
        g = _get_scoped_g(singleton_pool.G, scope)
        g.env = env
        g.policy = policy
    logger.log("Populated")
Ejemplo n.º 20
0
    def _policy_optimization(self, itr):
        logger.log('Policy optimization...')
        self._log_progress('Pre/')

        # Choose optimization starting parameters
        if itr == 0:
            x = self.all_params[np.argmax(self.all_Rs)]
        else:
            x = self.rollout_params.copy()

        # Define target objective based on surrogate model
        if itr == 0 or (np.max(self.all_Rs) / self.max_R) > 1.1:
            self.max_R = np.max(np.abs(self.all_Rs))

            var_list = self.model.policy.get_params(trainable=True)

            target = self.model.J_test / self.max_R + (
                self.model.ess_test / self.max_paths) * self.ess_penalty
            target_grad = flatgrad(target, var_list)

            def fun_and_jac(x):
                self.model.policy.set_param_values(x, trainable=True)
                return self.sess.run([target, target_grad], self.feed)

            self.fun_and_jac = fun_and_jac

        # Hard reset policy if stuck in bad local minimum
        if self.reset and itr > 10:
            sorted_R = sorted(self.all_Rs)
            cutoff = sorted_R[int(len(self.all_Rs) * self.explore_thresh)]
            print("Cutoff: ", cutoff)
            if self.all_Rs[-1] <= cutoff:
                loc = np.mean(
                    [self.all_params[idx] for idx in self.softmax_ids], axis=0)
                scale = np.std(self.all_params[-10:],
                               axis=0) * self.explore_width
                scale = np.minimum(scale, self.explore_limit)
                x = np.random.normal(loc=loc,
                                     scale=scale,
                                     size=(self.model.n_params))
                print(
                    "_______________________________________________EXPLORING with mean scale: ",
                    np.mean(scale))

        # SGD-based optimization
        self.optimizer.fun_and_jac = self.fun_and_jac
        x, f, fs = self.optimizer.optimize(x,
                                           step_size=1e-2 *
                                           np.exp(self.model.log_std))
        self.rollout_params = x.copy()

        self._log_progress('Post/')
Ejemplo n.º 21
0
    def train_inference_network(self, inference_opt_input_values):
        """ Optimize inference network """

        logger.log("Optimizing inference network...")
        infer_loss_before = self.inference_optimizer.loss(
            inference_opt_input_values)
        logger.record_tabular('Inference/Loss', infer_loss_before)
        self.inference_optimizer.optimize(inference_opt_input_values)
        infer_loss_after = self.inference_optimizer.loss(
            inference_opt_input_values)
        logger.record_tabular('Inference/dLoss',
                              infer_loss_before - infer_loss_after)

        return infer_loss_after
Ejemplo n.º 22
0
def worker_run_reset(G, flags, scope):
    if not hasattr(G, 'parallel_vec_envs'):
        logger.log("on worker %d" % G.worker_id)
        import traceback
        for line in traceback.format_stack():
            logger.log(line)
        # log the stacktrace at least
        logger.log("oops")
        for k, v in G.__dict__.items():
            logger.log(str(k) + " : " + str(v))
        assert hasattr(G, 'parallel_vec_envs')

    assert scope in G.parallel_vec_envs
    N = len(G.parallel_vec_envs[scope])
    env_template = G.parallel_vec_env_template[scope]
    obs_dim = env_template.observation_space.flat_dim
    ret_arr = np.zeros((N, obs_dim))
    ids = []
    flat_obs = []
    reset_ids = []
    for itr_idx, (idx, env) in enumerate(G.parallel_vec_envs[scope]):
        flag = flags[idx]
        if flag:
            flat_obs.append(env.reset())
            reset_ids.append(itr_idx)
        ids.append(idx)
    if len(reset_ids) > 0:
        ret_arr[reset_ids] = env_template.observation_space.flatten_n(flat_obs)
    return ids, ret_arr
Ejemplo n.º 23
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        inference_opt_input_values = self._inference_opt_input_values(
            samples_data)

        self.train_policy_and_embedding_networks(policy_opt_input_values)
        self.train_inference_network(inference_opt_input_values)

        samples_data = self.evaluate(policy_opt_input_values, samples_data)
        self.visualize_distribution(samples_data)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(samples_data['paths'], samples_data)
        else:
            self.baseline.fit(samples_data['paths'])
Ejemplo n.º 24
0
    def _bookkeeping(self):
        logger.log('Processing samples...')

        Rs = [sum(path['rewards']) for path in self.paths]
        Hs = [len(path['rewards']) for path in self.paths]
        current_params = [path['policy_params'] for path in self.paths]

        self.all_paths += self.paths
        self.all_params += current_params
        self.all_Rs += Rs
        self.all_Hs += Hs

        idx = np.argmax(Rs)
        if Rs[idx] > self.best_R:
            logger.log('Found new best policy parameters!...')
            self.best_R = Rs[idx]
            self.best_policy_params = current_params[idx].copy()
Ejemplo n.º 25
0
    def _log_diagnostics(self, itr):
        logger.log('Logging diagnostics...')

        undisc_returns = [sum(path["rewards"]) for path in self.paths]

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('Paths/NumTrajs', len(self.paths))
        logger.record_tabular(
            'Paths/Steps',
            np.mean([len(path['rewards']) for path in self.paths]))
        logger.record_tabular('Paths/Return/Best_Policy_Avg',
                              self.avg_ret_best_policy)
        logger.record_tabular('Paths/Return/Mean', np.mean(undisc_returns))
        logger.record_tabular('Paths/Return/Std', np.std(undisc_returns))
        logger.record_tabular('Paths/Return/Max', np.max(undisc_returns))
        logger.record_tabular('Paths/Return/Min', np.min(undisc_returns))

        self.model.policy.log_diagnostics(self.paths)
Ejemplo n.º 26
0
    def _initiate_training(self):
        """
        Initiates the training process by creating a tensorflow session
        and sampling 5 differnt policies which are generated by perturbing
        the initial policies parameters with random gaussian noise.
        """

        logger.log('Initiate training...')

        self.start_time = time.time()

        self.sess = tf.Session()
        self.sess.__enter__()
        self.sess.run(tf.global_variables_initializer())

        current_mean = self.model.policy.get_param_values(trainable=True)
        current_std = 1.0
        self.rollout_params = np.random.normal(loc=current_mean,
                                               scale=current_std,
                                               size=(5, self.model.n_params))
Ejemplo n.º 27
0
    def _fit_baseline(self, samples_data):
        """ Update baselines from samples. """

        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Augment reward from baselines
        rewards_tensor = self.f_rewards(*policy_opt_input_values)
        returns_tensor = self.f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor)

        paths = samples_data["paths"]
        valids = samples_data["valids"]
        baselines = [path["baselines"] for path in paths]

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path["rewards"] = rew[val.astype(np.bool)]
            path["returns"] = ret[val.astype(np.bool)]
            aug_rewards.append(path["rewards"])
            aug_returns.append(path["returns"])
        aug_rewards = tensor_utils.concat_tensor_list(aug_rewards)
        aug_returns = tensor_utils.concat_tensor_list(aug_returns)
        samples_data["rewards"] = aug_rewards
        samples_data["returns"] = aug_returns

        # Calculate explained variance
        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           aug_returns)
        logger.record_tabular(
            "{}/ExplainedVariance".format(self.baseline.name), ev)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, "fit_with_samples"):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)
    def train(self):
        address = ("localhost", 6000)
        conn = Client(address)
        try:
            plotter = Plotter()
            if self.plot:
                plotter.init_plot(self.env, self.policy)
            conn.send(ExpLifecycle.START)
            self.start_worker()
            self.init_opt()
            for itr in range(self.current_itr, self.n_itr):
                with logger.prefix('itr #%d | ' % itr):
                    conn.send(ExpLifecycle.OBTAIN_SAMPLES)
                    paths = self.sampler.obtain_samples(itr)
                    conn.send(ExpLifecycle.PROCESS_SAMPLES)
                    samples_data = self.sampler.process_samples(itr, paths)
                    self.log_diagnostics(paths)
                    conn.send(ExpLifecycle.OPTIMIZE_POLICY)
                    self.optimize_policy(itr, samples_data)
                    logger.log("saving snapshot...")
                    params = self.get_itr_snapshot(itr, samples_data)
                    self.current_itr = itr + 1
                    params["algo"] = self
                    if self.store_paths:
                        params["paths"] = samples_data["paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("saved")
                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        conn.send(ExpLifecycle.UPDATE_PLOT)
                        plotter.update_plot(self.policy, self.max_path_length)
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")

            conn.send(ExpLifecycle.SHUTDOWN)
            plotter.close()
            self.shutdown_worker()
        finally:
            conn.close()
Ejemplo n.º 29
0
Archivo: vpg.py Proyecto: gntoni/garage
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(samples_data, "observations", "actions",
                             "advantages")
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        inputs += tuple(state_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"], )
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](
            *(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
Ejemplo n.º 30
0
 def _reset_real(self):
     """
     reset the real
     """
     # randomize start position of blocks
     for block in self._blocks:
         block_random_delta = np.zeros(2)
         new_pos = block.initial_pos
         while np.linalg.norm(block_random_delta) < 0.1:
             block_random_delta = np.random.uniform(
                 -block.random_delta_range,
                 block.random_delta_range,
                 size=2)
         new_pos.x += block_random_delta[0]
         new_pos.y += block_random_delta[1]
         logger.log('new position for {} is x = {}, y = {}, z = {}'.format(
             block.name, new_pos.x, new_pos.y, new_pos.z))
         ready = False
         while not ready:
             ans = input(
                 'Have you finished setting up {}?[Yes/No]\n'.format(
                     block.name))
             if ans.lower() == 'yes' or ans.lower() == 'y':
                 ready = True
Ejemplo n.º 31
0
    def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
                 force_reset=True):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)

        # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
        # the time limit specified for each environment has been passed and
        # therefore the environment is not Markovian (terminal condition depends
        # on time rather than state).
        env = env.env

        self.env = env
        self.env_id = env.spec.id

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        logger.log("observation space: {}".format(self._observation_space))
        self._action_space = convert_gym_space(env.action_space)
        logger.log("action space: {}".format(self._action_space))
        self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
        self._log_dir = log_dir
        self._force_reset = force_reset