コード例 #1
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)
        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)
        logger.record_tabular("{}/LossBefore".format(self.policy.name),
                              loss_before)
        logger.record_tabular("{}/LossAfter".format(self.policy.name),
                              loss_after)
        logger.record_tabular("{}/dLoss".format(self.policy.name),
                              loss_before - loss_after)
        logger.record_tabular("{}/KLBefore".format(self.policy.name),
                              policy_kl_before)
        logger.record_tabular("{}/KL".format(self.policy.name), policy_kl)
        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        logger.record_tabular("{}/Entropy".format(self.policy.name),
                              np.mean(pol_ent))

        self._fit_baseline(samples_data)
コード例 #2
0
ファイル: hill_env.py プロジェクト: Mee321/HAPG_exp
 def _iam_terrain_generator(self, regen):
     """
     When parallel processing, don't want each worker to generate its own
     terrain. This method ensures that one worker generates the terrain,
     which is then used by other workers. It's still possible to have each
     worker use their own terrain by passing each worker a different hfield
     and texture dir.
     """
     if not os.path.exists(self.hfield_dir):
         os.makedirs(self.hfield_dir)
     terrain_path = os.path.join(self.hfield_dir, self.HFIELD_FNAME)
     lock_path = self._get_lock_path()
     if regen or (not regen and not os.path.exists(terrain_path)):
         # use a simple lock file to prevent different workers overwriting
         # the file, and/or running their own unique terrains
         if not os.path.exists(lock_path):
             with open(lock_path, 'w') as f:
                 f.write(str(os.getpid()))
             return True
         else:
             # wait for the worker that's generating the terrain to finish
             total = 0
             logger.log(
                 "Process {0} waiting for terrain generation...".format(
                     os.getpid()))
             while os.path.exists(lock_path) and total < 120:
                 time.sleep(5)
                 total += 5
             if os.path.exists(lock_path):
                 raise ("Process {0} timed out waiting for terrain "
                        "generation, or stale lock file").format(
                            os.getpid())
             logger.log("Done.")
             return False
     return False
コード例 #3
0
ファイル: batch_polopt.py プロジェクト: vincentzhang/garage
    def train(self):
        plotter = Plotter()
        if self.plot:
            plotter.init_plot(self.env, self.policy)
        self.start_worker()
        self.init_opt()
        for itr in range(self.current_itr, self.n_itr):
            with logger.prefix('itr #%d | ' % itr):
                paths = self.sampler.obtain_samples(itr)
                samples_data = self.sampler.process_samples(itr, paths)
                self.log_diagnostics(paths)
                self.optimize_policy(itr, samples_data)
                logger.log("saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)
                self.current_itr = itr + 1
                params["algo"] = self
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    plotter.update_plot(self.policy, self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

        plotter.close()
        self.shutdown_worker()
コード例 #4
0
    def optimize_policy(self, itr, **kwargs):
        paths = self.obtain_samples(itr)

        samples_data = self.process_samples(itr, paths)
        if self._save_sample_frequency > 0 and itr % self._save_sample_frequency == 0:
            self.save_samples(itr, samples_data)
        self.log_diagnostics(paths)

        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        inference_opt_input_values = self._inference_opt_input_values(
            samples_data)

        self.train_policy_and_embedding_networks(policy_opt_input_values)
        self.train_inference_network(inference_opt_input_values)

        samples_data = self.evaluate(policy_opt_input_values, samples_data)
        self.visualize_distribution(samples_data)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)

        return self.get_itr_snapshot(itr, samples_data)
コード例 #5
0
ファイル: npo_v1.py プロジェクト: maliesa96/fyra
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)
        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)
        logger.record_tabular("{}/LossBefore".format(self.policy.name),
                              loss_before)
        logger.record_tabular("{}/LossAfter".format(self.policy.name),
                              loss_after)
        logger.record_tabular("{}/dLoss".format(self.policy.name),
                              loss_before - loss_after)
        logger.record_tabular("{}/KLBefore".format(self.policy.name),
                              policy_kl_before)
        logger.record_tabular("{}/KL".format(self.policy.name), policy_kl)

        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        logger.record_tabular("{}/Entropy".format(self.policy.name),
                              np.mean(pol_ent))

        num_traj = self.batch_size // self.max_path_length
        actions = samples_data["actions"][:num_traj, ...]
        logger.record_histogram("{}/Actions".format(self.policy.name), actions)

        self._fit_baseline(samples_data)
コード例 #6
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())

        self.start_worker(sess)
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                params = self.optimize_policy(itr, )
                if self.plot:
                    self.plotter.update_plot(self.policy, self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
                logger.log("Saving snapshot...")
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('IterTime', time.time() - itr_start_time)
                logger.record_tabular('Time', time.time() - start_time)
                logger.dump_tabular()
        self.shutdown_worker()
        if created_session:
            sess.close()
コード例 #7
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
 def _memory_selection(self, itr):
     logger.log('Memory selection...')
     self.model_paths = self.memory_selection.select_paths_subset(
         self.all_paths, self.all_Rs)
     self.softmax_ids = self.softmax_selection.select_paths_subset(
         self.all_paths, self.all_Rs, return_indices=True)
     self.feed = self.model.get_feed(self.model_paths)
コード例 #8
0
    def train_policy_and_embedding_networks(self, policy_opt_input_values):
        """ Joint optimization of policy and embedding networks """

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)

        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        embed_kl_before = self.f_embedding_kl(*policy_opt_input_values)

        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)

        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        embed_kl = self.f_embedding_kl(*policy_opt_input_values)

        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)

        logger.record_tabular('Policy/LossBefore', loss_before)
        logger.record_tabular('Policy/LossAfter', loss_after)
        logger.record_tabular('Policy/KLBefore', policy_kl_before)
        logger.record_tabular('Policy/KL', policy_kl)
        logger.record_tabular('Policy/dLoss', loss_before - loss_after)
        logger.record_tabular('Embedding/KLBefore', embed_kl_before)
        logger.record_tabular('Embedding/KL', embed_kl)

        return loss_after
コード例 #9
0
ファイル: npo.py プロジェクト: gntoni/garage
 def optimize_policy(self, itr, samples_data):
     all_input_values = tuple(
         ext.extract(samples_data, "observations", "actions", "advantages"))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [
         agent_infos[k] for k in self.policy.distribution.dist_info_keys
     ]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"], )
     logger.log("Computing loss before")
     loss_before = self.optimizer.loss(all_input_values)
     logger.log("Computing KL before")
     mean_kl_before = self.optimizer.constraint_val(all_input_values)
     logger.log("Optimizing")
     self.optimizer.optimize(all_input_values)
     logger.log("Computing KL after")
     mean_kl = self.optimizer.constraint_val(all_input_values)
     logger.log("Computing loss after")
     loss_after = self.optimizer.loss(all_input_values)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKLBefore', mean_kl_before)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     return dict()
コード例 #10
0
ファイル: catrpo.py プロジェクト: Mee321/HAPG_exp
    def outer_optimize(self, samples_data):
        logger.log("optimizing policy")
        observations = ext.extract(samples_data, "observations")
        actions = ext.extract(samples_data, "actions")
        advantages = ext.extract(samples_data, "advantages")

        num_traj = len(samples_data["paths"])

        observations = observations[0].reshape(
            -1, self.env.spec.observation_space.shape[0])
        actions = actions[0].reshape(-1, self.env.spec.action_space.shape[0])
        advantages = advantages[0].reshape(-1)
        inputs = tuple([observations, actions, advantages])

        s_g = self._opt_fun["f_train"](*(list(inputs)))
        #s_g = [x / num_traj for x in s_g]
        self.gradient_backup = copy.deepcopy(s_g)
        g_flat = self.flatten_parameters(s_g)

        loss_before = self._opt_fun["f_loss"](*(list(inputs)))
        self.backup_policy.set_param_values(
            self.policy.get_param_values(trainable=True), trainable=True)
        self.optimizer.optimize(inputs, g_flat)
        loss_after = self._opt_fun["f_loss"](*(list(inputs)))
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self._opt_fun['f_kl'](*(list(inputs)))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
コード例 #11
0
 def train_once(self, itr, paths):
     itr_start_time = time.time()
     with logger.prefix('itr #%d | ' % itr):
         self.log_diagnostics(paths)
         logger.log("Optimizing policy...")
         self.optimize_policy(itr, paths)
         logger.record_tabular('IterTime', time.time() - itr_start_time)
         logger.dump_tabular()
コード例 #12
0
def worker_init_envs(G, alloc, scope, env):
    logger.log("initializing environment on worker %d" % G.worker_id)
    if not hasattr(G, 'parallel_vec_envs'):
        G.parallel_vec_envs = dict()
        G.parallel_vec_env_template = dict()
    G.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env)))
                                  for idx in alloc]
    G.parallel_vec_env_template[scope] = env
コード例 #13
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
    def train(self):
        self._initiate_training()

        for itr in range(self.n_itr):
            self._training_step(itr)

        logger.log('Terminate training')
        self.sess.close()
コード例 #14
0
    def optimize(self, inputs, extra_inputs=None, callback=None, name=None):
        with tf.name_scope(name, "optimize", values=[inputs, extra_inputs]):

            if not inputs:
                # Assumes that we should always sample mini-batches
                raise NotImplementedError

            f_loss = self._opt_fun["f_loss"]

            if extra_inputs is None:
                extra_inputs = tuple()

            last_loss = f_loss(*(tuple(inputs) + extra_inputs))

            start_time = time.time()

            dataset = BatchDataset(inputs,
                                   self._batch_size,
                                   extra_inputs=extra_inputs)

            sess = tf.get_default_session()

            for epoch in range(self._max_epochs):
                if self._verbose:
                    logger.log("Epoch %d" % (epoch))
                    progbar = pyprind.ProgBar(len(inputs[0]))

                for batch in dataset.iterate(update=True):
                    sess.run(self._train_op,
                             dict(list(zip(self._input_vars, batch))))
                    if self._verbose:
                        progbar.update(len(batch[0]))

                if self._verbose:
                    if progbar.active:
                        progbar.stop()

                new_loss = f_loss(*(tuple(inputs) + extra_inputs))

                if self._verbose:
                    logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
                if self._callback or callback:
                    elapsed = time.time() - start_time
                    callback_args = dict(
                        loss=new_loss,
                        params=self._target.get_param_values(
                            trainable=True) if self._target else None,
                        itr=epoch,
                        elapsed=elapsed,
                    )
                    if self._callback:
                        self._callback(callback_args)
                    if callback:
                        callback(**callback_args)

                if abs(last_loss - new_loss) < self._tolerance:
                    break
                last_loss = new_loss
コード例 #15
0
ファイル: svrg_optimizer.py プロジェクト: Mee321/HAPG_exp
    def optimize(self, inputs, extra_inputs=None):

        if not inputs:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_loss = self._opt_fun["f_loss"]
        f_grad = self._opt_fun["f_grad"]
        f_grad_tilde = self._opt_fun["f_grad_tilde"]

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        param = np.copy(self._target.get_param_values(trainable=True))
        logger.log("Start SVRPG optimization: #parameters: %d, #inputs %d" %
                   (len(param), len(inputs[0])))
        dataset = BatchDataset(inputs,
                               self._batch_size,
                               extra_inputs=extra_inputs)
        start_time = time.time()

        for epoch in range(self._max_epochs):
            if self._verbose:
                logger.log("Epoch %d" % (epoch))
                progbar = pyprind.ProgBar(len(inputs[0]))
            grad_sum = np.zeros_like(param)
            g_mean_tilde = f_grad_tilde(inputs, extra_inputs)
            logger.record_tabular('g_mean_tilde', LA.norm(g_mean_tilde))
            print("-------------mini-batch-------------------")
            num_batch = 0
            while num_batch < self._max_batch:
                batch = dataset.random_batch()
                g = f_grad(*(batch)) - f_grad_tilde(*(batch)) + g_mean_tilde
                grad_sum += g
                prev_w = np.copy(self._target.get_param_values(trainable=True))
                step = self._alpha * g
                cur_w = prev_w + step
                self._target.set_param_value(cur_w, trainable=True)
                num_batch += 1
            print("max batch achieved {:}".format(num_batch))
            grad_sum /= 1.0 * num_batch
            logger.record_tabular('gdist', LA.norm(grad_sum - g_mean_tilde))
            cur_w = np.copy(self._target.get_param_values(trainable=True))
            w_tilde = self._target_tilde.get_params_values(trainable=True)
            self._target_tilde.set_param_values(cur_w, trainable=True)
            logger.record_tabular('wnorm', LA.norm(cur_w))
            logger.record_tabular('w_dist',
                                  LA.norm(cur_w - w_tilde) / LA.norm(cur_w))

            if self._verbose:
                if progbar.active:
                    progbar.stop()
            if abs(LA.norm(cur_w - w_tilde) /
                   LA.norm(cur_w)) < self._tolerance:
                break
コード例 #16
0
ファイル: hill_env.py プロジェクト: Mee321/HAPG_exp
 def _gen_terrain(self, regen=True):
     logger.log("Process {0} generating terrain...".format(os.getpid()))
     x, y, hfield = terrain.generate_hills(40, 40, 500)
     hfield = self._mod_hfield(hfield)
     terrain.save_heightfield(
         x, y, hfield, self.HFIELD_FNAME, path=self.hfield_dir)
     terrain.save_texture(
         x, y, hfield, self.TEXTURE_FNAME, path=self.texturedir)
     logger.log("Generated.")
コード例 #17
0
    def optimize_gen(self,
                     inputs,
                     extra_inputs=None,
                     callback=None,
                     yield_itr=None):

        if len(inputs) == 0:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_opt = self._opt_fun["f_opt"]
        f_loss = self._opt_fun["f_loss"]

        if extra_inputs is None:
            extra_inputs = tuple()

        last_loss = f_loss(*(tuple(inputs) + extra_inputs))

        start_time = time.time()

        dataset = BatchDataset(
            inputs,
            self._batch_size,
            extra_inputs=extra_inputs
            #, randomized=self._randomized
        )

        itr = 0
        for epoch in pyprind.prog_bar(list(range(self._max_epochs))):
            for batch in dataset.iterate(update=True):
                f_opt(*batch)
                if yield_itr is not None and (itr % (yield_itr + 1)) == 0:
                    yield
                itr += 1

            new_loss = f_loss(*(tuple(inputs) + extra_inputs))
            if self._verbose:
                logger.log("Epoch %d, loss %s" % (epoch, new_loss))

            if self._callback or callback:
                elapsed = time.time() - start_time
                callback_args = dict(
                    loss=new_loss,
                    params=self._target.get_param_values(trainable=True)
                    if self._target else None,
                    itr=epoch,
                    elapsed=elapsed,
                )
                if self._callback:
                    self._callback(callback_args)
                if callback:
                    callback(**callback_args)

            if abs(last_loss - new_loss) < self._tolerance:
                break
            last_loss = new_loss
コード例 #18
0
    def decide_new_skill(self, samples_data):
        """
        Decide if new skill should be made. If yes, return also start and end observations for training.
        :param samples_data: processed sampled data:
                dict(observations, actions, advantages, rewards, returns, valids, agent_infos, env_infos, paths)
        :return: (bool: make new skill, start_obss, end_obss)
        """
        min_length = 2
        max_length = 5
        action_map = None  # {0: 's', 1: 'L', 2: 'R'}
        min_f_score = 2
        max_results = 10
        aggregations = []  # sublist of ['mean', 'most_freq', 'nearest_mean', 'medoid'] or 'all'
        f_score_step_factor = 1.5

        paths = samples_data['paths']
        path_trie = PathTrie(self._hrl_policy.num_skills)
        for path in paths:
            actions = path['actions'].argmax(axis=1).tolist()
            observations = path['observations']
            path_trie.add_all_subpaths(
                actions,
                observations,
                min_length=min_length,
                max_length=max_length
            )
        logger.log('Searched {} rollouts'.format(len(paths)))

        frequent_paths = path_trie.items(
            action_map=action_map,
            min_count=10,
            min_f_score=min_f_score,
            max_results=max_results,
            aggregations=aggregations
        )
        logger.log('Found {} frequent paths: [index, actions, count, f-score]'.format(len(frequent_paths)))
        for i, f_path in enumerate(frequent_paths):
            logger.log('    {:2}: {:{pad}}\t{}\t{:.3f}'.format(
                i,
                str(f_path['actions']),
                f_path['count'],
                f_path['f_score'],
                pad=max_length*3))

        # if self._added_skills > 0:
        #     return False, None  # DEBUG add only one skill
        if len(frequent_paths) == 0:
            return False, None
        top_subpath = frequent_paths[0]
        prev_f_score, self._last_f_score = self._last_f_score, top_subpath['f_score']
        if self._last_f_score > prev_f_score * f_score_step_factor:
            logger.log('Decided to make new skill, since its f-score {} > {} * {}'.format(self._last_f_score, f_score_step_factor, prev_f_score))
            logger.log('New skill is based on subpath: {}'.format(top_subpath['actions']))
            self._added_skills += 1
            return True, top_subpath
        return False, None
コード例 #19
0
def populate_task(env, policy, scope=None):
    logger.log("Populating workers...")
    if singleton_pool.n_parallel > 1:
        singleton_pool.run_each(_worker_populate_task, [
            (pickle.dumps(env), pickle.dumps(policy), scope)
        ] * singleton_pool.n_parallel)
    else:
        # avoid unnecessary copying
        g = _get_scoped_g(singleton_pool.G, scope)
        g.env = env
        g.policy = policy
    logger.log("Populated")
コード例 #20
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
    def _policy_optimization(self, itr):
        logger.log('Policy optimization...')
        self._log_progress('Pre/')

        # Choose optimization starting parameters
        if itr == 0:
            x = self.all_params[np.argmax(self.all_Rs)]
        else:
            x = self.rollout_params.copy()

        # Define target objective based on surrogate model
        if itr == 0 or (np.max(self.all_Rs) / self.max_R) > 1.1:
            self.max_R = np.max(np.abs(self.all_Rs))

            var_list = self.model.policy.get_params(trainable=True)

            target = self.model.J_test / self.max_R + (
                self.model.ess_test / self.max_paths) * self.ess_penalty
            target_grad = flatgrad(target, var_list)

            def fun_and_jac(x):
                self.model.policy.set_param_values(x, trainable=True)
                return self.sess.run([target, target_grad], self.feed)

            self.fun_and_jac = fun_and_jac

        # Hard reset policy if stuck in bad local minimum
        if self.reset and itr > 10:
            sorted_R = sorted(self.all_Rs)
            cutoff = sorted_R[int(len(self.all_Rs) * self.explore_thresh)]
            print("Cutoff: ", cutoff)
            if self.all_Rs[-1] <= cutoff:
                loc = np.mean(
                    [self.all_params[idx] for idx in self.softmax_ids], axis=0)
                scale = np.std(self.all_params[-10:],
                               axis=0) * self.explore_width
                scale = np.minimum(scale, self.explore_limit)
                x = np.random.normal(loc=loc,
                                     scale=scale,
                                     size=(self.model.n_params))
                print(
                    "_______________________________________________EXPLORING with mean scale: ",
                    np.mean(scale))

        # SGD-based optimization
        self.optimizer.fun_and_jac = self.fun_and_jac
        x, f, fs = self.optimizer.optimize(x,
                                           step_size=1e-2 *
                                           np.exp(self.model.log_std))
        self.rollout_params = x.copy()

        self._log_progress('Post/')
コード例 #21
0
    def train_inference_network(self, inference_opt_input_values):
        """ Optimize inference network """

        logger.log("Optimizing inference network...")
        infer_loss_before = self.inference_optimizer.loss(
            inference_opt_input_values)
        logger.record_tabular('Inference/Loss', infer_loss_before)
        self.inference_optimizer.optimize(inference_opt_input_values)
        infer_loss_after = self.inference_optimizer.loss(
            inference_opt_input_values)
        logger.record_tabular('Inference/dLoss',
                              infer_loss_before - infer_loss_after)

        return infer_loss_after
コード例 #22
0
def worker_run_reset(G, flags, scope):
    if not hasattr(G, 'parallel_vec_envs'):
        logger.log("on worker %d" % G.worker_id)
        import traceback
        for line in traceback.format_stack():
            logger.log(line)
        # log the stacktrace at least
        logger.log("oops")
        for k, v in G.__dict__.items():
            logger.log(str(k) + " : " + str(v))
        assert hasattr(G, 'parallel_vec_envs')

    assert scope in G.parallel_vec_envs
    N = len(G.parallel_vec_envs[scope])
    env_template = G.parallel_vec_env_template[scope]
    obs_dim = env_template.observation_space.flat_dim
    ret_arr = np.zeros((N, obs_dim))
    ids = []
    flat_obs = []
    reset_ids = []
    for itr_idx, (idx, env) in enumerate(G.parallel_vec_envs[scope]):
        flag = flags[idx]
        if flag:
            flat_obs.append(env.reset())
            reset_ids.append(itr_idx)
        ids.append(idx)
    if len(reset_ids) > 0:
        ret_arr[reset_ids] = env_template.observation_space.flatten_n(flat_obs)
    return ids, ret_arr
コード例 #23
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        inference_opt_input_values = self._inference_opt_input_values(
            samples_data)

        self.train_policy_and_embedding_networks(policy_opt_input_values)
        self.train_inference_network(inference_opt_input_values)

        samples_data = self.evaluate(policy_opt_input_values, samples_data)
        self.visualize_distribution(samples_data)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(samples_data['paths'], samples_data)
        else:
            self.baseline.fit(samples_data['paths'])
コード例 #24
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
    def _bookkeeping(self):
        logger.log('Processing samples...')

        Rs = [sum(path['rewards']) for path in self.paths]
        Hs = [len(path['rewards']) for path in self.paths]
        current_params = [path['policy_params'] for path in self.paths]

        self.all_paths += self.paths
        self.all_params += current_params
        self.all_Rs += Rs
        self.all_Hs += Hs

        idx = np.argmax(Rs)
        if Rs[idx] > self.best_R:
            logger.log('Found new best policy parameters!...')
            self.best_R = Rs[idx]
            self.best_policy_params = current_params[idx].copy()
コード例 #25
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
    def _log_diagnostics(self, itr):
        logger.log('Logging diagnostics...')

        undisc_returns = [sum(path["rewards"]) for path in self.paths]

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('Paths/NumTrajs', len(self.paths))
        logger.record_tabular(
            'Paths/Steps',
            np.mean([len(path['rewards']) for path in self.paths]))
        logger.record_tabular('Paths/Return/Best_Policy_Avg',
                              self.avg_ret_best_policy)
        logger.record_tabular('Paths/Return/Mean', np.mean(undisc_returns))
        logger.record_tabular('Paths/Return/Std', np.std(undisc_returns))
        logger.record_tabular('Paths/Return/Max', np.max(undisc_returns))
        logger.record_tabular('Paths/Return/Min', np.min(undisc_returns))

        self.model.policy.log_diagnostics(self.paths)
コード例 #26
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
    def _initiate_training(self):
        """
        Initiates the training process by creating a tensorflow session
        and sampling 5 differnt policies which are generated by perturbing
        the initial policies parameters with random gaussian noise.
        """

        logger.log('Initiate training...')

        self.start_time = time.time()

        self.sess = tf.Session()
        self.sess.__enter__()
        self.sess.run(tf.global_variables_initializer())

        current_mean = self.model.policy.get_param_values(trainable=True)
        current_std = 1.0
        self.rollout_params = np.random.normal(loc=current_mean,
                                               scale=current_std,
                                               size=(5, self.model.n_params))
コード例 #27
0
    def _fit_baseline(self, samples_data):
        """ Update baselines from samples. """

        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Augment reward from baselines
        rewards_tensor = self.f_rewards(*policy_opt_input_values)
        returns_tensor = self.f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor)

        paths = samples_data["paths"]
        valids = samples_data["valids"]
        baselines = [path["baselines"] for path in paths]

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path["rewards"] = rew[val.astype(np.bool)]
            path["returns"] = ret[val.astype(np.bool)]
            aug_rewards.append(path["rewards"])
            aug_returns.append(path["returns"])
        aug_rewards = tensor_utils.concat_tensor_list(aug_rewards)
        aug_returns = tensor_utils.concat_tensor_list(aug_returns)
        samples_data["rewards"] = aug_rewards
        samples_data["returns"] = aug_returns

        # Calculate explained variance
        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           aug_returns)
        logger.record_tabular(
            "{}/ExplainedVariance".format(self.baseline.name), ev)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, "fit_with_samples"):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)
コード例 #28
0
    def train(self):
        address = ("localhost", 6000)
        conn = Client(address)
        try:
            plotter = Plotter()
            if self.plot:
                plotter.init_plot(self.env, self.policy)
            conn.send(ExpLifecycle.START)
            self.start_worker()
            self.init_opt()
            for itr in range(self.current_itr, self.n_itr):
                with logger.prefix('itr #%d | ' % itr):
                    conn.send(ExpLifecycle.OBTAIN_SAMPLES)
                    paths = self.sampler.obtain_samples(itr)
                    conn.send(ExpLifecycle.PROCESS_SAMPLES)
                    samples_data = self.sampler.process_samples(itr, paths)
                    self.log_diagnostics(paths)
                    conn.send(ExpLifecycle.OPTIMIZE_POLICY)
                    self.optimize_policy(itr, samples_data)
                    logger.log("saving snapshot...")
                    params = self.get_itr_snapshot(itr, samples_data)
                    self.current_itr = itr + 1
                    params["algo"] = self
                    if self.store_paths:
                        params["paths"] = samples_data["paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("saved")
                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        conn.send(ExpLifecycle.UPDATE_PLOT)
                        plotter.update_plot(self.policy, self.max_path_length)
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")

            conn.send(ExpLifecycle.SHUTDOWN)
            plotter.close()
            self.shutdown_worker()
        finally:
            conn.close()
コード例 #29
0
ファイル: vpg.py プロジェクト: gntoni/garage
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(samples_data, "observations", "actions",
                             "advantages")
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        inputs += tuple(state_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"], )
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](
            *(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
コード例 #30
0
ファイル: block_world.py プロジェクト: wwxFromTju/garage
 def _reset_real(self):
     """
     reset the real
     """
     # randomize start position of blocks
     for block in self._blocks:
         block_random_delta = np.zeros(2)
         new_pos = block.initial_pos
         while np.linalg.norm(block_random_delta) < 0.1:
             block_random_delta = np.random.uniform(
                 -block.random_delta_range,
                 block.random_delta_range,
                 size=2)
         new_pos.x += block_random_delta[0]
         new_pos.y += block_random_delta[1]
         logger.log('new position for {} is x = {}, y = {}, z = {}'.format(
             block.name, new_pos.x, new_pos.y, new_pos.z))
         ready = False
         while not ready:
             ans = input(
                 'Have you finished setting up {}?[Yes/No]\n'.format(
                     block.name))
             if ans.lower() == 'yes' or ans.lower() == 'y':
                 ready = True
コード例 #31
0
ファイル: gym_env.py プロジェクト: sra4077/softqlearning
    def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
                 force_reset=True):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)

        # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
        # the time limit specified for each environment has been passed and
        # therefore the environment is not Markovian (terminal condition depends
        # on time rather than state).
        env = env.env

        self.env = env
        self.env_id = env.spec.id

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        logger.log("observation space: {}".format(self._observation_space))
        self._action_space = convert_gym_space(env.action_space)
        logger.log("action space: {}".format(self._action_space))
        self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
        self._log_dir = log_dir
        self._force_reset = force_reset