Esempio n. 1
0
    def _train_step(self):
        super()._train_step()

        critic_loss = self._update_critic()
        actor_loss = self._update_actor()
        critic_loss = MPIUtil.reduce_avg(critic_loss)
        actor_loss = MPIUtil.reduce_avg(actor_loss)

        critic_stepsize = self.critic_solver.get_stepsize()
        actor_stepsize = self.actor_solver.get_stepsize()

        self.logger.log_tabular('Critic_Loss', critic_loss)
        self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
        self.logger.log_tabular('Actor_Loss', actor_loss)
        self.logger.log_tabular('Actor_Stepsize', actor_stepsize)

        return
Esempio n. 2
0
    def _train(self):
        with self.sess.as_default(), self.graph.as_default():
            samples = self.replay_buffer.total_count
            self._total_sample_count = int(MPIUtil.reduce_sum(samples)) + self._init_total_sample_count
            end_training = False

            if (self.replay_buffer_initialized):
                if (self._valid_train_step()):
                    prev_iter = self.iter
                    iters = self._get_iters_per_update()
                    avg_train_return = MPIUtil.reduce_avg(self.train_return)
                    avg_train_mean_reward = MPIUtil.reduce_avg(self.train_mean_reward)
                    avg_train_pathlen = MPIUtil.reduce_avg(self.train_pathlen)
                    avg_train_pathlen /= 30 # policy is executed in 30Hz

                    for _ in range(iters):
                        curr_iter = self.iter
                        wall_time = time.time() - self.start_time
                        wall_time /= 60 * 60 # store time in hours

                        has_goal = self.has_goal()
                        s_mean = np.mean(self.s_norm.mean)
                        s_std = np.mean(self.s_norm.std)
                        g_mean = np.mean(self.g_norm.mean) if has_goal else 0
                        g_std = np.mean(self.g_norm.std) if has_goal else 0

                        self.logger.log_tabular("Iteration", self.iter)
                        self.logger.log_tabular("Wall_Time", wall_time)
                        self.logger.log_tabular("Samples", self._total_sample_count)
                        self.logger.log_tabular("Train_Path_Length", avg_train_pathlen)
                        self.logger.log_tabular("Train_Mean_Reward", avg_train_mean_reward)
                        self.logger.log_tabular("Train_Return", avg_train_return)
                        self.logger.log_tabular("Test_Return", self.avg_test_return)
                        self.logger.log_tabular("State_Mean", s_mean)
                        self.logger.log_tabular("State_Std", s_std)
                        self.logger.log_tabular("Goal_Mean", g_mean)
                        self.logger.log_tabular("Goal_Std", g_std)
                        self._log_exp_params()

                        self._update_iter(self.iter + 1)

                        train_start_time = time.time()
                        self._train_step()
                        train_time = time.time() - train_start_time

                        if self.iter == 1:
                            iteration_time = time.time() - self.start_time
                        else:
                            iteration_time = time.time() - self.iter_start_time
                        self.iter_start_time = time.time()

                        self.logger.log_tabular("Train_time", train_time)
                        self.logger.log_tabular("Simulation_time",  iteration_time - train_time)

                        Logger.print("Agent " + str(self.id))
                        self.logger.print_tabular()
                        Logger.print("")

                        if (self._enable_output() and curr_iter % self.int_output_iters == 0):
                            self.logger.dump_tabular()


                    if (prev_iter // self.int_output_iters != self.iter // self.int_output_iters):
                        end_training = self.enable_testing()

            else:

                Logger.print("Agent " + str(self.id))
                Logger.print("Samples: " + str(self._total_sample_count))
                Logger.print("")

                if (self._total_sample_count >= self.init_samples):
                    self.replay_buffer_initialized = True
                    end_training = self.enable_testing()

            if self._need_normalizer_update and self._enable_update_normalizer:
                self._update_normalizers()
                self._need_normalizer_update = self.normalizer_samples > self._total_sample_count

            if end_training:
                self._init_mode_train_end()
        return
Esempio n. 3
0
    def _train_step(self):
        adv_eps = 1e-5

        start_idx = self.replay_buffer.buffer_tail
        end_idx = self.replay_buffer.buffer_head
        assert (start_idx == 0)
        assert (self.replay_buffer.get_current_size() <=
                self.replay_buffer.buffer_size)  # must avoid overflow
        assert (start_idx < end_idx)

        idx = np.array(list(range(start_idx, end_idx)))
        end_mask = self.replay_buffer.is_path_end(idx)
        end_mask = np.logical_not(end_mask)

        vals = self._compute_batch_vals(start_idx, end_idx)
        new_vals = self._compute_batch_new_vals(start_idx, end_idx, vals)

        valid_idx = idx[end_mask]
        exp_idx = self.replay_buffer.get_idx_filtered(
            self.EXP_ACTION_FLAG).copy()
        num_valid_idx = valid_idx.shape[0]
        num_exp_idx = exp_idx.shape[0]
        exp_idx = np.column_stack(
            [exp_idx,
             np.array(list(range(0, num_exp_idx)), dtype=np.int32)])

        local_sample_count = valid_idx.size
        global_sample_count = int(MPIUtil.reduce_sum(local_sample_count))
        mini_batches = int(np.ceil(global_sample_count / self.mini_batch_size))

        adv = new_vals[exp_idx[:, 0]] - vals[exp_idx[:, 0]]
        new_vals = np.clip(new_vals, self.val_min, self.val_max)

        adv_mean = np.mean(adv)
        adv_std = np.std(adv)
        adv = (adv - adv_mean) / (adv_std + adv_eps)
        adv = np.clip(adv, -self.norm_adv_clip, self.norm_adv_clip)

        critic_loss = 0
        actor_loss = 0
        actor_clip_frac = 0

        for e in range(self.epochs):
            np.random.shuffle(valid_idx)
            np.random.shuffle(exp_idx)

            for b in range(mini_batches):
                batch_idx_beg = b * self._local_mini_batch_size
                batch_idx_end = batch_idx_beg + self._local_mini_batch_size

                critic_batch = np.array(range(batch_idx_beg, batch_idx_end),
                                        dtype=np.int32)
                actor_batch = critic_batch.copy()
                critic_batch = np.mod(critic_batch, num_valid_idx)
                actor_batch = np.mod(actor_batch, num_exp_idx)
                shuffle_actor = (actor_batch[-1] < actor_batch[0]) or (
                    actor_batch[-1] == num_exp_idx - 1)

                critic_batch = valid_idx[critic_batch]
                actor_batch = exp_idx[actor_batch]
                critic_batch_vals = new_vals[critic_batch]
                actor_batch_adv = adv[actor_batch[:, 1]]

                critic_s = self.replay_buffer.get('states', critic_batch)
                critic_g = self.replay_buffer.get(
                    'goals', critic_batch) if self.has_goal() else None
                curr_critic_loss = self._update_critic(critic_s, critic_g,
                                                       critic_batch_vals)

                actor_s = self.replay_buffer.get("states", actor_batch[:, 0])
                actor_g = self.replay_buffer.get(
                    "goals", actor_batch[:, 0]) if self.has_goal() else None
                actor_a = self.replay_buffer.get("actions", actor_batch[:, 0])
                actor_logp = self.replay_buffer.get("logps", actor_batch[:, 0])
                curr_actor_loss, curr_actor_clip_frac = self._update_actor(
                    actor_s, actor_g, actor_a, actor_logp, actor_batch_adv)

                critic_loss += curr_critic_loss
                actor_loss += np.abs(curr_actor_loss)
                actor_clip_frac += curr_actor_clip_frac

                if (shuffle_actor):
                    np.random.shuffle(exp_idx)

        total_batches = mini_batches * self.epochs
        critic_loss /= total_batches
        actor_loss /= total_batches
        actor_clip_frac /= total_batches

        critic_loss = MPIUtil.reduce_avg(critic_loss)
        actor_loss = MPIUtil.reduce_avg(actor_loss)
        actor_clip_frac = MPIUtil.reduce_avg(actor_clip_frac)

        critic_stepsize = self.critic_solver.get_stepsize()
        actor_stepsize = self.update_actor_stepsize(actor_clip_frac)

        self.logger.log_tabular('Critic_Loss', critic_loss)
        self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
        self.logger.log_tabular('Actor_Loss', actor_loss)
        self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
        self.logger.log_tabular('Clip_Frac', actor_clip_frac)
        self.logger.log_tabular('Adv_Mean', adv_mean)
        self.logger.log_tabular('Adv_Std', adv_std)

        self.replay_buffer.clear()

        return
Esempio n. 4
0
    def _train(self):
        samples = self.replay_buffer.total_count
        self._total_sample_count = int(MPIUtil.reduce_sum(samples))
        end_training = False

        if (self.replay_buffer_initialized):
            if (self._valid_train_step()):
                prev_iter = self.iter
                iters = self._get_iters_per_update()
                avg_train_return = MPIUtil.reduce_avg(self.train_return)

                for i in range(iters):
                    curr_iter = self.iter
                    wall_time = time.time() - self.start_time
                    wall_time /= 60 * 60  # store time in hours

                    has_goal = self.has_goal()
                    s_mean = np.mean(self._s_norm.mean)
                    s_std = np.mean(self._s_norm.std)
                    g_mean = np.mean(self._g_norm.mean) if has_goal else 0
                    g_std = np.mean(self._g_norm.std) if has_goal else 0

                    self.logger.log_tabular("Iteration", self.iter)
                    self.logger.log_tabular("Wall_Time", wall_time)
                    self.logger.log_tabular("Samples",
                                            self._total_sample_count)
                    self.logger.log_tabular("Train_Return", avg_train_return)
                    self.logger.log_tabular("Test_Return",
                                            self.avg_test_return)
                    self.logger.log_tabular("State_Mean", s_mean)
                    self.logger.log_tabular("State_Std", s_std)
                    self.logger.log_tabular("Goal_Mean", g_mean)
                    self.logger.log_tabular("Goal_Std", g_std)
                    self._log_exp_params()

                    self._update_iter(self.iter + 1)
                    self._train_step()

                    Logger.print("Agent " + str(self.id))
                    self.logger.print_tabular()
                    Logger.print("")

                    if (self._enable_output()
                            and curr_iter % self.int_output_iters == 0):
                        self.logger.dump_tabular()

                if (prev_iter // self.int_output_iters !=
                        self.iter // self.int_output_iters):
                    end_training = self.enable_testing()

        else:
            Logger.print("Agent " + str(self.id))
            Logger.print("Samples: " + str(self._total_sample_count))
            Logger.print("")

            if (self._total_sample_count >= self.init_samples):
                self.replay_buffer_initialized = True
                end_training = self.enable_testing()

        if self._need_normalizer_update:
            self._update_normalizers()
            self._need_normalizer_update = self.normalizer_samples > self._total_sample_count

        if end_training:
            self._init_mode_test()

        return