Beispiel #1
0
 def collect_transitions_sim_env(self):
     """
     Generate transitions using dynamics model
     """
     self.replay_buffer.clear()
     n_episodes = 0
     ave_episode_return = 0
     while self.replay_buffer.get_stored_size() < self._policy.horizon:
         obs = self._env.reset()
         episode_return = 0.
         for _ in range(self._episode_max_steps):
             act, logp, val = self._policy.get_action_and_val(obs)
             if not is_discrete(self._env.action_space):
                 env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
             else:
                 env_act = act
             if self._debug:
                 next_obs, rew, _, _ = self._env.step(env_act)
             else:
                 next_obs = self.predict_next_state(obs, env_act)
                 rew = self._reward_fn(obs, act)[0]
             self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=rew,
                                   done=False, logp=logp, val=val)
             obs = next_obs
             episode_return += rew
         self.finish_horizon(last_val=val)
         ave_episode_return += episode_return
         n_episodes += 1
     return ave_episode_return / n_episodes
Beispiel #2
0
    def _evaluate_model(self):
        ret_real_env, ret_sim_env = 0., 0.
        n_episodes = 10
        for _ in range(n_episodes):
            real_obs = self._env.reset()
            sim_obs = real_obs.copy()
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(real_obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act

                next_real_obs, rew, _, _ = self._env.step(env_act)
                ret_real_env += rew
                real_obs = next_real_obs

                next_sim_obs = self.predict_next_state(sim_obs, env_act)
                ret_sim_env += self._reward_fn(real_obs, act)[0]
                sim_obs = next_sim_obs

        ret_real_env /= n_episodes
        ret_sim_env /= n_episodes
        return ret_real_env, ret_sim_env
Beispiel #3
0
    def evaluate_policy(self, total_steps):
        """
        Evaluate policy

        Args:
            total_steps (int): Current total steps of training
        """
        avg_test_return = 0.
        avg_test_steps = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            avg_test_steps += 1
            for _ in range(self._episode_max_steps):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if is_discrete(self._env.action_space) else np.clip(
                    act, self._env.action_space.low,
                    self._env.action_space.high))
                next_obs, reward, done, _ = self._test_env.step(act)
                avg_test_steps += 1
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes
Beispiel #4
0
    def __call__(self):
        total_steps = 0
        n_episode = 0

        # TODO: clean codes
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            n_episode, total_rewards = self._collect_sample(
                n_episode, total_steps)
            total_steps += self._policy.horizon
            tf.summary.experimental.set_step(total_steps)

            if len(total_rewards) > 0:
                avg_training_return = sum(total_rewards) / len(total_rewards)
                tf.summary.scalar(name="Common/training_return",
                                  data=avg_training_return)

            # Train actor critic
            for _ in range(self._policy.n_epoch):
                samples = self.replay_buffer.sample(self._policy.horizon)
                if self._policy.normalize_adv:
                    adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                        samples["adv"])
                else:
                    adv = samples["adv"]
                for idx in range(
                        int(self._policy.horizon / self._policy.batch_size)):
                    target = slice(idx * self._policy.batch_size,
                                   (idx + 1) * self._policy.batch_size)
                    self._policy.train(states=samples["obs"][target],
                                       actions=samples["act"][target],
                                       advantages=adv[target],
                                       logp_olds=samples["logp"][target],
                                       returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
Beispiel #5
0
def get_replay_buffer(policy,
                      env,
                      use_prioritized_rb=False,
                      use_nstep_rb=False,
                      n_step=1,
                      size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs["size"] = size

    # on-policy policy
    if not issubclass(type(policy), OffPolicyAgent):
        kwargs["size"] = policy.horizon
        kwargs["env_dict"].pop("next_obs")
        kwargs["env_dict"].pop("rew")
        # TODO: Remove done. Currently cannot remove because of cpprb implementation
        # kwargs["env_dict"].pop("done")
        kwargs["env_dict"]["logp"] = {}
        kwargs["env_dict"]["ret"] = {}
        kwargs["env_dict"]["adv"] = {}
        if is_discrete(env.action_space):
            kwargs["env_dict"]["act"]["dtype"] = np.int32
        return ReplayBuffer(**kwargs)

    # N-step prioritized
    if use_prioritized_rb and use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return PrioritizedReplayBuffer(**kwargs)

    if len(obs_shape) == 3:
        kwargs["env_dict"]["obs"]["dtype"] = np.ubyte
        kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return ReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
Beispiel #6
0
    def _visualize_current_performance(self):
        obs = self._env.reset()
        for _ in range(self._episode_max_steps):
            act, _ = self._policy.get_action(obs)
            if not is_discrete(self._env.action_space):
                env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
            else:
                env_act = act
            next_obs = self.predict_next_state(obs, env_act)

            self._env.state = np.array([np.arctan2(next_obs[1], next_obs[0]), next_obs[2]], dtype=np.float32)
            # print(obs, act, next_obs, self._env.state)
            self._env.render()
            obs = next_obs
Beispiel #7
0
 def collect_transitions_real_env(self):
     total_steps = 0
     episode_steps = 0
     obs = self._env.reset()
     while total_steps < self._n_collect_steps:
         episode_steps += 1
         total_steps += 1
         act, _ = self._policy.get_action(obs)
         if not is_discrete(self._env.action_space):
             env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
         else:
             env_act = act
         next_obs, _, done, _ = self._env.step(env_act)
         self.dynamics_buffer.add(
             obs=obs, act=env_act, next_obs=next_obs)
         obs = next_obs
         if done or episode_steps == self._episode_max_steps:
             episode_steps = 0
             obs = self._env.reset()
Beispiel #8
0
    def _evaluate_current_return(self, init_states):
        n_episodes = self._n_dynamics_model * self._n_eval_episodes_per_model
        assert init_states.shape[0] == n_episodes

        obses = init_states.copy()
        next_obses = np.zeros_like(obses)
        returns = np.zeros(shape=(n_episodes,), dtype=np.float32)

        for _ in range(self._episode_max_steps):
            acts, _ = self._policy.get_action(obses)
            for i in range(n_episodes):
                model_idx = i // self._n_eval_episodes_per_model
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(acts[i], self._env.action_space.low, self._env.action_space.high)
                else:
                    env_act = acts[i]
                next_obses[i] = self.predict_next_state(obses[i], env_act, idx=model_idx)
            returns += self._reward_fn(obses, acts)
            obses = next_obses

        return returns
Beispiel #9
0
 def test_is_discrete(self):
     discrete_space = gym.make('CartPole-v0').action_space
     continuous_space = gym.make('Pendulum-v0').action_space
     self.assertTrue(is_discrete(discrete_space))
     self.assertFalse(is_discrete(continuous_space))
#parser.set_defaults(horizon=1024)
#parser.set_defaults(batch_size=512)
parser.set_defaults(gpu=-1)
parser.set_defaults(max_steps=100000000)
parser.set_defaults(n_warmup=0)
#parser.set_defaults(enable_gae=True)
args = parser.parse_args()

env = ArmEnvironment(static_goal=True,slow_step=slow_step)
test_env = ArmEnvironment(static_goal=True,slow_step=slow_step)

policy = PPO(
        state_shape=env.observation_space.shape,
        action_dim=get_act_dim(env.action_space),
        is_discrete=is_discrete(env.action_space),
        max_action=None if is_discrete(
            env.action_space) else env.action_space.high[0],
        batch_size=args.batch_size,
        actor_units=(64, 64),
        critic_units=(64, 64),
        n_epoch=10,
        lr_actor=3e-4,
        lr_critic=3e-4,
        hidden_activation_actor="tanh",
        hidden_activation_critic="tanh",
        discount=0.99,
        lam=0.95,
        entropy_coef=0.001,
        horizon=args.horizon,
        normalize_adv=args.normalize_adv,
Beispiel #11
0
    def __call__(self):
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                act, logp, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(act)
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()
            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer._encode_sample(
                    np.arange(self._policy.horizon))
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._save_model_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
Beispiel #12
0
    def __call__(self):
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_cost = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act
                next_obs, reward, done, info = self._env.step(env_act)
                # print('[DEBUG]  COST:', info['cost'])
                try:
                    cost = info['cost']
                except (TypeError, KeyError):
                    cost = 0
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward
                episode_cost += cost

                done_flag = done
                if (hasattr(self._env, "_max_episode_steps")
                        and episode_steps == self._env._max_episode_steps):
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, episode_cost, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    self.total_cost += episode_cost
                    cost_rate = self.total_cost / total_steps

                    wandb.log(
                        {
                            'Training_Return': episode_return,
                            'Training_Cost': episode_cost,
                            'Cost_Rate': cost_rate,
                            'FPS': fps
                        },
                        step=n_epoisode)
                    episode_steps = 0
                    episode_return = 0
                    episode_cost = 0
                    episode_start_time = time.time()

                if total_steps % self._test_interval == 0:
                    avg_test_return, avg_test_cost = self.evaluate_policy(
                        total_steps)
                    self.logger.info(
                        "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes"
                        .format(total_steps, avg_test_return, avg_test_cost,
                                self._test_episodes))
                    wandb.log(
                        {
                            'Evaluation_Return': avg_test_return,
                            'Evaluation_Cost': avg_test_cost
                        },
                        step=n_epoisode)
                    # wandb.log({'Evaluation_Step': total_steps})

                    tf.summary.scalar(name="Common/average_test_return",
                                      data=avg_test_return)
                    self.writer.flush()

                if total_steps % self._save_model_interval == 0:
                    self.checkpoint_manager.save()

            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer.get_all_transitions()
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
                # Update normalizer
                if self._normalize_obs:
                    self._obs_normalizer.experience(samples["obs"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._normalize_obs:
                        samples["obs"] = self._obs_normalizer(samples["obs"],
                                                              update=False)
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

        tf.summary.flush()
Beispiel #13
0
def run(parser):

    args = parser.parse_args()

    if args.gpu < 0:
        tf.config.experimental.set_visible_devices([], 'GPU')
    else:
        physical_devices = tf.config.list_physical_devices('GPU')
        tf.config.set_visible_devices(physical_devices[args.gpu], 'GPU')
        tf.config.experimental.set_virtual_device_configuration(
            physical_devices[args.gpu], [
                tf.config.experimental.VirtualDeviceConfiguration(
                    memory_limit=1024 * 3)
            ])

    if args.env == 200:
        envname = 'ScratchItchPR2X'
    elif args.env == 201:
        envname = 'DressingPR2X'
    elif args.env == 202:
        envname = 'BedBathingPR2X'

    logdir = f'MFBox_Assistive'
    if args.SAC:
        wandb.init(config=vars(args),
                   project="Assistive Gym",
                   name=f'SAC on {envname}')
    elif args.PPO:
        wandb.init(config=vars(args),
                   project="Assistive Gym",
                   name=f'PPO on {envname}')
    elif args.TD3:
        wandb.init(config=vars(args),
                   project="Assistive Gym",
                   name=f'TD3 on {envname}')
    elif args.DEBUG:
        logdir = f'DEBUG_Assistive'
        wandb.init(config=vars(args),
                   project="Assistive Gym",
                   name=f'DEBUG on {envname}')
    else:
        print('PLEASE INDICATE THE ALGORITHM !!')

    if not os.path.exists(logdir):
        os.makedirs(logdir)
    parser.set_defaults(logdir=logdir)
    args = parser.parse_args()

    env = gym.make(f'{envname}-v0')
    #test_env = Monitor(env,logdir,force=True)
    test_env = gym.make(f'{envname}-v0')

    if args.SAC:

        policy = SAC(state_shape=env.observation_space.shape,
                     action_dim=env.action_space.high.size,
                     gpu=args.gpu,
                     memory_capacity=args.memory_capacity,
                     max_action=env.action_space.high[0],
                     batch_size=args.batch_size,
                     n_warmup=args.n_warmup,
                     alpha=args.alpha,
                     auto_alpha=args.auto_alpha)
        trainer = Trainer(policy, env, args, test_env=test_env)

    elif args.PPO:
        policy = PPO(state_shape=env.observation_space.shape,
                     action_dim=get_act_dim(env.action_space),
                     is_discrete=is_discrete(env.action_space),
                     max_action=None if is_discrete(env.action_space) else
                     env.action_space.high[0],
                     batch_size=args.batch_size,
                     actor_units=(64, 64),
                     critic_units=(64, 64),
                     n_epoch=10,
                     lr_actor=3e-4,
                     lr_critic=3e-4,
                     hidden_activation_actor="tanh",
                     hidden_activation_critic="tanh",
                     discount=0.99,
                     lam=0.95,
                     entropy_coef=0.,
                     horizon=args.horizon,
                     normalize_adv=args.normalize_adv,
                     enable_gae=args.enable_gae,
                     gpu=args.gpu)
        trainer = OnPolicyTrainer(policy, env, args, test_env=test_env)

    elif args.TD3:
        policy = TD3(state_shape=env.observation_space.shape,
                     action_dim=env.action_space.high.size,
                     gpu=args.gpu,
                     memory_capacity=args.memory_capacity,
                     max_action=env.action_space.high[0],
                     batch_size=args.batch_size,
                     n_warmup=args.n_warmup)
        trainer = Trainer(policy, env, args, test_env=test_env)

    elif args.DEBUG:

        policy = SAC(state_shape=env.observation_space.shape,
                     action_dim=env.action_space.high.size,
                     gpu=args.gpu,
                     memory_capacity=args.memory_capacity,
                     max_action=env.action_space.high[0],
                     batch_size=args.batch_size,
                     n_warmup=100,
                     alpha=args.alpha,
                     auto_alpha=args.auto_alpha)
        parser.set_defaults(test_interval=200)
        args = parser.parse_args()

        trainer = Trainer(policy, env, args, test_env=None)

    trainer()
Beispiel #14
0
    def __call__(self):
        total_steps = 0
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        n_episode = 0
        test_step_threshold = self._test_interval

        # TODO: clean codes
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        obs = self._env.reset()
        while total_steps < self._max_steps:
            for _ in range(self._policy.horizon):
                action, log_pi, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(action)
                if self._show_progress:
                    self._env.render()
                episode_steps += 1
                episode_return += reward
                total_steps += 1

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=action,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=log_pi,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_episode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_episode, int(total_steps), episode_steps,
                                episode_return, fps))

                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

            self.finish_horizon(last_val=val)
            tf.summary.experimental.set_step(total_steps)
            samples = self.replay_buffer.sample(self._policy.horizon)
            # Normalize advantages
            if self._policy.normalize_adv:
                adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                    samples["adv"])
            else:
                adv = samples["adv"]
            for _ in range(1):
                self._policy.train_actor(samples["obs"], samples["act"], adv,
                                         samples["logp"])
            # Train Critic
            for _ in range(5):
                self._policy.train_critic(samples["obs"], samples["ret"])
            if total_steps > test_step_threshold:
                test_step_threshold += self._test_interval
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                tf.summary.scalar(name="Common/fps", data=fps)

                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()