Esempio n. 1
0
        if not args.symbolic_env:
            episode_str = str(episode).zfill(len(str(args.episodes)))
            write_video(video_frames, 'test_episode_%s' % episode_str,
                        results_dir)  # Lossy compression
            save_image(
                torch.as_tensor(video_frames[-1]),
                os.path.join(results_dir, 'test_episode_%s.png' % episode_str))
        torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))

        # Set models to train mode
        transition_model.train()
        observation_model.train()
        reward_model.train()
        encoder.train()
        actor_model.train()
        value_model.train()
        # Close test environments
        test_envs.close()

    writer.add_scalar("train_reward", metrics['train_rewards'][-1],
                      metrics['steps'][-1])
    writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1],
                      metrics['steps'][-1] * args.action_repeat)
    writer.add_scalar("observation_loss", metrics['observation_loss'][0][-1],
                      metrics['steps'][-1])
    writer.add_scalar("reward_loss", metrics['reward_loss'][0][-1],
                      metrics['steps'][-1])
    writer.add_scalar("kl_loss", metrics['kl_loss'][0][-1],
                      metrics['steps'][-1])
    writer.add_scalar("actor_loss", metrics['actor_loss'][0][-1],
                      metrics['steps'][-1])
Esempio n. 2
0
        if not args.symbolic:
            episode_str = str(episode).zfill(len(str(args.episodes)))
            write_video(video_frames, 'test_episode_%s' % episode_str,
                        results_dir)  # Lossy compression
            save_image(
                torch.as_tensor(video_frames[-1]),
                os.path.join(results_dir, 'test_episode_%s.png' % episode_str))
        torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))

        # Set models to train mode
        transition_model.train()
        observation_model.train()
        reward_model.train()
        encoder.train()
        actor_model.train()
        value_model1.train()
        value_model2.train()
        # Close test environments
        # test_envs.close()
        # env.close()

    writer.add_scalar("train_reward", metrics['train_rewards'][-1],
                      metrics['steps'][-1])
    writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1],
                      metrics['steps'][-1] * args.action_repeat)
    writer.add_scalar("observation_loss", metrics['observation_loss'][0][-1],
                      metrics['steps'][-1])
    writer.add_scalar("reward_loss", metrics['reward_loss'][0][-1],
                      metrics['steps'][-1])
    writer.add_scalar("kl_loss", metrics['kl_loss'][0][-1],
                      metrics['steps'][-1])
class Algorithms(object):
    def __init__(self, action_size, transition_model, encoder, reward_model,
                 observation_model):

        self.encoder, self.reward_model, self.transition_model, self.observation_model = encoder, reward_model, transition_model, observation_model

        self.merge_value_model = ValueModel(
            args.belief_size, args.state_size, args.hidden_size,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model = MergeModel(
            args.belief_size, args.state_size, args.hidden_size, action_size,
            args.pool_len,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model.share_memory()
        self.merge_value_model.share_memory()

        # set actor, value pool
        self.actor_pool = [
            ActorModel(args.belief_size, args.state_size, args.hidden_size,
                       action_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        self.value_pool = [
            ValueModel(args.belief_size, args.state_size, args.hidden_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        [actor.share_memory() for actor in self.actor_pool]
        [value.share_memory() for value in self.value_pool]

        self.env_model_modules = get_modules([
            self.transition_model, self.encoder, self.observation_model,
            self.reward_model
        ])
        self.actor_pool_modules = get_modules(self.actor_pool)
        self.model_modules = self.env_model_modules + self.actor_pool_modules

        self.merge_value_model_modules = get_modules([self.merge_value_model])

        self.merge_actor_optimizer = optim.Adam(
            self.merge_actor_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.actor_learning_rate,
            eps=args.adam_epsilon)
        self.merge_value_optimizer = optim.Adam(
            self.merge_value_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.value_learning_rate,
            eps=args.adam_epsilon)

        self.actor_pipes = [
            Pipe() for i in range(1,
                                  len(self.actor_pool) + 1)
        ]  # Set Multi Pipe
        self.workers_actor = [
            Worker_actor(actor_l=self.actor_pool[i],
                         value_l=self.value_pool[i],
                         transition_model=self.transition_model,
                         encoder=self.encoder,
                         observation_model=self.observation_model,
                         reward_model=self.reward_model,
                         child_conn=child,
                         results_dir=args.results_dir,
                         id=i + 1)
            for i, [parent, child] in enumerate(self.actor_pipes)
        ]  # Set Worker_actor Using i'th actor_pipes

        [w.start()
         for i, w in enumerate(self.workers_actor)]  # Start Single Process

        self.metrics = {
            'episodes': [],
            'merge_actor_loss': [],
            'merge_value_loss': []
        }
        self.merge_losses = []

    def get_action(self, belief, posterior_state, explore=False):
        merge_action_list = []
        for actor_l in self.actor_pool:
            actions_l_mean, actions_l_std = actor_l.get_action_mean_std(
                belief, posterior_state)
            merge_action_list.append(actions_l_mean)
            merge_action_list.append(actions_l_std)
        merge_actions = torch.cat(merge_action_list, dim=1)
        action = self.merge_actor_model.get_merge_action(merge_actions,
                                                         belief,
                                                         posterior_state,
                                                         det=not (explore))
        return action

    def train_algorithm(self, actor_states, actor_beliefs):

        [
            self.actor_pipes[i][0].send(1)
            for i, w in enumerate(self.workers_actor)
        ]  # Parent_pipe send data using i'th pipes
        [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)
         ]  # waitting the children finish

        with FreezeParameters(self.model_modules):
            imagination_traj = self.imagine_merge_ahead(
                prev_state=actor_states,
                prev_belief=actor_beliefs,
                policy_pool=self.actor_pool,
                transition_model=self.transition_model,
                merge_model=self.merge_actor_model)
        imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj

        with FreezeParameters(self.model_modules +
                              self.merge_value_model_modules):
            imged_reward = bottle(self.reward_model,
                                  (imged_beliefs, imged_prior_states))
            value_pred = bottle(self.merge_value_model,
                                (imged_beliefs, imged_prior_states))

        with FreezeParameters(self.actor_pool_modules):
            returns = lambda_return(imged_reward,
                                    value_pred,
                                    bootstrap=value_pred[-1],
                                    discount=args.discount,
                                    lambda_=args.disclam)
            merge_actor_loss = -torch.mean(returns)
            # Update model parameters
            self.merge_actor_optimizer.zero_grad()
            merge_actor_loss.backward()
            nn.utils.clip_grad_norm_(self.merge_actor_model.parameters(),
                                     args.grad_clip_norm,
                                     norm_type=2)
            self.merge_actor_optimizer.step()

        # Dreamer implementation: value loss calculation and optimization
        with torch.no_grad():
            value_beliefs = imged_beliefs.detach()
            value_prior_states = imged_prior_states.detach()
            target_return = returns.detach()

        value_dist = Normal(
            bottle(self.merge_value_model,
                   (value_beliefs, value_prior_states)),
            1)  # detach the input tensor from the transition network.
        merge_value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1))
        # Update model parameters
        self.merge_value_optimizer.zero_grad()
        merge_value_loss.backward()
        nn.utils.clip_grad_norm_(self.merge_value_model.parameters(),
                                 args.grad_clip_norm,
                                 norm_type=2)
        self.merge_value_optimizer.step()

        self.merge_losses.append(
            [merge_actor_loss.item(),
             merge_value_loss.item()])

        # return [merge_actor_loss, merge_value_loss]

    def save_loss_data(self, metrics_episodes):
        losses = tuple(zip(*self.merge_losses))
        self.metrics['merge_actor_loss'].append(losses[0])
        self.metrics['merge_value_loss'].append(losses[1])
        Save_Txt(metrics_episodes[-1], self.metrics['merge_actor_loss'][-1],
                 'merge_actor_loss', args.results_dir)
        Save_Txt(metrics_episodes[-1], self.metrics['merge_value_loss'][-1],
                 'merge_value_loss', args.results_dir)
        [
            sub_actor.save_loss_data(metrics_episodes)
            for sub_actor in self.workers_actor
        ]  # save sub actor loss
        self.merge_losses = []

    def imagine_merge_ahead(self,
                            prev_state,
                            prev_belief,
                            policy_pool,
                            transition_model,
                            merge_model,
                            planning_horizon=12):
        flatten = lambda x: x.view([-1] + list(x.size()[2:]))
        prev_belief = flatten(prev_belief)
        prev_state = flatten(prev_state)

        # Create lists for hidden states (cannot use single tensor as buffer because autograd won't work with inplace writes)
        T = planning_horizon
        beliefs, prior_states, prior_means, prior_std_devs = [
            torch.empty(0)
        ] * T, [torch.empty(0)] * T, [torch.empty(0)] * T, [torch.empty(0)] * T
        beliefs[0], prior_states[0] = prev_belief, prev_state
        for t in range(T - 1):
            _state = prior_states[t]

            merge_action_list = []
            for actor_l in policy_pool:
                actions_l_mean, actions_l_std = actor_l.get_action_mean_std(
                    beliefs[t].detach(), _state.detach())
                merge_action_list.append(actions_l_mean)
                merge_action_list.append(actions_l_std)

            merge_actions = torch.cat(merge_action_list, dim=1)

            actions = merge_model.get_merge_action(merge_actions,
                                                   beliefs[t].detach(),
                                                   _state.detach())
            # Compute belief (deterministic hidden state)
            if args.MultiGPU and torch.cuda.device_count() > 1:
                hidden = transition_model.module.act_fn(
                    transition_model.module.fc_embed_state_action(
                        torch.cat([_state, actions], dim=1)))
                beliefs[t + 1] = transition_model.module.rnn(
                    hidden, beliefs[t])
                # Compute state prior by applying transition dynamics
                hidden = transition_model.module.act_fn(
                    transition_model.module.fc_embed_belief_prior(beliefs[t +
                                                                          1]))
                prior_means[t + 1], _prior_std_dev = torch.chunk(
                    transition_model.module.fc_state_prior(hidden), 2, dim=1)
                prior_std_devs[t + 1] = F.softplus(
                    _prior_std_dev) + transition_model.module.min_std_dev
            else:
                hidden = transition_model.act_fn(
                    transition_model.fc_embed_state_action(
                        torch.cat([_state, actions], dim=1)))
                beliefs[t + 1] = transition_model.rnn(hidden, beliefs[t])
                # Compute state prior by applying transition dynamics
                hidden = transition_model.act_fn(
                    transition_model.fc_embed_belief_prior(beliefs[t + 1]))
                prior_means[t + 1], _prior_std_dev = torch.chunk(
                    transition_model.fc_state_prior(hidden), 2, dim=1)
                prior_std_devs[t + 1] = F.softplus(
                    _prior_std_dev) + transition_model.min_std_dev
            prior_states[t + 1] = prior_means[t + 1] + prior_std_devs[
                t + 1] * torch.randn_like(prior_means[t + 1])
            # Return new hidden states
        # imagined_traj = [beliefs, prior_states, prior_means, prior_std_devs]
        imagined_traj = [
            torch.stack(beliefs[1:], dim=0),
            torch.stack(prior_states[1:], dim=0),
            torch.stack(prior_means[1:], dim=0),
            torch.stack(prior_std_devs[1:], dim=0)
        ]
        return imagined_traj

    def train_to_eval(self):
        [actor_model.eval() for actor_model in self.actor_pool]
        [value_model.eval() for value_model in self.value_pool]
        self.merge_actor_model.eval()
        self.merge_value_model.eval()

    def eval_to_train(self):
        [actor_model.train() for actor_model in self.actor_pool]
        [value_model.train() for value_model in self.value_pool]
        self.merge_actor_model.train()
        self.merge_value_model.train()
Esempio n. 4
0
    if True:
      episode_str = str(episode).zfill(len(str(args.episodes)))
      write_video(video_frames, 'test_episode_%s' % episode_str, results_dir)  # Lossy compression
      save_image(torch.as_tensor(video_frames[-1]), os.path.join(results_dir, 'test_episode_%s.png' % episode_str))
    torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))
    test_reward_sum = sum(metrics['test_rewards'][-1])
    writer.add_scalar("test/episode_reward", test_reward_sum/args.test_episodes, metrics['steps'][-1]*args.action_repeat)

    # Set models to train mode
    transition_model.train()
    observation_model.train()
    reward_model.train()
    encoder.train()
    if args.algo=="p2e" or args.algo=="dreamer":
      actor_model.train()
      value_model.train()
      if args.algo=="p2e":
        curious_actor_model.train()
        curious_value_model.train()
    # Close test environments

  writer.add_scalar("train_reward", metrics['train_rewards'][-1], metrics['steps'][-1])
  writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1], metrics['steps'][-1]*args.action_repeat)
  writer.add_scalar("observation_loss", metrics['observation_loss'][-1][0], metrics['steps'][-1])
  writer.add_scalar("reward_loss", metrics['reward_loss'][-1][0], metrics['steps'][-1])
  writer.add_scalar("kl_loss", metrics['kl_loss'][-1][0], metrics['steps'][-1])
  writer.add_scalar("actor_loss", metrics['actor_loss'][-1][0], metrics['steps'][-1])
  writer.add_scalar("value_loss", metrics['value_loss'][-1][0], metrics['steps'][-1])
  writer.add_scalar("onestep_loss", metrics['onestep_loss'][-1][0], metrics['steps'][-1]) 
  writer.add_scalar("curious_actor_loss", metrics['curious_actor_loss'][-1][0], metrics['steps'][-1]) 
  writer.add_scalar("curious_value_loss", metrics['curious_value_loss'][-1][0], metrics['steps'][-1])