コード例 #1
0
ファイル: envs_manager.py プロジェクト: tbienhoff/carla-rl
    def __init__(self, venv, nstack, device=None):

        self.venv = venv
        self.nstack = nstack

        wos = venv.observation_space  # wrapped ob space
        wos = obs_to_dict(wos)
        self.stacked_obs = {}
        new_observation_spaces = {}
        self.shape_dim0 = {}
        for k in wos.spaces:

            self.shape_dim0[k] = wos.spaces[k].shape[0]
            low = np.repeat(wos.spaces[k].low, self.nstack, axis=0)
            high = np.repeat(wos.spaces[k].high, self.nstack, axis=0)

            if device is None:
                device = torch.device('cpu')
            self.stacked_obs[k] = torch.zeros((venv.num_envs, ) +
                                              low.shape).to(device)

            new_observation_spaces[k] = gym.spaces.Box(low=low,
                                                       high=high,
                                                       dtype=np.float32)

        if set(new_observation_spaces.keys()) == {None}:
            VecEnvWrapper.__init__(
                self, venv, observation_space=new_observation_spaces[None])
        else:
            VecEnvWrapper.__init__(
                self,
                venv,
                observation_space=gym.spaces.Dict(new_observation_spaces))
コード例 #2
0
ファイル: storage.py プロジェクト: zhangchi0605/carla-rl-1
    def feed_forward_generator(self, advantages, num_mini_batch):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_processes * num_steps
        assert batch_size >= num_mini_batch, (
            "PPO requires the number of processes ({}) "
            "* number of steps ({}) = {} "
            "to be greater than or equal to the number of PPO mini batches ({})."
            "".format(num_processes, num_steps, num_processes * num_steps,
                      num_mini_batch))
        mini_batch_size = batch_size // num_mini_batch
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                               mini_batch_size,
                               drop_last=False)
        for indices in sampler:
            obs_batch = {}
            self.obs = obs_to_dict(self.obs)
            for k in self.obs:
                obs_batch[k] = self.obs[k][:-1].view(
                    -1, *self.obs[k].size()[2:])[indices]
            self.obs = dict_to_obs(self.obs)
            recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(
                -1, self.recurrent_hidden_states.size(-1))[indices]
            actions_batch = self.actions.view(-1,
                                              self.actions.size(-1))[indices]
            value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices]
            return_batch = self.returns[:-1].view(-1, 1)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            old_action_log_probs_batch = self.action_log_probs.view(-1,
                                                                    1)[indices]
            adv_targ = advantages.view(-1, 1)[indices]

            yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
                value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ
コード例 #3
0
ファイル: envs_manager.py プロジェクト: tbienhoff/carla-rl
 def step_wait(self):
     obs, reward, done, info = self.venv.step_wait()
     obs = obs_to_dict(obs)
     for k in obs:
         obs[k] = torch.from_numpy(obs[k]).float().to(self.device)
     reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
     return dict_to_obs(obs), reward, done, info
コード例 #4
0
ファイル: envs_manager.py プロジェクト: tbienhoff/carla-rl
    def reset(self):

        obs = self.venv.reset()
        obs = obs_to_dict(obs)
        for k in obs:
            self.stacked_obs[k].zero_()
            self.stacked_obs[k][:, -self.shape_dim0[k]:] = obs[k]
        return dict_to_obs(self.stacked_obs)
コード例 #5
0
ファイル: storage.py プロジェクト: zhangchi0605/carla-rl-1
    def insert(self, obs, recurrent_hidden_states, actions, action_log_probs,
               value_preds, rewards, masks):
        obs = obs_to_dict(obs)
        self.obs = obs_to_dict(self.obs)
        for k in self.obs:
            self.obs[k][self.step + 1].copy_(obs[k])
        self.obs = dict_to_obs(self.obs)
        # self.obs[self.step + 1].copy_(obs)
        self.recurrent_hidden_states[self.step +
                                     1].copy_(recurrent_hidden_states)
        self.actions[self.step].copy_(actions)
        self.action_log_probs[self.step].copy_(action_log_probs)
        self.value_preds[self.step].copy_(value_preds)
        self.rewards[self.step].copy_(rewards)
        self.masks[self.step + 1].copy_(masks)

        self.step = (self.step + 1) % self.num_steps
コード例 #6
0
ファイル: envs_manager.py プロジェクト: tbienhoff/carla-rl
 def step_wait(self):
     obs, rews, news, infos = self.venv.step_wait()
     obs = obs_to_dict(obs)
     for k in obs:
         self.stacked_obs[k][:, :-self.shape_dim0[k]] = \
             self.stacked_obs[k][:, self.shape_dim0[k]:]
         for (i, new) in enumerate(news):
             if new:
                 self.stacked_obs[k][i] = 0
         self.stacked_obs[k][:, -self.shape_dim0[k]:] = obs[k]
     return dict_to_obs(self.stacked_obs), rews, news, infos
コード例 #7
0
ファイル: storage.py プロジェクト: zhangchi0605/carla-rl-1
 def to(self, device):
     self.obs = obs_to_dict(self.obs)
     for k in self.obs:
         self.obs[k] = self.obs[k].to(device)
     self.obs = dict_to_obs(self.obs)
     self.recurrent_hidden_states = self.recurrent_hidden_states.to(device)
     self.rewards = self.rewards.to(device)
     self.value_preds = self.value_preds.to(device)
     self.returns = self.returns.to(device)
     self.action_log_probs = self.action_log_probs.to(device)
     self.actions = self.actions.to(device)
     self.masks = self.masks.to(device)
コード例 #8
0
ファイル: storage.py プロジェクト: zhangchi0605/carla-rl-1
    def after_update(self):

        self.obs = obs_to_dict(self.obs)

        if self.her:
            for k in self.obs:
                self.obs[k] = self.obs[k][:, :self.num_processes]
            self.masks = self.masks[:, :self.num_processes]
            self.returns = self.returns[:, :self.num_processes]
            self.value_preds = self.value_preds[:, :self.num_processes]
            self.rewards = self.rewards[:, :self.num_processes]
            self.action_log_probs = self.action_log_probs[:, :self.
                                                          num_processes]
            self.actions = self.actions[:, :self.num_processes]
            self.recurrent_hidden_states = self.recurrent_hidden_states[:, :
                                                                        self.
                                                                        num_processes]

        for k in self.obs:
            self.obs[k][0].copy_(self.obs[k][-1])
        self.obs = dict_to_obs(self.obs)
        self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1])
        self.masks[0].copy_(self.masks[-1])
コード例 #9
0
ファイル: envs_manager.py プロジェクト: tbienhoff/carla-rl
 def reset(self):
     obs = self.venv.reset()
     obs = obs_to_dict(obs)
     for k in obs:
         obs[k] = torch.from_numpy(obs[k]).float().to(self.device)
     return dict_to_obs(obs)
コード例 #10
0
def main():
    config = None
    args = get_args()
    config, checkpoint = get_config_and_checkpoint(args)

    set_random_seeds(args, config)
    eval_log_dir = args.save_dir + "_eval"
    try:
        os.makedirs(args.save_dir)
        os.makedirs(eval_log_dir)
    except OSError:
        pass

    now = datetime.datetime.now()
    experiment_name = args.experiment_name + '_' + now.strftime(
        "%Y-%m-%d_%H-%M-%S")

    # Create checkpoint file
    save_dir_model = os.path.join(args.save_dir, 'model', experiment_name)
    save_dir_config = os.path.join(args.save_dir, 'config', experiment_name)
    try:
        os.makedirs(save_dir_model)
        os.makedirs(save_dir_config)
    except OSError as e:
        logger.error(e)
        exit()

    if args.config:
        shutil.copy2(args.config, save_dir_config)

    # Tensorboard Logging
    writer = SummaryWriter(
        os.path.join(args.save_dir, 'tensorboard', experiment_name))

    # Logger that writes to STDOUT and a file in the save_dir
    logger = setup_carla_logger(args.save_dir, experiment_name)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    norm_reward = not config.no_reward_norm
    norm_obs = not config.no_obs_norm

    assert not (config.num_virtual_goals > 0) or (
        config.reward_class
        == 'SparseReward'), 'Cant use HER with dense reward'
    obs_converter = CarlaObservationConverter(
        h=84, w=84, rel_coord_system=config.rel_coord_system)
    action_converter = CarlaActionsConverter(config.action_type)
    envs = make_vec_envs(obs_converter,
                         action_converter,
                         args.starting_port,
                         config.seed,
                         config.num_processes,
                         config.gamma,
                         device,
                         config.reward_class,
                         num_frame_stack=1,
                         subset=config.experiments_subset,
                         norm_reward=norm_reward,
                         norm_obs=norm_obs,
                         apply_her=config.num_virtual_goals > 0,
                         video_every=args.video_interval,
                         video_dir=os.path.join(args.save_dir, 'video',
                                                experiment_name))

    if config.agent == 'forward':
        agent = agents.ForwardCarla()

    if config.agent == 'a2c':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm)

    elif config.agent == 'acktr':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm,
                                acktr=True)

    elif config.agent == 'ppo':
        agent = agents.PPOCarla(obs_converter,
                                action_converter,
                                config.clip_param,
                                config.ppo_epoch,
                                config.num_mini_batch,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                max_grad_norm=config.max_grad_norm)

    if checkpoint is not None:
        load_modules(agent.optimizer, agent.model, checkpoint)

    rollouts = RolloutStorage(config.num_steps, config.num_processes,
                              envs.observation_space, envs.action_space, 20,
                              config.num_virtual_goals,
                              config.rel_coord_system, obs_converter)

    obs = envs.reset()
    # Save the first observation
    obs = obs_to_dict(obs)
    rollouts.obs = obs_to_dict(rollouts.obs)
    for k in rollouts.obs:
        rollouts.obs[k][rollouts.step + 1].copy_(obs[k])
    rollouts.obs = dict_to_obs(rollouts.obs)
    rollouts.to(device)

    start = time.time()

    total_steps = 0
    total_episodes = 0
    total_reward = 0

    episode_reward = torch.zeros(config.num_processes)

    for j in range(config.num_updates):

        for step in range(config.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.act(
                    rollouts.get_obs(step),
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, info = envs.step(action)

            # For logging purposes
            carla_rewards = torch.tensor([i['carla-reward'] for i in info],
                                         dtype=torch.float)
            episode_reward += carla_rewards
            total_reward += carla_rewards.sum().item()
            total_steps += config.num_processes

            if done.any():
                total_episodes += done.sum()
                torch_done = torch.tensor(done.astype(int)).byte()
                mean_episode_reward = episode_reward[torch_done].mean().item()
                logger.info('{} episode(s) finished with reward {}'.format(
                    done.sum(), mean_episode_reward))
                writer.add_scalar('train/mean_ep_reward_vs_steps',
                                  mean_episode_reward, total_steps)
                writer.add_scalar('train/mean_ep_reward_vs_episodes',
                                  mean_episode_reward, total_episodes)
                episode_reward[torch_done] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor(1 - done)

            rollouts.insert(obs, recurrent_hidden_states,
                            action, action_log_prob, value, reward,
                            masks.unsqueeze(-1))

        if config.num_virtual_goals > 0:
            rollouts.apply_her(config.num_virtual_goals,
                               device,
                               beta=config.beta)

        with torch.no_grad():
            next_value = agent.get_value(
                rollouts.get_obs(-1),  # Get last observation
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, config.use_gae, config.gamma,
                                 config.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and config.agent != 'forward':
            save_path = os.path.join(save_dir_model, str(j) + '.pth.tar')
            save_modules(agent.optimizer, agent.model, args, config, save_path)

        total_num_steps = (j + 1) * config.num_processes * config.num_steps

        if j % args.log_interval == 0:

            # Logging to the stdout/our logs
            end = time.time()
            logger.info('------------------------------------')
            logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\
                .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start)))
            logger.info('------------------------------------')

            # Logging to tensorboard
            writer.add_scalar('train/cum_reward_vs_steps', total_reward,
                              total_steps)
            writer.add_scalar('train/cum_reward_vs_updates', total_reward,
                              j + 1)

            if config.agent in ['a2c', 'acktr', 'ppo']:
                writer.add_scalar('debug/value_loss_vs_steps', value_loss,
                                  total_steps)
                writer.add_scalar('debug/value_loss_vs_updates', value_loss,
                                  j + 1)
                writer.add_scalar('debug/action_loss_vs_steps', action_loss,
                                  total_steps)
                writer.add_scalar('debug/action_loss_vs_updates', action_loss,
                                  j + 1)
                writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy,
                                  total_steps)
                writer.add_scalar('debug/dist_entropy_vs_updates',
                                  dist_entropy, j + 1)

            # Sample the last reward
            writer.add_scalar('debug/sampled_normalized_reward_vs_steps',
                              reward.mean(), total_steps)
            writer.add_scalar('debug/sampled_normalized_reward_vs_updates',
                              reward.mean(), j + 1)
            writer.add_scalar('debug/sampled_carla_reward_vs_steps',
                              carla_rewards.mean(), total_steps)
            writer.add_scalar('debug/sampled_carla_reward_vs_updates',
                              carla_rewards.mean(), j + 1)

        if (args.eval_interval is not None and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name, args.starting_port,
                                      obs_converter,
                                      args.x + config.num_processes,
                                      config.num_processes, config.gamma,
                                      eval_log_dir, config.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(config.num_processes,
                                                       20,
                                                       device=device)
            eval_masks = torch.zeros(config.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = agent.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                carla_obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            logger.info(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))