Exemple #1
0
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir,
             device, custom_gym, save_path):
    eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes,
                              None, eval_log_dir, device, True, custom_gym)

    vec_norm = utils.get_vec_normalize(eval_envs)
    if vec_norm is not None:
        vec_norm.eval()
        vec_norm.ob_rms = ob_rms

    eval_episode_rewards = []
    eval_episode_length = []
    eval_episode_success_rate = []

    obs = eval_envs.reset()
    eval_recurrent_hidden_states = torch.zeros(
        num_processes, actor_critic.recurrent_hidden_state_size, device=device)
    eval_masks = torch.zeros(num_processes, 1, device=device)

    while len(eval_episode_rewards) < num_processes * 10:
        with torch.no_grad():
            _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                obs,
                eval_recurrent_hidden_states,
                eval_masks,
                deterministic=True)

        # Obser reward and next obs
        obs, _, done, infos = eval_envs.step(action)

        eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                   for done_ in done],
                                  dtype=torch.float32,
                                  device=device)

        for info in infos:
            if 'episode' in info.keys():
                eval_episode_rewards.append(info['episode']['r'])
                eval_episode_length.append(info['episode']['l'])
                eval_episode_success_rate.append(
                    info['was_successful_trajectory'])

    eval_envs.close()

    print(
        " Evaluation using {} episodes: mean reward {:.5f}, mean_length {:.2f}, mean_success {:.2f} \n"
        .format(len(eval_episode_rewards), np.mean(eval_episode_rewards),
                np.mean(eval_episode_length),
                np.mean(eval_episode_success_rate)))
    if actor_critic.max_eval_success_rate <= np.mean(
            eval_episode_success_rate):
        actor_critic.max_eval_success_rate = np.mean(eval_episode_success_rate)
        torch.save([
            actor_critic,
            getattr(utils.get_vec_normalize(eval_envs), 'ob_rms', None)
        ], os.path.join(save_path,
                        str(seed) + "_best_test.pt"))
Exemple #2
0
 def __init__(self, args, actor_critic, device):
     eval_args = args
    #eval_args.render = True
     self.device = device
    #if args.model == 'fractal':
    #    for i in range(-1, args.n_recs):
    #        eval_log_dir = args.log_dir + "_eval_col_{}".format(i)
    #        try:
    #            os.makedirs(eval_log_dir)
    #        except OSError:
    #            files = glob.glob(os.path.join(eval_log_dir,  '*.monitor.csv'))
    #            for f in files:
    #                os.remove(f)
    #        setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir)
             
     self.eval_log_dir = args.log_dir + "_eval"
     try:
         os.makedirs(self.eval_log_dir)
     except OSError:
         files = glob.glob(os.path.join(self.eval_log_dir,  '*.monitor.csv'))
         for f in files:
             os.remove(f)
     self.num_eval_processes = 2
     self.eval_envs = make_vec_envs(
                 eval_args.env_name, eval_args.seed + self.num_eval_processes, self.num_eval_processes,
                 eval_args.gamma, self.eval_log_dir, eval_args.add_timestep, self.device, True, args=eval_args)
     self.vec_norm = get_vec_normalize(self.eval_envs)
     if self.vec_norm is not None:
         self.vec_norm.eval()
         self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms
     self.actor_critic = actor_critic
     self.tstart = time.time()
     fieldnames = ['r', 'l', 't']
     if args.model == 'fractal':
         n_cols = actor_critic.base.n_cols
         for i in range(-1, n_cols):
             log_file_col = open('{}/col_{}_eval.csv'.format(self.eval_log_dir, i), mode='w')
             setattr(self, 'log_file_col_{}'.format(i), log_file_col)
             writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames)
             setattr(self, 'writer_col_{}'.format(i), writer_col)
             writer_col.writeheader()
             log_file_col.flush()
     else:
         self.log_file = open('{}/col_evals.csv'.format(self.eval_log_dir), mode='w')
         self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
         self.writer.writeheader()
         self.log_file.flush()
     self.args = eval_args
Exemple #3
0
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir,
             device):
    eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes,
                              None, eval_log_dir, device, True)

    vec_norm = utils.get_vec_normalize(eval_envs)
    if vec_norm is not None:
        vec_norm.eval()
        vec_norm.ob_rms = ob_rms

    eval_episode_rewards = []

    obs = eval_envs.reset()
    eval_recurrent_hidden_states = torch.zeros(
        num_processes, actor_critic.recurrent_hidden_state_size, device=device)
    eval_masks = torch.zeros(num_processes, 1, device=device)

    while len(eval_episode_rewards) < 10:
        with torch.no_grad():
            _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                obs,
                eval_recurrent_hidden_states,
                eval_masks,
                deterministic=True)

        # Obser reward and next obs
        obs, _, done, infos = eval_envs.step(action)

        eval_masks = torch.tensor(
            [[0.0] if done_ else [1.0] for done_ in done],
            dtype=torch.float32,
            device=device)

        for info in infos:
            if 'episode' in info.keys():
                eval_episode_rewards.append(info['episode']['r'])

    eval_envs.close()

    print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
        len(eval_episode_rewards), np.mean(eval_episode_rewards)))
Exemple #4
0
def main():
    config = None
    args = get_args()
    config, checkpoint = get_config_and_checkpoint(args)

    set_random_seeds(args, config)
    eval_log_dir = args.save_dir + "_eval"
    try:
        os.makedirs(args.save_dir)
        os.makedirs(eval_log_dir)
    except OSError:
        pass

    now = datetime.datetime.now()
    experiment_name = args.experiment_name + '_' + now.strftime(
        "%Y-%m-%d_%H-%M-%S")

    # Create checkpoint file
    save_dir_model = os.path.join(args.save_dir, 'model', experiment_name)
    save_dir_config = os.path.join(args.save_dir, 'config', experiment_name)
    try:
        os.makedirs(save_dir_model)
        os.makedirs(save_dir_config)
    except OSError as e:
        logger.error(e)
        exit()

    if args.config:
        shutil.copy2(args.config, save_dir_config)

    # Tensorboard Logging
    writer = SummaryWriter(
        os.path.join(args.save_dir, 'tensorboard', experiment_name))

    # Logger that writes to STDOUT and a file in the save_dir
    logger = setup_carla_logger(args.save_dir, experiment_name)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    norm_reward = not config.no_reward_norm
    norm_obs = not config.no_obs_norm

    assert not (config.num_virtual_goals > 0) or (
        config.reward_class
        == 'SparseReward'), 'Cant use HER with dense reward'
    obs_converter = CarlaObservationConverter(
        h=84, w=84, rel_coord_system=config.rel_coord_system)
    action_converter = CarlaActionsConverter(config.action_type)
    envs = make_vec_envs(obs_converter,
                         action_converter,
                         args.starting_port,
                         config.seed,
                         config.num_processes,
                         config.gamma,
                         device,
                         config.reward_class,
                         num_frame_stack=1,
                         subset=config.experiments_subset,
                         norm_reward=norm_reward,
                         norm_obs=norm_obs,
                         apply_her=config.num_virtual_goals > 0,
                         video_every=args.video_interval,
                         video_dir=os.path.join(args.save_dir, 'video',
                                                experiment_name))

    if config.agent == 'forward':
        agent = agents.ForwardCarla()

    if config.agent == 'a2c':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm)

    elif config.agent == 'acktr':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm,
                                acktr=True)

    elif config.agent == 'ppo':
        agent = agents.PPOCarla(obs_converter,
                                action_converter,
                                config.clip_param,
                                config.ppo_epoch,
                                config.num_mini_batch,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                max_grad_norm=config.max_grad_norm)

    if checkpoint is not None:
        load_modules(agent.optimizer, agent.model, checkpoint)

    rollouts = RolloutStorage(config.num_steps, config.num_processes,
                              envs.observation_space, envs.action_space, 20,
                              config.num_virtual_goals,
                              config.rel_coord_system, obs_converter)

    obs = envs.reset()
    # Save the first observation
    obs = obs_to_dict(obs)
    rollouts.obs = obs_to_dict(rollouts.obs)
    for k in rollouts.obs:
        rollouts.obs[k][rollouts.step + 1].copy_(obs[k])
    rollouts.obs = dict_to_obs(rollouts.obs)
    rollouts.to(device)

    start = time.time()

    total_steps = 0
    total_episodes = 0
    total_reward = 0

    episode_reward = torch.zeros(config.num_processes)

    for j in range(config.num_updates):

        for step in range(config.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.act(
                    rollouts.get_obs(step),
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, info = envs.step(action)

            # For logging purposes
            carla_rewards = torch.tensor([i['carla-reward'] for i in info],
                                         dtype=torch.float)
            episode_reward += carla_rewards
            total_reward += carla_rewards.sum().item()
            total_steps += config.num_processes

            if done.any():
                total_episodes += done.sum()
                torch_done = torch.tensor(done.astype(int)).byte()
                mean_episode_reward = episode_reward[torch_done].mean().item()
                logger.info('{} episode(s) finished with reward {}'.format(
                    done.sum(), mean_episode_reward))
                writer.add_scalar('train/mean_ep_reward_vs_steps',
                                  mean_episode_reward, total_steps)
                writer.add_scalar('train/mean_ep_reward_vs_episodes',
                                  mean_episode_reward, total_episodes)
                episode_reward[torch_done] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor(1 - done)

            rollouts.insert(obs, recurrent_hidden_states,
                            action, action_log_prob, value, reward,
                            masks.unsqueeze(-1))

        if config.num_virtual_goals > 0:
            rollouts.apply_her(config.num_virtual_goals,
                               device,
                               beta=config.beta)

        with torch.no_grad():
            next_value = agent.get_value(
                rollouts.get_obs(-1),  # Get last observation
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, config.use_gae, config.gamma,
                                 config.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and config.agent != 'forward':
            save_path = os.path.join(save_dir_model, str(j) + '.pth.tar')
            save_modules(agent.optimizer, agent.model, args, config, save_path)

        total_num_steps = (j + 1) * config.num_processes * config.num_steps

        if j % args.log_interval == 0:

            # Logging to the stdout/our logs
            end = time.time()
            logger.info('------------------------------------')
            logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\
                .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start)))
            logger.info('------------------------------------')

            # Logging to tensorboard
            writer.add_scalar('train/cum_reward_vs_steps', total_reward,
                              total_steps)
            writer.add_scalar('train/cum_reward_vs_updates', total_reward,
                              j + 1)

            if config.agent in ['a2c', 'acktr', 'ppo']:
                writer.add_scalar('debug/value_loss_vs_steps', value_loss,
                                  total_steps)
                writer.add_scalar('debug/value_loss_vs_updates', value_loss,
                                  j + 1)
                writer.add_scalar('debug/action_loss_vs_steps', action_loss,
                                  total_steps)
                writer.add_scalar('debug/action_loss_vs_updates', action_loss,
                                  j + 1)
                writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy,
                                  total_steps)
                writer.add_scalar('debug/dist_entropy_vs_updates',
                                  dist_entropy, j + 1)

            # Sample the last reward
            writer.add_scalar('debug/sampled_normalized_reward_vs_steps',
                              reward.mean(), total_steps)
            writer.add_scalar('debug/sampled_normalized_reward_vs_updates',
                              reward.mean(), j + 1)
            writer.add_scalar('debug/sampled_carla_reward_vs_steps',
                              carla_rewards.mean(), total_steps)
            writer.add_scalar('debug/sampled_carla_reward_vs_updates',
                              carla_rewards.mean(), j + 1)

        if (args.eval_interval is not None and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name, args.starting_port,
                                      obs_converter,
                                      args.x + config.num_processes,
                                      config.num_processes, config.gamma,
                                      eval_log_dir, config.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(config.num_processes,
                                                       20,
                                                       device=device)
            eval_masks = torch.zeros(config.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = agent.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                carla_obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            logger.info(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))
Exemple #5
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False,
                         args.custom_gym)

    base = SEVN

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'ppo':
        agent = PPO(actor_critic,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.value_loss_coef,
                    args.entropy_coef,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode_length = deque(maxlen=10)
    episode_success_rate = deque(maxlen=100)
    episode_total = 0

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_length.append(info['episode']['l'])
                    episode_success_rate.append(
                        info['was_successful_trajectory'])
                    episode_total += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            writer.add_scalars('Train/Episode Reward', {
                "Reward Mean": np.mean(episode_rewards),
                "Reward Min": np.min(episode_rewards),
                "Reward Max": np.max(episode_rewards)
            },
                               global_step=total_num_steps)
            writer.add_scalars('Train/Episode Length', {
                "Episode Length Mean": np.mean(episode_length),
                "Episode Length Min": np.min(episode_length),
                "Episode Length Max": np.max(episode_length)
            },
                               global_step=total_num_steps)
            writer.add_scalar("Train/Episode Reward Mean",
                              np.mean(episode_rewards),
                              global_step=total_num_steps)
            writer.add_scalar("Train/Episode Length Mean",
                              np.mean(episode_length),
                              global_step=total_num_steps)
            writer.add_scalar("Train/Episode Success Rate",
                              np.mean(episode_success_rate),
                              global_step=total_num_steps)

            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemple #6
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    average_actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    average_actor_critic.load_state_dict(actor_critic.state_dict())
    actor_critic.to(device)
    average_actor_critic.to(device)

    agent = algo.ACER_AGENT(actor_critic,
                            average_actor_critic,
                            args.value_loss_coef,
                            args.entropy_coef,
                            args.gamma,
                            args.clip,
                            args.no_trust_region,
                            args.alpha,
                            args.delta,
                            lr=args.lr,
                            eps=args.eps,
                            rms_alpha=args.rms_alpha,
                            max_grad_norm=args.max_grad_norm)

    buffer = Buffer(args.num_steps, args.num_processes,
                    envs.observation_space.shape, envs.action_space,
                    actor_critic.recurrent_hidden_state_size, args.buffer_size)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    off_rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    off_rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    acer = algo.ACER(actor_critic, rollouts, off_rollouts, buffer,
                     episode_rewards, agent, envs)

    start = time.time()
    for j in range(num_updates):
        # On-policy ACER
        value_loss, action_loss, dist_entropy = acer.call(on_policy=True)
        if args.replay_ratio > 0 and buffer.has_atleast(args.replay_start):
            # Off-policy ACER
            n = np.random.poisson(args.replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\ndist_entropy {:.1f}, value/action loss {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            eval_episode_rewards = []

            obs = eval_envs.reset().to(device)
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, _, _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, _, done, infos = eval_envs.step(action)

                obs = obs.to(device)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done]).to(device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
Exemple #7
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args_iko.cuda else "cpu")

    if args_iko.vis:
        from visdom import Visdom
        viz = Visdom(port=args_iko.port)
        win = None

    envs = make_vec_envs(args_iko.env_name, args_iko.seed,
                         args_iko.num_processes, args_iko.gamma,
                         args_iko.log_dir, args_iko.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args_iko.recurrent_policy})
    actor_critic.to(device)

    action_shape = 3
    reward_model = RewardModel(11 * 11 * 6, 1, 64, 64)
    reward_model.to(device)

    if args_iko.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               lr=args_iko.lr,
                               eps=args_iko.eps,
                               alpha=args_iko.alpha,
                               max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args_iko.clip_param,
                         args_iko.ppo_epoch,
                         args_iko.num_mini_batch,
                         args_iko.value_loss_coef,
                         args_iko.entropy_coef,
                         args_iko.use_singh,
                         reward_model,
                         lr=args_iko.lr,
                         eps=args_iko.eps,
                         max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args_iko.use_linear_lr_decay:
            # decrease learning rate linearly
            if args_iko.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args_iko.lr)

        if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay:
            agent.clip_param = args_iko.clip_param * (1 -
                                                      j / float(num_updates))

        reward_train = []
        reward_block_penalty = []
        reward_bel_gt = []
        reward_bel_gt_nonlog = []
        reward_infogain = []
        reward_bel_ent = []
        reward_hit = []
        reward_dist = []
        reward_inv_dist = []

        for step in range(args_iko.num_steps):
            # Sample actions
            # print(step, args_iko.num_steps)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            reward_train.append(reward)
            # print("infos is ", infos)
            # reward_b.append(infos[0]['auxillary_reward'])
            # print("infos is ",infos[0]['auxillary_reward'])
            reward_block_penalty.append(infos[0]['reward_block_penalty'])
            reward_bel_gt.append(infos[0]['reward_bel_gt'])
            reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog'])
            reward_infogain.append(infos[0]['reward_infogain'])
            reward_bel_ent.append(infos[0]['reward_bel_ent'])
            reward_hit.append(infos[0]['reward_hit'])
            reward_dist.append(infos[0]['reward_dist'])
            reward_inv_dist.append(infos[0]['reward_inv_dist'])
            # print(reward)

            reward.to(device)
            reward_model.to(device)
            if args_iko.use_singh:
                # print("using learning IR")
                my_reward = reward_model(obs.clone().to(device),
                                         action.clone().float()).detach()
                my_reward.to(device)
                reward = reward + args_iko.singh_coef * my_reward.type(
                    torch.FloatTensor)

            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])
            #         print("infos is ",infos[0]['auxillary_reward'])
            #         print("info is",info['episode']['r'] )

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        # print("mean reward_a", np.mean(reward_train))
        # print("mean reward_block_penalty", np.mean(reward_block_penalty))
        # print("mean reward_bel_gt", np.mean(reward_bel_gt))
        # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog))
        # print("mean reward_infogain", np.mean(reward_infogain))
        # print("mean reward_bel_ent", np.mean(reward_bel_ent))
        # print("mean reward_hit", np.mean(reward_hit))
        # print("mean reward_dist", np.mean(reward_dist))
        # print("mean reward_inv_dist", np.mean(reward_inv_dist))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps
        writer.add_scalar('mean_reward_train', np.mean(reward_train),
                          total_num_steps)
        writer.add_scalar('mean_reward_block_penalty',
                          np.mean(reward_block_penalty), total_num_steps)
        writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_gt_nonlog',
                          np.mean(reward_bel_gt_nonlog), total_num_steps)
        writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent),
                          total_num_steps)
        writer.add_scalar('mean_reward_hit', np.mean(reward_hit),
                          total_num_steps)
        writer.add_scalar('mean_reward_dist', np.mean(reward_dist),
                          total_num_steps)
        writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist),
                          total_num_steps)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma,
                                 args_iko.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args_iko.save_interval == 0
                or j == num_updates - 1) and args_iko.save_dir != "":
            save_path = os.path.join(args_iko.save_dir, args_iko.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args_iko.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(
                    save_path, 'ugl' + str(args_iko.use_gt_likelihood) +
                    'block-pen-' + str(args_iko.penalty_for_block) + '_' +
                    'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' +
                    str(args_iko.rew_bel_new) + '_' + 'bel-ent-' +
                    str(args_iko.rew_bel_ent) + '_' + 'infogain-' +
                    str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' +
                    str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' +
                    str(args_iko.rew_bel_gt) + '_' + 'dist-' +
                    str(args_iko.rew_dist) + '_' + 'hit-' +
                    str(args_iko.rew_hit) + '_' + 'inv-dist-' +
                    str(args_iko.rew_inv_dist) + args_iko.algo + ".pt"))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps

        if j % args_iko.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("mean reward_a", np.mean(reward_a))
            print("mean_reward_b", np.mean(reward_b))
            # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            len(episode_rewards),
            #            np.mean(episode_rewards),
            #            np.median(episode_rewards),
            #            np.min(episode_rewards),
            #            np.max(episode_rewards), dist_entropy,
            #            value_loss, action_loss))
            # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps)
            # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps)
            # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps)
            # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps)

        if (args_iko.eval_interval is not None and len(episode_rewards) > 1
                and j % args_iko.eval_interval == 0):
            eval_envs = make_vec_envs(args_iko.env_name,
                                      args_iko.seed + args_iko.num_processes,
                                      args_iko.num_processes, args_iko.gamma,
                                      eval_log_dir, args_iko.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args_iko.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args_iko.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args_iko.vis and j % args_iko.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args_iko.log_dir,
                                  args_iko.env_name, args_iko.algo,
                                  args_iko.num_env_steps)
            except IOError:
                pass
    writer.close()
def main():
  device = 'cpu'
  acc_steps = []
  acc_scores = []
  torch.set_num_threads(1)

  envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                       args.gamma, args.log_dir, args.add_timestep,
                       device, False)

  # get cloned policy and recovered reward function
  policy_reward_dir = args.rewards_dir
  policy_dir = args.policies_dir

  policy_reward = Policy(envs.observation_space.shape, envs.action_space)

  policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth'
  policy_reward_sd = torch.load(policy_reward_file_name)
  policy_reward.load_state_dict(policy_reward_sd)

  actor_critic = Policy(envs.observation_space.shape, envs.action_space)

  policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth'
  policy_sd = torch.load(policy_file_name)
  actor_critic.load_state_dict(policy_sd)
  actor_critic.to(device)

  agent = PPO(actor_critic, args.clip_param, args.ppo_epoch,
              args.num_mini_batch, args.value_loss_coef, args.entropy_coef,
              lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm)

  rollouts = RolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space)

  obs = envs.reset()
  rollouts.obs[0].copy_(obs)
  rollouts.to(device)

  episode_rewards = collections.deque(maxlen=10)

  for j in range(num_updates):

    if args.use_linear_lr_decay:
      # decrease learning rate linearly
      update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
      agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

    for step in range(args.num_steps):
      # Sample actions
      with torch.no_grad():
        value, action, action_log_prob = actor_critic.act(
            rollouts.obs[step],
            rollouts.masks[step])

      obs, _, done, infos = envs.step(action)
      if step > 1 and step % 1000 == 0:
        done = True

      # use infered reward:
      with torch.no_grad():
        # _, reward = shapes(rollouts.obs[step], 0)
        _, action_log_probs, _, _ = policy_reward.evaluate_actions(
            rollouts.obs[step], None, None, action)
        reward = action_log_probs

      for info in infos:
        # if 'episode' in info.keys():
        #  episode_rewards.append(info['episode']['r'])
        r = 0
        for key, val in info.items():
          if 'reward' in key:
            r += val
        episode_rewards.append(r)

      # If done then clean the history of observations.
      masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                 for done_ in done])

      rollouts.insert(obs, action, action_log_prob,
                      value, reward, masks)

    with torch.no_grad():
      next_value = actor_critic.get_value(rollouts.obs[-1],
                                          rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()

    # save for every interval-th episode or for the last epoch
    if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir:
      save_path = os.path.join(args.save_dir, 'ppo')
      try:
        os.makedirs(save_path)
      except OSError:
        pass

      # A really ugly way to save a model to CPU
      save_model = actor_critic

      save_model = [save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None)]

      torch.save(save_model, os.path.join(save_path, args.env_name + '.pt'))

    total_num_steps = (j + 1) * args.num_processes * args.num_steps

    if j % args.log_interval == 0 and len(episode_rewards) > 1:
      print('Updates', j,
            'num timesteps', len(episode_rewards),
            '\n Last training episodes: mean/median reward',
            '{:.1f}'.format(np.mean(episode_rewards)),
            '/{:.1f}'.format(np.median(episode_rewards)),
            'min/max reward',
            '{:.1f}'.format(np.min(episode_rewards)),
            '/{:.1f}'.format(np.max(episode_rewards)),
            'dist entropy', dist_entropy,
            'value loss', value_loss,
            'action loss', action_loss)

    if len(episode_rewards) > 1:
      acc_steps.append(total_num_steps)
      acc_scores.append(np.mean(episode_rewards))

    if (args.eval_interval is not None
        and len(episode_rewards) > 1
        and j % args.eval_interval == 0):
      eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes,
                                args.num_processes, args.gamma, eval_log_dir,
                                args.add_timestep, device, True)

      vec_norm = get_vec_normalize(eval_envs)
      if vec_norm is not None:
        vec_norm.eval()
        vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

      eval_episode_rewards = []

      obs = eval_envs.reset()
      eval_masks = torch.zeros(args.num_processes, 1, device=device)

      while len(eval_episode_rewards) < 10:
        with torch.no_grad():
          _, action, _ = actor_critic.act(
              obs, eval_masks, deterministic=True)

        # Obser reward and next obs
        obs, reward, done, infos = eval_envs.step(action)

        eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                        for done_ in done])
        for info in infos:
          if 'episode' in info.keys():
            eval_episode_rewards.append(info['episode']['r'])

      eval_envs.close()

      print('Evaluation using',
            len(eval_episode_rewards),
            'episodes: mean reward',
            '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

  scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy'
  steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy'
  np.save(scores_file_name, np.array(acc_scores))
  np.save(steps_file_name, np.array(acc_steps))
    def __init__(self, args, im_log_dir):
        self.im_log_dir = im_log_dir
        self.log_dir = args.load_dir
        env_name = args.env_name
        if torch.cuda.is_available() and not args.no_cuda:
            args.cuda = True
            device = torch.device('cuda')
            map_location = torch.device('cuda')
        else:
            args.cuda = False
            device = torch.device('cpu')
            map_location = torch.device('cpu')
        try:
            checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'),
                                    map_location=map_location)
        except FileNotFoundError:
            print('load-dir does not start with valid gym environment id, using command line args')
            env_name = args.env_name
            checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'),
                                map_location=map_location)
        saved_args = checkpoint['args']
        past_frames = checkpoint['n_frames']
        args.past_frames = past_frames
        env_name = saved_args.env_name

        if 'Micropolis' in env_name:
            args.power_puzzle = saved_args.power_puzzle

        if not args.evaluate and not 'GoLMulti' in env_name:
            # assume we just want to observe/interact w/ a single env.
            args.num_proc = 1
        dummy_args = args
        envs = make_vec_envs(env_name, args.seed + 1000, args.num_processes,
                            None, args.load_dir, args.add_timestep, device=device,
                            allow_early_resets=False,
                            args=dummy_args)
        print(args.load_dir)

        if isinstance(envs.observation_space, gym.spaces.Discrete):
            in_width = 1
            num_inputs = envs.observation_space.n
        elif isinstance(envs.observation_space, gym.spaces.Box):
            if len(envs.observation_space.shape) == 3:
                in_w = envs.observation_space.shape[1]
                in_h = envs.observation_space.shape[2]
            else:
                in_w = 1
                in_h = 1
            num_inputs = envs.observation_space.shape[0]

        if isinstance(envs.action_space, gym.spaces.Discrete):
            out_w = 1
            out_h = 1
            num_actions = int(envs.action_space.n // (in_w * in_h))
           #if 'Micropolis' in env_name:
           #    num_actions = env.venv.venv.envs[0].num_tools
           #elif 'GameOfLife' in env_name:
           #    num_actions = 1
           #else:
           #    num_actions = env.action_space.n
        elif isinstance(envs.action_space, gym.spaces.Box):
            out_w = envs.action_space.shape[0]
            out_h = envs.action_space.shape[1]
            num_actions = envs.action_space.shape[-1]
        # We need to use the same statistics for normalization as used in training
        #actor_critic, ob_rms = \
        #            torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))

        if saved_args.model == 'fractal':
            saved_args.model = 'FractalNet'
        actor_critic = Policy(envs.observation_space.shape, envs.action_space,
                base_kwargs={'map_width': args.map_width,
                             'recurrent': args.recurrent_policy,
                            'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs,
                    'out_w': out_w, 'out_h': out_h },
                             curiosity=args.curiosity, algo=saved_args.algo,
                             model=saved_args.model, args=saved_args)
        actor_critic.to(device)
        torch.nn.Module.dump_patches = True
        actor_critic.load_state_dict(checkpoint['model_state_dict'])
        ob_rms = checkpoint['ob_rms']

        if 'fractal' in args.model.lower():
            new_recs = args.n_recs - saved_args.n_recs

            for nr in range(new_recs):
                actor_critic.base.auto_expand()
            print('expanded network:\n', actor_critic.base)

            if args.active_column is not None \
                    and hasattr(actor_critic.base, 'set_active_column'):
                actor_critic.base.set_active_column(args.active_column)
        vec_norm = get_vec_normalize(envs)

        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
        self.actor_critic = actor_critic
        self.envs = envs
        self.args = args
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    print('args.lr')
    print(args.lr)

    #     print('args.stat_decay')
    #     print(args.stat_decay)

    #     sys.exit()

    if args.algo == 'a2c':

        #         print('args.eps')
        #         print(args.eps)

        #         sys.exit()

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo in ['acktr']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['acktr-h**o']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               if_homo=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['acktr-h**o-noEigen']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               if_homo=True,
                               stat_decay=args.stat_decay,
                               if_eigen=False)
    elif args.algo in ['kbfgs']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o-invertA']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay=args.stat_decay,
                               if_invert_A=True)

    elif args.algo in ['kbfgs-h**o-invertA-decoupledDecay']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay_A=args.stat_decay_A,
                               stat_decay_G=args.stat_decay_G,
                               if_invert_A=True,
                               if_decoupled_decay=True)
    elif args.algo in ['kbfgs-h**o-momentumGrad']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               if_momentumGrad=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o-noClip']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               if_clip=False,
                               stat_decay=args.stat_decay)
    else:
        print('unknown args.algo for ' + args.algo)
        sys.exit()

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    record_rewards = []

    record_num_steps = []

    print('num_updates')
    print(num_updates)

    total_num_steps = 0

    start = time.time()
    for j in range(num_updates):

        print('j')
        print(j)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:

                #                 print('info.keys()')
                #                 print(info.keys())

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                    print('info[episode][r]')
                    print(info['episode']['r'])

                    record_rewards.append(info['episode']['r'])

                    #                     print('total_num_steps')
                    #                     print(total_num_steps)

                    #                     print('total_num_steps + (step + 1) * args.num_processes')
                    #                     print(total_num_steps + (step + 1) * args.num_processes)

                    record_num_steps.append(total_num_steps +
                                            (step + 1) * args.num_processes)

#                     sys.exit()

# If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, update_signal = agent.update(
            rollouts)

        if update_signal == -1:
            #             sys.exit()
            break

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass

    print('record_rewards')
    print(record_rewards)

    dir_with_params = args.env_name + '/' +\
    args.algo + '/' +\
    'eps_' + str(args.eps) + '/' +\
    'lr_' + str(args.lr) + '/' +\
    'stat_decay_' + str(args.stat_decay) + '/'

    #     saving_dir = './result/' + args.env_name + '/' + args.algo + '/'
    saving_dir = './result/' + dir_with_params

    if not os.path.isdir(saving_dir):
        os.makedirs(saving_dir)

    import pickle

    with open(saving_dir + 'result.pkl', 'wb') as handle:
        pickle.dump(
            {
                'record_rewards': record_rewards,
                'record_num_steps': record_num_steps
            }, handle)

    print('args.log_dir')
    print(args.log_dir)

    print('os.listdir(args.log_dir)')
    print(os.listdir(args.log_dir))

    #     saving_dir_monitor = './result_monitor/' + args.env_name + '/' + args.algo + '/'

    saving_dir_monitor = './result_monitor/' + dir_with_params

    if os.path.isdir(saving_dir_monitor):
        import shutil

        shutil.rmtree(saving_dir_monitor)

    if not os.path.isdir(saving_dir_monitor):
        os.makedirs(saving_dir_monitor)

    print('saving_dir_monitor')
    print(saving_dir_monitor)

    import shutil

    for file_name in os.listdir(args.log_dir):

        full_file_name = os.path.join(args.log_dir, file_name)

        print('full_file_name')
        print(full_file_name)

        print('os.path.isfile(full_file_name)')
        print(os.path.isfile(full_file_name))

        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, saving_dir_monitor)

#     print('os.listdir(saving_dir_monitor)')
#     print(os.listdir(saving_dir_monitor))

#     print('len(os.listdir(saving_dir_monitor))')
#     print(len(os.listdir(saving_dir_monitor)))

#     print('args.num_processes')
#     print(args.num_processes)

    assert len(os.listdir(saving_dir_monitor)) == args.num_processes
Exemple #11
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_env(args.env_name, args.seed, args.gamma)

    model = MujocoModel(envs.observation_space.shape[0],
                        envs.action_space.shape[0])
    model.to(device)

    algorithm = PPO(model,
                    args.clip_param,
                    args.value_loss_coef,
                    args.entropy_coef,
                    initial_lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    agent = MujocoAgent(algorithm, device)

    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
                              envs.action_space.shape[0])

    obs = envs.reset()
    rollouts.obs[0] = np.copy(obs)

    episode_rewards = deque(maxlen=10)

    num_updates = int(args.num_env_steps) // args.num_steps
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = agent.sample(
                    rollouts.obs[step])  # why use obs from rollouts???有病吧

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.append(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = agent.value(rollouts.obs[-1])

        value_loss, action_loss, dist_entropy = agent.learn(
            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
            args.num_mini_batch, rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_steps
            print(
                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
                                        args.seed, device)
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #13
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    utils.cleanup_log_dir(log_dir)

    with open(log_dir + 'extras.csv', "w") as file:
        file.write("n, value_loss\n")

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    model = Policy(envs.observation_space.shape,
                   envs.action_space.n,
                   extra_kwargs={'use_backpack': args.algo == 'tdprop'})
    model.to(device)

    if args.algo == 'tdprop':
        from algo.sarsa_tdprop import SARSA
        agent = SARSA(model,
                      lr=args.lr,
                      eps=args.eps,
                      max_grad_norm=args.max_grad_norm,
                      beta_1=args.beta_1,
                      beta_2=args.beta_2,
                      n=args.num_steps,
                      num_processes=args.num_processes,
                      gamma=args.gamma)
    else:
        from algo.sarsa import SARSA
        agent = SARSA(model,
                      lr=args.lr,
                      eps=args.eps,
                      max_grad_norm=args.max_grad_norm,
                      beta_1=args.beta_1,
                      beta_2=args.beta_2,
                      algo=args.algo)

    explore_policy = utils.eps_greedy
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              model.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                qs = model(rollouts.obs[step])
                _, dist = explore_policy(qs, args.exploration)
                actions = dist.sample().unsqueeze(-1)
                value = qs.gather(-1, actions)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(actions)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, torch.FloatTensor([0.0]), actions, value,
                            value, reward, masks, bad_masks)
        with torch.no_grad():
            next_qs = model(rollouts.obs[-1])
            next_probs, _ = explore_policy(next_qs, args.exploration)
            next_value = (next_probs * next_qs).sum(-1).unsqueeze(-1)

        rollouts.compute_returns(next_value, args.gamma)

        value_loss = agent.update(rollouts, explore_policy, args.exploration)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_updates - 1):
            save_path = os.path.join(args.log_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                list(model.parameters()),
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                    ("Updates {}, num timesteps {}, FPS {}\n" + \
                            "Last {} training episodes: mean/median reward {:.1f}/{:.1f}" + \
                            ", min/max reward {:.1f}/{:.1f}\n" + \
                            "entropy {:.2f}, value loss {:.4f}")
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist.entropy().mean().item(), value_loss))
            with open(log_dir + 'extras.csv', "a") as file:
                file.write(
                    str(total_num_steps) + ", " + str(value_loss) + "\n")
Exemple #14
0
    def __init__(self, args, actor_critic, device, envs=None, vec_norm=None,
            frozen=False):
        ''' frozen: we are not in the main training loop, but evaluating frozen model separately'''
        if frozen:
            self.win_eval = None
            past_steps = args.past_steps
        self.frozen = frozen
       #eval_args.render = True
        self.device = device
       #if args.model == 'fractal':
       #    for i in range(-1, args.n_recs):
       #        eval_log_dir = args.log_dir + "_eval_col_{}".format(i)
       #        try:
       #            os.makedirs(eval_log_dir)
       #        except OSError:
       #            files = glob.glob(os.path.join(eval_log_dir,  '*.monitor.csv'))
       #            for f in files:
       #                os.remove(f)
       #        setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir)
        if frozen:
            if 'GameOfLife' in args.env_name:
                self.eval_log_dir = args.log_dir + "/eval_{}-steps_w{}_{}rec_{}s_{}pl".format(past_steps,
                        args.map_width, args.n_recs, args.max_step, args.prob_life, '.1f')
            else:
                self.eval_log_dir = args.log_dir + "/eval_{}-steps_w{}_{}rec_{}s".format(past_steps,
                        args.map_width, args.n_recs, args.max_step, '.1f')
            merge_col_logs = True
        else:
            self.eval_log_dir = args.log_dir + "_eval"
            merge_col_logs = False
        try:
            os.makedirs(self.eval_log_dir)
        except OSError:
            files = glob.glob(os.path.join(self.eval_log_dir,  '*.monitor.csv'))
            files += glob.glob(os.path.join(self.eval_log_dir, '*_eval.csv'))
            if args.overwrite:
                for f in files:
                    os.remove(f)
            elif files:
                merge_col_logs = True

        self.args = args
        self.actor_critic = actor_critic
        self.num_eval_processes = args.num_processes
        if envs:
            self.eval_envs = envs
            self.vec_norm = vec_norm
        else:

           #print('making envs in Evaluator: ', self.args.env_name, self.args.seed + self.num_eval_processes, self.num_eval_processes,
           #            self.args.gamma, self.eval_log_dir, self.args.add_timestep, self.device, True, self.args)
            self.eval_envs = make_vec_envs(
                        self.args.env_name, self.args.seed + self.num_eval_processes, self.num_eval_processes,
                        self.args.gamma, self.eval_log_dir, self.args.add_timestep, self.device, False, args=self.args)
            self.vec_norm = get_vec_normalize(self.eval_envs)
        if self.vec_norm is not None:
            self.vec_norm.eval()
            self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms
        self.tstart = time.time()
        fieldnames = ['r', 'l', 't']
        model = actor_critic.base
        if args.model == 'FractalNet' or args.model =='fractal':
            n_cols = model.n_cols
        else:
            n_cols = 0
        self.plotter = Plotter(n_cols, self.eval_log_dir, self.num_eval_processes, max_steps=self.args.max_step)
        eval_cols = range(-1, n_cols)
        if args.model == 'fixed' and model.RAND:
            eval_cols = model.eval_recs
        if eval_cols is not None:
            for i in eval_cols:
                log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i)
                if merge_col_logs and os.path.exists(log_file):
                    merge_col_log = True
                else:
                    merge_col_log = False
                if merge_col_log:
                    if len(eval_cols) > 1 and i == eval_cols[-2] and self.args.auto_expand: # problem if we saved model after auto-expanding, without first evaluating!
                        # for the newly added column, we duplicate the last col.'s records
                        new_col_log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i + 1)
                        copyfile(log_file, new_col_log_file)
                    old_log = '{}_old'.format(log_file)
                    os.rename(log_file, old_log)
                log_file_col = open(log_file, mode='w')
                setattr(self, 'log_file_col_{}'.format(i), log_file_col)
                writer_col = csv.DictWriter(log_file_col, fieldnames=fieldnames)
                setattr(self, 'writer_col_{}'.format(i), writer_col)
                if merge_col_log:
                    with open(old_log, newline='') as old:
                        reader = csv.DictReader(old, fieldnames=('r', 'l', 't'))
                        h = 0
                        try: # in case of null bytes resulting from interrupted logging
                            for row in reader:
                                if h > 1:
                                    row['t'] = 0.0001 * h # HACK: false times for past logs to maintain order
                                    writer_col.writerow(row)
                                h += 1
                        except csv.Error:
                            h_i = 0
                            for row in reader:
                                if h_i > h:
                                    row['t'] = 0.0001 * h_i # HACK: false times for past logs to maintain order
                                    writer_col.writerow(row)
                                h_i += 1
                    os.remove(old_log)

                else:
                    writer_col.writeheader()
                    log_file_col.flush()
def main():
    '''
    Train PPO policies on each of the training environments.
    '''
    args = get_args()

    try:
        os.makedirs(args.log_dir)
    except OSError:
        pass

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args, device)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ep_reward = np.zeros(args.num_processes)
    episode_rewards = deque(maxlen=100)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        # decrease learning rate linearly
        utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obs reward and next obs
            obs, reward, done, infos = envs.step(action)
            if 'spaceship' in args.env_name:  # spaceship, swimmer
                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(reward[i].item())
            # elif 'swimmer' in args.env_name:
            else:
                for i in range(len(done)):
                    ep_reward[i] += reward[i].numpy().item()
                    if done[i]:
                        episode_rewards.append(ep_reward[i])
                        ep_reward[i] = 0
            # if 'ant' in args.env_name:
            #     for info in infos:
            #         if 'episode' in info.keys():
            #             episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda,
                                 True)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(args.save_dir)
            except OSError:
                pass


            torch.save(
                actor_critic.state_dict(),
                os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\
                    .format(args.env_name, args.default_ind, args.seed))
            )

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("\nUpdates {}, num timesteps {}, Last {} training episodes: \
                \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}"
                  .format(j, total_num_steps, len(episode_rewards),
                          np.mean(episode_rewards), np.median(episode_rewards),
                          np.min(episode_rewards), np.max(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, device)

    envs.close()
Exemple #16
0
def main():

    actor_critic = False
    agent = False
    past_steps = 0
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            if args.overwrite:
                os.remove(f)
            else:
                pass
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
        win_eval = None

    import torch.multiprocessing as multiprocessing
    multiprocessing.set_start_method('spawn')
    torch.manual_seed(args.seed)
    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         args.add_timestep,
                         device,
                         False,
                         None,
                         args=args)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    num_actions = 1
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'map_width': args.map_width,
                              'num_actions': num_actions,
                              'recurrent': args.recurrent_policy
                          },
                          curiosity=args.curiosity,
                          algo=args.algo,
                          model=args.model,
                          args=args)

    evaluator = None

    if not agent:
        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic,
                                            args.value_loss_coef,
                                            args.entropy_coef,
                                            lr=args.lr,
                                            eps=args.eps,
                                            alpha=args.alpha,
                                            max_grad_norm=args.max_grad_norm,
                                            curiosity=args.curiosity,
                                            args=args)
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic,
                                            args.value_loss_coef,
                                            args.entropy_coef,
                                            lr=args.lr,
                                            eps=args.eps,
                                            alpha=args.alpha,
                                            max_grad_norm=args.max_grad_norm,
                                            acktr=True,
                                            curiosity=args.curiosity,
                                            args=args)

#saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
    saved_model = os.path.join(args.save_dir, args.env_name + '.tar')
    if os.path.exists(saved_model) and not args.overwrite:
        checkpoint = torch.load(saved_model)
        actor_critic.load_state_dict(checkpoint['model_state_dict'])
        actor_critic.to(device)
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        past_steps = checkpoint['past_steps']
        ob_rms = checkpoint['ob_rms']
        past_steps = next(iter(
            agent.optimizer.state_dict()['state'].values()))['step']
        saved_args = checkpoint['args']
        new_recs = args.n_recs - saved_args.n_recs
        for nr in range(new_recs):
            actor_critic.base.auto_expand()
        if saved_args.n_recs > args.n_recs:
            print('applying {} fractal expansions to network'.format(
                saved_args.n_recs - args.n_recs))
        print('Resuming from step {}'.format(past_steps))

        #print(type(next(iter((torch.load(saved_model))))))
        #actor_critic, ob_rms = \
        #        torch.load(saved_model)
        #agent = \
        #    torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
        #if not agent.optimizer.state_dict()['state'].values():
        #    past_steps = 0
        #else:

        #    raise Exception

        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    actor_critic.to(device)

    if 'LSTM' in args.model:
        recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size(
        )
    else:
        recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size
    if args.curiosity:
        rollouts = CuriosityRolloutStorage(
            args.num_steps,
            args.num_processes,
            envs.observation_space.shape,
            envs.action_space,
            recurrent_hidden_state_size,
            actor_critic.base.feature_state_size(),
            args=args)
    else:
        rollouts = RolloutStorage(args.num_steps,
                                  args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  recurrent_hidden_state_size,
                                  args=args)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    model = actor_critic.base
    done = False
    for j in range(past_steps, num_updates):
        if args.model == 'fractal' and args.drop_path:
            model.set_drop_path()
        if args.model == 'fixed' and model.RAND:
            model.num_recursions = random.randint(1, model.map_width * 2)

        player_act = None
        for step in range(args.num_steps):
            # if type(done) is not bool:
            #     if done.any():
            #         obs = envs.reset()
            # elif done:
            #     obs = env.reset()
            # Sample actions
            with torch.no_grad():

                value, action, action_log_probs, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                    player_act=player_act,
                    icm_enabled=args.curiosity,
                    deterministic=False)

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            player_act = None
            if args.render:

                if infos[0]:
                    if 'player_move' in infos[0].keys():
                        player_act = infos[0]['player_move']

            if args.curiosity:
                # run icm
                with torch.no_grad():

                    feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act(
                        (rollouts.obs[step], obs, action_bin))

                intrinsic_reward = args.eta * (
                    (feature_state - feature_state_pred).pow(2)).sum() / 2.
                if args.no_reward:
                    reward = 0
                reward += intrinsic_reward.cpu()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if args.curiosity:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_probs, value, reward, masks,
                                feature_state, feature_state_pred, action_bin,
                                action_dist_pred)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_probs, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(
                rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None)
            save_model = copy.deepcopy(actor_critic)
            save_agent = copy.deepcopy(agent)
            if args.cuda:
                save_model.cpu()
            optim_save = save_agent.optimizer.state_dict()

            # experimental:
            torch.save(
                {
                    'past_steps': step,
                    'model_state_dict': save_model.state_dict(),
                    'optimizer_state_dict': optim_save,
                    'ob_rms': ob_rms,
                    'args': args
                }, os.path.join(save_path, args.env_name + ".tar"))

        #save_model = [save_model,
        #              getattr(get_vec_normalize(envs), 'ob_rms', None)]

        #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
        #save_agent = copy.deepcopy(agent)

        #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
        #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \
dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".format(
                    j, total_num_steps,
                    int((total_num_steps -
                         past_steps * args.num_processes * args.num_steps) /
                        (end - start)), len(episode_rewards),
                    np.mean(episode_rewards), np.median(episode_rewards),
                    np.min(episode_rewards), np.max(episode_rewards),
                    dist_entropy, value_loss, action_loss))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".format(
                    fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args, actor_critic, device)

            model = evaluator.actor_critic.base
            if args.model == 'fractal':
                n_cols = model.n_cols
                if args.rule == 'wide1' and args.n_recs > 3:
                    col_step = 3
                else:
                    col_step = 1
                col_idx = [-1, *range(0, n_cols, col_step)]
                for i in col_idx:
                    evaluator.evaluate(column=i)
            #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
            # making sure the evaluator plots the '-1'st column (the overall net)
                win_eval = visdom_plot(viz,
                                       win_eval,
                                       evaluator.eval_log_dir,
                                       graph_name,
                                       args.algo,
                                       args.num_frames,
                                       n_graphs=col_idx)
            elif args.model == 'fixed' and model.RAND:
                for i in model.eval_recs:
                    evaluator.evaluate(num_recursions=i)
                win_eval = visdom_plot(viz,
                                       win_eval,
                                       evaluator.eval_log_dir,
                                       graph_name,
                                       args.algo,
                                       args.num_frames,
                                       n_graphs=model.eval_recs)
            else:
                evaluator.evaluate(column=None)
                win_eval = visdom_plot(
                    viz, win_eval, evaluator.eval_log_dir, graph_name,
                    args.algo, args.num_frames * 20 /
                    (args.eval_interval * args.num_steps))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, graph_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #17
0
    def __init__(self, args, actor_critic, device):
        eval_args = args
        #eval_args.render = True
        self.device = device
        #if args.model == 'fractal':
        #    for i in range(-1, args.n_recs):
        #        eval_log_dir = args.log_dir + "_eval_col_{}".format(i)
        #        try:
        #            os.makedirs(eval_log_dir)
        #        except OSError:
        #            files = glob.glob(os.path.join(eval_log_dir,  '*.monitor.csv'))
        #            for f in files:
        #                os.remove(f)
        #        setattr(self, 'eval_log_dir_col_{}'.format(i), eval_log_dir)

        self.eval_log_dir = args.log_dir + "_eval"
        merge_col_logs = False
        try:
            os.makedirs(self.eval_log_dir)
        except OSError:
            files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv'))
            if args.overwrite:
                for f in files:
                    os.remove(f)
            elif files:
                merge_col_logs = True

        self.num_eval_processes = 20
        self.eval_envs = make_vec_envs(eval_args.env_name,
                                       eval_args.seed +
                                       self.num_eval_processes,
                                       self.num_eval_processes,
                                       eval_args.gamma,
                                       self.eval_log_dir,
                                       eval_args.add_timestep,
                                       self.device,
                                       True,
                                       args=eval_args)
        self.vec_norm = get_vec_normalize(self.eval_envs)
        if self.vec_norm is not None:
            self.vec_norm.eval()
            self.vec_norm.ob_rms = get_vec_normalize(self.eval_envs).ob_rms
        self.actor_critic = actor_critic
        self.tstart = time.time()
        fieldnames = ['r', 'l', 't']
        model = actor_critic.base
        if args.model == 'fractal':
            n_cols = model.n_cols
            eval_cols = range(-1, n_cols)
        elif args.model == 'fixed' and model.RAND:
            eval_cols = model.eval_recs
        else:
            eval_cols = None
        if eval_cols is not None:
            for i in eval_cols:
                log_file = '{}/col_{}_eval.csv'.format(self.eval_log_dir, i)
                if merge_col_logs:
                    old_log = '{}_old'.format(log_file)
                    os.rename(log_file, old_log)
                log_file_col = open(log_file, mode='w')
                setattr(self, 'log_file_col_{}'.format(i), log_file_col)
                writer_col = csv.DictWriter(log_file_col,
                                            fieldnames=fieldnames)
                setattr(self, 'writer_col_{}'.format(i), writer_col)
                if merge_col_logs:
                    with open(old_log, newline='') as old:
                        reader = csv.DictReader(old,
                                                fieldnames=('r', 'l', 't'))
                        h = 0
                        for row in reader:
                            if h > 1:
                                row['t'] = 0.0001 * h  # HACK: false times for past logs to maintain order
                                writer_col.writerow(row)
                            h += 1
                    os.remove(old_log)

                else:
                    writer_col.writeheader()
                    log_file_col.flush()
        self.args = eval_args
Exemple #18
0
    def __init__(self):
        import random
        import gym_city
        import game_of_life

        self.fieldnames = self.get_fieldnames()
        self.n_frames = 0

        args = get_args()
        args.log_dir = args.save_dir + '/logs'
        assert args.algo in ['a2c', 'ppo', 'acktr']
        if args.recurrent_policy:
            assert args.algo in ['a2c', 'ppo'], \
                    'Recurrent policy is not implemented for ACKTR'

        torch.manual_seed(args.seed)
        if args.cuda:
            print('CUDA ENABLED')
            torch.cuda.manual_seed(args.seed)

        graph_name = args.save_dir.split('trained_models/')[1].replace(
            '/', ' ')
        self.graph_name = graph_name

        actor_critic = False
        agent = False
        past_frames = 0
        try:
            os.makedirs(args.log_dir)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
            for f in files:
                if args.overwrite:
                    os.remove(f)
                else:
                    pass
        torch.set_num_threads(1)
        device = torch.device("cuda:0" if args.cuda else "cpu")
        self.device = device

        if args.vis:
            from visdom import Visdom
            viz = Visdom(port=args.port)
            self.viz = viz
            win = None
            self.win = win
            win_eval = None
            self.win_eval = win_eval
        print('env name: {}'.format(args.env_name))
        if 'GameOfLife' in args.env_name:
            num_actions = 1
        envs = make_vec_envs(args.env_name,
                             args.seed,
                             args.num_processes,
                             args.gamma,
                             args.log_dir,
                             args.add_timestep,
                             device,
                             False,
                             None,
                             args=args)

        if isinstance(envs.observation_space, gym.spaces.Discrete):
            num_inputs = envs.observation_space.n
        elif isinstance(envs.observation_space, gym.spaces.Box):
            if 'golmulti' in args.env_name.lower():
                multi_env = True
                observation_space_shape = envs.observation_space.shape[1:]
            else:
                multi_env = False
                observation_space_shape = envs.observation_space.shape
            self.multi_env = multi_env
            if len(observation_space_shape) == 3:
                in_w = observation_space_shape[1]
                in_h = observation_space_shape[2]
            else:
                in_w = 1
                in_h = 1
            num_inputs = observation_space_shape[0]
        if isinstance(envs.action_space, gym.spaces.Discrete) or\
            isinstance(envs.action_space, gym.spaces.Box):
            out_w = args.map_width
            out_h = args.map_width
            if 'Micropolis' in args.env_name:  #otherwise it's set
                if args.power_puzzle:
                    num_actions = 1
                else:
                    num_actions = 19  # TODO: have this already from env
            elif 'GameOfLife' in args.env_name:
                num_actions = 1
            else:
                num_actions = envs.action_space.n
        elif isinstance(envs.action_space, gym.spaces.Box):
            if len(envs.action_space.shape) == 3:
                out_w = envs.action_space.shape[1]
                out_h = envs.action_space.shape[2]
            elif len(envs.action_space.shape) == 1:
                out_w = 1
                out_h = 1
            num_actions = envs.action_space.shape[-1]
        print('num actions {}'.format(num_actions))

        if args.auto_expand:
            args.n_recs -= 1
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base_kwargs={
                                  'map_width': args.map_width,
                                  'num_actions': num_actions,
                                  'recurrent': args.recurrent_policy,
                                  'prebuild': args.prebuild,
                                  'in_w': in_w,
                                  'in_h': in_h,
                                  'num_inputs': num_inputs,
                                  'out_w': out_w,
                                  'out_h': out_h
                              },
                              curiosity=args.curiosity,
                              algo=args.algo,
                              model=args.model,
                              args=args)
        if not agent:
            agent = init_agent(actor_critic, args)
        if args.auto_expand:
            args.n_recs += 1

        evaluator = None
        self.evaluator = evaluator

        vec_norm = get_vec_normalize(envs)
        self.vec_norm = vec_norm
        #saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
        if args.load_dir:
            saved_model = os.path.join(args.load_dir, args.env_name + '.tar')
        else:
            saved_model = os.path.join(args.save_dir, args.env_name + '.tar')
        self.checkpoint = None
        if os.path.exists(saved_model) and not args.overwrite:
            #print('current actor_critic params: {}'.format(actor_critic.parameters()))
            checkpoint = torch.load(saved_model)
            self.checkpoint = checkpoint
            saved_args = checkpoint['args']
            actor_critic.load_state_dict(checkpoint['model_state_dict'])
            opt = agent.optimizer.state_dict()
            opt_load = checkpoint['optimizer_state_dict']
            for o, l in zip(opt, opt_load):
                #print(o, l)
                param = opt[o]
                param_load = opt_load[l]
            #print('current: {}'.format(param), 'load: {}'.format(param_load))
            #print(param_load.keys())
            #params = param_load[0]['params']
            #param[0]['params'] = params
            #for m, n in zip(param, param_load):
            #    for p, q in zip(m, n):
            #        print(p, q)
            #        if type(m[p]) == list:
            #            print(len(m[p]), len(n[q]))
            #            agent.optimizer[m][p] = m[q]

        #print(agent.optimizer.state_dict()['param_groups'])
        #print('\n')
        #print(checkpoint['model_state_dict'])
            actor_critic.to(self.device)
            #actor_critic.cuda()
            #agent = init_agent(actor_critic, saved_args)
            agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            if args.auto_expand:
                if not args.n_recs - saved_args.n_recs == 1:
                    print('can expand by 1 rec only from saved model, not {}'.
                          format(args.n_recs - saved_args.n_recs))
                    raise Exception
                actor_critic.base.auto_expand()
                print('expanded net: \n{}'.format(actor_critic.base))
                # TODO: Are we losing something crucial here? Probably not ideal.
                agent = init_agent(actor_critic, args)

            past_frames = checkpoint['n_frames']
            ob_rms = checkpoint['ob_rms']
            #past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step']
            print('Resuming from frame {}'.format(past_frames))

            #print(type(next(iter((torch.load(saved_model))))))
            #actor_critic, ob_rms = \
            #        torch.load(saved_model)
            #agent = \
            #    torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
            #if not agent.optimizer.state_dict()['state'].values():
            #    past_steps = 0
            #else:

            #    raise Exception

            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = ob_rms
            saved_args.num_frames = args.num_frames
            saved_args.vis_interval = args.vis_interval
            saved_args.eval_interval = args.eval_interval
            saved_args.overwrite = args.overwrite
            saved_args.n_recs = args.n_recs
            saved_args.intra_shr = args.intra_shr
            saved_args.inter_shr = args.inter_shr
            saved_args.map_width = args.map_width
            saved_args.render = args.render
            saved_args.print_map = args.print_map
            saved_args.load_dir = args.load_dir
            saved_args.experiment_name = args.experiment_name
            saved_args.log_dir = args.log_dir
            saved_args.save_dir = args.save_dir
            saved_args.num_processes = args.num_processes
            saved_args.n_chan = args.n_chan
            saved_args.prebuild = args.prebuild
            args = saved_args
        actor_critic.to(device)

        updates_remaining = int(args.num_frames - past_frames) // (
            args.num_steps * args.num_processes)
        self.n_frames = self.n_frames + past_frames
        self.past_frames = past_frames
        if 'LSTM' in args.model:
            recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size(
            )
        else:
            recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size
        if args.curiosity:
            rollouts = CuriosityRolloutStorage(
                args.num_steps,
                args.num_processes,
                envs.observation_space.shape,
                envs.action_space,
                recurrent_hidden_state_size,
                actor_critic.base.feature_state_size(),
                args=args)
        else:
            rollouts = RolloutStorage(args.num_steps,
                                      args.num_processes,
                                      envs.observation_space.shape,
                                      envs.action_space,
                                      recurrent_hidden_state_size,
                                      args=args)

        obs = envs.reset()
        rollouts.obs[0].copy_(obs)
        rollouts.to(device)

        episode_rewards = deque(maxlen=10)

        start = time.time()
        self.model = model = actor_critic.base
        self.reset_eval = False
        plotter = None
        env_param_bounds = envs.get_param_bounds()
        # in case we want to change this dynamically in the future (e.g., we may
        # not know how much traffic the agent can possibly produce in Micropolis)
        envs.set_param_bounds(env_param_bounds)  # start with default bounds

        if args.model == 'FractalNet' or args.model == 'fractal':
            n_cols = model.n_cols
            if args.rule == 'wide1' and args.n_recs > 3:
                col_step = 3
            else:
                col_step = 1
        else:
            n_cols = 0
            col_step = 1
        self.col_step = col_step
        env_param_bounds = envs.get_param_bounds()
        # in case we want to change this dynamically in the future (e.g., we may
        # not know how much traffic the agent can possibly produce in Micropolis)
        envs.set_param_bounds(env_param_bounds)  # start with default bounds
        self.updates_remaining = updates_remaining
        self.envs = envs
        self.start = start
        self.rollouts = rollouts
        self.args = args
        self.actor_critic = actor_critic
        self.plotter = plotter
        self.agent = agent
        self.episode_rewards = episode_rewards
        self.n_cols = n_cols
Exemple #19
0
    def train(self):
        evaluator = self.evaluator
        episode_rewards = self.episode_rewards
        args = self.args
        actor_critic = self.actor_critic
        rollouts = self.rollouts
        agent = self.agent
        envs = self.envs
        plotter = self.plotter
        n_train = self.n_train
        start = self.start
        plotter = self.plotter
        n_cols = self.n_cols
        model = self.model
        device = self.device
        vec_norm = self.vec_norm
        n_frames = self.n_frames
        if self.reset_eval:
            obs = envs.reset()
            rollouts.obs[0].copy_(obs)
            rollouts.to(device)
            self.reset_eval = False
        if args.model == 'FractalNet' and args.drop_path:
            model.set_drop_path()
        if args.model == 'fixed' and model.RAND:
            model.num_recursions = random.randint(1, model.map_width * 2)
        self.player_act = None
        for self.n_step in range(args.num_steps):
            # Sample actions
            self.step()

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(
                rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)
        envs.dist_entropy = dist_entropy

        rollouts.after_update()

        total_num_steps = (n_train + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0

    #print(episode_rewards)
    #if torch.max(rollouts.rewards) > 0:
    #    print(rollouts.rewards)
        if args.log and n_train % args.log_interval == 0 and len(
                episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.6f}/{:.6f}, min/max reward {:.6f}/{:.6f}\n \
dist entropy {:.6f}, val/act loss {:.6f}/{:.6f},".format(
                    n_train, total_num_steps,
                    int((self.n_frames - self.past_frames) / (end - start)),
                    len(episode_rewards), round(np.mean(episode_rewards), 6),
                    round(np.median(episode_rewards), 6),
                    round(np.min(episode_rewards), 6),
                    round(np.max(episode_rewards), 6), round(dist_entropy, 6),
                    round(value_loss, 6), round(action_loss, 6)))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".format(
                    fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and n_train % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args,
                                      actor_critic,
                                      device,
                                      envs=envs,
                                      vec_norm=vec_norm,
                                      fieldnames=self.fieldnames)
                self.evaluator = evaluator

            col_idx = [-1, *[i for i in range(0, n_cols, self.col_step)]]
            for i in col_idx:
                evaluator.evaluate(column=i)
        #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
        # making sure the evaluator plots the '-1'st column (the overall net)
            viz = self.viz
            win_eval = self.win_eval
            graph_name = self.graph_name
            if args.vis:  #and n_train % args.vis_interval == 0:
                try:
                    # Sometimes monitor doesn't properly flush the outputs
                    win_eval = evaluator.plotter.visdom_plot(
                        viz,
                        win_eval,
                        evaluator.eval_log_dir,
                        graph_name,
                        args.algo,
                        args.num_frames,
                        n_graphs=col_idx)
                except IOError:
                    pass
        #elif args.model == 'fixed' and model.RAND:
        #    for i in model.eval_recs:
        #        evaluator.evaluate(num_recursions=i)
        #    win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
        #                           args.algo, args.num_frames, n_graphs=model.eval_recs)
        #else:
        #    evaluator.evaluate(column=-1)
        #    win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
        #                  args.algo, args.num_frames)
            self.reset_eval = True

        if args.save and n_train % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None)
            save_model = copy.deepcopy(actor_critic)
            save_agent = copy.deepcopy(agent)
            if args.cuda:
                save_model.cpu()
            optim_save = save_agent.optimizer.state_dict()
            self.agent = agent
            self.save_model = save_model
            self.optim_save = optim_save
            self.args = args
            self.ob_rms = ob_rms
            torch.save(self.get_save_dict(),
                       os.path.join(save_path, args.env_name + ".tar"))

            #save_model = [save_model,
            #              getattr(get_vec_normalize(envs), 'ob_rms', None)]

            #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            #save_agent = copy.deepcopy(agent)

            #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
            #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

            print('model saved at {}'.format(save_path))

        if args.vis and n_train % args.vis_interval == 0:
            if plotter is None:
                plotter = Plotter(n_cols, args.log_dir, args.num_processes)
            try:
                # Sometimes monitor doesn't properly flush the outputs
                viz = self.viz
                win = self.win
                graph_name = self.graph_name
                win = plotter.visdom_plot(viz, win, args.log_dir, graph_name,
                                          args.algo, args.num_frames)
            except IOError:
                pass
Exemple #20
0
def main():
    import random
    import gym_micropolis
    import game_of_life

    args = get_args()
    args.log_dir = args.save_dir + '/logs'
    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    num_updates = int(args.num_frames) // args.num_steps // args.num_processes

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ')

    actor_critic = False
    agent = False
    past_steps = 0
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            if args.overwrite:
                os.remove(f)
            else:
                pass
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
        win_eval = None
    if 'GameOfLife' in args.env_name:
        print('env name: {}'.format(args.env_name))
        num_actions = 1
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False, None,

                        args=args)

    if isinstance(envs.observation_space, gym.spaces.Discrete):
        num_inputs = envs.observation_space.n
    elif isinstance(envs.observation_space, gym.spaces.Box):
        if len(envs.observation_space.shape) == 3:
            in_w = envs.observation_space.shape[1]
            in_h = envs.observation_space.shape[2]
        else:
            in_w = 1
            in_h = 1
        num_inputs = envs.observation_space.shape[0]
    if isinstance(envs.action_space, gym.spaces.Discrete):
        out_w = 1
        out_h = 1
        if 'Micropolis' in args.env_name: #otherwise it's set
            if args.power_puzzle:
                num_actions = 1
            else:
                num_actions = 19 # TODO: have this already from env
        elif 'GameOfLife' in args.env_name:
            num_actions = 1
        else:
            num_actions = envs.action_space.n
    elif isinstance(envs.action_space, gym.spaces.Box):
        if len(envs.action_space.shape) == 3:
            out_w = envs.action_space.shape[1]
            out_h = envs.action_space.shape[2]
        elif len(envs.action_space.shape) == 1:
            out_w = 1
            out_h = 1
        num_actions = envs.action_space.shape[-1]
    print('num actions {}'.format(num_actions))

    if args.auto_expand:
        args.n_recs -= 1
    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'map_width': args.map_width, 'num_actions': num_actions,
            'recurrent': args.recurrent_policy,
            'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs,
            'out_w': out_w, 'out_h': out_h},
                     curiosity=args.curiosity, algo=args.algo,
                     model=args.model, args=args)
    if args.auto_expand:
        args.n_recs += 1

    evaluator = None

    if not agent:
        agent = init_agent(actor_critic, args)

   #saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
    if args.load_dir:
        saved_model = os.path.join(args.load_dir, args.env_name + '.tar')
    else:
        saved_model = os.path.join(args.save_dir, args.env_name + '.tar')
    vec_norm = get_vec_normalize(envs)
    if os.path.exists(saved_model) and not args.overwrite:
        checkpoint = torch.load(saved_model)
        saved_args = checkpoint['args']
        actor_critic.load_state_dict(checkpoint['model_state_dict'])
       #for o, l in zip(agent.optimizer.state_dict, checkpoint['optimizer_state_dict']):
       #    print(o, l)
       #print(agent.optimizer.state_dict()['param_groups'])
       #print('\n')
       #print(checkpoint['model_state_dict'])
        actor_critic.to(device)
        actor_critic.cuda()
       #agent = init_agent(actor_critic, saved_args)
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        if args.auto_expand:
            if not args.n_recs - saved_args.n_recs == 1:
                print('can expand by 1 rec only from saved model, not {}'.format(args.n_recs - saved_args.n_recs))
                raise Exception
            actor_critic.base.auto_expand()
            print('expanded net: \n{}'.format(actor_critic.base))
        past_steps = checkpoint['past_steps']
        ob_rms = checkpoint['ob_rms']

        past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step']
        print('Resuming from step {}'.format(past_steps))

       #print(type(next(iter((torch.load(saved_model))))))
       #actor_critic, ob_rms = \
       #        torch.load(saved_model)
       #agent = \
       #    torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
       #if not agent.optimizer.state_dict()['state'].values():
       #    past_steps = 0
       #else:

       #    raise Exception

        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
        saved_args.num_frames = args.num_frames
        saved_args.vis_interval = args.vis_interval
        saved_args.eval_interval = args.eval_interval
        saved_args.overwrite = args.overwrite
        saved_args.n_recs = args.n_recs
        saved_args.intra_shr = args.intra_shr
        saved_args.inter_shr = args.inter_shr
        saved_args.map_width = args.map_width
        saved_args.render = args.render
        saved_args.print_map = args.print_map
        saved_args.load_dir = args.load_dir
        saved_args.experiment_name = args.experiment_name
        saved_args.log_dir = args.log_dir
        saved_args.save_dir = args.save_dir
        args = saved_args
    actor_critic.to(device)

    if 'LSTM' in args.model:
        recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size()
    else:
        recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size
    if args.curiosity:
        rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space,
                            recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args)
    else:
        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space,
                            recurrent_hidden_state_size, args=args)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    model = actor_critic.base
    reset_eval = False
    plotter = None
    if args.model == 'FractalNet' or args.model == 'fractal':
        n_cols = model.n_cols
        if args.rule == 'wide1' and args.n_recs > 3:
            col_step = 3
        else:
            col_step = 1
    else:
        n_cols = 0
        col_step = 1
    for j in range(past_steps, num_updates):
        if reset_eval:
            print('post eval reset')
            obs = envs.reset()
            rollouts.obs[0].copy_(obs)
            rollouts.to(device)
            reset_eval = False
       #if np.random.rand(1) < 0.1:
       #    envs.venv.venv.remotes[1].send(('setRewardWeights', None))
        if args.model == 'FractalNet' and args.drop_path:
           #if args.intra_shr and args.inter_shr:
           #    n_recs = np.randint
           #    model.set_n_recs()
            model.set_drop_path()
        if args.model == 'fixed' and model.RAND:
            model.num_recursions = random.randint(1, model.map_width * 2)
        player_act = None
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                if args.render:
                    if args.num_processes == 1:
                        if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name):
                            envs.venv.venv.render()
                        else:
                            pass
                    else:
                        if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name):
                            envs.render()
                            envs.venv.venv.render()
                        else:
                            pass
                           #envs.venv.venv.remotes[0].send(('render', None))
                           #envs.venv.venv.remotes[0].recv()
                value, action, action_log_probs, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step],
                        player_act=player_act,
                        icm_enabled=args.curiosity,
                        deterministic=False)

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            player_act = None
            if args.render:
                if infos[0]:
                    if 'player_move' in infos[0].keys():
                        player_act = infos[0]['player_move']
            if args.curiosity:
                # run icm
                with torch.no_grad():


                    feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act(
                            (rollouts.obs[step], obs, action_bin)
                            )

                intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2.
                if args.no_reward:
                    reward = 0
                reward += intrinsic_reward.cpu()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if args.curiosity:
                rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks,
                                feature_state, feature_state_pred, action_bin, action_dist_pred)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()



        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \
dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".
                format(j, total_num_steps,
                       int((total_num_steps - past_steps * args.num_processes * args.num_steps) / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".
                format(
                       fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args, actor_critic, device, envs=envs, vec_norm=vec_norm)


            model = evaluator.actor_critic.base

            col_idx = [-1, *range(0, n_cols, col_step)]
            for i in col_idx:
                evaluator.evaluate(column=i)
           #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
           # making sure the evaluator plots the '-1'st column (the overall net)

            if args.vis: #and j % args.vis_interval == 0:
                try:
                    # Sometimes monitor doesn't properly flush the outputs
                    win_eval = evaluator.plotter.visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
                                  args.algo, args.num_frames, n_graphs= col_idx)
                except IOError:
                    pass
           #elif args.model == 'fixed' and model.RAND:
           #    for i in model.eval_recs:
           #        evaluator.evaluate(num_recursions=i)
           #    win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
           #                           args.algo, args.num_frames, n_graphs=model.eval_recs)
           #else:
           #    evaluator.evaluate(column=-1)
           #    win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name,
           #                  args.algo, args.num_frames)
            reset_eval = True

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None)
            save_model = copy.deepcopy(actor_critic)
            save_agent = copy.deepcopy(agent)
            if args.cuda:
                save_model.cpu()
            optim_save = save_agent.optimizer.state_dict()

            # experimental:
            torch.save({
                'past_steps': next(iter(agent.optimizer.state_dict()['state'].values()))['step'],
                'model_state_dict': save_model.state_dict(),
                'optimizer_state_dict': optim_save,
                'ob_rms': ob_rms,
                'args': args
                }, os.path.join(save_path, args.env_name + ".tar"))

           #save_model = [save_model,
           #              getattr(get_vec_normalize(envs), 'ob_rms', None)]

           #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
           #save_agent = copy.deepcopy(agent)

           #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
           #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

        if args.vis and j % args.vis_interval == 0:
            if plotter is None:
                plotter = Plotter(n_cols, args.log_dir, args.num_processes)
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = plotter.visdom_plot(viz, win, args.log_dir, graph_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #21
0
def main():
    device = 'cpu'
    acc_steps = []
    acc_scores = []
    torch.set_num_threads(1)
    print('here')

    if args.env_name == 'Reacher-v2':
        rbf1 = build_features_reacher2(.2, 5, 2)
        len_rbf = rbf1._K
        len_features = len_rbf + 1
    if args.env_name == 'Hopper-v2':
        len_features = 3
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)

    actor_critic.to(device)

    agent = PPO(actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              len_features)
    print('here2')
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = collections.deque(maxlen=10)
    num_updates = 20
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # Prepare demos
        demo_actions = np.zeros(
            (1, args.num_processes, envs.action_space.shape[0]))
        demo_states = np.zeros(
            (1, args.num_processes, envs.observation_space.shape[0]))

        demo_features = np.zeros((1, args.num_processes, len_features))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # obs, reward and next obs
            demo_actions = np.concatenate(
                [demo_actions,
                 action.reshape(1, args.num_processes, -1)], 0)
            demo_states = np.concatenate([
                demo_states, rollouts.obs[step].reshape(
                    1, args.num_processes, -1)
            ], 0)
            feat_rewards = np.zeros((args.num_processes, len_features))
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_before = envs.get_sim_data()
            obs, reward, done, infos = envs.step(action)
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_after = envs.get_sim_data()
                    for num_p in range(args.num_processes):
                        feat_1 = pos_after[num_p] - pos_before[num_p]
                        feat_2 = 0
                        if not done[num_p]:
                            feat_2 = 1
                        # feat_2 = np.array([1 for _ in range(args.num_processes)])
                        feat_3 = np.array(
                            [np.linalg.norm(action[num_p],
                                            ord=2)**2]).flatten()
                        feat_rewards[num_p] = np.array(
                            [feat_1, feat_2, feat_3])
            if args.env_name == 'Reacher-v2':
                if args.num_processes > 1:
                    body_data = envs.get_body_data()
                    for num_p in range(args.num_processes):
                        rbf1_ = rbf1(body_data[num_p][:-1])
                        rbf4_ = np.array(
                            [np.linalg.norm(action[num_p], ord=2)**2])
                        feat_rewards[num_p] = np.concatenate(
                            (rbf1_.reshape(-1), rbf4_))
                else:
                    rbf1_ = rbf1(
                        (envs.envs[0].env.env.get_body_com("fingertip") -
                         envs.envs[0].env.env.get_body_com("target"))[:-1])
                    rbf4_ = np.array([-np.square(action[0]).sum()])
                    feat_rewards[0] = np.concatenate(
                        (rbf1_.reshape(-1), rbf4_))
            demo_features = np.concatenate([
                demo_features,
                feat_rewards.reshape(1, args.num_processes, -1)
            ], 0)
            if step > 1 and step % 1000 == 0:
                done = [True for _ in range(args.num_processes)]

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob, \
                            value, reward, masks, feat_rewards)

        # Save demos:
        action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy'
        state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy'
        rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str(
            j) + '.npy'
        policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth'
        np.save(action_file_name, demo_actions)
        np.save(state_file_name, demo_states)
        np.save(rew_feat_file_name, demo_features)
        torch.save(actor_critic.state_dict(), policy_file_name)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir:
            save_path = os.path.join(args.save_dir, 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + '.pt'))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print('Updates', j, 'num timesteps', len(episode_rewards),
                  '\n Last training episodes: mean/median reward',
                  '{:.1f}'.format(np.mean(episode_rewards)),
                  '/{:.1f}'.format(np.median(episode_rewards)),
                  'min/max reward', '{:.1f}'.format(np.min(episode_rewards)),
                  '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy',
                  dist_entropy, 'value loss', value_loss, 'action loss',
                  action_loss)

        if len(episode_rewards) > 1:
            acc_steps.append(total_num_steps)
            acc_scores.append(np.mean(episode_rewards))
            #print(acc_scores)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _ = actor_critic.act(obs,
                                                    eval_masks,
                                                    deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print('Evaluation using', len(eval_episode_rewards),
                  'episodes: mean reward',
                  '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

    scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy'
    steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy'
    np.save(scores_file_name, np.array(acc_scores))
    np.save(steps_file_name, np.array(acc_steps))
Exemple #22
0
def main():
    saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
    if os.path.exists(saved_model) and not args.overwrite:
        actor_critic, ob_rms = \
                torch.load(saved_model)
        agent = \
            torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
        for i in agent.optimizer.state_dict():
            print(dir(agent.optimizer))
            print(getattr(agent.optimizer, 'steps'))
            print(agent.optimizer.state_dict()[i])
        past_steps = agent.optimizer.steps
    else: 
        actor_critic = False
        agent = False
        past_steps = 0
        try:
            os.makedirs(args.log_dir)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
        win_eval = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False, None,

                        args=args)

    if actor_critic:
        pass
      # vec_norm = get_vec_normalize(envs)
      # if vec_norm is not None:
      #     vec_norm.eval()
      #     vec_norm.ob_rms = ob_rms
        
    else:
        actor_critic = Policy(envs.observation_space.shape, envs.action_space,
            base_kwargs={'map_width': args.map_width, 'num_actions': 18, 'recurrent': args.recurrent_policy},
            curiosity=args.curiosity, algo=args.algo, model=args.model, args=args)
    actor_critic.to(device)

    evaluator = None

    if not agent:
        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef,
                                   args.entropy_coef, lr=args.lr,
                                   eps=args.eps, alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm,
                                   curiosity=args.curiosity, args=args)
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                             args.value_loss_coef, args.entropy_coef, lr=args.lr,
                                   eps=args.eps,
                                   max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef,
                                   args.entropy_coef, lr=args.lr,
                                   eps=args.eps, alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm,
                                   acktr=True,
                                   curiosity=args.curiosity, args=args)

    if args.curiosity:
        rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space,
                            actor_critic.recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args)
    else:
        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space,
                            actor_critic.recurrent_hidden_state_size, args=args)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates - past_steps):
        if args.drop_path:
            actor_critic.base.get_drop_path()
        player_act = None
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():

                value, action, action_log_probs, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step],
                        player_act=player_act,
                        icm_enabled=args.curiosity)

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            player_act = None
            if args.render:

                if infos[0]:
                    if 'player_move' in infos[0].keys():
                        player_act = infos[0]['player_move']
            

            if args.curiosity:
                # run icm
                with torch.no_grad():


                    feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act(
                            (rollouts.obs[step], obs, action_bin)
                            )

                intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2.
                if args.no_reward:
                    reward = 0
                reward += intrinsic_reward.cpu()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if args.curiosity:
                rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks,
                                feature_state, feature_state_pred, action_bin, action_dist_pred)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            save_agent = copy.deepcopy(agent)

            torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
           #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \
dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".
                format(
                       fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args, actor_critic, device)


            if args.model == 'fractal':
                n_cols = evaluator.actor_critic.base.n_cols
                for i in range(-1, n_cols):
                    evaluator.evaluate(column=i)
               #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
                win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, args.env_name,
                              args.algo, args.num_frames, n_graphs=args.n_recs)
            else:
                evaluator.evaluate(column=None)



        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemple #23
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now(
    ).strftime("%Y-%m-%d-%H-%M-%S-%f")
    log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir,
                                                 args.save_dir)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         log_dir,
                         args.add_timestep,
                         device,
                         False,
                         frame_skip=args.frame_skip)

    if args.load_path:
        actor_critic, _ob_rms = torch.load(args.load_path)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.train()
            vec_norm.ob_rms = _ob_rms
        actor_critic.train()
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              beta=args.beta_dist,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=args.lr_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=args.lr_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         beta=args.sil_beta,
                         value_loss_coef=args.sil_value_loss_coef,
                         entropy_coef=args.sil_entropy_coef)
        replay = ReplayStorage(10000,
                               num_processes=args.num_processes,
                               gamma=args.gamma,
                               prio_alpha=args.sil_alpha,
                               obs_shape=envs.observation_space.shape,
                               action_space=envs.action_space,
                               recurrent_hidden_state_size=actor_critic.
                               recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    action_high = torch.from_numpy(envs.action_space.high).to(device)
    action_low = torch.from_numpy(envs.action_space.low).to(device)
    action_mid = 0.5 * (action_high + action_low)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    benchmark_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                # sample actions
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            if args.clip_action and isinstance(envs.action_space,
                                               gym.spaces.Box):
                clipped_action = action.clone()
                if args.shift_action:
                    # FIXME experimenting with this, so far resulting in
                    # faster learning when clipping guassian continuous
                    # output (vs leaving centred at 0 and unscaled)
                    clipped_action = 0.5 * clipped_action + action_mid
                clipped_action = torch.max(
                    torch.min(clipped_action, action_high), action_low)
            else:
                clipped_action = action

            # act in environment and observe
            obs, reward, done, infos = envs.step(clipped_action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    if 'rb' in info['episode']:
                        benchmark_rewards.append(info['episode']['rb'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        train_eprew = np.mean(episode_rewards)
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), train_eprew,
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end='')
            if len(benchmark_rewards):
                print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format(
                    np.mean(benchmark_rewards), np.median(benchmark_rewards),
                    np.min(benchmark_rewards), np.max(benchmark_rewards)),
                      end='')
            print()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                clipped_action = action
                if args.clip_action and isinstance(envs.action_space,
                                                   gym.spaces.Box):
                    if args.shift_action:
                        clipped_action = 0.5 * clipped_action + action_mid
                    clipped_action = torch.max(
                        torch.min(clipped_action, action_high), action_low)

                obs, reward, done, infos = eval_envs.step(clipped_action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            eval_eprew = np.mean(eval_episode_rewards)
            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), eval_eprew))

        if len(episode_rewards
               ) and j % args.save_interval == 0 and save_dir != "":
            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            ep_rewstr = ("%d" % train_eprew).replace("-", "n")
            save_filename = os.path.join(
                save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr))

            torch.save(save_model, save_filename)

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, log_dir, args.env_name, args.algo,
                                  args.num_frames)
            except IOError:
                pass
Exemple #24
0
args = parser.parse_args()

args.det = not args.non_det

env = make_vec_envs(args.env_name, args.seed + 1000, 1,
                            None, None, args.add_timestep, device='cpu',
                            allow_early_resets=False)

# Get a render function
render_func = get_render_func(env)

# We need to use the same statistics for normalization as used in training
actor_critic, ob_rms = \
            torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))

vec_norm = get_vec_normalize(env)
if vec_norm is not None:
    vec_norm.eval()
    vec_norm.ob_rms = ob_rms

recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size)
masks = torch.zeros(1, 1)

if render_func is not None:
    render_func('human')

obs = env.reset()

if args.env_name.find('Bullet') > -1:
    import pybullet as p
Exemple #25
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        vizz = Visdom(port=args.port)
        win = None
        winloss = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    #Initialize bw Model
    if args.bw:
        bw_model = bw_module(actor_critic, args, agent.optimizer,
                             envs.action_space, envs.observation_space)
    vis_timesteps = []
    vis_loss = []

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            # Add stuff to the Buffer
            if args.bw:
                bw_model.step(rollouts.obs[step].detach().cpu().numpy(),
                              action.detach().cpu().numpy(),
                              reward.detach().cpu().numpy(), done,
                              obs.detach().cpu().numpy())

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # Do BW STEPS
        if args.bw and (j % args.n_a2c == 0):
            if not args.consistency:
                l_bw, l_imi = 0.0, 0.0
                for _ in range(args.n_bw):
                    l_bw += bw_model.train_bw_model(j)
                l_bw /= args.n_bw
                for _ in range(args.n_imi):
                    l_imi += bw_model.train_imitation(j)
                l_imi /= args.n_imi
            else:
                l_bw, l_fw = 0.0, 0.0
                for _ in range(args.n_bw):
                    l_bw_, l_fw_ = bw_model.train_bw_model(j)
                    l_bw += l_bw_
                    l_fw += l_fw_
                l_bw /= args.n_bw
                l_fw /= args.n_bw
                l_imi, l_cons = 0.0, 0.0
                for _ in range(args.n_imi):
                    l_imi_, l_cons_ = bw_model.train_imitation(j)
                    l_imi += l_imi_
                    l_cons_ += l_cons_
                l_imi /= args.n_imi
                l_cons /= args.n_imi

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                env_name = args.env_name
                if args.bw:
                    env_name += 'BW'
                win = visdom_plot(viz, win, args.log_dir, env_name, args.algo,
                                  args.num_frames)
            except IOError:
                pass

        # Save to Visdom Plots
        if args.vis and (j % args.vis_interval == 0):
            if args.bw and args.consistency:
                vis_loss.append(
                    [value_loss, action_loss, l_bw, l_imi, l_fw, l_cons])
                legend = [
                    'Value loss', 'Action loss', 'BW Loss', 'IMI loss',
                    'FW Loss', 'CONST loss'
                ]
                title = args.env_name + '-' + 'bw' + '-' + 'consistency' + args.title
            elif args.bw:
                vis_loss.append([value_loss, action_loss, l_bw, l_imi])
                legend = ['Value loss', 'Action loss', 'BW Loss', 'IMI loss']
                title = args.env_name + '-' + 'bw' + args.title
            else:
                vis_loss.append([value_loss, action_loss])
                legend = ['Value loss', 'Action loss']
                title = args.env_name + '-' + 'vanilla'
            vis_timesteps.append(
                (j + 1) * (args.num_processes * args.num_steps))
            # vis_rewards.append(final_rewards.mean())
            # vis_rewards.append(np.mean(reward_queue))

            # if win is None:
            #     win = vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), opts=dict(title=title, xlabel='Timesteps',
            #                 ylabel='Avg Rewards'))
            # vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), win=win, update='replace', opts=dict(title=title, xlabel='Timesteps',
            #                 ylabel='Avg Rewards'))
            if winloss is None:
                winloss = vizz.line(Y=np.array(vis_loss),
                                    X=np.array(vis_timesteps),
                                    opts=dict(title=title,
                                              xlabel='Timesteps',
                                              ylabel='Losses',
                                              legend=legend))
            vizz.line(Y=np.array(vis_loss),
                      X=np.array(vis_timesteps),
                      win=winloss,
                      update='replace',
                      opts=dict(title=title,
                                xlabel='Timesteps',
                                ylabel='Losses',
                                legend=legend))