Example #1
0
 def __init__(self, new_constants={}):
     # Update constants
     assert(type(new_constants) == dict)
     for c in new_constants.keys():
         if c in self.constants.keys():
             self.constants[c] = new_constants[c]
     # Clear existing logs
     if self.constants["clear_logs"]:
         for fname in glob.glob("*_log_*.txt"):
             os.remove(fname)
             print('Removed: %s' % fname)
     # set device
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     torch.manual_seed(self.constants["seed"])
     torch.cuda.manual_seed_all(self.constants["seed"])
     # construct envs
     args = self.args_k("seed", "num_processes", "gamma", "log_dir")
     assert(self.constants["env"] != None)
     if type(self.constants["env"]) == str: # Gym env
         self.envs = make_vec_envs(self.constants["env"], *args, self.device, False)
     else: # Custom env (TODO: multi-process?)
         self.envs = make_vec_envs_custom(self.constants, self.device, self.constants["env"])
     # construct actor critic
     env_args = (self.envs.observation_space.shape, self.envs.action_space)
     self.actor_critic = Policy(*env_args, base_kwargs={'recurrent': self.constants["recurrent_policy"]}).to(self.device)
     # construct PPO
     args = self.args_k("clip_param", "ppo_epoch", "minibatch_size", "value_loss_coef", "entropy_coef")
     kwargs = self.kwargs_k("lr", "eps", "max_grad_norm")
     self.agent = algo.PPO(self.actor_critic, *args, **kwargs)
     # rollout storage / experiences
     args = self.args_k("forward_steps", "num_processes")
     self.rollouts = RolloutStorage(*args, *env_args, self.actor_critic.recurrent_hidden_state_size)
Example #2
0
def load_alg():
    actor_critic_2, ob_rms = \
        torch.load(os.path.join(model_path))
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy
                          }).to(device)
    custom_loader(actor_critic, actor_critic_2)
    del actor_critic_2

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm,
                     weight_decay=args.weight_decay)
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    return actor_critic, ob_rms, agent, rollouts
    def __init__(
        self,
        env,
        policy,
        device,
        max_num_epoch_paths_saved=None,
        render=False,
        render_kwargs=None,
        num_processes=1,
    ):
        super().__init__(env, policy, max_num_epoch_paths_saved, render,
                         render_kwargs, num_processes)
        self.num_processes = num_processes

        self.device = device
        self.is_json = isinstance(env.observation_space, Json)
        self.is_tuple = False
        if self.is_json:
            self.json_to_screen = env.observation_space.converter
            self.is_tuple = isinstance(env.observation_space.image, Tuple)

        self.shape = (((
            env.observation_space.image[0].shape,
            env.observation_space.image[1].shape,
        ) if self.is_tuple else env.observation_space.image.shape)
                      if self.is_json else env.observation_space.shape)
        self._rollouts = RolloutStorage(
            max_num_epoch_paths_saved,
            num_processes,
            self.shape,
            env.action_space,
            1,  # hardcoding reccurent hidden state off for now.
        )

        raw_obs = env.reset()
        action_obs = self._convert_to_torch(raw_obs)
        stored_obs = _flatten_tuple(
            action_obs) if self.is_tuple else action_obs
        self.obs = (raw_obs if isinstance(self, HierarchicalStepCollector) or
                    isinstance(self, ThreeTierStepCollector) else action_obs)

        # print(raw_obs.shape)
        # print(action_obs.shape)
        # print(stored_obs.shape)

        self._rollouts.obs[0].copy_(stored_obs)
        self._rollouts.to(self.device)
def ppo_experiment(args):


    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    device = torch.device("cuda:0" if args.cuda else "cpu")


    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    env_kwargs = dict()

    env_kwargs['timestep'] = args.timestep

    if "push" in args.env_name:
        env_kwargs['params'] = 'random_goal_unconstrained'

    if "soccer" in args.env_name:
        env_kwargs['params'] = 'random_goal_unconstrained'

    if "faucet" in args.env_name:
        secondary_output = True


    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                             args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs)

    test_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes,
                             None, args.log_dir, device, False, env_kwargs=env_kwargs)

    actor_critic = Policy(
            envs.observation_space.shape,
            envs.action_space,
            base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    agent = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)



    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    train_ppo(actor_critic, agent, rollouts, envs, test_envs, args)
Example #5
0
def job(rank, args, device, shared_model):
    episode_rewards = deque(maxlen=10)
    envs = gym.make(args.env_name)
    envs.seed(args.seed + rank)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)
    actor_critic.load_state_dict(shared_model.state_dict())

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    obs = envs.reset()
    obs = torch.from_numpy(obs)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    acc_r = 0
    done = [False]

    for step in range(args.num_steps):
        if done[0]:
            episode_rewards.append(acc_r)
            obs = envs.reset()
            obs = torch.from_numpy(obs)
            rollouts.obs[0].copy_(obs)
            rollouts.to(device)
            acc_r = 0

        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step])

        # Obser reward and next obs
        target_action = action.numpy()[0]
        obs, reward, done, infos = envs.step(target_action)

        acc_r += reward
        obs = torch.from_numpy(obs).float().to(device)
        reward = torch.from_numpy(np.array([reward])).unsqueeze(dim=1).float()
        done = [done]

        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])
        bad_masks = torch.FloatTensor([[1.0] for done_ in done])
        rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob,
                        value, reward, masks, bad_masks)
    print(rank, np.mean(episode_rewards))
    return rollouts
Example #6
0
    def make_agent(is_leaf=True):
        ## AGENT CONSTRUCTION:
        ## Modularize this and allow for cascading (obs dim for child policy should be cat of obs and parents output)
        actor_critic = OpsPolicy(
            envs.observation_space.shape,
            envs.action_space if is_leaf else gym.spaces.Discrete(2),
            is_leaf=is_leaf,
            base_kwargs=dict(recurrent=True,
                             partial_obs=args.partial_obs,
                             gate_input=args.gate_input))

        actor_critic.to(device)
        # wandb.watch(actor_critic.base)

        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   args.pred_loss_coef,
                                   lr=args.lr,
                                   eps=args.eps,
                                   alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm)
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             args.pred_loss_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   acktr=True)

        rollouts = RolloutStorage(args.num_steps,
                                  args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size,
                                  info_size=2 if is_leaf else 0)

        return actor_critic, agent, rollouts
Example #7
0
def make_agent():

    # Make the device
    device = torch.device('cpu')

    # Number of parallel environments to generate games in
    n_envs = 50

    # Number of steps per environment to simulate
    n_steps = 400

    # The gym environment
    env = BattlesnakeEnv(n_threads=1, n_envs=n_envs)

    # Storage for rollouts (game turns played and the rewards)
    rollouts = RolloutStorage(n_steps, n_envs, env.observation_space.shape,
                              env.action_space, n_steps)
    env.close()

    # Create our policy as defined above
    policy = create_policy(env.observation_space.shape, env.action_space,
                           SnakePolicyBase)

    # Load old state dictionary from training
    policy.load_state_dict(
        torch.load("weights/battlesnakeWeights.pt", map_location=device))
    policy.eval()

    best_old_policy = create_policy(env.observation_space.shape,
                                    env.action_space, SnakePolicyBase)

    # Lets make the old policy the same as the current one
    best_old_policy.load_state_dict(policy.state_dict())

    agent = PPO(policy,
                value_loss_coef=0.5,
                entropy_coef=0.01,
                max_grad_norm=0.5,
                clip_param=0.2,
                ppo_epoch=4,
                num_mini_batch=32,
                eps=1e-5,
                lr=1e-3)

    return agent, policy
Example #8
0
def train_ppo_from_scratch(args):

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(2)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, True)

    actor_critic = Policy(  # 2-layer fully connected network
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={
            'recurrent': False,
            'hidden_size': 32
        })
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    episode_reward_means = []
    episode_reward_times = []

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

            episode_reward_means.append(np.mean(episode_rewards))
            episode_reward_times.append(total_num_steps)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

    print(episode_reward_means, episode_reward_times)
    return episode_reward_means, episode_reward_times
Example #9
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.env_name.startswith("lab_"):
        gym_name, flow_json = make_lab_env(args.env_name)

        args.env_name = gym_name

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(
            actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir, "trajs_{}.pt".format(
                args.env_name.split('-')[0].lower()))
        
        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=4, subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: "
                "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Example #10
0
def train():
    processes = []
    if os.path.isdir(args.log_dir):
        ans = input('{} exists\ncontinue and overwrite? y/n: '.format(
            args.log_dir))
        if ans == 'n':
            return

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w'))

    torch.set_num_threads(2)

    start = time.time()
    policy_update_time, policy_forward_time = 0, 0
    step_time_env, step_time_total, step_time_rewarder = 0, 0, 0
    visualize_time = 0
    rewarder_fit_time = 0

    envs = ContextualEnvInterface(args)
    if args.look:
        looker = Looker(args.log_dir)

    actor_critic, agent = initialize_policy(envs)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.obs_shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    rollouts.to(args.device)

    def copy_obs_into_beginning_of_storage(obs):
        rollouts.obs[0].copy_(obs)

    for j in range(args.num_updates):

        obs = envs.reset(
        )  # have to reset here to use updated rewarder to sample tasks
        copy_obs_into_beginning_of_storage(obs)

        if args.use_linear_lr_decay:
            update_linear_schedule(agent.optimizer, j, args.num_updates,
                                   args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 -
                                                  j / float(args.num_updates))

        log_marginal = 0
        lambda_log_s_given_z = 0

        for step in range(args.num_steps):
            # Sample actions
            policy_forward_start = time.time()
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            policy_forward_time += time.time() - policy_forward_start

            # Obser reward and next obs
            step_total_start = time.time()
            obs, reward, done, info = envs.step(action)
            step_time_total += time.time() - step_total_start
            step_time_env += info['step_time_env']
            step_time_rewarder += info['reward_time']
            if args.rewarder == 'unsupervised' and args.clusterer == 'vae':
                log_marginal += info['log_marginal'].sum().item()
                lambda_log_s_given_z += info['lambda_log_s_given_z'].sum(
                ).item()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        assert all(done)

        # policy update
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        policy_update_start = time.time()
        if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0:
            value_loss, action_loss, dist_entropy = 0, 0, 0
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)
        policy_update_time += time.time() - policy_update_start
        rollouts.after_update()

        # metrics
        trajectories = envs.trajectories_current_update
        state_entropy = calculate_state_entropy(args, trajectories)

        return_avg = rollouts.rewards.sum() / args.trials_per_update
        reward_avg = return_avg / (args.trial_length * args.episode_length)
        log_marginal_avg = log_marginal / args.trials_per_update / (
            args.trial_length * args.episode_length)
        lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (
            args.trial_length * args.episode_length)

        num_steps = (j + 1) * args.num_steps * args.num_processes
        num_episodes = num_steps // args.episode_length
        num_trials = num_episodes // args.trial_length

        logger.logkv('state_entropy', state_entropy)
        logger.logkv('value_loss', value_loss)
        logger.logkv('action_loss', action_loss)
        logger.logkv('dist_entropy', dist_entropy)
        logger.logkv('return_avg', return_avg.item())
        logger.logkv('reward_avg', reward_avg.item())
        logger.logkv('steps', num_steps)
        logger.logkv('episodes', num_episodes)
        logger.logkv('trials', num_trials)
        logger.logkv('policy_updates', (j + 1))
        logger.logkv('time', time.time() - start)
        logger.logkv('policy_forward_time', policy_forward_time)
        logger.logkv('policy_update_time', policy_update_time)
        logger.logkv('step_time_rewarder', step_time_rewarder)
        logger.logkv('step_time_env', step_time_env)
        logger.logkv('step_time_total', step_time_total)
        logger.logkv('visualize_time', visualize_time)
        logger.logkv('rewarder_fit_time', rewarder_fit_time)
        if args.rewarder == 'unsupervised' and args.clusterer == 'vae':
            logger.logkv('log_marginal_avg', log_marginal_avg)
            logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg)
        logger.dumpkvs()

        if (j % args.save_period == 0
                or j == args.num_updates - 1) and args.log_dir != '':
            save_model(args, actor_critic, envs, iteration=j)

        if j % args.rewarder_fit_period == 0:
            rewarder_fit_start = time.time()
            envs.fit_rewarder()
            rewarder_fit_time += time.time() - rewarder_fit_start

        if (j % args.vis_period == 0
                or j == args.num_updates - 1) and args.log_dir != '':
            visualize_start = time.time()
            if args.look:
                looker.look(iteration=j)
            if args.plot:
                p = Popen('python visualize.py --log-dir {}'.format(
                    args.log_dir),
                          shell=True)
                processes.append(p)
            visualize_time += time.time() - visualize_start
Example #11
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    actor_critic = Policy(STATE_DIM, ACTION_DIM, USER_DIM)
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)
    if args.cgail:
        discr = cgail.Discriminator(STATE_DIM,
                                    ACTION_DIM,
                                    USER_DIM,
                                    device,
                                    lr=args.D_lr)

    train_file_name = os.path.join(args.experts_dir, "expert_traj.pkl")
    test_file_name = os.path.join(args.experts_dir, "test_traj.pkl")
    ground_file_name = os.path.join(args.experts_dir, "exp_loc.pkl")

    expert_st, expert_ur, expert_ac = pickle.load(open(train_file_name, 'rb'))
    train_load = data_utils.TensorDataset(
        torch.from_numpy(np.asarray(expert_st)),
        torch.from_numpy(np.asarray(expert_ur)),
        torch.from_numpy(np.asarray(expert_ac)))
    gail_train_loader = torch.utils.data.DataLoader(
        train_load, batch_size=args.gail_batch_size, shuffle=True)

    test_st, test_ur, test_ac = pickle.load(open(test_file_name, 'rb'))
    test_load = data_utils.TensorDataset(torch.from_numpy(np.asarray(test_st)),
                                         torch.from_numpy(np.asarray(test_ur)),
                                         torch.from_numpy(np.asarray(test_ac)))
    test_loader = torch.utils.data.DataLoader(test_load,
                                              batch_size=args.gail_batch_size,
                                              shuffle=True)
    exp_loc = pickle.load(open(ground_file_name, 'rb'))

    envs = make_vec_envs(expert_st, expert_ur, args.seed, args.num_processes,
                         args.gamma, device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              STATE_DIM * 5, USER_DIM, ACTION_DIM)

    obs, user = envs.reset()
    rollouts.obs[0].copy_(obs[0])
    rollouts.user[0].copy_(user[0])
    rollouts.to(device)

    result_log = []

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.user[step])

            # Obser reward and next obs
            if action.item() != 9:
                obs = decide_next_state(action, rollouts.obs[step][0], 1)
                if obs != None:
                    rollouts.insert(obs, rollouts.user[step], action,
                                    action_log_prob, value)

                with torch.no_grad():
                    next_value = actor_critic.get_value(
                        rollouts.obs[-1], rollouts.user[-1]).detach()
        gail_epoch = args.gail_epoch
        if j < 10:
            gail_epoch = 100  # Warm up
        for _ in range(gail_epoch):
            discr.update(gail_train_loader, rollouts)

        for step in range(args.num_steps):
            if cgail:
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.user[step],
                    rollouts.actions[step], args.gamma)
            else:
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, str(args.lr),
                                     str(args.gail_batch_size),
                                     "entropy_" + str(args.entropy_coef),
                                     "D_lr" + str(args.D_lr))
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save(actor_critic,
                       os.path.join(save_path, "ac_{}.pt".format(j)))
            torch.save(discr, os.path.join(save_path, "D_{}.pt".format(j)))

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {}".format(
                j, total_num_steps, int(total_num_steps / (end - start))))

            out_loc = {}
            for i, data in enumerate(test_loader, 0):
                inputs, user, labels = data
                inputs = inputs.float()
                user = user.float()
                labels = labels.long()
                output = actor_critic.act(inputs, user)[1].tolist()

                for i in range(inputs.size(0)):
                    x = int(inputs[i][0].item())
                    y = int(inputs[i][1].item())

                    if (x, y) not in out_loc:
                        out_loc[(x, y)] = np.zeros(10)
                        out_loc[(x, y)][output[i]] += 1
                    else:
                        out_loc[(x, y)][output[i]] += 1
            target = []
            ground = []
            for key in out_loc:
                o1 = out_loc[key].copy()
                o1 /= sum(o1)
                if key in exp_loc:
                    o2 = np.zeros(10)
                    for b, w in exp_loc[key].items():
                        o2[b] += w
                    o2 /= sum(o2)
                    target.append(o1)
                    ground.append(o2)
            k, kls = cross_entropy(target, ground)
            print(k)
Example #12
0
def MOPG_worker(args, task_id, task, device, iteration, num_updates, start_time, results_queue, done_event):
    scalarization = task.scalarization
    env_params, actor_critic, agent = task.sample.env_params, task.sample.actor_critic, task.sample.agent
    
    weights_str = (args.obj_num * '_{:.3f}').format(*task.scalarization.weights)

    # make envs
    envs = make_vec_envs(env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, \
                        gamma=args.gamma, log_dir=None, device=device, allow_early_resets=False, \
                        obj_rms=args.obj_rms, ob_rms = args.ob_rms)
    if env_params['ob_rms'] is not None:
        envs.venv.ob_rms = deepcopy(env_params['ob_rms'])
    if env_params['ret_rms'] is not None:
        envs.venv.ret_rms = deepcopy(env_params['ret_rms'])
    if env_params['obj_rms'] is not None:
        envs.venv.obj_rms = deepcopy(env_params['obj_rms'])

    # build rollouts data structure
    rollouts = RolloutStorage(num_steps = args.num_steps, num_processes = args.num_processes,
                              obs_shape = envs.observation_space.shape, action_space = envs.action_space,
                              recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size, obj_num=args.obj_num)
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode_lens = deque(maxlen=10)
    episode_objs = deque(maxlen=10)   # for each cost component we care
    episode_obj = np.array([None] * args.num_processes)

    total_num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes

    offspring_batch = []

    start_iter, final_iter = iteration, min(iteration + num_updates, total_num_updates)
    for j in range(start_iter, final_iter):
        torch.manual_seed(j)
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule( \
                agent.optimizer, j * args.lr_decay_ratio, \
                total_num_updates, args.lr)
        
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            
            obs, _, done, infos = envs.step(action)
            obj_tensor = torch.zeros([args.num_processes, args.obj_num])

            for idx, info in enumerate(infos):
                obj_tensor[idx] = torch.from_numpy(info['obj'])
                episode_obj[idx] = info['obj_raw'] if episode_obj[idx] is None else episode_obj[idx] + info['obj_raw']
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_lens.append(info['episode']['l'])
                    if episode_obj[idx] is not None:
                        episode_objs.append(episode_obj[idx])
                        episode_obj[idx] = None

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, obj_tensor, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        obj_rms_var = envs.obj_rms.var if envs.obj_rms is not None else None

        value_loss, action_loss, dist_entropy = agent.update(rollouts, scalarization, obj_rms_var)

        rollouts.after_update()

        env_params = {}
        env_params['ob_rms'] = deepcopy(envs.ob_rms) if envs.ob_rms is not None else None
        env_params['ret_rms'] = deepcopy(envs.ret_rms) if envs.ret_rms is not None else None
        env_params['obj_rms'] = deepcopy(envs.obj_rms) if envs.obj_rms is not None else None

        # evaluate new sample
        sample = Sample(env_params, deepcopy(actor_critic), deepcopy(agent))
        objs = evaluation(args, sample)
        sample.objs = objs
        offspring_batch.append(sample)

        if args.rl_log_interval > 0 and (j + 1) % args.rl_log_interval == 0 and len(episode_rewards) > 1:
            if task_id == 0:
                total_num_steps = (j + 1) * args.num_processes * args.num_steps
                end = time.time()
                print(
                    "[RL] Updates {}, num timesteps {}, FPS {}, time {:.2f} seconds"
                    .format(j + 1, total_num_steps,
                            int(total_num_steps / (end - start_time)),
                            end - start_time))

        # put results back every update_iter iterations, to avoid the multi-processing crash
        if (j + 1) % args.update_iter == 0 or j == final_iter - 1:
            offspring_batch = np.array(offspring_batch)
            results = {}
            results['task_id'] = task_id
            results['offspring_batch'] = offspring_batch
            if j == final_iter - 1:
                results['done'] = True
            else:
                results['done'] = False
            results_queue.put(results)
            offspring_batch = []

    envs.close()   
    
    done_event.wait()
Example #13
0
def main():
    tb_path = os.path.join(os.path.expanduser(args.log_dir), "tensorboard_log")
    makedir_if_not_exists(tb_path)
    writer = SummaryWriter(tb_path)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    # p = multiprocessing.Process(target=_tb_task,args=(tb_path,5013) ,daemon=True)
    # p.start()
    if args.start_tb:
        _tb_task(tb_path, port=5013)
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()

    num_eps = 0  # num training eps
    num_steps = 0  # num training eps

    for j in range(num_updates):

        # list of all values all eps in num updates
        num_steps_basline_info = defaultdict(list)
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        env_basline_info = defaultdict(list)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            for info in infos:
                # episode is done
                # add addisiotnal baseline rw info in infos:
                if 'basline_rw_mse' in info:
                    env_basline_info['rw_mse'].append(info['basline_rw_mse'])
                    env_basline_info['rw_rec'].append(info['basline_rw_rec'])
                if 'basline_rw_tcn' in info:
                    env_basline_info['rw_tcn'].append(info['basline_rw_tcn'])

                if 'episode' in info.keys():
                    # end of episode
                    episode_rewards.append(info['episode']['r'])

                    num_steps_basline_info['len_episode'].append(
                        info['episode']['l'])
                    # distance of the pushed block
                    num_steps_basline_info['push_distance'].append(
                        info['basline_rw_push_dist'])
                    # take mean over eps
                    for k, step_vals in env_basline_info.items():
                        num_steps_basline_info[k].append(np.sum(step_vals))
                    # add baseline infos
                    num_eps += 1
                    env_basline_info = defaultdict(list)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        # write baseline finfos for tcn
        writer_step = total_num_steps
        for k, vals_step_eps in num_steps_basline_info.items():
            writer.add_scalar('basline/' + k, np.mean(vals_step_eps),
                              writer_step)
        writer.add_scalar('basline/episodes', num_eps, writer_step)
        len_eps = np.mean(num_steps_basline_info['len_episode'])

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            log.info(
                "Updates {}, num timesteps {}, FPS {}  Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, len eps {}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), int(len_eps), dist_entropy,
                        value_loss, action_loss))

        if j == num_updates or (args.eval_interval is not None
                                and len(episode_rewards) > 1
                                and j % args.eval_interval == 0):

            vid_log_dir = os.getenv('TCN_ENV_VID_LOG_FOLDER',
                                    '/tmp/env_tcn/train_vid')
            vid_log_inter = os.getenv('TCN_ENV_VID_LOG_INTERVAL',
                                      train_vid_log_iter)
            os.environ[
                'TCN_ENV_VID_LOG_FOLDER'] = "eval_vid"  # os.path.join(vid_log_dir,"../eval_vid/","interval_"+str(j))
            os.environ['TCN_ENV_VID_LOG_INTERVAL'] = '1'
            os.environ['TCN_ENV_EVAL_EPISODE'] = '1'
            with redirect_stdout(open(os.devnull, "w")):  # no stdout
                with suppress_logging():
                    # eval envs
                    eval_envs = make_vec_envs(args.env_name,
                                              args.seed + args.num_processes,
                                              1, args.gamma, eval_log_dir,
                                              args.add_timestep, device, True)

                    vec_norm = get_vec_normalize(eval_envs)
                    if vec_norm is not None:
                        vec_norm.eval()
                        vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

                    eval_episode_rewards = []

                    obs = eval_envs.reset()
                    eval_recurrent_hidden_states = torch.zeros(
                        args.num_processes,
                        actor_critic.recurrent_hidden_state_size,
                        device=device)
                    eval_masks = torch.zeros(args.num_processes,
                                             1,
                                             device=device)

                    while len(eval_episode_rewards) < 1:
                        with torch.no_grad():
                            _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                                obs,
                                eval_recurrent_hidden_states,
                                eval_masks,
                                deterministic=True)

                        # Obser reward and next obs
                        obs, reward, done, infos = eval_envs.step(action)

                        eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                                   for done_ in done],
                                                  dtype=torch.float32,
                                                  device=device)

                        for info in infos:
                            if 'episode' in info.keys():
                                eval_episode_rewards.append(
                                    info['episode']['r'])

                    eval_envs.close()
            os.environ['TCN_ENV_VID_LOG_FOLDER'] = vid_log_dir
            os.environ['TCN_ENV_EVAL_EPISODE'] = '0'
            os.environ['TCN_ENV_VID_LOG_INTERVAL'] = vid_log_inter

            writer.add_scalar('eval/rw', np.mean(eval_episode_rewards), j)
            log.info(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if j % args.vis_interval == 0:
            try:
                td_plot(writer, args.log_dir)
                # Sometimes monitor doesn't properly flush the outputs
                # win = visdom_plot(viz, win, args.log_dir, args.env_name,
                # args.algo, args.num_env_steps)
            except IOError:
                print("plt error")
                pass
Example #14
0
def main():
    if not os.path.exists("./plots"):
        os.makedirs("./plots")

    gbench = read_gbench('./data/gbench.txt')

    args = my_get_args()
    print(args)

    config = dict(sigma=args.sim_sigma,
                  momentum=args.sim_momentum,
                  pump_bins=args.sim_bins,
                  lag=1000 // args.num_steps,
                  rshift=args.sim_rshift,
                  pump_scale=args.sim_scale,
                  reward_kind=args.sim_reward,
                  continuous=args.sim_continuous,
                  span=args.sim_span,
                  percentile=args.sim_percentile,
                  last_runs=args.sim_perc_len,
                  add_linear=not args.sim_no_linear,
                  start_pump=args.sim_start,
                  static_features=not args.sim_no_static,
                  extra_features=not args.sim_no_extra,
                  curiosity_num=args.curiosity)

    base_kwargs = {
        'hidden_size': args.hidden_size,
        'film_size': 800 * (not args.sim_no_static)
    }
    if args.relu:
        base_kwargs['activation'] = 'relu'
    base = FILMBase  #FILMBase

    if args.gset > 0:
        test_graphs = [args.gset]
    else:
        test_graphs = [1, 2, 3, 4, 5]

    #---------------------------------------------------------

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'
                             ], 'Recurrent policy is not implemented for ACKTR'

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print('Num updates: ', num_updates)

    if args.dry_run:
        return

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    logdata = defaultdict(list)

    if args.gset > 0:
        envs = []
        for g in test_graphs:
            g_ = read_gset('./data/G{}.txt'.format(g), negate=True)
            s = SIMCIM(g_,
                       device=device,
                       batch_size=args.num_processes,
                       **config)
            s.runpump()
            envs.append(s)
        envs = SIMCollection(envs, [gbench[g] for g in test_graphs])
        logdata['bls_bench'] = [gbench[g] for g in test_graphs]
    else:
        envs = SIMGeneratorRandom(800,
                                  0.06,
                                  args.num_processes,
                                  config,
                                  keep=args.sim_keep,
                                  n_sims=args.sim_nsim,
                                  device=device)

    if args.snapshot is None:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base=base,
                              base_kwargs=base_kwargs)
    else:
        actor_critic, _ = torch.load(
            os.path.join(args.save_dir, args.algo, args.snapshot + ".pt"))

    actor_critic.to(device)
    print(actor_critic)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()

    print(rollouts.obs.shape, obs.shape)

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    eval_envs = []
    for g in test_graphs:
        g_ = read_gset('./data/G{}.txt'.format(g), negate=True)
        s = SIMCIM(g_,
                   device=device,
                   batch_size=args.num_val_processes,
                   **config)
        s.runpump()
        eval_envs.append(s)
    eval_envs = SIMCollection(eval_envs, [gbench[g] for g in test_graphs])
    ref_cuts = [s.lastcuts for s in eval_envs.envs]
    logdata['ref_cuts'] = [e.tolist() for e in ref_cuts]

    stoch_cuts = None

    start = time.time()
    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # ROLLOUT DATA
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            if 'episode' in infos[0].keys():
                rw = np.mean([e['episode']['r'] for e in infos])
                logdata['episode_rewards'].append(rw.item())
                if args.gset > 0:
                    cuts = [e.lastcuts for e in envs.envs]
                    logdata['train_median'].append(
                        [np.median(e).item() for e in cuts])
                    logdata['train_max'].append(
                        [np.max(e).item() for e in cuts])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        #UPDATE AGENT
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, _ = agent.update(rollouts)
        logdata['alosses'].append(action_loss)
        logdata['vlosses'].append(value_loss)

        logdata['train_percentiles'].append(envs.perc.tolist())

        rollouts.after_update()

        #CHECKPOINTS
        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + '-' + str(j) + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        #LOGGING
        if j % args.log_interval == 0 and len(logdata['episode_rewards']) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: \
                mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(logdata['episode_rewards']),
                        np.mean(logdata['episode_rewards'][-10:]),
                        np.median(logdata['episode_rewards'][-10:]),
                        np.min(logdata['episode_rewards'][-10:]),
                        np.max(logdata['episode_rewards'][-10:])))

        #EVALUATION
        if (args.eval_interval is not None and j % args.eval_interval == 0):
            logdata['spumps'] = []

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_val_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_val_processes, 1, device=device)

            eval_done = False

            while not eval_done:
                p = eval_envs.envs[0].old_p
                logdata['spumps'].append(p[:10].cpu().numpy().tolist())

                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=False)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_done = np.all(done)

                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          dtype=torch.float32,
                                          device=device)

            stoch_cuts = [e.lastcuts for e in eval_envs.envs]
            logdata['stoch_cuts'] = [e.tolist() for e in stoch_cuts]
            logdata['eval_median'].append(
                [np.median(e).item() for e in stoch_cuts])
            logdata['eval_max'].append([np.max(e).item() for e in stoch_cuts])

            logdata['test_percentiles'].append(eval_envs.perc.tolist())

            rw = np.mean([e['episode']['r'] for e in infos])
            logdata['eval_episode_rewards'].append(rw.item())

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(logdata['eval_episode_rewards']),
                np.mean(logdata['eval_episode_rewards'])))

        if j % args.log_interval == 0:
            fn = os.path.join(save_path, args.env_name + ".res")
            with open(fn, 'w') as f:
                json.dump(logdata, f, sort_keys=True, indent=2)

        #VISUALIZATION
        if j % args.vis_interval == 0:
            #if False:
            plt.figure(figsize=(15, 10))

            plt.subplot(231)
            plt.title('Rewards')
            plt.xlabel('SIM runs')
            plt.plot(logdata['episode_rewards'], c='r', label='mean train')
            plt.plot(np.linspace(0, len(logdata['episode_rewards']),
                                 len(logdata['eval_episode_rewards'])),
                     logdata['eval_episode_rewards'],
                     'b',
                     label='mean eval')
            plt.legend()

            plt.subplot(232)
            plt.plot(logdata['alosses'])
            plt.title('Policy loss')

            plt.subplot(233)
            plt.plot(logdata['vlosses'])
            plt.title('Value loss')

            plt.subplot(234)
            plt.title('Pumps')
            plt.xlabel('SIM iterations / 10')
            plt.plot(np.array(logdata['spumps']))
            plt.ylim(-0.05, 1.1)

            plt.subplot(235)
            plt.plot(logdata['train_percentiles'])
            plt.title('Train average percentile')

            plt.subplot(236)
            plt.title('Test percentiles')
            plt.plot(logdata['test_percentiles'])
            plt.legend([str(e) for e in test_graphs])

            plt.tight_layout()
            plt.savefig('./plots/agent_' + args.env_name + '.pdf')
            plt.clf()
            plt.close()
            gc.collect()
            #plt.show()

            if stoch_cuts is not None:
                fig, axs = plt.subplots(len(ref_cuts),
                                        1,
                                        sharex=False,
                                        tight_layout=True)
                if len(ref_cuts) == 1:
                    axs = [axs]
                for gi in range(len(ref_cuts)):
                    mn = min(ref_cuts[gi])
                    axs[gi].hist(ref_cuts[gi], bins=100, alpha=0.7)
                    dc = stoch_cuts[gi][stoch_cuts[gi] >= mn]
                    if dc.size > 0:
                        axs[gi].hist(dc, bins=100, alpha=0.7)
                plt.savefig('./plots/cuts_' + args.env_name + '.pdf')
                plt.clf()
                plt.close()
                gc.collect()
Example #15
0
def main():
    args = get_args()

    # Record trajectories
    if args.record_trajectories:
        record_trajectories()
        return

    print(args)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Append the model name
    log_dir = os.path.expanduser(args.log_dir)
    log_dir = os.path.join(log_dir, args.model_name, str(args.seed))

    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, log_dir, device, False)

    # Take activation for carracing
    print("Loaded env...")
    activation = None
    if args.env_name == 'CarRacing-v0' and args.use_activation:
        activation = torch.tanh
    print(activation)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'env': args.env_name
                          },
                          activation=activation)
    actor_critic.to(device)
    # Load from previous model
    if args.load_model_name:
        state = torch.load(
            os.path.join(args.save_dir, args.load_model_name,
                         args.load_model_name + '_{}.pt'.format(args.seed)))[0]
        try:
            actor_critic.load_state_dict(state)
        except:
            actor_critic = state

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        if len(envs.observation_space.shape) == 1:
            discr = gail.Discriminator(
                envs.observation_space.shape[0] + envs.action_space.shape[0],
                100, device)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=3,
                                                subsample_frequency=1)
            expert_dataset_test = gail.ExpertDataset(file_name,
                                                     num_trajectories=1,
                                                     start=3,
                                                     subsample_frequency=1)
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)
            gail_test_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset_test,
                batch_size=args.gail_batch_size,
                shuffle=False,
                drop_last=False)
            print(len(expert_dataset), len(expert_dataset_test))
        else:
            # env observation shape is 3 => its an image
            assert len(envs.observation_space.shape) == 3
            discr = gail.CNNDiscriminator(envs.observation_space.shape,
                                          envs.action_space, 100, device)
            file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl')

            expert_dataset = gail.ExpertImageDataset(file_name, train=True)
            test_dataset = gail.ExpertImageDataset(file_name, train=False)
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=len(expert_dataset) > args.gail_batch_size,
            )
            gail_test_loader = torch.utils.data.DataLoader(
                dataset=test_dataset,
                batch_size=args.gail_batch_size,
                shuffle=False,
                drop_last=len(test_dataset) > args.gail_batch_size,
            )
            print('Dataloader size', len(gail_train_loader))

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    #num_updates = int(
    #args.num_env_steps) // args.num_steps // args.num_processes
    num_updates = args.num_steps
    print(num_updates)

    # count the number of times validation loss increases
    val_loss_increase = 0
    prev_val_action = np.inf
    best_val_loss = np.inf

    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                try:
                    envs.venv.eval()
                except:
                    pass

            gail_epoch = args.gail_epoch
            #if j < 10:
            #gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                #discr.update(gail_train_loader, rollouts,
                #None)
                pass

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        #value_loss, action_loss, dist_entropy = agent.update(rollouts)
        value_loss = 0
        dist_entropy = 0
        for data in gail_train_loader:
            expert_states, expert_actions = data
            expert_states = Variable(expert_states).to(device)
            expert_actions = Variable(expert_actions).to(device)
            loss = agent.update_bc(expert_states, expert_actions)
            action_loss = loss.data.cpu().numpy()
        print("Epoch: {}, Loss: {}".format(j, action_loss))

        with torch.no_grad():
            cnt = 0
            val_action_loss = 0
            for data in gail_test_loader:
                expert_states, expert_actions = data
                expert_states = Variable(expert_states).to(device)
                expert_actions = Variable(expert_actions).to(device)
                loss = agent.get_action_loss(expert_states, expert_actions)
                val_action_loss += loss.data.cpu().numpy()
                cnt += 1
            val_action_loss /= cnt
            print("Val Loss: {}".format(val_action_loss))

        #rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":

            if val_action_loss < best_val_loss:
                val_loss_increase = 0
                best_val_loss = val_action_loss
                save_path = os.path.join(args.save_dir, args.model_name)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic.state_dict(),
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None),
                    getattr(utils.get_vec_normalize(envs), 'ret_rms', None)
                ],
                           os.path.join(
                               save_path,
                               args.model_name + "_{}.pt".format(args.seed)))
            elif val_action_loss > prev_val_action:
                val_loss_increase += 1
                if val_loss_increase == 10:
                    print("Val loss increasing too much, breaking here...")
                    break
            elif val_action_loss < prev_val_action:
                val_loss_increase = 0

            # Update prev val action
            prev_val_action = val_action_loss

        # log interval
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Example #16
0
def onpolicy_main():
    print("onpolicy main")

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))

    # Make vector env
    envs = make_vec_envs(
        args.env_name,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        env_kwargs=env_kwargs,
    )

    # agly ways to access to the environment attirubutes
    if args.env_name.find('doorenv') > -1:
        if args.num_processes > 1:
            visionnet_input = envs.venv.venv.visionnet_input
            nn = envs.venv.venv.nn
            env_name = envs.venv.venv.xml_path
        else:
            visionnet_input = envs.venv.venv.envs[
                0].env.env.env.visionnet_input
            nn = envs.venv.venv.envs[0].env.env.env.nn
            env_name = envs.venv.venv.envs[0].env.env.env.xml_path
        dummy_obs = np.zeros(nn * 2 + 3)
    else:
        dummy_obs = envs.observation_space
        visionnet_input = None
        nn = None

    if pretrained_policy_load:
        print("loading", pretrained_policy_load)
        actor_critic, ob_rms = torch.load(pretrained_policy_load)
    else:
        actor_critic = Policy(dummy_obs.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})

    if visionnet_input:
        visionmodel = load_visionmodel(env_name, args.visionmodel_path,
                                       VisionModelXYZ())
        actor_critic.visionmodel = visionmodel.eval()
    actor_critic.nn = nn
    actor_critic.to(device)

    #disable normalizer
    vec_norm = get_vec_normalize(envs)
    vec_norm.eval()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              dummy_obs.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    full_obs = envs.reset()
    initial_state = full_obs[:, :envs.action_space.shape[0]]

    if args.env_name.find('doorenv') > -1 and visionnet_input:
        obs = actor_critic.obs2inputs(full_obs, 0)
    else:
        if knob_noisy:
            obs = add_noise(full_obs, 0)
        else:
            obs = full_obs

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        pos_control = False
        total_switches = 0
        prev_selection = ""
        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
                next_action = action

            if pos_control:
                frame_skip = 2
                if step % (512 / frame_skip - 1) == 0:
                    current_state = initial_state
                next_action = current_state + next_action
                for kk in range(frame_skip):
                    full_obs, reward, done, infos = envs.step(next_action)

                current_state = full_obs[:, :envs.action_space.shape[0]]
            else:
                full_obs, reward, done, infos = envs.step(next_action)

            # convert img to obs if door_env and using visionnet
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                if knob_noisy:
                    obs = add_noise(full_obs, j)
                else:
                    obs = full_obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        writer.add_scalar("Value loss", value_loss, j)
        writer.add_scalar("action loss", action_loss, j)
        writer.add_scalar("dist entropy loss", dist_entropy, j)
        writer.add_scalar("Episode rewards", np.mean(episode_rewards), j)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ],
                       os.path.join(
                           save_path, args.env_name +
                           "_{}.{}.pt".format(args.save_name, j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        DR = True  #Domain Randomization
        ################## for multiprocess world change ######################
        if DR:
            print("changing world")

            envs.close_extras()
            envs.close()
            del envs

            envs = make_vec_envs(
                args.env_name,
                args.seed,
                args.num_processes,
                args.gamma,
                args.log_dir,
                device,
                False,
                env_kwargs=env_kwargs,
            )

            full_obs = envs.reset()
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                obs = full_obs
Example #17
0
 def get_one_agent_rollout(self, agent_idx, is_cen=False, is_aug=False):
     st = RolloutStorage(None, None, None, None, None)
     st.obs = self.obs[:, :, agent_idx]
     st.recurrent_hidden_states = self.recurrent_hidden_states
     st.rewards = self.rewards[:, :, agent_idx]
     st.value_preds = self.value_preds[:, :, agent_idx]
     st.returns = self.returns[:, :, agent_idx]
     st.action_log_probs = self.action_log_probs[:, :, agent_idx]
     st.actions = self.actions[:, :, agent_idx]
     st.masks = self.masks
     st.bad_masks = self.bad_masks
     self.num_steps = 0
     if is_cen:
         start = time.time()
         if agent_idx != 0 and agent_idx != self.num_agents - 1:
             obs_layer4_1 = torch.narrow(self.obs_layer4,
                                         dim=2,
                                         start=0,
                                         length=agent_idx)
             obs_layer4_2 = torch.narrow(self.obs_layer4,
                                         dim=2,
                                         start=agent_idx + 1,
                                         length=self.num_agents - 1 -
                                         agent_idx)
             st.other_obs = torch.cat([obs_layer4_1, obs_layer4_2], dim=2)
         elif agent_idx == 0:
             st.other_obs = self.obs_layer4[:, :, 1:]
         else:
             st.other_obs = self.obs_layer4[:, :, :-1]
         #print('other_obs {} s'.format(time.time() - start))
         idx = [i for i in range(self.num_agents) if i != agent_idx]
         other_actions = self.actions[:, :, idx]
         st.other_actions = torch.zeros(
             (*list(other_actions.size()[:-1]), self.num_actions))
         st.other_actions.scatter_(3, other_actions, 1)
     if is_aug:
         st.aug_obs = self.aug_obs[:, :, agent_idx]
     st.aug_size = self.aug_size
     return st
Example #18
0
def main(args):
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    eval_log_dir = args.log_dir + "_eval"

    try:
        os.makedirs(eval_log_dir)
    except OSError:
        files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    if args.eval_render:
        render_env = make_vec_envs(args.env_name,
                                   args.seed,
                                   1,
                                   None,
                                   None,
                                   args.add_timestep,
                                   device='cpu',
                                   allow_early_resets=False)

    torch.set_num_threads(1)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Uses gpu/cuda by default
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # Only if running visdoom
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    # Set up actor_critic
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    # Set algorithm with actor critic and use to learn
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(
                    save_path, args.env_name + "-AvgRwrd" +
                    str(int(np.mean(episode_rewards))) + ".pt"))
            print("Saving Model")

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        # Logs every log_interval steps
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            if args.eval_render:
                show_model(render_env, actor_critic)

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    #envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                    args.gamma, args.log_dir, device, False)

    envs = make_parallel_env(args.env_name, args.num_processes, args.seed, True)

    '''
    actor_critic = Policy(
        envs.observation_space[0].shape,
        envs.action_space[0],
        agent_num=args.agent_num, 
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)
    '''
    actor_critic = []
    for i in range(args.agent_num):
        ac = Policy(
            envs.observation_space[0].shape,
            envs.action_space[0],
            agent_num=args.agent_num, 
            agent_i = i,
            base_kwargs={'recurrent': args.recurrent_policy})
        ac.to(device)
        actor_critic.append(ac)

    
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        '''
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)
        '''
        agent = []
        for i in range(args.agent_num):
            agent.append(algo.PPO(
                actor_critic[i],
                i,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm,
                model_dir = args.model_dir))
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(
            actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir, "trajs_{}.pt".format(
                args.env_name.split('-')[0].lower()))
        
        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=4, subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)
    '''   
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space[0].shape, envs.action_space[0],
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(torch.tensor(obs[:,0,:]))
    rollouts.to(device)
    '''

    rollouts = []
    for i in range(args.agent_num):
        rollout = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space[0].shape, envs.action_space[0],
                              actor_critic[i].recurrent_hidden_state_size,
                              args.agent_num, i)
        rollouts.append(rollout)

    obs = envs.reset()
    # pdb.set_trace()
    
    for i in range(args.agent_num):
        rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1)))
        rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:]))
        rollouts[i].to(device)
        
    episode_rewards = deque(maxlen=10)

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print(num_updates)
    for j in range(num_updates):
        #pdb.set_trace()
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            for i in range(args.agent_num):
                utils.update_linear_schedule(agent[i].optimizer, j, num_updates, agent[i].optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            value_list, action_list, action_log_prob_list, recurrent_hidden_states_list = [], [], [], []
            with torch.no_grad():
                for i in range(args.agent_num):
                    #pdb.set_trace()
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic[i].act(
                        rollouts[i].share_obs[step],
                        rollouts[i].obs[step], rollouts[i].recurrent_hidden_states[step],
                        rollouts[i].masks[step])
                    # import pdb; pdb.set_trace()
                    value_list.append(value)
                    action_list.append(action)
                    action_log_prob_list.append(action_log_prob)
                    recurrent_hidden_states_list.append(recurrent_hidden_states)
            # Obser reward and next obs
            action = []
            for i in range(args.num_processes):
                one_env_action = []
                for k in range(args.agent_num):
                    one_hot_action = np.zeros(envs.action_space[0].n)
                    one_hot_action[action_list[k][i]] = 1
                    one_env_action.append(one_hot_action)
                action.append(one_env_action)
            #start = time.time()
            #pdb.set_trace()            
            obs, reward, done, infos = envs.step(action)
            # print(obs[0][0])
            # pdb.set_trace()
            #end = time.time()
            #print("step time: ", end-start)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            '''
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done[0]])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos[0]])
            '''
            masks = torch.ones(args.num_processes, 1)
            bad_masks = torch.ones(args.num_processes, 1)
            '''
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            '''
            #import pdb; pdb.set_trace()
            for i in range(args.agent_num):
                rollouts[i].insert(torch.tensor(obs.reshape(args.num_processes, -1)), torch.tensor(obs[:,i,:]), 
                            recurrent_hidden_states, action_list[i],
                            action_log_prob_list[i], value_list[i], torch.tensor(reward[:, i].reshape(-1,1)), masks, bad_masks)
        #import pdb; pdb.set_trace()
        with torch.no_grad():
            next_value_list = []
            for i in range(args.agent_num):
                next_value = actor_critic[i].get_value(
                    rollouts[i].share_obs[-1],
                    rollouts[i].obs[-1], rollouts[i].recurrent_hidden_states[-1],
                    rollouts[i].masks[-1]).detach()
                next_value_list.append(next_value)

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])
        for i in range(args.agent_num):
            rollouts[i].compute_returns(next_value_list[i], args.use_gae, args.gamma,
                                    args.gae_lambda, args.use_proper_time_limits)

        #import pdb; pdb.set_trace()
        for i in range(args.agent_num):
            value_loss, action_loss, dist_entropy = agent[i].update(rollouts[i])
            if (i == 0):
                print("value loss: " + str(value_loss))
        # print(value_loss)
            # pdb.set_trace()

        #rollouts.after_update()
        obs = envs.reset()
        # pdb.set_trace()
        for i in range(args.agent_num):
            rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1)))
            rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:]))
            rollouts[i].to(device)

        # save for every interval-th episode or for the last epoch     
        #pdb.set_trace()   
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            if not os.path.exists(save_path + args.model_dir):
                os.makedirs(save_path + args.model_dir)
            for i in range(args.agent_num):
                torch.save([
                    actor_critic[i],
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], save_path + args.model_dir + '/agent_%i' % (i+1) + ".pt")
        '''
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
        '''
        '''
Example #20
0
def main():
    ARGUMENTS.update(vars(args))
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        if args.algo == 'ppo' and args.use_linear_lr_decay:
            agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))
            ALL_UPDATES.append(j)
            ALL_TIMESTEPS.append(total_num_steps)
            ALL_FPS.append(int(total_num_steps / (end - start)))
            ALL_MEAN_REWARDS.append(np.mean(episode_rewards))
            ALL_MEDIAN_REWARDS.append(np.median(episode_rewards))
            ALL_MIN_REWARDS.append(np.min(episode_rewards))
            ALL_MAX_REWARDS.append(np.max(episode_rewards))
            ALL_DIST_ENTROPY.append(dist_entropy)
            ALL_VALUE_LOSS.append(value_loss)
            ALL_ACTION_LOSS.append(action_loss)

        if (args.eval_interval is not None
                and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.seed + args.num_processes, args.num_processes,
                args.gamma, eval_log_dir, args.add_timestep, device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
    # Save the results
    name = ARGUMENTS['env_name'] + '-' + ARGUMENTS['algo'] + '-' + ARGUMENTS['experiment'] + '-grad_noise' + str(ARGUMENTS['gradient_noise'])
    experiment = ro.Experiment(name, directory='results')
    data = {
        'updates': ALL_UPDATES,
        'timesteps': ALL_TIMESTEPS,
        'fps': ALL_FPS,
        'mean_rewards': ALL_MEAN_REWARDS,
        'median_rewards': ALL_MEDIAN_REWARDS,
        'min_rewards': ALL_MIN_REWARDS,
        'max_rewards': ALL_MAX_REWARDS,
        'dist_entropy': ALL_DIST_ENTROPY,
        'value_loss': ALL_VALUE_LOSS,
        'action_loss': ALL_ACTION_LOSS,
    }
    data.update(ARGUMENTS)
    result = data['mean_rewards'][-1]
    experiment.add_result(result, data)
Example #21
0
def instinct_loop_ppo(
        args,
        learning_rate,
        num_steps,
        num_updates,
        inst_on,
        visualize,
        save_dir
):
    torch.set_num_threads(1)
    log_writer = SummaryWriter(save_dir, max_queue=1, filename_suffix="log")
    device = torch.device("cpu")

    env_name = ENV_NAME_BOX #"Safexp-PointGoal1-v0"
    envs = make_vec_envs(env_name, np.random.randint(2 ** 32), NUM_PROC,
                         args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors)
    eval_envs = make_vec_envs(env_name, np.random.randint(2 ** 32), 1,
                         args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors)

    actor_critic_policy = init_default_ppo(envs, log(args.init_sigma))

    # Prepare modified observation shape for instinct
    obs_shape = envs.observation_space.shape
    inst_action_space = deepcopy(envs.action_space)
    inst_obs_shape = list(obs_shape)
    inst_obs_shape[0] = inst_obs_shape[0] + envs.action_space.shape[0]
    # Prepare modified action space for instinct
    inst_action_space.shape = list(inst_action_space.shape)
    inst_action_space.shape[0] = inst_action_space.shape[0] + 1
    inst_action_space.shape = tuple(inst_action_space.shape)
    actor_critic_instinct = torch.load("pretrained_instinct_h100.pt")

    actor_critic_policy.to(device)
    actor_critic_instinct.to(device)

    agent_policy = algo.PPO(
        actor_critic_policy,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=learning_rate,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(num_steps, NUM_PROC,
                                   obs_shape, envs.action_space,
                                   actor_critic_policy.recurrent_hidden_state_size)

    obs = envs.reset()
    i_obs = make_instinct_input(obs, torch.zeros((NUM_PROC, envs.action_space.shape[0])))  # Add zero action to the observation
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    fitnesses = []
    best_fitness_so_far = float("-Inf")

    masks = torch.ones(num_steps + 1, NUM_PROC, 1)
    instinct_recurrent_hidden_states = torch.zeros(num_steps + 1, NUM_PROC, actor_critic_instinct.recurrent_hidden_state_size)

    for j in range(num_updates):
        training_collisions_current_update = 0
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                # (value, action, action_log_probs, rnn_hxs), (instinct_value, instinct_action, instinct_outputs_log_prob, i_rnn_hxs), final_action
                value, action, action_log_probs, recurrent_hidden_states = actor_critic_policy.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                    deterministic=False
                )
                instinct_value, instinct_action, instinct_outputs_log_prob, instinct_recurrent_hidden_states = actor_critic_instinct.act(
                    i_obs,
                    instinct_recurrent_hidden_states,
                    masks,
                    deterministic=False,
                )

            # Combine two networks
            final_action, i_control = policy_instinct_combinator(action, instinct_action)
            obs, reward, done, infos = envs.step(final_action)
            #envs.render()

            training_collisions_current_update += sum([i['cost'] for i in infos])
            modded_reward, violation_cost = reward_cost_combinator(reward, infos, NUM_PROC, i_control)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor([[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos])
            # i_obs = torch.cat([obs, action], dim=1)
            i_obs = make_instinct_input(obs, action)
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs,
                                 value, modded_reward, masks, bad_masks)

        with torch.no_grad():
            next_value_policy = actor_critic_policy.get_value(rollouts.obs[-1],
                                                                rollouts.recurrent_hidden_states[-1],
                                                                rollouts.masks[-1].detach())

        rollouts.compute_returns(next_value_policy, args.use_gae, args.gamma,
                                args.gae_lambda, args.use_proper_time_limits)

        print("training policy")
        # Instinct training phase
        p_before = deepcopy(actor_critic_instinct)
        val_loss, action_loss, dist_entropy = agent_policy.update(rollouts)
        p_after = deepcopy(actor_critic_instinct)
        assert compare_two_models(p_before, p_after), "policy changed when it shouldn't"

        rollouts.after_update()

        ob_rms = utils.get_vec_normalize(envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(EvalActorCritic(actor_critic_policy, actor_critic_instinct), ob_rms, eval_envs, NUM_PROC,
                                    reward_cost_combinator, device, instinct_on=inst_on, visualise=visualize)
        instinct_reward = info['instinct_reward']
        hazard_collisions = info['hazard_collisions']
        print(
            f"Step {j}, Fitness {fits.item()}, value_loss instinct = {val_loss}, action_loss instinct= {action_loss}, "
            f"dist_entropy instinct = {dist_entropy}")
        print(
            f"Step {j}, Cost {instinct_reward}")
        print("-----------------------------------------------------------------")

        # Tensorboard logging
        log_writer.add_scalar("Task reward", fits.item(), j)
        log_writer.add_scalar("cost/Training hazard collisions", training_collisions_current_update, j)
        log_writer.add_scalar("cost/Instinct reward", instinct_reward, j)
        log_writer.add_scalar("cost/Eval hazard collisions", hazard_collisions, j)
        log_writer.add_scalar("value loss", val_loss, j)
        log_writer.add_scalar("action loss", action_loss, j)
        log_writer.add_scalar("dist entropy", dist_entropy, j)

        fitnesses.append(fits)
        if fits.item() > best_fitness_so_far:
            best_fitness_so_far = fits.item()
            torch.save(actor_critic_instinct, join(save_dir, "model_rl_instinct.pt"))
            torch.save(actor_critic_policy, join(save_dir, "model_rl_policy.pt"))
        torch.save(actor_critic_instinct, join(save_dir, "model_rl_instinct_latest.pt"))
        torch.save(actor_critic_policy, join(save_dir, "model_rl_policy_latest.pt"))
        torch.save(actor_critic_policy, join(save_dir, f"model_rl_policy_latest_{j}.pt"))
        pickle.dump(ob_rms, open(join(save_dir, "ob_rms.p"), "wb"))
    return (fitnesses[-1]), 0, 0
Example #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=str, default="")
    parser.add_argument('--epochs', type=int, default=30)
    parser.add_argument('--optimizer', type=str, default="mixture")
    parser.add_argument('--batch-size', type=int, default=1000)
    parser.add_argument('--worker', type=int, default=8)
    parser.add_argument('--dataset', type=str, default="CIFAR10")
    parser.add_argument('--log-dir', type=str, default="logs")
    parser.add_argument('--num-classes', type=int, help="number of classes")
    parser.add_argument('--use-log-loss', action="store_true")
    parser.add_argument('--lr-meta',
                        type=float,
                        default=7e-4,
                        help='learning rate (default: 7e-4)')
    parser.add_argument('--meta-epochs',
                        type=int,
                        default=30,
                        help='meta epochs')
    parser.add_argument('--eps',
                        type=float,
                        default=1e-5,
                        help='RMSprop optimizer epsilon (default: 1e-5)')
    parser.add_argument('--alpha',
                        type=float,
                        default=0.99,
                        help='RMSprop optimizer apha (default: 0.99)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor for rewards (default: 0.99)')
    parser.add_argument('--use-gae',
                        action='store_true',
                        default=False,
                        help='use generalized advantage estimation')
    parser.add_argument('--gae-lambda',
                        type=float,
                        default=0.95,
                        help='gae lambda parameter (default: 0.95)')
    parser.add_argument('--entropy-coef',
                        type=float,
                        default=0.01,
                        help='entropy term coefficient (default: 0.01)')
    parser.add_argument('--value-loss-coef',
                        type=float,
                        default=0.5,
                        help='value loss coefficient (default: 0.5)')
    parser.add_argument('--max-grad-norm',
                        type=float,
                        default=0.5,
                        help='max norm of gradients (default: 0.5)')

    parser.add_argument('--pretrained', action="store_true")
    parser.add_argument('--name', type=str, default="")
    parser.add_argument('--data', type=str, default="")

    parser.add_argument('--num-steps', type=int, default=3)

    parser.add_argument('--val-percent', type=float, default=0.0)
    args = parser.parse_args()

    task_name = "{}_da{}_ep{}_bs{}_{}".format(args.optimizer, args.dataset,
                                              args.epochs, args.batch_size,
                                              args.name)
    writer = tensorboardX.SummaryWriter(os.path.join(args.log_dir, task_name))

    data_transforms = {
        'train':
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]),
        'val':
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
    }

    if args.dataset == 'CIFAR10':
        train_dataset = torchvision.datasets.CIFAR10(
            './cifar', transform=data_transforms['train'])
        val_dataset = torchvision.datasets.CIFAR10(
            './cifar', transform=data_transforms['val'])
    elif args.dataset == 'CIFAR100':
        train_dataset = torchvision.datasets.CIFAR10(
            './cifar-100', transform=data_transforms['train'], download=True)
        val_dataset = torchvision.datasets.CIFAR10(
            './cifar-100', transform=data_transforms['val'], download=True)
    elif args.dataset == 'tiny':
        data_transforms = {
            'train':
            transforms.Compose([
                transforms.RandomSizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225]),
            ]),
            'val':
            transforms.Compose([
                transforms.Scale(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225]),
            ])
        }
        train_dataset = torchvision.datasets.ImageFolder(
            './tiny-imagenet-200/train', transform=data_transforms['train'])
        val_dataset = torchvision.datasets.ImageFolder(
            './tiny-imagenet-200/val', transform=data_transforms['val'])
    elif args.dataset == 'CUB':
        train_transforms, val_transforms, evaluate_transforms = preprocess_strategy(
            'CUB')
        traindir = os.path.join(args.data, 'train')
        valdir = os.path.join(args.data, 'val')
        train_dataset = torchvision.datasets.ImageFolder(
            traindir, train_transforms)
        val_dataset = torchvision.datasets.ImageFolder(valdir, val_transforms)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               args.batch_size,
                                               num_workers=args.worker,
                                               shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             args.batch_size,
                                             num_workers=args.worker,
                                             shuffle=False)

    #model = SimpleModel()
    model = resnet18(pretrained=args.pretrained)
    model.fc = nn.Linear(512, args.num_classes)

    if args.optimizer == 'mixture':
        action_space = np.array([0, 1, 2])
        coord_size = len(model.layers())
        ob_name_lstm = ["loss", "val_loss", "step"]
        ob_name_scalar = []
        num_steps = args.num_steps
        obs_shape = (len(ob_name_lstm) + len(ob_name_scalar) + coord_size, )
        _hidden_size = 20
        hidden_size = _hidden_size * len(ob_name_lstm)

        actor_critic = Policy(coord_size, input_size=(len(ob_name_lstm), len(ob_name_scalar)), \
        action_space=len(action_space), hidden_size=_hidden_size, window_size=1)

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr_meta,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
        meta_info = {'coord_size': coord_size, 'action_space': action_space, 'ob_name_lstm': ob_name_lstm, \
            'ob_name_scalar': ob_name_lstm, 'obs_shape': obs_shape, 'hidden_size': hidden_size, \
            'actor_critic': actor_critic}
    if args.optimizer == 'mixture':
        rollouts = RolloutStorage(
            num_steps,
            obs_shape,
            action_shape=coord_size,
            hidden_size=hidden_size,
            num_recurrent_layers=actor_critic.net.num_recurrent_layers)
        names = list(map(lambda x: x[0], list(model.named_parameters())))
        optimizer = MixtureOptimizer(model.parameters(),
                                     0.001,
                                     writer=writer,
                                     layers=model.layers(),
                                     names=names)
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), 0.001)
    elif args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), 0.001)
    else:
        raise NotImplementedError

    if len(args.gpu) == 0:
        use_cuda = False
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        use_cuda = True

    runner_config = {
        'USE_CUDA': use_cuda,
        'writer': writer,
        'epochs': args.meta_epochs,
        'val_percent': args.val_percent,
        'num_steps': args.num_steps,
        'use_gae': True,
        'savepath': 'models/' + task_name
    }

    trainer_config = {
        'train_loader': train_loader,
        'val_loader': val_loader,
        'USE_CUDA': use_cuda,
        'writer': writer,
        'use_log_loss': args.use_log_loss,
        'print_freq': 5,
        'epochs': args.epochs
    }

    if args.optimizer == 'mixture':
        trainer = MetaTrainer(model, nn.CrossEntropyLoss(), optimizer,
                              **trainer_config)

        runner = MetaRunner(trainer, rollouts, agent, actor_critic,
                            **runner_config)
        runner.run()
    else:
        trainer = Trainer(model, nn.CrossEntropyLoss(), optimizer,
                          **trainer_config)
        runner = Runner(trainer, **runner_config)
        runner.run()
Example #23
0
def main():
    args = get_args()

    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    if config.cuda and torch.cuda.is_available() and config.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    logger, final_output_dir, tb_log_dir = create_logger(config,
                                                         args.cfg,
                                                         'train',
                                                         seed=config.seed)

    eval_log_dir = final_output_dir + "_eval"

    utils.cleanup_log_dir(final_output_dir)
    utils.cleanup_log_dir(eval_log_dir)

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    writer = SummaryWriter(tb_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:" + config.GPUS if config.cuda else "cpu")

    width = height = 84
    envs = make_vec_envs(config.env_name,
                         config.seed,
                         config.num_processes,
                         config.gamma,
                         final_output_dir,
                         device,
                         False,
                         width=width,
                         height=height,
                         ram_wrapper=False)
    # create agent
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent':
                              config.recurrent_policy,
                              'hidden_size':
                              config.hidden_size,
                              'feat_from_selfsup_attention':
                              config.feat_from_selfsup_attention,
                              'feat_add_selfsup_attention':
                              config.feat_add_selfsup_attention,
                              'feat_mul_selfsup_attention_mask':
                              config.feat_mul_selfsup_attention_mask,
                              'selfsup_attention_num_keypoints':
                              config.SELFSUP_ATTENTION.NUM_KEYPOINTS,
                              'selfsup_attention_gauss_std':
                              config.SELFSUP_ATTENTION.GAUSS_STD,
                              'selfsup_attention_fix':
                              config.selfsup_attention_fix,
                              'selfsup_attention_fix_keypointer':
                              config.selfsup_attention_fix_keypointer,
                              'selfsup_attention_pretrain':
                              config.selfsup_attention_pretrain,
                              'selfsup_attention_keyp_maps_pool':
                              config.selfsup_attention_keyp_maps_pool,
                              'selfsup_attention_image_feat_only':
                              config.selfsup_attention_image_feat_only,
                              'selfsup_attention_feat_masked':
                              config.selfsup_attention_feat_masked,
                              'selfsup_attention_feat_masked_residual':
                              config.selfsup_attention_feat_masked_residual,
                              'selfsup_attention_feat_load_pretrained':
                              config.selfsup_attention_feat_load_pretrained,
                              'use_layer_norm':
                              config.use_layer_norm,
                              'selfsup_attention_keyp_cls_agnostic':
                              config.SELFSUP_ATTENTION.KEYPOINTER_CLS_AGNOSTIC,
                              'selfsup_attention_feat_use_ln':
                              config.SELFSUP_ATTENTION.USE_LAYER_NORM,
                              'selfsup_attention_use_instance_norm':
                              config.SELFSUP_ATTENTION.USE_INSTANCE_NORM,
                              'feat_mul_selfsup_attention_mask_residual':
                              config.feat_mul_selfsup_attention_mask_residual,
                              'bottom_up_form_objects':
                              config.bottom_up_form_objects,
                              'bottom_up_form_num_of_objects':
                              config.bottom_up_form_num_of_objects,
                              'gaussian_std':
                              config.gaussian_std,
                              'train_selfsup_attention':
                              config.train_selfsup_attention,
                              'block_selfsup_attention_grad':
                              config.block_selfsup_attention_grad,
                              'sep_bg_fg_feat':
                              config.sep_bg_fg_feat,
                              'mask_threshold':
                              config.mask_threshold,
                              'fix_feature':
                              config.fix_feature
                          })

    # init / load parameter
    if config.MODEL_FILE:
        logger.info('=> loading model from {}'.format(config.MODEL_FILE))
        state_dict = torch.load(config.MODEL_FILE)

        state_dict = OrderedDict(
            (_k, _v) for _k, _v in state_dict.items() if 'dist' not in _k)

        actor_critic.load_state_dict(state_dict, strict=False)
    elif config.RESUME:
        checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')
        if os.path.exists(checkpoint_file):
            logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
            checkpoint = torch.load(checkpoint_file)
            actor_critic.load_state_dict(checkpoint['state_dict'])

            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_file, checkpoint['epoch']))

    actor_critic.to(device)

    if config.algo == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            config.value_loss_coef,
            config.entropy_coef,
            lr=config.lr,
            eps=config.eps,
            alpha=config.alpha,
            max_grad_norm=config.max_grad_norm,
            train_selfsup_attention=config.train_selfsup_attention)
    elif config.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         config.clip_param,
                         config.ppo_epoch,
                         config.num_mini_batch,
                         config.value_loss_coef,
                         config.entropy_coef,
                         lr=config.lr,
                         eps=config.eps,
                         max_grad_norm=config.max_grad_norm)
    elif config.algo == 'acktr':
        agent = algo.A2C_ACKTR(
            actor_critic,
            config.value_loss_coef,
            config.entropy_coef,
            acktr=True,
            train_selfsup_attention=config.train_selfsup_attention,
            max_grad_norm=config.max_grad_norm)

    # rollouts: environment
    rollouts = RolloutStorage(
        config.num_steps,
        config.num_processes,
        envs.observation_space.shape,
        envs.action_space,
        actor_critic.recurrent_hidden_state_size,
        keep_buffer=config.train_selfsup_attention,
        buffer_size=config.train_selfsup_attention_buffer_size)

    if config.RESUME:
        if os.path.exists(checkpoint_file):
            agent.optimizer.load_state_dict(checkpoint['optimizer'])
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        config.num_env_steps) // config.num_steps // config.num_processes
    best_perf = 0.0
    best_model = False
    print('num updates', num_updates, 'num steps', config.num_steps)

    for j in range(num_updates):

        if config.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if config.algo == "acktr" else config.lr)

        for step in range(config.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            recurrent_hidden_states, meta = recurrent_hidden_states

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            objects_locs = []
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            if objects_locs:
                objects_locs = torch.FloatTensor(objects_locs)
                objects_locs = objects_locs * 2 - 1  # -1, 1
            else:
                objects_locs = None
            rollouts.insert(obs,
                            recurrent_hidden_states,
                            action,
                            action_log_prob,
                            value,
                            reward,
                            masks,
                            bad_masks,
                            objects_loc=objects_locs)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1],
            ).detach()

        rollouts.compute_returns(next_value, config.use_gae, config.gamma,
                                 config.gae_lambda,
                                 config.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if config.train_selfsup_attention and j > 15:
            for _iter in range(config.num_steps // 5):
                frame_x, frame_y = rollouts.generate_pair_image()
                selfsup_attention_loss, selfsup_attention_output, image_b_keypoints_maps = \
                    agent.update_selfsup_attention(frame_x, frame_y, config.SELFSUP_ATTENTION)

        if j % config.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * config.num_processes * config.num_steps
            end = time.time()
            msg = 'Updates {}, num timesteps {}, FPS {} \n' \
                  'Last {} training episodes: mean/median reward {:.1f}/{:.1f} ' \
                  'min/max reward {:.1f}/{:.1f} ' \
                  'dist entropy {:.1f}, value loss {:.1f}, action loss {:.1f}\n'. \
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards), np.mean(episode_rewards),
                       np.median(episode_rewards), np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy, value_loss,
                       action_loss)
            if config.train_selfsup_attention and j > 15:
                msg = msg + 'selfsup attention loss {:.5f}\n'.format(
                    selfsup_attention_loss)
            logger.info(msg)

        if (config.eval_interval is not None and len(episode_rewards) > 1
                and j % config.eval_interval == 0):
            total_num_steps = (j + 1) * config.num_processes * config.num_steps
            ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            eval_mean_score, eval_max_score, eval_scores = evaluate(
                actor_critic,
                ob_rms,
                config.env_name,
                config.seed,
                config.num_processes,
                eval_log_dir,
                device,
                width=width,
                height=height)
            perf_indicator = eval_mean_score
            if perf_indicator > best_perf:
                best_perf = perf_indicator
                best_model = True
            else:
                best_model = False

            # record test scores
            with open(os.path.join(final_output_dir, 'test_scores'),
                      'a+') as f:
                out_s = "TEST: {}, {}, {}, {}\n".format(
                    str(total_num_steps), str(eval_mean_score),
                    str(eval_max_score),
                    [str(_eval_scores) for _eval_scores in eval_scores])
                print(out_s, end="", file=f)
                logger.info(out_s)
            writer.add_scalar('data/mean_score', eval_mean_score,
                              total_num_steps)
            writer.add_scalar('data/max_score', eval_max_score,
                              total_num_steps)

            writer.add_scalars('test', {'mean_score': eval_mean_score},
                               total_num_steps)

            # save for every interval-th episode or for the last epoch
            if (j % config.save_interval == 0
                    or j == num_updates - 1) and config.save_dir != "":

                logger.info(
                    "=> saving checkpoint to {}".format(final_output_dir))
                epoch = j / config.save_interval
                save_checkpoint(
                    {
                        'epoch':
                        epoch + 1,
                        'model':
                        get_model_name(config),
                        'state_dict':
                        actor_critic.state_dict(),
                        'perf':
                        perf_indicator,
                        'optimizer':
                        agent.optimizer.state_dict(),
                        'ob_rms':
                        getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                    }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        '=> saving final model state to {}'.format(final_model_state_file))
    torch.save(actor_critic.state_dict(), final_model_state_file)

    # export_scalars_to_json needs results from add scalars
    writer.export_scalars_to_json(os.path.join(tb_log_dir, 'all_scalars.json'))
    writer.close()
Example #24
0
def train(train_states,
          run_dir,
          num_env_steps,
          eval_env_steps,
          writer,
          writer_name,
          args,
          init_model=None):
    envs = make_vec_envs(train_states, args.seed, args.num_processes,
                         args.gamma, 'cpu', 'train', args)

    if init_model:
        actor_critic, env_step, model_name = init_model
        obs_space = actor_critic.obs_space
        obs_process = actor_critic.obs_process
        obs_module = actor_critic.obs_module
        print(f"  [load] Loaded model {model_name} at step {env_step}")
    else:
        obs_space = envs.observation_space
        actor_critic = Policy(obs_space,
                              args.obs_process,
                              args.obs_module,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
        env_step = 0
    actor_critic.to(args.device)
    #print(actor_critic)

    run_name = run_dir.replace('/', '_')
    vid_save_dir = f"{run_dir}/videos/"
    try:
        os.makedirs(vid_save_dir)
    except OSError:
        pass
    ckpt_save_dir = f"{run_dir}/ckpts/"
    try:
        os.makedirs(ckpt_save_dir)
    except OSError:
        pass

    if args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         args.device,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=False)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=True)
    else:
        raise NotImplementedError

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    actor_critic.eval()
    """
    try:
        writer.add_graph(actor_critic, obs)
    except ValueError:
        print("Unable to write model graph to tensorboard.")
    """
    actor_critic.train()

    for k in rollouts.obs.keys():
        rollouts.obs[k][0].copy_(obs[k][0])

    episode_rewards = deque(maxlen=10)

    num_updates = num_env_steps // args.num_steps // args.num_processes
    batch_size = args.num_steps * args.num_processes
    start = time.time()
    while env_step < num_env_steps:
        s = time.time()
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act(
                    {
                        k: rollouts.obs[k][step].float().to(args.device)
                        for k in rollouts.obs.keys()
                    }, rollouts.recurrent_hidden_states[step].to(args.device),
                    rollouts.masks[step].to(args.device))
                value = value.cpu()
                action = action.cpu()
                action_log_prob = action_log_prob.cpu()
                recurrent_hidden_states = recurrent_hidden_states.cpu()
            # Observe reward and next obs
            obs, reward, dones, infos = envs.step(action)

            for done, info in zip(dones, infos):
                env_state = info['env_state'][1]
                if done:
                    writer.add_scalar(f'train_episode_x/{env_state}',
                                      info['max_x'], env_step)
                    writer.add_scalar(f'train_episode_%/{env_state}',
                                      info['max_x'] / info['lvl_max_x'] * 100,
                                      env_step)
                    writer.add_scalar(f'train_episode_r/{env_state}',
                                      info['sum_r'], env_step)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done else [1.0]
                                       for done in dones])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
        with torch.no_grad():
            next_value = actor_critic.get_value(
                {
                    k: rollouts.obs[k][-1].float().to(args.device)
                    for k in rollouts.obs.keys()
                }, rollouts.recurrent_hidden_states[-1].to(args.device),
                rollouts.masks[-1].to(args.device)).detach().cpu()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        env_step += batch_size
        fps = batch_size / (time.time() - s)
        #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
        #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step)
        #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step)
        total_norm = 0
        for p in list(
                filter(lambda p: p.grad is not None,
                       actor_critic.parameters())):
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item()**2
        total_norm = total_norm**(1. / 2)
        obs_norm = {}
        for obs_name in args.obs_keys:
            t_norm = 0
            if obs_name == 'video':
                md = actor_critic.base.video_module
            elif obs_name == 'audio':
                md = actor_critic.base.audio_module
            else:
                raise NotImplementedError
            for p in list(filter(lambda p: p.grad is not None,
                                 md.parameters())):
                param_norm = p.grad.data.norm(2)
                t_norm += param_norm.item()**2
            obs_norm[obs_name] = t_norm**(1. / 2)

        prev_env_step = max(0, env_step + 1 - batch_size)
        # write training metrics for this batch, usually takes 0.003s
        if (env_step + 1
            ) // args.write_interval > prev_env_step // args.write_interval:
            writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step)
            writer.add_scalar(f'fps/{writer_name}', fps, env_step)
            writer.add_scalar(f'value_loss/{writer_name}',
                              value_loss / batch_size, env_step)
            writer.add_scalar(f'action_loss/{writer_name}',
                              action_loss / batch_size, env_step)
            writer.add_scalar(f'dist_entropy/{writer_name}',
                              dist_entropy / batch_size, env_step)
            writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(),
                              env_step)
            writer.add_scalar(f'cpu_mem/{writer_name}',
                              psutil.virtual_memory()._asdict()['percent'],
                              env_step)
            for obs_name in args.obs_keys:
                writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}',
                                  obs_norm[obs_name], env_step)

        # print log to console
        if (env_step +
                1) // args.log_interval > prev_env_step // args.log_interval:
            end = time.time()
            print("  [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format(
                env_step + 1, num_env_steps, end - start, fps))
            if len(episode_rewards) > 0:
                print(
                    "    Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards)))
            print(
                "    dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}"
                .format(dist_entropy, value_loss, action_loss, total_norm))
            start = time.time()

        # save model to ckpt
        if ((env_step + 1) // args.save_interval >
                prev_env_step // args.save_interval):
            torch.save([
                actor_critic,
                env_step,
                run_name,
            ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt"))
            print(f"  [save] Saved model at step {env_step+1}.")

        # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop
        if ((env_step + 1) // args.eval_interval >
                prev_env_step // args.eval_interval
            ) and env_step < num_env_steps and eval_env_steps > 0:
            torch.save([
                actor_critic,
                env_step,
                run_name,
            ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt"))
            print(f"  [save] Saved model at step {env_step+1}.")

            envs.close()
            del envs  # close does not actually get rid of envs, need to del
            actor_critic.eval()
            eval_score, e_dict = evaluate(train_states, actor_critic,
                                          eval_env_steps, env_step, writer,
                                          vid_save_dir, args.vid_tb_steps,
                                          args.vid_file_steps,
                                          args.obs_viz_layer, args)
            print(f"  [eval] Evaluation score: {eval_score}")
            writer.add_scalar('eval_score', eval_score, env_step)

            actor_critic.train()
            envs = make_vec_envs(train_states, args.seed, args.num_processes,
                                 args.gamma, 'cpu', 'train', args)
            obs = envs.reset()
            # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0
            for k in rollouts.obs.keys():
                rollouts.obs[k][0].copy_(obs[k][0])

    # final model save
    final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")
    torch.save([
        actor_critic,
        env_step,
        run_name,
    ], final_model_path)
    print(
        f"  [save] Final model saved at step {env_step+1} to {final_model_path}"
    )

    # final model eval
    envs.close()
    del envs
    eval_score = None
    eval_dict = None
    if eval_env_steps > 0:
        eval_score, eval_dict = evaluate(train_states, actor_critic,
                                         eval_env_steps, env_step, writer,
                                         vid_save_dir, args.vid_tb_steps,
                                         args.vid_file_steps,
                                         args.obs_viz_layer, args)
        print(f"  [eval] Final model evaluation score: {eval_score:.3f}")

    return (actor_critic, env_step, run_name), eval_score, eval_dict
Example #25
0
def train(args, envs, encoder, agent, actor_critic, device):
    rollouts = RolloutStorage(
        args.num_steps,
        args.num_processes,
        envs.observation_space.shape,
        envs.action_space,
        actor_critic.recurrent_hidden_state_size,
    )

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.ppo_use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates, args.ppo_lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_probs, recurrent_hidden_states, actor_features, dist_entropy = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                )

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            # TODO: Check that the encoder is not updated
            # TODO: Analyze features of vae and infonce-st encoder

            for info in infos:
                if "episode" in info.keys():
                    episode_rewards.append(info["episode"]["r"])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if "bad_transition" in info.keys() else [1.0] for info in infos]
            )
            rollouts.insert(
                obs,
                recurrent_hidden_states,
                action,
                action_log_probs,
                value,
                reward,
                masks,
                bad_masks,
            )

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1],
            )

        rollouts.compute_returns(
            next_value, False, args.ppo_gamma, 0.0, args.use_proper_time_limits
        )

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if j % args.save_interval == 0 or j == num_updates - 1:
            torch.save(
                [actor_critic, getattr(utils.get_vec_normalize(envs), "ob_rms", None)],
                os.path.join(wandb.run.dir, args.env_name + ".pt"),
            )

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                )
            )
            wandb.log(
                {
                    "updates": j,
                    "total_num_steps": total_num_steps,
                    "fps": int(total_num_steps / (end - start)),
                    "episode_rewards_mean": np.mean(episode_rewards),
                    "episode_rewards_median": np.median(episode_rewards),
                    "episode_rewards_min": np.min(episode_rewards),
                    "episode_rewards_max": np.max(episode_rewards),
                    "entropy": dist_entropy,
                    "value_loss": value_loss,
                    "policy_loss": action_loss,
                }
            )
Example #26
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir + args.env_name)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    log_dir2 = os.path.expanduser(args.log_dir2 + args.env_name2)
    eval_log_dir2 = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir2)
    utils.cleanup_log_dir(eval_log_dir2)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    import json
    file_path = "config.json"
    setup_json = json.load(open(file_path, 'r'))
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env_name:
            env_conf = setup_json[i]


# 1 game
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, env_conf, False)
    # 2 game
    envs2 = make_vec_envs(args.env_name2, args.seed, args.num_processes,
                          args.gamma, args.log_dir2, device, env_conf, False)

    save_model, ob_rms = torch.load('./trained_models/PongNoFrameskip-v4.pt')

    from a2c_ppo_acktr.cnn import CNNBase

    a = CNNBase(envs.observation_space.shape[0], recurrent=False)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        #(obs_shape[0], ** base_kwargs)
        base=a,
        #base_kwargs={'recurrent': args.recurrent_policy}
    )
    #actor_critic.load_state_dict(save_model.state_dict())
    actor_critic.to(device)

    actor_critic2 = Policy(envs2.observation_space.shape,
                           envs2.action_space,
                           base=a)
    #base_kwargs={'recurrent': args.recurrent_policy})
    #actor_critic2.load_state_dict(save_model.state_dict())
    actor_critic2.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               actor_critic2,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    rollouts2 = RolloutStorage(args.num_steps, args.num_processes,
                               envs2.observation_space.shape,
                               envs2.action_space,
                               actor_critic2.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    obs2 = envs2.reset()
    rollouts2.obs[0].copy_(obs2)
    rollouts2.to(device)

    episode_rewards = deque(maxlen=10)
    episode_rewards2 = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):
        # if args.use_linear_lr_decay:
        #     # decrease learning rate linearly
        #     utils.update_linear_schedule(
        #         agent.optimizer, j, num_updates,
        #         agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
                value2, action2, action_log_prob2, recurrent_hidden_states2, _ = actor_critic2.act(
                    rollouts2.obs[step],
                    rollouts2.recurrent_hidden_states[step],
                    rollouts2.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            obs2, reward2, done2, infos2 = envs2.step(action2)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
            for info2 in infos2:
                if 'episode' in info2.keys():
                    episode_rewards2.append(info2['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            masks2 = torch.FloatTensor([[0.0] if done_ else [1.0]
                                        for done_ in done2])
            bad_masks2 = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info2.keys() else [1.0]
                 for info2 in infos2])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            rollouts2.insert(obs2, recurrent_hidden_states2, action2,
                             action_log_prob2, value2, reward2, masks2,
                             bad_masks2)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()
            next_value2 = actor_critic2.get_value(
                rollouts2.obs[-1], rollouts2.recurrent_hidden_states[-1],
                rollouts2.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        rollouts2.compute_returns(next_value2, args.use_gae, args.gamma,
                                  args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy, value_loss2, action_loss2, dist_entropy2 = agent.update(
            rollouts, rollouts2)

        rollouts.after_update()
        rollouts2.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))
            torch.save([
                actor_critic2,
                getattr(utils.get_vec_normalize(envs2), 'ob_rms2', None)
            ], os.path.join(save_path, args.env_name2 + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards2), np.mean(episode_rewards2),
                        np.median(episode_rewards2), np.min(episode_rewards2),
                        np.max(episode_rewards2), dist_entropy2, value_loss2,
                        action_loss2))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

            ob_rms2 = utils.get_vec_normalize(envs2).ob_rms
            evaluate(actor_critic2, ob_rms2, args.env_name2, args.seed,
                     args.num_processes, eval_log_dir2, device)
Example #27
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)
    if args.load_policy is not None:
        actor_critic, ob_rms = torch.load(args.load_policy)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(
        maxlen=(args.num_processes if args.num_processes > 10 else 10))

    start = time.time()
    snapshot_counter = 0
    last_delete = -1
    try:
        os.makedirs(os.path.join(args.save_dir, args.algo))
    except OSError:
        pass
    log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'),
                        'w')
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(save_path,
                             args.env_name + "epoch_{:07d}.pt".format(j)))
            snapshot_counter += 1
            last_delete += 1
            if snapshot_counter > 100:
                os.system('rm ' + os.path.join(
                    save_path, args.env_name +
                    'epoch_{:07d}.py'.format(last_delete)))
                snapshot_counter -= 1

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss)
            print(log_info)
            sys.stdout.flush()
            log_out_file.write(log_info)
            log_out_file.flush()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            log_out_file.write(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            log_out_file.flush()
            sys.stdout.flush()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
Example #28
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         True)

    frame_skip = 4  # frame skip
    if args.tb_dir[-1] != '/':
        args.tb_dir = args.tb_dir + '/'
    logger = Logger(args.tb_dir)
    logger.write_settings(args)
    if args.use_tdm:

        # beta scheduler
        if args.beta_schedule == 'const':
            beta_func = lambda x: float(args.beta_int)
        elif args.beta_schedule == 'sqrt':
            beta_func = lambda x: 1. / np.sqrt(x + 2)
        elif args.beta_schedule == 'log':
            beta_func = lambda x: 1. / np.log(x + 2)
        elif args.beta_schedule == 'linear':
            beta_func = lambda x: 1. / (x + 2)

        # bonus function variations
        if args.bonus_func == 'linear':
            bonus_func = lambda x: x + 1
        elif args.bonus_func == 'square':
            bonus_func = lambda x: (x + 1)**2
        elif args.bonus_func == 'sqrt':
            bonus_func = lambda x: (x + 1)**(1 / 2)
        elif args.bonus_func == 'log':
            bonus_func = lambda x: np.log(x + 1)

        # temporal difference module
        tdm = TemporalDifferenceModule(
            inputSize=2 * int(envs.observation_space.shape[0]),
            outputSize=args.time_intervals,
            num_fc_layers=int(args.num_layers),
            depth_fc_layers=int(args.fc_width),
            lr=float(args.opt_lr),
            buffer_max_length=args.buffer_max_length,
            buffer_RL_ratio=args.buffer_RL_ratio,
            frame_skip=frame_skip,
            tdm_epoch=args.tdm_epoch,
            tdm_batchsize=args.tdm_batchsize,
            logger=logger,
            bonus_func=bonus_func).to(device)

        #collect random trajectories
        sample_collector = CollectSamples(envs,
                                          args.num_processes,
                                          initial=True)
        tdm.buffer_rand = sample_collector.collect_trajectories(
            args.num_rollouts, args.steps_per_rollout)

        # initial training
        tdm.update()

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = deque(maxlen=10)
    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # acting
        for step in range(args.num_steps):

            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            # envs.render()

            obs_old = obs.clone()
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            #compute intrinsic bonus
            if args.use_tdm:
                tdm.symm_eval = True if step == args.num_steps - 1 else False
                reward_int = tdm.compute_bonus(obs_old, obs).float()
                reward += beta_func(
                    step + j * args.num_steps) * reward_int.cpu().unsqueeze(1)

                if (j % args.log_interval == 0) and (step
                                                     == args.num_steps - 1):
                    logger.add_reward_intrinsic(reward_int,
                                                (j + 1) * args.num_steps *
                                                args.num_processes)

            #saving to buffer.
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        # saving to buffer and periodic updating parameters
        if (args.use_tdm):
            tdm.buffer_RL_temp.append((rollouts.obs, rollouts.masks))
            if (j % args.num_steps == 0 and j > 0):
                tdm.update()

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        # no
        # save every 1-million steps
        if (((j + 1) * args.num_steps * args.num_processes) % 1e6 == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            if j == num_updates - 1:
                save_here = os.path.join(
                    save_path, args.env_name + "_step_{}M.pt".format(
                        (j + 1) * args.num_steps * args.num_processes // 1e6))
            else:
                save_here = os.path.join(save_path,
                                         args.env_name + "_final.pt")
            torch.save(save_model, save_here)  # saved policy.

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        # printing outputs
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            logger.add_reward(episode_rewards,
                              (j + 1) * args.num_steps * args.num_processes)

        #
        # if j % args.tb_interval == 0:
        #     # mean/std or median/1stqt?
        #     logger.add_tdm_loss(loss, self.epoch_count*i)

        # evaluation process
        # if (args.eval_interval is not None
        #         and len(episode_rewards) > 1
        #         and j % args.eval_interval == 0):
        #     eval_envs = make_vec_envs(
        #         args.env_name, args.seed + args.num_processes, args.num_processes,
        #         args.gamma, eval_log_dir, args.add_timestep, device, True)
        #
        #     vec_norm = get_vec_normalize(eval_envs)
        #     if vec_norm is not None:
        #         vec_norm.eval()
        #         vec_norm.ob_rms = get_vec_normalize(envs).ob_rms
        #
        #     eval_episode_rewards = []
        #
        #     obs = eval_envs.reset()
        #     eval_recurrent_hidden_states = torch.zeros(args.num_processes,
        #                     actor_critic.recurrent_hidden_state_size, device=device)
        #     eval_masks = torch.zeros(args.num_processes, 1, device=device)
        #
        #     while len(eval_episode_rewards) < 10:
        #         with torch.no_grad():
        #             _, action, _, eval_recurrent_hidden_states = actor_critic.act(
        #                 obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)
        #
        #         # Obser reward and next obs
        #         # envs.render()
        #         obs, reward, done, infos = eval_envs.step(action)
        #
        #         eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
        #                                         for done_ in done])
        #         for info in infos:
        #             if 'episode' in info.keys():
        #                 eval_episode_rewards.append(info['episode']['r'])
        #
        #     eval_envs.close()
        #
        #     print(" Evaluation using {} episodes: mean reward {:.5f}\n".
        #         format(len(eval_episode_rewards),
        #                np.mean(eval_episode_rewards)))

        # # plotting
        # if args.vis and j % args.vis_interval == 0:
        #     try:
        #         # Sometimes monitor doesn't properly flush the outputs
        #         win = visdom_plot(viz, win, args.log_dir, args.env_name,
        #                           args.algo, args.num_env_steps)
        #     except IOError:
        #         pass
    #if done save:::::::::::
    logger.save()
Example #29
0
def main():

    args = get_args()
    writer = SummaryWriter(os.path.join('logs', args.save_name), )
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(
        basic_env.BasicFlatDiscreteEnv,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        task='lift',
        gripper_type='RobotiqThreeFingerDexterousGripper',
        robot='Panda',
        controller='JOINT_TORQUE' if args.vel else 'JOINT_POSITION',
        horizon=1000,
        reward_shaping=True)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base=Surreal,
        # base=OpenAI,
        # base=MLP_ATTN,
        base_kwargs={
            'recurrent':
            args.recurrent_policy,
            # 'dims': basic_env.BasicFlatEnv().modality_dims
            'config':
            dict(act='relu' if args.relu else 'tanh', rec=args.rec, fc=args.fc)
        })
    print(actor_critic)
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    best_reward = 0
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)
            writer.add_scalar('lr', agent.optimizer.param_groups[0]['lr'])

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        end = time.time()
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        if len(episode_rewards) > 1:
            writer.add_scalar('loss/value', value_loss, total_num_steps)
            writer.add_scalar('loss/policy', action_loss, total_num_steps)
            writer.add_scalar('experiment/num_updates', j, total_num_steps)
            writer.add_scalar('experiment/FPS',
                              int(total_num_steps / (end - start)),
                              total_num_steps)
            writer.add_scalar('experiment/EPISODE MEAN',
                              np.mean(episode_rewards), total_num_steps)
            writer.add_scalar('experiment/EPISODE MEDIAN',
                              np.median(episode_rewards), total_num_steps)
            writer.add_scalar('experiment/EPISODE MIN',
                              np.min(episode_rewards), total_num_steps)
            writer.add_scalar('experiment/EPSIDOE MAX',
                              np.max(episode_rewards), total_num_steps)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if len(episode_rewards) > 1 and args.save_dir != "":
            rew = np.mean(episode_rewards)
            if rew > best_reward:
                best_reward = rew
                print('saved with best reward', rew)

                save_path = os.path.join(args.save_dir, args.algo)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
                ], os.path.join(save_path, args.save_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            evaluate(actor_critic, obs_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        writer.close()
Example #30
0
def main():
    all_episode_rewards = []  ### 记录 6/29
    all_temp_rewards = []  ### 记录 6/29
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print('num_updates ', num_updates)
    print('num_steps ', args.num_steps)
    count = 0
    h5_path = './data/' + args.env_name
    if not os.path.exists(h5_path):
        os.makedirs(h5_path)
    h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count)
    data = {}
    data['states'] = []
    data['actions'] = []
    data['rewards'] = []
    data['done'] = []
    data['lengths'] = []

    episode_step = 0

    for j in range(num_updates):  ### num-steps

        temp_states = []
        temp_actions = []
        temp_rewards = []
        temp_done = []
        temp_lenthgs = []

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            if j == 0 and step == 0:
                print('obs ', type(rollouts.obs[step]),
                      rollouts.obs[step].shape)
                print('hidden_states ',
                      type(rollouts.recurrent_hidden_states[step]),
                      rollouts.recurrent_hidden_states[step].shape)
                print('action ', type(action), action.shape)
                print('action prob ', type(action_log_prob),
                      action_log_prob.shape)
                print('-' * 20)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #print(infos)
            #print(reward)
            temp_states += [np.array(rollouts.obs[step].cpu())]
            temp_actions += [np.array(action.cpu())]
            #temp_rewards += [np.array(reward.cpu())]
            temp_rewards += [np.array([infos[0]['myrewards']])
                             ]  ### for halfcheetah不能直接用 reward !! 6/29
            temp_done += [np.array(done)]

            if j == 0 and step == 0:
                print('obs ', type(obs), obs.shape)
                print('reward ', type(reward), reward.shape)
                print('done ', type(done), done.shape)
                print('infos ', len(infos))
                for k, v in infos[0].items():
                    print(k, v.shape)
                print()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    all_episode_rewards += [info['episode']['r']]  ### 记录 6/29

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        temp_lengths = len(temp_states)
        temp_states = np.concatenate(temp_states)
        temp_actions = np.concatenate(temp_actions)
        temp_rewards = np.concatenate(temp_rewards)
        temp_done = np.concatenate(temp_done)
        #print('temp_lengths',temp_lengths)
        #print('temp_states', temp_states.shape)
        #print('temp_actions', temp_actions.shape)
        #print('temp_rewards', temp_rewards.shape)
        if j > int(0.4 * num_updates):
            data['states'] += [temp_states]
            data['actions'] += [temp_actions]
            data['rewards'] += [temp_rewards]
            data['lengths'] += [temp_lengths]
            data['done'] += [temp_done]
            #print('temp_lengths',data['lengths'].shape)
            #print('temp_states', data['states'].shape)
            #print('temp_actions', data['actions'].shape)
            #print('temp_rewards', data['rewards'].shape)

            if args.save_expert and len(data['states']) >= 100:
                with h5py.File(h5_filename, 'w') as f:
                    f['states'] = np.array(data['states'])
                    f['actions'] = np.array(data['actions'])
                    f['rewards'] = np.array(data['rewards'])
                    f['done'] = np.array(data['done'])
                    f['lengths'] = np.array(data['lengths'])
                    #print('f_lengths',f['lengths'].shape)
                    #print('f_states', f['states'].shape)
                    #print('f_actions', f['actions'].shape)
                    #print('f_rewards', f['rewards'].shape)

                count += 1
                h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (
                    count)
                data['states'] = []
                data['actions'] = []
                data['rewards'] = []
                data['done'] = []
                data['lengths'] = []

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards)  ### 保存记录 6/29
            #print(temp_rewards)
            print("temp rewards size", temp_rewards.shape, "mean",
                  np.mean(temp_rewards), "min", np.min(temp_rewards), "max",
                  np.max(temp_rewards))
            all_temp_rewards += [temp_rewards]
            np.savez(os.path.join(save_path,
                                  args.env_name + "_%d" % (args.seed)),
                     episode=all_episode_rewards,
                     timestep=all_temp_rewards)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
    '''data['states'] = np.array(data['states'])