Beispiel #1
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False,
                         args.custom_gym)

    base = SEVN

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'ppo':
        agent = PPO(actor_critic,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.value_loss_coef,
                    args.entropy_coef,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode_length = deque(maxlen=10)
    episode_success_rate = deque(maxlen=100)
    episode_total = 0

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_length.append(info['episode']['l'])
                    episode_success_rate.append(
                        info['was_successful_trajectory'])
                    episode_total += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            writer.add_scalars('Train/Episode Reward', {
                "Reward Mean": np.mean(episode_rewards),
                "Reward Min": np.min(episode_rewards),
                "Reward Max": np.max(episode_rewards)
            },
                               global_step=total_num_steps)
            writer.add_scalars('Train/Episode Length', {
                "Episode Length Mean": np.mean(episode_length),
                "Episode Length Min": np.min(episode_length),
                "Episode Length Max": np.max(episode_length)
            },
                               global_step=total_num_steps)
            writer.add_scalar("Train/Episode Reward Mean",
                              np.mean(episode_rewards),
                              global_step=total_num_steps)
            writer.add_scalar("Train/Episode Length Mean",
                              np.mean(episode_length),
                              global_step=total_num_steps)
            writer.add_scalar("Train/Episode Success Rate",
                              np.mean(episode_success_rate),
                              global_step=total_num_steps)

            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Beispiel #2
0
        def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len,
                S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, end_of_video, cdn_newest_id, download_id,
                cdn_has_frame, IntialVars):
            torch.set_num_threads(1)
            device = torch.device("cuda:0" if args.cuda else "cpu")

            if args.vis:
                from visdom import Visdom
                viz = Visdom(port=args.port)
                win = None

            # The online env in AItrans, it should have the observation space, action space and so on
            # We should step into the depth of envs.py in the github doc, and extract the format of observation
            # and action space
            envs =

            actor_critic = Policy(envs.observation_space.shape, envs.action_space,
                                  base_kwargs={'recurrent': args.recurrent_policy})
            actor_critic.to(device)

            # choose the algorithm, now we only have a2c
            if args.algo == 'a2c':
                agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                                       args.entropy_coef, lr=args.lr,
                                       eps=args.eps, alpha=args.alpha,
                                       max_grad_norm=args.max_grad_norm)
            elif args.algo == 'ppo':
                agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                                 args.value_loss_coef, args.entropy_coef, lr=args.lr,
                                 eps=args.eps,
                                 max_grad_norm=args.max_grad_norm)
            elif args.algo == 'acktr':
                agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                                       args.entropy_coef, acktr=True)

            rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                      envs.observation_space.shape, envs.action_space,
                                      actor_critic.recurrent_hidden_state_size)

            # the initial observation
            obs =
            rollouts.obs[0].copy_(obs)
            rollouts.to(device)

            episode_reward = deque(maxlen=10)
            start = time.time()
            for j in range(num_updates):

                if args.use_linear_lr_decay:
                    # decrease learning rate linearly
                    if args.algo == "acktr":
                        # use optimizer's learning rate since it's hard-coded in kfac.py
                        update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
                    else:
                        update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

                if args.algo == 'ppo' and args.use_linear_lr_decay:
                    agent.clip_param = args.clip_param * (1 - j / float(num_updates))

                for step in range(args.num_steps):
                    # Sample actions
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                            rollouts.obs[step],
                            rollouts.recurrent_hidden_states[step],
                            rollouts.masks[step])
Beispiel #3
0
def main(args, idx):
    # Create summary writer
    writer_path = os.path.join(args.log_dir, args.task_id, args.run_id + '-' + str(idx))
    writer = SummaryWriter(log_dir=writer_path)

    # Create training envs
    envs = make_vec_envs(args.task_id, args.seed, args.num_processes,
                         args.gamma, args.monitor_dir, args.device)
    obs_size = envs.observation_space.shape[0]
    act_size = envs.action_space.shape[0]

    # Create NN
    actor_critic = Policy(obs_size, act_size,
                          action_range=[envs.action_space.low[0], envs.action_space.high[0]])
    actor_critic.to(args.device)

    # Create ppo agent
    agent = PPO(
        actor_critic=actor_critic,
        device=args.device,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        clip_param=args.clip_param,
        ppo_epoch=args.ppo_epoch,
        num_mini_batch=args.num_mini_batch,
        value_loss_coef=args.value_loss_coef,
        entropy_coef=args.entropy_coef,
    )

    # Create replay buffer
    buffer = ReplayBuffer(args.num_steps, args.num_processes, obs_size, act_size)
    buffer.to(args.device)

    # Reset envs
    obs = envs.reset()
    buffer.obs[0].copy_(obs)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes
    for j in tqdm(range(num_updates)):

        if args.use_linear_lr_decay:
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        # Collect trajectories and compute returns
        with torch.no_grad():
            for step in range(args.num_steps):
                # Sample actions
                action = actor_critic(buffer.obs[step])

                # Get trajectories from envs
                obs, reward, done, infos = envs.step(action)
                mask = torch.tensor(
                    [[0.0] if done_ else [1.0] for done_ in done],
                    dtype=torch.float, device=args.device)
                for info in infos:
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])

                # Store trajectories
                buffer.insert(obs, action, reward, mask)

            # Compute returns
            batch_obs = buffer.obs.view(-1, obs_size)
            value = actor_critic.get_value(batch_obs).view(args.num_steps + 1, args.num_processes, 1)
            batch_obs = buffer.obs[:-1].view(-1, obs_size)
            batch_action = buffer.actions.view(-1, act_size)
            action_log_prob = actor_critic.get_act_log_prob(batch_obs, batch_action).view(args.num_steps,
                                                                                          args.num_processes, 1)
            buffer.update_value_log_prob(value, action_log_prob)
            buffer.compute_returns(args.gamma, args.gae_lambda)

        # Update policy
        agent_output = agent.update(buffer)
        buffer.after_update()

        # Log stuff
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            speed = int(total_num_steps / (end - start))
            print(
                "Updates {}, num timesteps {}, FPS {} \n "
                "Last {} training episodes: mean/median reward {:.1f}/{:.1f}, "
                "min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            speed,
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            ))
            writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps)
            writer.add_scalar('speed', speed, total_num_steps)
            for key in agent_output.keys():
                writer.add_scalar(key, agent_output[key], total_num_steps)

            if args.task_id == 'Pendulum-v0' and np.mean(episode_rewards) > -250:
                break

    envs.close()
    writer.close()
def main():
  device = 'cpu'
  acc_steps = []
  acc_scores = []
  torch.set_num_threads(1)

  envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                       args.gamma, args.log_dir, args.add_timestep,
                       device, False)

  # get cloned policy and recovered reward function
  policy_reward_dir = args.rewards_dir
  policy_dir = args.policies_dir

  policy_reward = Policy(envs.observation_space.shape, envs.action_space)

  policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth'
  policy_reward_sd = torch.load(policy_reward_file_name)
  policy_reward.load_state_dict(policy_reward_sd)

  actor_critic = Policy(envs.observation_space.shape, envs.action_space)

  policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth'
  policy_sd = torch.load(policy_file_name)
  actor_critic.load_state_dict(policy_sd)
  actor_critic.to(device)

  agent = PPO(actor_critic, args.clip_param, args.ppo_epoch,
              args.num_mini_batch, args.value_loss_coef, args.entropy_coef,
              lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm)

  rollouts = RolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space)

  obs = envs.reset()
  rollouts.obs[0].copy_(obs)
  rollouts.to(device)

  episode_rewards = collections.deque(maxlen=10)

  for j in range(num_updates):

    if args.use_linear_lr_decay:
      # decrease learning rate linearly
      update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
      agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

    for step in range(args.num_steps):
      # Sample actions
      with torch.no_grad():
        value, action, action_log_prob = actor_critic.act(
            rollouts.obs[step],
            rollouts.masks[step])

      obs, _, done, infos = envs.step(action)
      if step > 1 and step % 1000 == 0:
        done = True

      # use infered reward:
      with torch.no_grad():
        # _, reward = shapes(rollouts.obs[step], 0)
        _, action_log_probs, _, _ = policy_reward.evaluate_actions(
            rollouts.obs[step], None, None, action)
        reward = action_log_probs

      for info in infos:
        # if 'episode' in info.keys():
        #  episode_rewards.append(info['episode']['r'])
        r = 0
        for key, val in info.items():
          if 'reward' in key:
            r += val
        episode_rewards.append(r)

      # If done then clean the history of observations.
      masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                 for done_ in done])

      rollouts.insert(obs, action, action_log_prob,
                      value, reward, masks)

    with torch.no_grad():
      next_value = actor_critic.get_value(rollouts.obs[-1],
                                          rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()

    # save for every interval-th episode or for the last epoch
    if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir:
      save_path = os.path.join(args.save_dir, 'ppo')
      try:
        os.makedirs(save_path)
      except OSError:
        pass

      # A really ugly way to save a model to CPU
      save_model = actor_critic

      save_model = [save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None)]

      torch.save(save_model, os.path.join(save_path, args.env_name + '.pt'))

    total_num_steps = (j + 1) * args.num_processes * args.num_steps

    if j % args.log_interval == 0 and len(episode_rewards) > 1:
      print('Updates', j,
            'num timesteps', len(episode_rewards),
            '\n Last training episodes: mean/median reward',
            '{:.1f}'.format(np.mean(episode_rewards)),
            '/{:.1f}'.format(np.median(episode_rewards)),
            'min/max reward',
            '{:.1f}'.format(np.min(episode_rewards)),
            '/{:.1f}'.format(np.max(episode_rewards)),
            'dist entropy', dist_entropy,
            'value loss', value_loss,
            'action loss', action_loss)

    if len(episode_rewards) > 1:
      acc_steps.append(total_num_steps)
      acc_scores.append(np.mean(episode_rewards))

    if (args.eval_interval is not None
        and len(episode_rewards) > 1
        and j % args.eval_interval == 0):
      eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes,
                                args.num_processes, args.gamma, eval_log_dir,
                                args.add_timestep, device, True)

      vec_norm = get_vec_normalize(eval_envs)
      if vec_norm is not None:
        vec_norm.eval()
        vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

      eval_episode_rewards = []

      obs = eval_envs.reset()
      eval_masks = torch.zeros(args.num_processes, 1, device=device)

      while len(eval_episode_rewards) < 10:
        with torch.no_grad():
          _, action, _ = actor_critic.act(
              obs, eval_masks, deterministic=True)

        # Obser reward and next obs
        obs, reward, done, infos = eval_envs.step(action)

        eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                        for done_ in done])
        for info in infos:
          if 'episode' in info.keys():
            eval_episode_rewards.append(info['episode']['r'])

      eval_envs.close()

      print('Evaluation using',
            len(eval_episode_rewards),
            'episodes: mean reward',
            '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

  scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy'
  steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy'
  np.save(scores_file_name, np.array(acc_scores))
  np.save(steps_file_name, np.array(acc_steps))
Beispiel #5
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args_iko.cuda else "cpu")

    if args_iko.vis:
        from visdom import Visdom
        viz = Visdom(port=args_iko.port)
        win = None

    envs = make_vec_envs(args_iko.env_name, args_iko.seed,
                         args_iko.num_processes, args_iko.gamma,
                         args_iko.log_dir, args_iko.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args_iko.recurrent_policy})
    actor_critic.to(device)

    action_shape = 3
    reward_model = RewardModel(11 * 11 * 6, 1, 64, 64)
    reward_model.to(device)

    if args_iko.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               lr=args_iko.lr,
                               eps=args_iko.eps,
                               alpha=args_iko.alpha,
                               max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args_iko.clip_param,
                         args_iko.ppo_epoch,
                         args_iko.num_mini_batch,
                         args_iko.value_loss_coef,
                         args_iko.entropy_coef,
                         args_iko.use_singh,
                         reward_model,
                         lr=args_iko.lr,
                         eps=args_iko.eps,
                         max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args_iko.use_linear_lr_decay:
            # decrease learning rate linearly
            if args_iko.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args_iko.lr)

        if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay:
            agent.clip_param = args_iko.clip_param * (1 -
                                                      j / float(num_updates))

        reward_train = []
        reward_block_penalty = []
        reward_bel_gt = []
        reward_bel_gt_nonlog = []
        reward_infogain = []
        reward_bel_ent = []
        reward_hit = []
        reward_dist = []
        reward_inv_dist = []

        for step in range(args_iko.num_steps):
            # Sample actions
            # print(step, args_iko.num_steps)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            reward_train.append(reward)
            # print("infos is ", infos)
            # reward_b.append(infos[0]['auxillary_reward'])
            # print("infos is ",infos[0]['auxillary_reward'])
            reward_block_penalty.append(infos[0]['reward_block_penalty'])
            reward_bel_gt.append(infos[0]['reward_bel_gt'])
            reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog'])
            reward_infogain.append(infos[0]['reward_infogain'])
            reward_bel_ent.append(infos[0]['reward_bel_ent'])
            reward_hit.append(infos[0]['reward_hit'])
            reward_dist.append(infos[0]['reward_dist'])
            reward_inv_dist.append(infos[0]['reward_inv_dist'])
            # print(reward)

            reward.to(device)
            reward_model.to(device)
            if args_iko.use_singh:
                # print("using learning IR")
                my_reward = reward_model(obs.clone().to(device),
                                         action.clone().float()).detach()
                my_reward.to(device)
                reward = reward + args_iko.singh_coef * my_reward.type(
                    torch.FloatTensor)

            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])
            #         print("infos is ",infos[0]['auxillary_reward'])
            #         print("info is",info['episode']['r'] )

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        # print("mean reward_a", np.mean(reward_train))
        # print("mean reward_block_penalty", np.mean(reward_block_penalty))
        # print("mean reward_bel_gt", np.mean(reward_bel_gt))
        # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog))
        # print("mean reward_infogain", np.mean(reward_infogain))
        # print("mean reward_bel_ent", np.mean(reward_bel_ent))
        # print("mean reward_hit", np.mean(reward_hit))
        # print("mean reward_dist", np.mean(reward_dist))
        # print("mean reward_inv_dist", np.mean(reward_inv_dist))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps
        writer.add_scalar('mean_reward_train', np.mean(reward_train),
                          total_num_steps)
        writer.add_scalar('mean_reward_block_penalty',
                          np.mean(reward_block_penalty), total_num_steps)
        writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_gt_nonlog',
                          np.mean(reward_bel_gt_nonlog), total_num_steps)
        writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent),
                          total_num_steps)
        writer.add_scalar('mean_reward_hit', np.mean(reward_hit),
                          total_num_steps)
        writer.add_scalar('mean_reward_dist', np.mean(reward_dist),
                          total_num_steps)
        writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist),
                          total_num_steps)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma,
                                 args_iko.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args_iko.save_interval == 0
                or j == num_updates - 1) and args_iko.save_dir != "":
            save_path = os.path.join(args_iko.save_dir, args_iko.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args_iko.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(
                    save_path, 'ugl' + str(args_iko.use_gt_likelihood) +
                    'block-pen-' + str(args_iko.penalty_for_block) + '_' +
                    'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' +
                    str(args_iko.rew_bel_new) + '_' + 'bel-ent-' +
                    str(args_iko.rew_bel_ent) + '_' + 'infogain-' +
                    str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' +
                    str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' +
                    str(args_iko.rew_bel_gt) + '_' + 'dist-' +
                    str(args_iko.rew_dist) + '_' + 'hit-' +
                    str(args_iko.rew_hit) + '_' + 'inv-dist-' +
                    str(args_iko.rew_inv_dist) + args_iko.algo + ".pt"))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps

        if j % args_iko.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("mean reward_a", np.mean(reward_a))
            print("mean_reward_b", np.mean(reward_b))
            # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            len(episode_rewards),
            #            np.mean(episode_rewards),
            #            np.median(episode_rewards),
            #            np.min(episode_rewards),
            #            np.max(episode_rewards), dist_entropy,
            #            value_loss, action_loss))
            # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps)
            # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps)
            # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps)
            # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps)

        if (args_iko.eval_interval is not None and len(episode_rewards) > 1
                and j % args_iko.eval_interval == 0):
            eval_envs = make_vec_envs(args_iko.env_name,
                                      args_iko.seed + args_iko.num_processes,
                                      args_iko.num_processes, args_iko.gamma,
                                      eval_log_dir, args_iko.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args_iko.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args_iko.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args_iko.vis and j % args_iko.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args_iko.log_dir,
                                  args_iko.env_name, args_iko.algo,
                                  args_iko.num_env_steps)
            except IOError:
                pass
    writer.close()
Beispiel #6
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:            
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        if args.algo == 'ppo' and args.use_linear_lr_decay:      
            agent.clip_param = args.clip_param  * (1 - j / float(num_updates))
                
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))

        if (args.eval_interval is not None
                and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.seed + args.num_processes, args.num_processes,
                args.gamma, eval_log_dir, args.add_timestep, device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Beispiel #7
0
    agent.rollouts.obs[0].copy_(obs[1])
    agent.rollouts.to(device)

    # start training
    agent.train()
    start = time.time()

    num_updates = int(args.num_env_steps // args.num_processes //
                      args.num_steps)

    for update in range(num_updates):
        # decrease learning rate linearly
        if args.use_linear_lr_decay:
            if args.share_optim:
                utils.update_linear_schedule(optimizer=agent.optimizer,
                                             update=update,
                                             total_num_updates=num_updates,
                                             initial_lr=args.pi_lr)
            else:
                utils.update_linear_schedule(optimizer=agent.policy_optimizer,
                                             update=update,
                                             total_num_updates=num_updates,
                                             initial_lr=args.pi_lr)

                utils.update_linear_schedule(
                    optimizer=agent.value_fn_optimizer,
                    update=update,
                    total_num_updates=num_updates,
                    initial_lr=args.v_lr)

        extrinsic_rewards = []
        episode_length = []
Beispiel #8
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    run_id = "alpha{}".format(args.gcn_alpha)
    if args.use_logger:
        from utils import Logger
        folder = "{}/{}".format(args.folder, run_id)
        logger = Logger(algo_name=args.algo,
                        environment_name=args.env_name,
                        folder=folder,
                        seed=args.seed)
        logger.save_args(args)

        print("---------------------------------------")
        print('Saving to', logger.save_folder)
        print("---------------------------------------")

    else:
        print("---------------------------------------")
        print('NOTE : NOT SAVING RESULTS')
        print("---------------------------------------")
    all_rewards = []

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          args.env_name,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              actor_critic.base.output_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ############################
    # GCN Model and optimizer
    from pygcn.train import update_graph
    from pygcn.models import GCN, GAT, SAGE
    assert args.gnn in ['gcn', 'gat', 'sage']

    if args.gnn == 'gat':
        gcn_model = GAT(nfeat=actor_critic.base.output_size,
                        nhid=args.gcn_hidden)
    elif args.gnn == 'sage':
        gcn_model = SAGE(nfeat=actor_critic.base.output_size,
                         nhid=args.gcn_hidden)
    elif args.gnn == 'gcn':
        gcn_model = GCN(nfeat=actor_critic.base.output_size,
                        nhid=args.gcn_hidden)

    gcn_model.to(device)
    gcn_optimizer = optim.Adam(gcn_model.parameters(),
                               lr=args.gcn_lr,
                               weight_decay=args.gcn_weight_decay)
    gcn_loss = nn.NLLLoss()
    gcn_states = [[] for _ in range(args.num_processes)]
    Gs = [nx.Graph() for _ in range(args.num_processes)]
    node_ptrs = [0 for _ in range(args.num_processes)]
    rew_states = [[] for _ in range(args.num_processes)]
    ############################

    episode_rewards = deque(maxlen=100)
    avg_fwdloss = deque(maxlen=100)
    rew_rms = RunningMeanStd(shape=())
    delay_rew = torch.zeros([args.num_processes, 1])
    delay_step = torch.zeros([args.num_processes])

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob,\
                 recurrent_hidden_states, hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            delay_rew += reward
            delay_step += 1

            for idx, (info, hid,
                      eps_done) in enumerate(zip(infos, hidden_states, done)):

                if eps_done or delay_step[idx] == args.reward_freq:
                    reward[idx] = delay_rew[idx]
                    delay_rew[idx] = delay_step[idx] = 0
                else:
                    reward[idx] = 0

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                if args.gcn_alpha < 1.0:
                    gcn_states[idx].append(hid)
                    node_ptrs[idx] += 1
                    if not eps_done:
                        Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx])
                    if reward[idx] != 0. or eps_done:
                        rew_states[idx].append(
                            [node_ptrs[idx] - 1, reward[idx]])
                    if eps_done:
                        adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\
                                        else sp.csr_matrix(np.eye(1,dtype='int64'))
                        update_graph(gcn_model, gcn_optimizer,
                                     torch.stack(gcn_states[idx]), adj,
                                     rew_states[idx], gcn_loss, args, envs)
                        gcn_states[idx] = []
                        Gs[idx] = nx.Graph()
                        node_ptrs[idx] = 0
                        rew_states[idx] = []

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks,
                            hidden_states)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau, gcn_model, args.gcn_alpha)
        agent.update(rollouts)
        rollouts.after_update()

        ####################### Saving and book-keeping #######################
        if (j % int(num_updates / 5.) == 0
                or j == num_updates - 1) and args.save_dir != "":
            print('Saving model')
            print()

            save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id)
            save_path = os.path.join(save_dir, args.algo, 'seed' +
                                     str(args.seed)) + '_iter' + str(j)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_gcn = gcn_model
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_gcn = copy.deepcopy(gcn_model).cpu()

            save_model = [
                save_gcn, save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + "ac.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {}\
             training episodes: mean/median reward {:.2f}/{:.2f},\
              min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".format(
                j,
                total_num_steps,
                int(total_num_steps / (end - start)),
                len(episode_rewards),
                np.mean(episode_rewards),
                np.median(episode_rewards),
                np.min(episode_rewards),
                np.max(episode_rewards),
                np.count_nonzero(np.greater(episode_rewards, 0)) /
                len(episode_rewards),
            ))

            all_rewards.append(np.mean(episode_rewards))
            if args.use_logger:
                logger.save_task_results(all_rewards)
        ####################### Saving and book-keeping #######################

    envs.close()
Beispiel #9
0
def main():
    device = 'cpu'
    acc_steps = []
    acc_scores = []
    torch.set_num_threads(1)
    print('here')

    if args.env_name == 'Reacher-v2':
        rbf1 = build_features_reacher2(.2, 5, 2)
        len_rbf = rbf1._K
        len_features = len_rbf + 1
    if args.env_name == 'Hopper-v2':
        len_features = 3
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)

    actor_critic.to(device)

    agent = PPO(actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              len_features)
    print('here2')
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = collections.deque(maxlen=10)
    num_updates = 20
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # Prepare demos
        demo_actions = np.zeros(
            (1, args.num_processes, envs.action_space.shape[0]))
        demo_states = np.zeros(
            (1, args.num_processes, envs.observation_space.shape[0]))

        demo_features = np.zeros((1, args.num_processes, len_features))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # obs, reward and next obs
            demo_actions = np.concatenate(
                [demo_actions,
                 action.reshape(1, args.num_processes, -1)], 0)
            demo_states = np.concatenate([
                demo_states, rollouts.obs[step].reshape(
                    1, args.num_processes, -1)
            ], 0)
            feat_rewards = np.zeros((args.num_processes, len_features))
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_before = envs.get_sim_data()
            obs, reward, done, infos = envs.step(action)
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_after = envs.get_sim_data()
                    for num_p in range(args.num_processes):
                        feat_1 = pos_after[num_p] - pos_before[num_p]
                        feat_2 = 0
                        if not done[num_p]:
                            feat_2 = 1
                        # feat_2 = np.array([1 for _ in range(args.num_processes)])
                        feat_3 = np.array(
                            [np.linalg.norm(action[num_p],
                                            ord=2)**2]).flatten()
                        feat_rewards[num_p] = np.array(
                            [feat_1, feat_2, feat_3])
            if args.env_name == 'Reacher-v2':
                if args.num_processes > 1:
                    body_data = envs.get_body_data()
                    for num_p in range(args.num_processes):
                        rbf1_ = rbf1(body_data[num_p][:-1])
                        rbf4_ = np.array(
                            [np.linalg.norm(action[num_p], ord=2)**2])
                        feat_rewards[num_p] = np.concatenate(
                            (rbf1_.reshape(-1), rbf4_))
                else:
                    rbf1_ = rbf1(
                        (envs.envs[0].env.env.get_body_com("fingertip") -
                         envs.envs[0].env.env.get_body_com("target"))[:-1])
                    rbf4_ = np.array([-np.square(action[0]).sum()])
                    feat_rewards[0] = np.concatenate(
                        (rbf1_.reshape(-1), rbf4_))
            demo_features = np.concatenate([
                demo_features,
                feat_rewards.reshape(1, args.num_processes, -1)
            ], 0)
            if step > 1 and step % 1000 == 0:
                done = [True for _ in range(args.num_processes)]

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob, \
                            value, reward, masks, feat_rewards)

        # Save demos:
        action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy'
        state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy'
        rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str(
            j) + '.npy'
        policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth'
        np.save(action_file_name, demo_actions)
        np.save(state_file_name, demo_states)
        np.save(rew_feat_file_name, demo_features)
        torch.save(actor_critic.state_dict(), policy_file_name)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir:
            save_path = os.path.join(args.save_dir, 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + '.pt'))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print('Updates', j, 'num timesteps', len(episode_rewards),
                  '\n Last training episodes: mean/median reward',
                  '{:.1f}'.format(np.mean(episode_rewards)),
                  '/{:.1f}'.format(np.median(episode_rewards)),
                  'min/max reward', '{:.1f}'.format(np.min(episode_rewards)),
                  '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy',
                  dist_entropy, 'value loss', value_loss, 'action loss',
                  action_loss)

        if len(episode_rewards) > 1:
            acc_steps.append(total_num_steps)
            acc_scores.append(np.mean(episode_rewards))
            #print(acc_scores)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _ = actor_critic.act(obs,
                                                    eval_masks,
                                                    deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print('Evaluation using', len(eval_episode_rewards),
                  'episodes: mean reward',
                  '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

    scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy'
    steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy'
    np.save(scores_file_name, np.array(acc_scores))
    np.save(steps_file_name, np.array(acc_steps))
def main():
    '''
    Train PPO policies on each of the training environments.
    '''
    args = get_args()

    try:
        os.makedirs(args.log_dir)
    except OSError:
        pass

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args, device)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ep_reward = np.zeros(args.num_processes)
    episode_rewards = deque(maxlen=100)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        # decrease learning rate linearly
        utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obs reward and next obs
            obs, reward, done, infos = envs.step(action)
            if 'spaceship' in args.env_name:  # spaceship, swimmer
                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(reward[i].item())
            # elif 'swimmer' in args.env_name:
            else:
                for i in range(len(done)):
                    ep_reward[i] += reward[i].numpy().item()
                    if done[i]:
                        episode_rewards.append(ep_reward[i])
                        ep_reward[i] = 0
            # if 'ant' in args.env_name:
            #     for info in infos:
            #         if 'episode' in info.keys():
            #             episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda,
                                 True)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(args.save_dir)
            except OSError:
                pass


            torch.save(
                actor_critic.state_dict(),
                os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\
                    .format(args.env_name, args.default_ind, args.seed))
            )

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("\nUpdates {}, num timesteps {}, Last {} training episodes: \
                \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}"
                  .format(j, total_num_steps, len(episode_rewards),
                          np.mean(episode_rewards), np.median(episode_rewards),
                          np.min(episode_rewards), np.max(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, device)

    envs.close()
Beispiel #11
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_env(args.env_name, args.seed, args.gamma)

    model = MujocoModel(envs.observation_space.shape[0],
                        envs.action_space.shape[0])
    model.to(device)

    algorithm = PPO(model,
                    args.clip_param,
                    args.value_loss_coef,
                    args.entropy_coef,
                    initial_lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    agent = MujocoAgent(algorithm, device)

    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
                              envs.action_space.shape[0])

    obs = envs.reset()
    rollouts.obs[0] = np.copy(obs)

    episode_rewards = deque(maxlen=10)

    num_updates = int(args.num_env_steps) // args.num_steps
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = agent.sample(
                    rollouts.obs[step])  # why use obs from rollouts???有病吧

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.append(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = agent.value(rollouts.obs[-1])

        value_loss, action_loss, dist_entropy = agent.learn(
            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
            args.num_mini_batch, rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_steps
            print(
                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
                                        args.seed, device)
Beispiel #12
0
    print("start training")
    obs = agent.world.reset()
    print("initial", obs)
    rollouts.obs[0].copy_(torch.from_numpy(obs))
    rollouts.to(device)

    start = time.time()
    num_updates = int(total_supposed_steps) // num_steps // num_processes
    all_return = []
    all_length = []
    for j in range(num_updates):
        print("runs", j + 1)

        if use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(agent.optimizer, j, num_updates, initial_lr)
        cumul_return = []
        episo_length = []
        episode_rewards = []
        episode_lengths = 0.0
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                agent.model.eval()
                value, action, action_log_prob = agent.sample_actions(
                    rollouts.obs[step], rollouts.masks[step], device)

            # Obser reward and next obs
            action_world = action.cpu().numpy().reshape(-1)
            obs, reward, done, success = agent.world.step(action_world)
            episode_lengths += 1.0
Beispiel #13
0
def main(algorithm, opt, loss, ppo, normalization, alpha, seed, num_processes,
         num_steps, num_test_steps, num_stack, log_interval, test_log_interval,
         num_frames, reset_encoder_in_test, freeze_in_test, environment, tasks,
         test_tasks, architecture, num_env_restarts, warmup_period_frames,
         final_period_frames, load_id, testing_frames, option_init,
         num_simultaneous_restarts, save_dir, cuda, add_timestep, _run):

    import os
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # ACKTR currently broken
    assert algorithm in ['a2c', 'ppo']

    # If all tasks are ints, convert them to actual ints
    try:
        tasks = list(map(int, tasks))
        test_tasks = list(map(int, test_tasks))
    except:
        pass

    num_tasks = len(tasks)
    num_processes_per_task = num_processes // num_tasks
    # num_frames = num_frames PER TASK
    num_updates = int(num_frames) * num_tasks // num_steps // num_processes
    print('Num updates:{}\n'.format(num_updates))
    assert num_updates > 0, 'num_updates is 0, increase number of frames'

    # There will be `num_env_restarts` within the time between warmup_updates:(num_updates -
    # final_updates)
    # This leaves some warmup period and final training period to inspect the fully trained options
    warmup_updates = int(warmup_period_frames) * \
        num_tasks // num_steps // num_processes
    final_updates = int(final_period_frames) * \
        num_tasks // num_steps // num_processes
    testing_updates = int(testing_frames) * \
        num_tasks // num_test_steps // num_processes

    restart_interval = (num_updates - warmup_updates -
                        final_updates) // (num_env_restarts + 1)

    print('Num tasks:{}\nNum processes per task:{}\n'.format(
        num_tasks, num_processes_per_task))

    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized, but we are plotting the average return after clipping. Sacred plots will be inaccurate if per-timestep rewards are out of the range [-1, 1]"
    )
    print("#######")

    torch.set_num_threads(1)

    envs = [
        make_env(environment, seed, i, add_timestep)
        for i in range(num_tasks * num_processes_per_task)
    ]
    testing_envs = [
        make_env(environment, seed, i, add_timestep)
        for i in range(num_tasks * num_processes_per_task)
    ]
    constraint = []
    test_constraint = []
    task_seed = []
    for task in tasks:
        constraint += [task] * num_processes_per_task
        task_seed += [np.random.randint(LONG_NUMBER)] * num_processes_per_task
    for task in test_tasks:
        test_constraint += [task] * num_processes_per_task

    if num_processes > 1:
        envs = MTSubprocVecEnv(envs)
        testing_envs = MTSubprocVecEnv(testing_envs)
    else:
        envs = DummyVecEnv(envs)
        testing_envs = DummyVecEnv(testing_envs)

    if len(envs.observation_space.shape) == 1:
        envs = MTVecNormalize(envs,
                              ob=normalization['ob'],
                              ret=normalization['ret'],
                              gamma=loss['gamma'])
        testing_envs = MTVecNormalize(testing_envs,
                                      ob=normalization['ob'],
                                      ret=False,
                                      gamma=loss['gamma'])

    returned_task_seed = envs.draw_and_set_task(constraint=constraint,
                                                seed=task_seed)
    testing_envs.draw_and_set_task(constraint=constraint,
                                   seed=returned_task_seed)

    print("Task seeds: {}".format(returned_task_seed))

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])

    hierarchical_actor_critic = HierarchicalPolicy(num_tasks,
                                                   num_processes_per_task,
                                                   alpha,
                                                   obs_shape,
                                                   envs.action_space,
                                                   loss,
                                                   architecture,
                                                   option_init=option_init)

    if load_id is not None:
        docs = get_docs(db_uri, db_name, 'runs')
        doc = docs.find_one({'_id': load_id})
        name = "model_after_training"
        # config = doc['config']
        # config.update({'num_processes': len(config['tasks']), 'cuda': False})
        file_id = get_file_id(doc=doc, file_name=name)
        save_file_from_db(file_id=file_id,
                          destination='model_tmp_{}.pyt'.format(_run._id),
                          db_uri=db_uri,
                          db_name=db_name)
        state_dict = torch.load("model_tmp_{}.pyt".format(_run._id),
                                map_location=lambda storage, loc: storage)
        hierarchical_actor_critic.load_state_dict(state_dict)
        os.remove('model_tmp_{}.pyt'.format(_run._id))
        print("Loading model parameters complete.")

        if isinstance(envs, MTVecNormalize) and envs.ob_rms is not None:
            print("Loading ob_rms normalization")
            ob_name = name + ".npy"
            file_id = get_file_id(doc=doc, file_name=ob_name)
            save_file_from_db(file_id=file_id,
                              destination='ob_rms_tmp.npy',
                              db_uri=db_uri,
                              db_name=db_name)
            rms_dict = np.load("ob_rms_tmp.npy")[()]
            print(rms_dict)
            envs.ob_rms.mean = rms_dict['mean']
            envs.ob_rms.var = rms_dict['var']
            envs.ob_rms.count = rms_dict['count']
            testing_envs.ob_rms.mean = rms_dict['mean']
            testing_envs.ob_rms.var = rms_dict['var']
            testing_envs.ob_rms.count = rms_dict['count']
            os.remove("ob_rms_tmp.npy")

    num_parameters = 0
    for p in hierarchical_actor_critic.parameters():
        num_parameters += p.nelement()

    num_params_master = 0
    for p in hierarchical_actor_critic.masters[0].parameters():
        num_params_master += p.nelement()

    num_params_option = 0
    for p in hierarchical_actor_critic.options[0].parameters():
        num_params_option += p.nelement()

    print(hierarchical_actor_critic)
    print("Total Number parameters: {}".format(num_parameters))
    print("Number parameters master: {}".format(num_params_master))
    print("Number parameters option: {}".format(num_params_option))

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if cuda:
        hierarchical_actor_critic.cuda()

    if algorithm == 'a2c':
        agent = algo.A2C(hierarchical_actor_critic, loss=loss, opt=opt)
    elif algorithm == 'ppo':
        agent = algo.PPO(hierarchical_actor_critic, loss, opt, ppo)
    elif algorithm == 'acktr':
        raise NotImplementedError("ACKTR not implemented with HRL")
        # agent = algo.A2C_ACKTR(hierarchical_actor_critic, value_loss_coef,
        #                        entropy_coef, acktr=True)

    def reset_envs(storage_length):
        rollouts = RolloutStorage(num_tasks, storage_length,
                                  num_processes_per_task, obs_shape,
                                  envs.action_space, loss)
        current_obs = torch.zeros(num_tasks, num_processes_per_task,
                                  *obs_shape)

        obs = envs.reset()

        update_current_obs(obs, current_obs, obs_shape, num_stack, num_tasks,
                           num_processes_per_task)
        for task in range(num_tasks):
            rollouts.obs[task, 0].copy_(current_obs[task])
        if cuda:
            current_obs = current_obs.cuda()
            rollouts.cuda()

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros([num_tasks, num_processes_per_task, 1])
        final_rewards = torch.zeros([num_tasks, num_processes_per_task, 1])
        episode_length = torch.zeros([num_tasks, num_processes_per_task, 1])
        final_length = torch.zeros([num_tasks, num_processes_per_task, 1])
        episode_terminations = torch.zeros(
            [num_tasks, num_processes_per_task, 1])
        final_terminations = torch.zeros(
            [num_tasks, num_processes_per_task, 1])
        master_terminations = torch.zeros(
            [num_tasks, num_processes_per_task, 1])
        final_master_terminations = torch.zeros(
            [num_tasks, num_processes_per_task, 1])
        return (rollouts, current_obs, episode_rewards, final_rewards,
                episode_length, final_length, episode_terminations,
                final_terminations, master_terminations,
                final_master_terminations)

    rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length, \
        episode_terminations, final_terminations, master_terminations, final_master_terminations = reset_envs(
            storage_length=num_steps)

    start = time.time()
    hierarchical_actor_critic.train()
    rollout_length = num_steps
    assert num_tasks >= num_simultaneous_restarts
    randomSampler = data.sampler.BatchSampler(
        data.sampler.RandomSampler(range(num_tasks)),
        batch_size=num_simultaneous_restarts,
        drop_last=True)
    rndSampler_iter = iter(randomSampler)
    iterator = iter(range(num_updates + testing_updates))

    for j in iterator:

        # Load old model if load_id is given
        if load_id is not None and j == 0:
            # Skip to j == num_updates - 1
            next(islice(iterator, num_updates - 2, num_updates - 2), None)
            j = next(iterator)
            ppo['use_linear_clip_decay'] = False
            opt['use_lr_decay'] = False

        # Updated Learning rate
        j_mod = j % num_updates
        lr_schedule_length = num_updates if j <= num_updates else testing_updates
        if opt['use_lr_decay']:
            update_linear_schedule(agent.optimizer, j_mod, lr_schedule_length,
                                   opt['lr'])

        # Update clip param
        if algorithm == 'ppo' and ppo['use_linear_clip_decay']:
            agent.clip_param = ppo['clip_param'] * \
                (1 - j_mod / float(lr_schedule_length))

        # Update c_kl_b
        if loss['c_kl_b_1'] is not None:
            per = np.clip((j - warmup_updates) / (num_updates - final_updates),
                          0, 1)
            cur_val = (1 - per) * loss['c_kl_b_orig'] + per * loss['c_kl_b_1']
            rollouts.loss['c_kl_b'] = cur_val
            if not loss['fixed_a']:
                rollouts.loss['c_kl_a'] = cur_val

        # Update c_kl_a
        if loss['c_kl_a_1'] is not None:
            per = np.clip((j - warmup_updates) / (num_updates - final_updates),
                          0, 1)
            cur_val = (1 - per) * loss['c_kl_a_orig'] + per * loss['c_kl_a_1']
            rollouts.loss['c_kl_a'] = cur_val
            # if not loss['fixed_b']:
            #     rollouts.loss['c_kl_a'] = cur_val

        # Update entropy_coef
        train_progress = j / (num_updates - final_updates)
        if not agent.hierarchical_actor_critic.training:
            # Testing
            elc = loss['entropy_loss_coef_test']
        elif loss['entropy_loss_coef_1'] is not None:
            factor = max(0, 1 - train_progress)
            elc = (loss['entropy_loss_coef_0'] * factor +
                   loss['entropy_loss_coef_1'] * (1 - factor))
        else:
            elc = loss['entropy_loss_coef_0']
        loss['elc'] = elc

        for step in range(rollout_length):
            # Sample actions
            """
            Note regarding z:
            z_t is treated the same way as s_t with regards to saving because at t=0 we need access to
            s_{-1} and z_{t-1}. HOWEVER, that means that the code is off by one compared to the
            equations:
            In equations: z_t depends on s_t and z_{t-1}
            Here: z_t depends on s_{t-1} and z_{t-1}
            """

            with torch.no_grad():
                b, b_log_prob, _ = hierarchical_actor_critic.executePolicy(
                    obs=rollouts.obs[:, step],
                    z=rollouts.z[:, step],
                    policy_type="termination",
                    masks=rollouts.masks[:, step])

                z, z_log_prob, _ = hierarchical_actor_critic.executePolicy(
                    obs=rollouts.obs[:, step],
                    z=rollouts.z[:, step],
                    policy_type="master",
                    b=b)

                action, action_log_prob, _ = hierarchical_actor_critic.executePolicy(
                    obs=rollouts.obs[:, step], z=z, policy_type="option")

                # Evaluate Log probs for regularized reward
                b_prior_log_prob = hierarchical_actor_critic.evaluatePrior(
                    obs=rollouts.obs[:, step],
                    z=rollouts.z[:, step],
                    action=b,
                    policy_type="termination",
                    masks=rollouts.masks[:, step])
                action_prior_log_prob = hierarchical_actor_critic.evaluatePrior(
                    obs=rollouts.obs[:, step],
                    z=z,
                    action=action,
                    policy_type="option")
                value_pred = hierarchical_actor_critic.get_U(
                    obs=rollouts.obs[:, step], previous_z=z)

            # Flatten actions:
            _, _, *action_shape = action.size()
            flat_action = action.view(num_tasks * num_processes_per_task,
                                      *action_shape)
            cpu_actions = flat_action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            single_obs_shape = obs.shape[1:]
            obs = np.reshape(np.stack(obs),
                             (num_tasks, num_processes_per_task) +
                             single_obs_shape)
            reward = np.reshape(np.stack(reward),
                                (num_tasks, num_processes_per_task))
            done = np.reshape(np.stack(done),
                              (num_tasks, num_processes_per_task))

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     2)).float()

            episode_rewards += reward
            episode_length += 1
            episode_terminations += b.cpu().float()

            delta_b = 1 - (z == rollouts.z[:, step]).int()
            master_terminations += delta_b.cpu().float()

            # If done then clean the history of observations.
            masks = torch.ones((num_tasks, num_processes_per_task, 1),
                               dtype=torch.float32)
            for task in range(num_tasks):
                for process in range(num_processes_per_task):
                    masks[task, process] = 0.0 if done[task][process] else 1.0

            # Mask rewards
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            final_length *= masks
            final_length += (1 - masks) * episode_length
            episode_length *= masks

            final_terminations *= masks
            # It starts of with a termination
            final_terminations += (1 - masks) * (episode_terminations - 1)
            episode_terminations *= masks

            final_master_terminations *= masks
            # It starts of with a termination
            final_master_terminations += (1 - masks) * \
                (master_terminations - 1)
            master_terminations *= masks

            # Mask observations
            if cuda:
                masks = masks.cuda()
            if current_obs.dim() == 5:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, num_stack,
                               num_tasks, num_processes_per_task)

            rollouts.insert(current_obs=current_obs,
                            z=z,
                            b=b,
                            action=action,
                            value_pred=value_pred,
                            action_log_prob=action_log_prob,
                            action_prior_log_prob=action_prior_log_prob,
                            z_log_prob=z_log_prob,
                            b_log_prob=b_log_prob,
                            b_prior_log_prob=b_prior_log_prob,
                            reward=reward,
                            mask=masks)

        with torch.no_grad():
            # obs[-1] is s_{t+1} in equations
            # z[-1] is z_{t} in equations
            # Basically: Those are the last values we know which are s_{t+1} and z_t
            next_value_u = hierarchical_actor_critic.get_U(
                obs=rollouts.obs[:, -1], previous_z=rollouts.z[:, -1])

        rollouts.store_next_value(next_value_u)
        rollouts.compute_returns()
        losses = agent.update(rollouts)

        rollouts.after_update()

        # While still in training and in between warmup_updates and final_updates
        if warmup_updates < j < num_updates and j < (
                num_updates - final_updates) and (
                    j - warmup_updates) % restart_interval == 0:
            # Get tasks to reset
            try:
                next_restart_tasks = next(rndSampler_iter)
            except StopIteration as e:
                rndSampler_iter = iter(randomSampler)
                next_restart_tasks = next(rndSampler_iter)

            returned_task_seed = reset_task(next_restart_tasks,
                                            hierarchical_actor_critic,
                                            constraint, agent,
                                            returned_task_seed, envs,
                                            testing_envs)
            # load_master=train_load_master_params)

            # Unfortunately there isn't a simple nice way to only restart the environment that was resetted
            rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length,\
                episode_terminations, final_terminations, master_terminations, final_master_terminations = reset_envs(
                    storage_length=num_steps)

        # When we reached the end of the training phase, reset all tasks
        if j == num_updates - 1:
            save_model(hierarchical_actor_critic, "model_after_training", envs)
            print("Reset all tasks, stop updating prior, start testing")
            last_training_task_seed = returned_task_seed.copy()
            hierarchical_actor_critic.eval()
            returned_task_seed = reset_task(
                restart_tasks=range(num_tasks),
                hierarchical_actor_critic=hierarchical_actor_critic,
                constraint=test_constraint,
                agent=agent,
                returned_task_seed=returned_task_seed,
                envs=envs,
                testing_envs=testing_envs)

            print("Freezing and resetting for test")
            hierarchical_actor_critic.frozen['prior'] = freeze_in_test['prior']
            hierarchical_actor_critic.frozen['option'] = freeze_in_test[
                'option']

            if architecture['shared_encoder']:
                hierarchical_actor_critic.split_encoder()

            # This will create a new Encoder!
            if reset_encoder_in_test['option']:
                hierarchical_actor_critic.reset_encoder('option')

            if reset_encoder_in_test['master']:
                hierarchical_actor_critic.reset_encoder('master')
            agent.init_optimizer(hierarchical_actor_critic)

            # Unfortunately there isn't a simple nice way to only restart the environment that was resetted
            rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length,\
                episode_terminations, final_terminations, master_terminations, final_master_terminations = reset_envs(
                    storage_length=num_test_steps)
            rollout_length = num_test_steps

        if (j < num_updates
                and j % log_interval == 0) or (j >= num_updates
                                               and j % test_log_interval == 0):

            test_performance = test_policy(testing_envs,
                                           hierarchical_actor_critic)
            end = time.time()
            if j % (log_interval * 10) == 0:
                printHeader()

            if j < num_updates:
                total_num_steps = (j + 1) * num_processes * num_steps
            else:
                total_num_steps = (
                    num_updates * num_steps +
                    (j + 1 - num_updates) * num_test_steps) * num_processes

            # FPS PER TASK (because num_frames is also per task!)
            fps = int(total_num_steps / num_tasks / (end - start))

            logging.info('Updt: {:5} |{:5} {:5}|{:5}|{:5}|{:5}'.format(
                str(j / num_updates)[:5],
                str(fps),
                str(final_rewards.mean().item())[:5],
                str(final_rewards.median().item())[:5],
                str(final_rewards.min().item())[:5],
                str(final_rewards.max().item())[:5],
            ))

            for task in range(num_tasks):
                _run.log_scalar('return.avg.{}'.format(task),
                                float(final_rewards[task].mean()),
                                total_num_steps // num_tasks)
                _run.log_scalar('return.test.avg.{}'.format(task),
                                float(test_performance[task].mean()),
                                total_num_steps // num_tasks)

            _run.log_scalar('return.avg',
                            final_rewards.mean().item(),
                            total_num_steps // num_tasks)
            _run.log_scalar('return.test.avg',
                            test_performance.mean().item(),
                            total_num_steps // num_tasks)
            _run.log_scalar('episode.length',
                            final_length.mean().item(),
                            total_num_steps // num_tasks)
            _run.log_scalar('episode.terminations',
                            final_terminations.mean().item(),
                            total_num_steps // num_tasks)
            _run.log_scalar('episode.master_terminations',
                            final_master_terminations.mean().item(),
                            total_num_steps // num_tasks)
            _run.log_scalar('fps', fps, total_num_steps // num_tasks)

            _run.log_scalar('loss.value', losses['value_loss'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.action_a', losses['action_loss_a'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.action_z', losses['action_loss_z'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.action_b', losses['action_loss_b'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.action_prior', losses['action_prior_loss'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.b_prior', losses['b_prior_loss'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.entropy_a', losses['entropy_a'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.entropy_b', losses['entropy_b'],
                            total_num_steps // num_tasks)
            _run.log_scalar('loss.entropy_z', losses['entropy_z'],
                            total_num_steps // num_tasks)

    _run.info["seeds_final"] = returned_task_seed
    # _run.info["last_training_task_seed"] = last_training_task_seed
    _run.info["constraints_final"] = constraint
    _run.info['test_constraints_final'] = test_constraint

    save_model(hierarchical_actor_critic, "final_model", envs)