Example #1
0
def main():
    ARGUMENTS.update(vars(args))
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        if args.algo == 'ppo' and args.use_linear_lr_decay:
            agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))
            ALL_UPDATES.append(j)
            ALL_TIMESTEPS.append(total_num_steps)
            ALL_FPS.append(int(total_num_steps / (end - start)))
            ALL_MEAN_REWARDS.append(np.mean(episode_rewards))
            ALL_MEDIAN_REWARDS.append(np.median(episode_rewards))
            ALL_MIN_REWARDS.append(np.min(episode_rewards))
            ALL_MAX_REWARDS.append(np.max(episode_rewards))
            ALL_DIST_ENTROPY.append(dist_entropy)
            ALL_VALUE_LOSS.append(value_loss)
            ALL_ACTION_LOSS.append(action_loss)

        if (args.eval_interval is not None
                and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.seed + args.num_processes, args.num_processes,
                args.gamma, eval_log_dir, args.add_timestep, device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
    # Save the results
    name = ARGUMENTS['env_name'] + '-' + ARGUMENTS['algo'] + '-' + ARGUMENTS['experiment'] + '-grad_noise' + str(ARGUMENTS['gradient_noise'])
    experiment = ro.Experiment(name, directory='results')
    data = {
        'updates': ALL_UPDATES,
        'timesteps': ALL_TIMESTEPS,
        'fps': ALL_FPS,
        'mean_rewards': ALL_MEAN_REWARDS,
        'median_rewards': ALL_MEDIAN_REWARDS,
        'min_rewards': ALL_MIN_REWARDS,
        'max_rewards': ALL_MAX_REWARDS,
        'dist_entropy': ALL_DIST_ENTROPY,
        'value_loss': ALL_VALUE_LOSS,
        'action_loss': ALL_ACTION_LOSS,
    }
    data.update(ARGUMENTS)
    result = data['mean_rewards'][-1]
    experiment.add_result(result, data)
Example #2
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)
    if args.load_policy is not None:
        actor_critic, ob_rms = torch.load(args.load_policy)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(
        maxlen=(args.num_processes if args.num_processes > 10 else 10))

    start = time.time()
    snapshot_counter = 0
    last_delete = -1
    try:
        os.makedirs(os.path.join(args.save_dir, args.algo))
    except OSError:
        pass
    log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'),
                        'w')
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(save_path,
                             args.env_name + "epoch_{:07d}.pt".format(j)))
            snapshot_counter += 1
            last_delete += 1
            if snapshot_counter > 100:
                os.system('rm ' + os.path.join(
                    save_path, args.env_name +
                    'epoch_{:07d}.py'.format(last_delete)))
                snapshot_counter -= 1

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss)
            print(log_info)
            sys.stdout.flush()
            log_out_file.write(log_info)
            log_out_file.flush()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            log_out_file.write(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            log_out_file.flush()
            sys.stdout.flush()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
Example #3
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args_iko.cuda else "cpu")

    if args_iko.vis:
        from visdom import Visdom
        viz = Visdom(port=args_iko.port)
        win = None

    envs = make_vec_envs(args_iko.env_name, args_iko.seed,
                         args_iko.num_processes, args_iko.gamma,
                         args_iko.log_dir, args_iko.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args_iko.recurrent_policy})
    actor_critic.to(device)

    action_shape = 3
    reward_model = RewardModel(11 * 11 * 6, 1, 64, 64)
    reward_model.to(device)

    if args_iko.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               lr=args_iko.lr,
                               eps=args_iko.eps,
                               alpha=args_iko.alpha,
                               max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args_iko.clip_param,
                         args_iko.ppo_epoch,
                         args_iko.num_mini_batch,
                         args_iko.value_loss_coef,
                         args_iko.entropy_coef,
                         args_iko.use_singh,
                         reward_model,
                         lr=args_iko.lr,
                         eps=args_iko.eps,
                         max_grad_norm=args_iko.max_grad_norm)
    elif args_iko.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args_iko.value_loss_coef,
                               args_iko.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args_iko.use_linear_lr_decay:
            # decrease learning rate linearly
            if args_iko.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args_iko.lr)

        if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay:
            agent.clip_param = args_iko.clip_param * (1 -
                                                      j / float(num_updates))

        reward_train = []
        reward_block_penalty = []
        reward_bel_gt = []
        reward_bel_gt_nonlog = []
        reward_infogain = []
        reward_bel_ent = []
        reward_hit = []
        reward_dist = []
        reward_inv_dist = []

        for step in range(args_iko.num_steps):
            # Sample actions
            # print(step, args_iko.num_steps)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            reward_train.append(reward)
            # print("infos is ", infos)
            # reward_b.append(infos[0]['auxillary_reward'])
            # print("infos is ",infos[0]['auxillary_reward'])
            reward_block_penalty.append(infos[0]['reward_block_penalty'])
            reward_bel_gt.append(infos[0]['reward_bel_gt'])
            reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog'])
            reward_infogain.append(infos[0]['reward_infogain'])
            reward_bel_ent.append(infos[0]['reward_bel_ent'])
            reward_hit.append(infos[0]['reward_hit'])
            reward_dist.append(infos[0]['reward_dist'])
            reward_inv_dist.append(infos[0]['reward_inv_dist'])
            # print(reward)

            reward.to(device)
            reward_model.to(device)
            if args_iko.use_singh:
                # print("using learning IR")
                my_reward = reward_model(obs.clone().to(device),
                                         action.clone().float()).detach()
                my_reward.to(device)
                reward = reward + args_iko.singh_coef * my_reward.type(
                    torch.FloatTensor)

            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])
            #         print("infos is ",infos[0]['auxillary_reward'])
            #         print("info is",info['episode']['r'] )

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        # print("mean reward_a", np.mean(reward_train))
        # print("mean reward_block_penalty", np.mean(reward_block_penalty))
        # print("mean reward_bel_gt", np.mean(reward_bel_gt))
        # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog))
        # print("mean reward_infogain", np.mean(reward_infogain))
        # print("mean reward_bel_ent", np.mean(reward_bel_ent))
        # print("mean reward_hit", np.mean(reward_hit))
        # print("mean reward_dist", np.mean(reward_dist))
        # print("mean reward_inv_dist", np.mean(reward_inv_dist))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps
        writer.add_scalar('mean_reward_train', np.mean(reward_train),
                          total_num_steps)
        writer.add_scalar('mean_reward_block_penalty',
                          np.mean(reward_block_penalty), total_num_steps)
        writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_gt_nonlog',
                          np.mean(reward_bel_gt_nonlog), total_num_steps)
        writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain),
                          total_num_steps)
        writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent),
                          total_num_steps)
        writer.add_scalar('mean_reward_hit', np.mean(reward_hit),
                          total_num_steps)
        writer.add_scalar('mean_reward_dist', np.mean(reward_dist),
                          total_num_steps)
        writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist),
                          total_num_steps)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma,
                                 args_iko.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args_iko.save_interval == 0
                or j == num_updates - 1) and args_iko.save_dir != "":
            save_path = os.path.join(args_iko.save_dir, args_iko.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args_iko.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(
                    save_path, 'ugl' + str(args_iko.use_gt_likelihood) +
                    'block-pen-' + str(args_iko.penalty_for_block) + '_' +
                    'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' +
                    str(args_iko.rew_bel_new) + '_' + 'bel-ent-' +
                    str(args_iko.rew_bel_ent) + '_' + 'infogain-' +
                    str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' +
                    str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' +
                    str(args_iko.rew_bel_gt) + '_' + 'dist-' +
                    str(args_iko.rew_dist) + '_' + 'hit-' +
                    str(args_iko.rew_hit) + '_' + 'inv-dist-' +
                    str(args_iko.rew_inv_dist) + args_iko.algo + ".pt"))

        total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps

        if j % args_iko.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("mean reward_a", np.mean(reward_a))
            print("mean_reward_b", np.mean(reward_b))
            # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            len(episode_rewards),
            #            np.mean(episode_rewards),
            #            np.median(episode_rewards),
            #            np.min(episode_rewards),
            #            np.max(episode_rewards), dist_entropy,
            #            value_loss, action_loss))
            # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps)
            # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps)
            # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps)
            # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps)

        if (args_iko.eval_interval is not None and len(episode_rewards) > 1
                and j % args_iko.eval_interval == 0):
            eval_envs = make_vec_envs(args_iko.env_name,
                                      args_iko.seed + args_iko.num_processes,
                                      args_iko.num_processes, args_iko.gamma,
                                      eval_log_dir, args_iko.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args_iko.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args_iko.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args_iko.vis and j % args_iko.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args_iko.log_dir,
                                  args_iko.env_name, args_iko.algo,
                                  args_iko.num_env_steps)
            except IOError:
                pass
    writer.close()
Example #4
0
def main(args):
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    eval_log_dir = args.log_dir + "_eval"

    try:
        os.makedirs(eval_log_dir)
    except OSError:
        files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    if args.eval_render:
        render_env = make_vec_envs(args.env_name,
                                   args.seed,
                                   1,
                                   None,
                                   None,
                                   args.add_timestep,
                                   device='cpu',
                                   allow_early_resets=False)

    torch.set_num_threads(1)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Uses gpu/cuda by default
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # Only if running visdoom
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    # Set up actor_critic
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    # Set algorithm with actor critic and use to learn
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(
                    save_path, args.env_name + "-AvgRwrd" +
                    str(int(np.mean(episode_rewards))) + ".pt"))
            print("Saving Model")

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        # Logs every log_interval steps
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            if args.eval_render:
                show_model(render_env, actor_critic)

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
def main(env, scene_path):
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)
    save_path = os.path.join(args.save_dir, args.algo)

    eval_x = []
    eval_y = []

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    initial_policies = torch.load(os.path.join(args.load_dir, args.algo,
                                               args.initial_policy + ".pt")) \
        if args.initial_policy else None

    if args.reuse_residual:
        residual, ob_rms, initial_policies = initial_policies
    else:
        residual = None
        ob_rms = None

    pose_estimator = torch.load(os.path.join(args.load_dir, "pe",
                                             args.pose_estimator + ".pt")) \
        if args.pose_estimator else None

    envs = make_vec_envs(env,
                         scene_path,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         initial_policies,
                         pose_estimator=pose_estimator,
                         init_control=not args.dense_ip)
    if args.reuse_residual:
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms

    base_kwargs = {'recurrent': args.recurrent_policy}
    base = residual.base if args.reuse_residual else None
    dist = residual.dist if args.reuse_residual else None
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs=base_kwargs,
                          zero_last_layer=True,
                          base=base,
                          dist=dist)
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         burn_in=initial_policies is not None
                         and not args.reuse_residual)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=64)

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    total_num_steps = 0
    j = 0
    max_succ = -1
    max_mean_rew = -math.inf
    mean_ep_rew = -math.inf
    evals_without_improv = 0

    start = time.time()
    start_update = start
    while (not use_metric
           and j < num_updates) or (use_metric
                                    and max_succ < args.trg_succ_rate):
        if args.eval_interval is not None and j % args.eval_interval == 0:
            print("Evaluating current policy...")
            i = 0
            total_successes = 0
            max_trials = 50
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)
            while i + args.num_processes <= max_trials:

                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                obs, _, dones, infos = envs.step(action)

                if np.all(dones):  # Rigid - assumes episodes are fixed length
                    rews = []
                    for info in infos:
                        rews.append(info['rew_success'])
                    i += args.num_processes
                    rew = sum([int(rew > 0) for rew in rews])
                    total_successes += rew

            p_succ = (100 * total_successes / i)
            eval_x += [total_num_steps]
            eval_y += [p_succ]

            end = time.time()
            print(
                f"Evaluation: {total_successes} successful out of {i} episodes - "
                f"{p_succ:.2f}% successful. Eval length: {end - start_update}")
            torch.save([eval_x, eval_y],
                       os.path.join(args.save_as + "_eval.pt"))
            start_update = end

            if p_succ > max_succ:
                max_succ = p_succ
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            elif mean_ep_rew > max_mean_rew:
                print("Unimproved success rate, higher reward")
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            else:
                evals_without_improv += 1

            if evals_without_improv == 10 or max_succ >= args.trg_succ_rate:
                save_model = actor_critic
                if args.cuda:
                    save_model = copy.deepcopy(actor_critic).cpu()

                save_model = [
                    save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None),
                    initial_policies
                ]
                extra = "_final" if evals_without_improv == 5 else ""
                torch.save(
                    save_model,
                    os.path.join(save_path, args.save_as + f"{extra}.pt"))
                break

        # save for every interval-th episode or for the last epoch
        if ((not use_metric and
             (j % args.save_interval == 0 or j == num_updates - 1)) or
            (use_metric
             and evals_without_improv == 0)) and args.save_dir != "":
            os.makedirs(save_path, exist_ok=True)

            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            if pose_estimator is not None:
                save_model = [save_model, pose_estimator, initial_policies]
            else:
                save_model = [
                    save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None),
                    initial_policies
                ]

            torch.save(save_model, os.path.join(save_path,
                                                args.save_as + ".pt"))
            # torch.save(save_model, os.path.join(save_path, args.save_as + f"{j * args.num_processes * args.num_steps}.pt"))

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            mean_ep_rew = np.mean(episode_rewards)
            if mean_ep_rew > max_mean_rew:
                print("Improved max mean reward")
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), mean_ep_rew,
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            print("Update length: ", end - start_update)
            start_update = end

        if args.vis and (j % args.vis_interval == 0 or
                         (not use_metric and j == num_updates - 1)):
            try:
                # Sometimes monitor doesn't properly flush the outputs
                visdom_plot(args.log_dir, args.save_as, args.algo,
                            total_num_steps)
            except IOError:
                pass

        j += 1

    if use_metric:
        if max_succ >= args.trg_succ_rate:
            print(
                f"Achieved greater than {args.trg_succ_rate}% success, advancing curriculum."
            )
        else:
            print(
                f"Policy converged with max success rate < {args.trg_succ_rate}%"
            )
    # Copy logs to permanent location so new graphs can be drawn.
    copy_tree(args.log_dir, os.path.join('logs', args.save_as))
    envs.close()
    return total_num_steps
Example #6
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    print(device)
    print(save_folder)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False, args.reward_type)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)
    actor_critic.to(device)

    curiosity = None
    if use_curiosity:
        curiosity = ICM(envs.observation_space.shape[0], envs.action_space.n)
        curiosity.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         curiosity,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_curiosity=use_curiosity)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    cum_rew = [0] * args.num_processes
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=args.num_processes * 2)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            envs.render()

            cur_reward = reward

            to_write = reward.cpu().numpy()
            for i in range(args.num_processes):
                cum_rew[i] += to_write[i][0]

            if use_curiosity:
                action_one_hot = (torch.eye(14)[action]).view(-1, 14).cuda()
                _, pred_phi, actual_phi = curiosity(
                    (rollouts.obs[step], obs, action_one_hot))
                cur_reward += 0.2 * ((pred_phi - actual_phi).pow(2)).sum(
                    -1, keepdim=True).cpu() / 2

            for i, finished in enumerate(done):
                if finished:
                    percentile = infos[i]['x_pos'] / norm_pos
                    episode_rewards.append(percentile)
                    print(cum_rew[i])
                    with open(train_file[:-4] + str(i) + train_file[-4:],
                              'a',
                              newline='') as sfile:
                        writer = csv.writer(sfile)
                        writer.writerows([[cum_rew[i], percentile]])
                    cum_rew[i] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, cur_reward.detach(), masks)

        with torch.no_grad():
            next_value = agent.actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = agent.actor_critic
            if args.cuda:
                save_model = copy.deepcopy(agent.actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_folder, '/' + args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(
                episode_rewards) > args.num_processes:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, cumulative reward {:.3f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), np.mean(cum_rew)))


#Evaluation time :

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):

            num_proc = 1
            eval_envs = make_vec_envs(args.env_name, args.seed + num_proc,
                                      num_proc, args.gamma, args.log_dir,
                                      args.add_timestep, device, True,
                                      args.reward_type)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []
            test_rew = 0
            finish_this = False

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                num_proc,
                actor_critic.recurrent_hidden_state_size,
                device=device)

            eval_masks = torch.zeros(num_proc, 1, device=device)
            positions = deque(maxlen=400)

            while not finish_this:
                with torch.no_grad():

                    _, action, _, eval_recurrent_hidden_states = agent.actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_envs.render()

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done]).cuda()

                #                 for i, finished in enumerate(done):
                #                     if finished:
                #                         percentile = infos[i]['x_pos']/norm_pos
                #                         eval_episode_rewards.append(percentile)
                #                         with open(eval_file, 'a', newline='') as sfile:
                #                             writer = csv.writer(sfile)
                #                             writer.writerows([[percentile]])

                test_rew += reward.cpu().numpy()[0, 0]

                for i, finished in enumerate(done):
                    if finished:
                        print('he died')
                        percentile = infos[i]['x_pos'] / norm_pos
                        eval_episode_rewards.append(percentile)
                        with open(eval_file, 'a', newline='') as sfile:
                            writer = csv.writer(sfile)
                            writer.writerows([[test_rew, percentile]])
                        finish_this = True

                #to prevent the agent from getting stuck
                positions.append(infos[0]['x_pos'])
                pos_ar = np.array(positions)
                if (len(positions) >= 200) and (pos_ar < pos_ar[-1] + 20).all(
                ) and (pos_ar > pos_ar[-1] - 20).all():
                    print("he's stuck")
                    percentile = infos[0]['x_pos'] / norm_pos
                    eval_episode_rewards.append(percentile)
                    with open(eval_file, 'a', newline='') as sfile:
                        writer = csv.writer(sfile)
                        writer.writerows([[test_rew, percentile]])
                    finish_this = True

            eval_envs.close()
            positions.clear()

            print(
                " Evaluation using {} episodes:  reward {:.3f}, distance {:.3f}\n"
                .format(len(eval_episode_rewards), test_rew,
                        np.mean(eval_episode_rewards)))
            test_rew = 0
            finish_this = False

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
Example #7
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, 1, args.gamma, args.log_dir,
                         args.add_timestep, device, False)

    # Determine the observation and action lengths for the robot and human, respectively
    obs = envs.reset()
    action = torch.tensor([envs.action_space.sample()])
    _, _, _, info = envs.step(action)
    obs_robot_len = info[0]['obs_robot_len']
    obs_human_len = info[0]['obs_human_len']
    action_robot_len = info[0]['action_robot_len']
    action_human_len = info[0]['action_human_len']
    obs_robot = obs[:, :obs_robot_len]
    obs_human = obs[:, obs_robot_len:]
    if len(obs_robot[0]) != obs_robot_len or len(
            obs_human[0]) != obs_human_len:
        print('robot obs shape:', obs_robot.shape, 'obs space robot shape:',
              (obs_robot_len, ))
        print('human obs shape:', obs_human.shape, 'obs space human shape:',
              (obs_human_len, ))
        exit()

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    # Reset environment
    obs = envs.reset()
    obs_robot = obs[:, :obs_robot_len]
    obs_human = obs[:, obs_robot_len:]

    action_space_robot = spaces.Box(low=np.array([-1.0] * action_robot_len),
                                    high=np.array([1.0] * action_robot_len),
                                    dtype=np.float32)
    action_space_human = spaces.Box(low=np.array([-1.0] * action_human_len),
                                    high=np.array([1.0] * action_human_len),
                                    dtype=np.float32)

    if args.load_policy is not None:
        actor_critic_robot, actor_critic_human, ob_rms = torch.load(
            args.load_policy)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    else:
        actor_critic_robot = Policy(
            [obs_robot_len],
            action_space_robot,
            base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic_human = Policy(
            [obs_human_len],
            action_space_human,
            base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic_robot.to(device)
    actor_critic_human.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent_robot = algo.PPO(actor_critic_robot,
                               args.clip_param,
                               args.ppo_epoch,
                               args.num_mini_batch,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
        agent_human = algo.PPO(actor_critic_human,
                               args.clip_param,
                               args.ppo_epoch,
                               args.num_mini_batch,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts_robot = RolloutStorage(
        args.num_steps, args.num_processes, [obs_robot_len],
        action_space_robot, actor_critic_robot.recurrent_hidden_state_size)
    rollouts_human = RolloutStorage(
        args.num_steps, args.num_processes, [obs_human_len],
        action_space_human, actor_critic_human.recurrent_hidden_state_size)
    rollouts_robot.obs[0].copy_(obs_robot)
    rollouts_robot.to(device)
    rollouts_human.obs[0].copy_(obs_human)
    rollouts_human.to(device)

    episode_rewards = deque(
        maxlen=(args.num_processes if args.num_processes > 10 else 10))

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent_robot.optimizer, j, num_updates,
                                       args.lr)
                update_linear_schedule(agent_human.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent_robot.clip_param = args.clip_param * (1 -
                                                        j / float(num_updates))
            agent_human.clip_param = args.clip_param * (1 -
                                                        j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value_robot, action_robot, action_log_prob_robot, recurrent_hidden_states_robot = actor_critic_robot.act(
                    rollouts_robot.obs[step],
                    rollouts_robot.recurrent_hidden_states[step],
                    rollouts_robot.masks[step])
                value_human, action_human, action_log_prob_human, recurrent_hidden_states_human = actor_critic_human.act(
                    rollouts_human.obs[step],
                    rollouts_human.recurrent_hidden_states[step],
                    rollouts_human.masks[step])

            # Obser reward and next obs
            action = torch.cat((action_robot, action_human), dim=-1)
            obs, reward, done, infos = envs.step(action)
            obs_robot = obs[:, :obs_robot_len]
            obs_human = obs[:, obs_robot_len:]

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts_robot.insert(obs_robot, recurrent_hidden_states_robot,
                                  action_robot, action_log_prob_robot,
                                  value_robot, reward, masks)
            rollouts_human.insert(obs_human, recurrent_hidden_states_human,
                                  action_human, action_log_prob_human,
                                  value_human, reward, masks)

        with torch.no_grad():
            next_value_robot = actor_critic_robot.get_value(
                rollouts_robot.obs[-1],
                rollouts_robot.recurrent_hidden_states[-1],
                rollouts_robot.masks[-1]).detach()
            next_value_human = actor_critic_human.get_value(
                rollouts_human.obs[-1],
                rollouts_human.recurrent_hidden_states[-1],
                rollouts_human.masks[-1]).detach()

        rollouts_robot.compute_returns(next_value_robot, args.use_gae,
                                       args.gamma, args.tau)
        rollouts_human.compute_returns(next_value_human, args.use_gae,
                                       args.gamma, args.tau)

        value_loss_robot, action_loss_robot, dist_entropy_robot = agent_robot.update(
            rollouts_robot)
        value_loss_human, action_loss_human, dist_entropy_human = agent_human.update(
            rollouts_human)

        rollouts_robot.after_update()
        rollouts_human.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model_robot = actor_critic_robot
            save_model_human = actor_critic_human
            if args.cuda:
                save_model_robot = copy.deepcopy(actor_critic_robot).cpu()
                save_model_human = copy.deepcopy(actor_critic_human).cpu()

            save_model = [
                save_model_robot, save_model_human,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Robot/Human updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy_robot,
                        value_loss_robot, action_loss_robot))
            sys.stdout.flush()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            obs_robot = obs[:, :obs_robot_len]
            obs_human = obs[:, obs_robot_len:]
            eval_recurrent_hidden_states_robot = torch.zeros(
                args.num_processes,
                actor_critic_robot.recurrent_hidden_state_size,
                device=device)
            eval_recurrent_hidden_states_human = torch.zeros(
                args.num_processes,
                actor_critic_human.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action_robot, _, eval_recurrent_hidden_states_robot = actor_critic_robot.act(
                        obs_robot,
                        eval_recurrent_hidden_states_robot,
                        eval_masks,
                        deterministic=True)
                    _, action_human, _, eval_recurrent_hidden_states_human = actor_critic_human.act(
                        obs_human,
                        eval_recurrent_hidden_states_human,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                action = torch.cat((action_robot, action_human), dim=-1)
                obs, reward, done, infos = eval_envs.step(action)
                obs_robot = obs[:, :obs_robot_len]
                obs_human = obs[:, obs_robot_len:]

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
            sys.stdout.flush()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs, is_minigrid = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False, args.num_frame_stack)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy, 'est_beta_value':args.est_beta_value},
        is_minigrid=is_minigrid, use_rew=args.use_reward)

    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               lr_beta=args.lr_beta, reg_beta=args.reg_beta,
                               delib_center=args.delib_center,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    obs = obs + torch.randn_like(obs) * args.noise_obs
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    
    all_rewards_local = deque()
    all_frame_local = deque()
    all_beta_mean_local = deque()
    all_beta_std_local = deque()

    start = time.time()

    prev_value = 0
    eval_prev_value = 0
    prev_rew = torch.zeros(args.num_processes).to(device)
    eval_prev_rew = torch.zeros(args.num_processes).to(device)

    for j in range(num_updates):
        beta_value_list = []

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, prev_value, beta_value = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step],
                        prev_value=prev_value,
                        prev_rew=prev_rew)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            obs = obs + torch.randn_like(obs) * args.noise_obs

            if not args.cuda:
                beta_value_list.append(beta_value.numpy())
            else:
                beta_value_list.append(beta_value.cpu().numpy())

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            prev_rew = reward * masks

            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy, eval_prev_value, beta_loss, eval_prev_rew = agent.update(rollouts, eval_prev_value, eval_prev_rew)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]
                
            torch.save(save_model, os.path.join(save_path, args.env_name +"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))

            experiment.log_metrics({"mean reward": np.mean(episode_rewards),
                                 "Value loss": value_loss, "Action Loss": action_loss,
                                 "Beta loss": beta_loss,
                                 "Beta mean": np.array(beta_value_list).mean(),
                                 "Beta std": np.array(beta_value_list).std()}, 
                                 step=j * args.num_steps * args.num_processes)

            all_rewards_local.append(np.mean(episode_rewards))
            all_frame_local.append(total_num_steps)
            all_beta_mean_local.append(np.array(beta_value_list).mean())
            all_beta_std_local.append(np.array(beta_value_list).std())

        if (args.eval_interval is not None
                and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.seed + args.num_processes, args.num_processes,
                args.gamma, eval_log_dir, args.add_timestep, device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                           dtype=torch.float32,
                                           device=device)
                
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass

    with open(args.save_local_dir+"Rewards_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f:
        pickle.dump(all_rewards_local, f)

    with open(args.save_local_dir+"Frames_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f:
        pickle.dump(all_frame_local, f)

    with open(args.save_local_dir+"beta_mean_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f:
        pickle.dump(all_beta_mean_local, f)

    with open(args.save_local_dir+"beta_std_"+str(args.env_name)+"_seed_"+str(args.seed)+"_est_beta_"+str(args.est_beta_value)+"_lr_beta_"+str(args.lr_beta)+"_beta_reg_"+str(args.reg_beta)+"_entropy_"+str(args.entropy_coef)+"_steps_"+str(args.num_steps)+"_noise_obs_"+str(args.noise_obs)+".pkl", 'wb') as f:
        pickle.dump(all_beta_std_local, f)
Example #9
0
def main():
    setup_logger(args.verbose, args.model_name)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.visdom_port)
        win = None

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         config.log_directory,
                         args.add_timestep,
                         device,
                         allow_early_resets=True,
                         num_frame_stack=None,
                         ip=args.ip,
                         start_port=args.port,
                         wait_action=args.wait_action,
                         reset_step=args.reset_step)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_layer_size
                          })
    # load model
    if args.load_path is not None:
        logger.info("loading model: {}".format(args.load_path))
        actor_critic = torch.load(args.load_path)

    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_clipped_value_loss=True)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=20)
    episode_distance = deque(maxlen=20)

    if args.use_wandb:
        wandb.init(project='LumbarSpine',
                   config=args,
                   group=args.model_name,
                   resume=args.resume_wandb)
        # wandb.watch(actor_critic)

        if wandb.run.resumed:
            logger.info('Wandb resumed!')

    # --------------------- train ----------------------------
    start = time.time()
    for iter in range(num_updates):
        logger.info('Training {}/{} updates'.format(iter, num_updates))
        if args.test:
            break

        # todo: maybe this is what is making things confusing?!
        envs.reset()

        # HACKY way of reconnecting back the main env to avoid packet drop
        # for i in range(args.num_processes):
        #     envs.venv.envs[i].env.net.connect(args.ip, args.port)

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                lr = update_linear_schedule(agent.optimizer, iter, num_updates,
                                            agent.optimizer.lr)
            else:
                lr = update_linear_schedule(agent.optimizer, iter, num_updates,
                                            args.lr)
        else:
            lr = args.lr

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 -
                                                  iter / float(num_updates))

        distances = []
        vels = []

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_distance.append(info['episode_']['distance'])
                if 'distance' in info.keys():
                    distances.append(info['distance'])
                    vels.append(info['vel'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if iter % args.save_interval == 0 or iter == num_updates - 1:
            save_path = os.path.join(config.trained_directory,
                                     args.algo + "-" + args.env_name + ".pt")
            logger.info("Saving model: {}".format(save_path))
            torch.save(actor_critic, save_path)

        total_num_steps = (iter + 1) * args.num_processes * args.num_steps

        log_info = {
            'average_vel': np.mean(vels),
            'average_distance': np.mean(distances),
            'value_loss': value_loss,
            'action_loss': action_loss,
            'dist_entropy': dist_entropy,
            'lr': lr,
            'agent_clip_param': agent.clip_param,
        }

        if len(episode_rewards) > 1:
            log_info.update({
                'mean_episode_reward': np.mean(episode_rewards),
                'median_episode_reward': np.median(episode_rewards),
                'min_episode_reward': np.min(episode_rewards),
                'max_episode_reward': np.max(episode_rewards),
                'mean_episode_distance': np.mean(episode_distance)
            })

        # todo: switch to episodic and cover other locations. This log is only for episodic
        if iter % args.episode_log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            logger.info(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median "
                "reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".format(
                    iter,
                    total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards), dist_entropy, value_loss,
                    action_loss))

        # --------------------- evaluate ----------------------------
        # Evaluate on a single environment
        if args.eval_interval is not None and iter % args.eval_interval == 0:
            logger.info('Evaluate')

            # todo: what is allow_early_resets? (False for main, True for eval)
            eval_envs = make_vec_envs(
                args.env_name,
                args.seed,
                1,  # args.num_processes,
                args.gamma,
                config.log_directory,
                args.add_timestep,
                device,
                allow_early_resets=False,
                num_frame_stack=None,
                ip=args.ip,
                start_port=args.port,
                wait_action=args.wait_action,
                eval_mode=True)

            eval_episode_rewards = []
            rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            eval_distances = []

            # while len(eval_episode_rewards) < 10:
            for eval_step in range(args.num_steps_eval):
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          dtype=torch.float32,
                                          device=device)
                logger.log(msg='eval step reward: {}'.format(reward), level=18)
                logger.log(msg='eval step obs: {}'.format(obs), level=18)

                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])
                    if 'distance' in info.keys():
                        eval_distances.append(info['distance'])

                rewards.extend(reward)

            eval_envs.close()

            if args.episodic:
                logger.info(
                    "Evaluation using {} episodes: mean reward {:.5f}\n".
                    format(len(eval_episode_rewards),
                           np.mean(eval_episode_rewards)))
            else:
                logger.info(
                    "Evaluation using {} steps: mean reward {:.5f}\n".format(
                        args.num_steps_eval, np.mean(rewards)))

            # update info
            log_info.update({
                'mean_eval_reward': np.mean(rewards),
                'eval_average_distance': np.mean(eval_distances)
            })

        if args.vis and iter % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                logger.info("Visdom log update")
                win = visdom_plot(viz, win, config.visdom_log_directory,
                                  args.env_name, args.algo, args.num_env_steps)
            except IOError:
                pass

        if iter % args.log_interval == 0:
            logger.info('{}:{}  {}'.format(iter, num_updates, log_info))
            if args.use_wandb:
                wandb.log(log_info)

    # -------------------------------------- testing -------------------------------------
    if args.test:
        logger.info('Evaluate')

        # todo: what is allow_early_resets? (False for main, True for eval)
        eval_envs = make_vec_envs(args.env_name,
                                  args.seed,
                                  args.num_processes,
                                  args.gamma,
                                  config.log_directory,
                                  args.add_timestep,
                                  device,
                                  allow_early_resets=False,
                                  num_frame_stack=None,
                                  ip=args.ip,
                                  start_port=args.port,
                                  wait_action=args.wait_action,
                                  eval_mode=True)

        eval_episode_rewards = []
        rewards = []

        obs = eval_envs.reset()
        eval_recurrent_hidden_states = torch.zeros(
            args.num_processes,
            actor_critic.recurrent_hidden_state_size,
            device=device)
        eval_masks = torch.zeros(args.num_processes, 1, device=device)

        # while len(eval_episode_rewards) < 10:
        for eval_step in range(args.num_steps_eval):
            with torch.no_grad():
                _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                    obs,
                    eval_recurrent_hidden_states,
                    eval_masks,
                    deterministic=True)

            # Obser reward and next obs
            obs, reward, done, infos = eval_envs.step(action)

            eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                       for done_ in done],
                                      dtype=torch.float32,
                                      device=device)
            logger.info('eval step reward: {}'.format(reward))
            logger.log(msg='eval step obs: {}'.format(obs), level=18)

            if args.episodic:
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])
            else:
                rewards.append(reward)

        eval_envs.close()

        if args.episodic:
            logger.info(
                "Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))
        else:
            logger.info(
                "Evaluation using {} steps: mean reward {:.5f}\n".format(
                    args.num_steps, np.mean(rewards)))