Exemple #1
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    beta = 0.1
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"
    expert_path = "../save/{}_traj.pkl".format(args.env)

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Load expert trajectories
    #----------------------------
    if os.path.exists(expert_path):
        s_real, a_real = pkl.load(open(expert_path, "rb"))
        sa_real = []

        if args.conti:
            for i in range(len(s_real)):
                sa_real.append(np.concatenate([s_real[i], a_real[i]], 1))
        else:
            for i in range(len(s_real)):
                a_real_onehot = np.zeros((len(a_real[i]), a_dim),
                                         dtype=np.float32)

                for j in range(len(a_real[i])):
                    a_real_onehot[j, a_real[i][j]] = 1

                sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1))

        sa_real = np.concatenate(sa_real, 0)
    else:
        print("ERROR: No expert trajectory file found")
        sys.exit(1)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    dis_net = DiscriminatorNet(s_dim + a_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                dis_net,
                a_dim,
                beta,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device,
                conti=args.conti)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        dis_net.load_state_dict(checkpoint["DiscriminatorNet"])
        agent.beta = checkpoint["beta"]
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net, dis_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train(
            policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values,
            mb_advs, mb_returns, mb_old_a_logps, sa_real)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance(
            )
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps        = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time     = {:.2f} sec".format(n_sec))
            print("FPS              = {:d}".format(fps))
            print("actor loss       = {:.6f}".format(pg_loss))
            print("critic loss      = {:.6f}".format(v_loss))
            print("dis loss         = {:.6f}".format(dis_loss))
            print("entropy          = {:.6f}".format(ent))
            print("avg_kl           = {:.6f}".format(avg_kl))
            print("beta             = {:.6f}".format(agent.beta))
            print("mean true return = {:.6f}".format(mean_true_return))
            print("mean return      = {:.6f}".format(mean_return))
            print("mean length      = {:.2f}".format(mean_len))
            print("dis_real         = {:.3f}".format(dis_real))
            print("dis_fake         = {:.3f}".format(dis_fake))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "beta": agent.beta,
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict(),
                    "DiscriminatorNet": dis_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Exemple #2
0
def main():
    env_name = 'BipedalWalker-v3'
    render = False
    solved_reward = 60
    log_interval = 20
    max_episodes = 10000
    max_timesteps = 1500

    update_timestep = 8000
    action_std = 0.5
    K_epochs = 80
    eps_clip = 0.2
    gamma = 0.99

    lr = 3e-4
    betas = (0.9, 0.999)

    env = gym.make(env_name)
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # s_dim, a_dim = 24, 4

    # print(env.observation_space.high)
    # print(env.observation_space.low)
    # print(env.action_space.high)
    # print(env.action_space.low)
    # assert False

    device = torch.device('cuda')

    memory = Memory()
    ppo = PPO(s_dim, a_dim, action_std, lr, betas, gamma, K_epochs, eps_clip,
              device)

    running_reward = 0
    avg_length = 0
    time_step = 0
    total_time = time.time()

    for eps in range(max_episodes):
        state = env.reset()
        if eps % log_interval == 0:
            start_t = time.time()
        for t in range(max_timesteps):
            time_step += 1
            action = ppo.choose_action(state, memory)
            # action.shape = (a_dim,)
            state, reward, done, _ = env.step(action)

            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            if time_step % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0

            running_reward += reward

            if render:
                env.render()

            if done:
                break

        avg_length += (t + 1)

        # solved_reward = 60
        # log_interval = 20
        # max_episodes = 10000
        # max_timesteps = 1500
        # update_timestep = 8000

        if running_reward > log_interval * solved_reward:
            print("### Solved! ###")
            torch.save(ppo.policy.state_dict(), 'ckpt/ppo_solved.pt')
            break

        if (eps + 1) % 500 == 0:
            torch.save(ppo.policy.state_dict(), 'ckpt/ppo.pt')

        if (eps + 1) % log_interval == 0:
            end_t = time.time()
            m, s = epoch_time(start_t, end_t)
            t_m, t_s = epoch_time(total_time, end_t)

            avg_length = int(avg_length / log_interval)
            running_reward = int(running_reward / log_interval)

            print(f"Episode {eps+1}")
            print(f"\tTime: {m}m {s}s | Total Time: {t_m}m {t_s}s")
            print(f"\tAvg length: {avg_length}")
            print(f"\tAvg reward: {running_reward}")
            running_reward = 0
            avg_length = 0
Exemple #3
0
assert args.env_name in env_lst

env = NormalizedGymEnv(args.env_name, normalize_obs=True)
'''
#for pybullet envs
import pybullet_envs
env = NormalizedGymEnv("HopperBulletEnv-v0",normalize_obs=True)
'''
action_space = env.action_space.shape[0]
state_space = env.observation_space.shape[0]
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if args.use_cuda == False:
    device = 'cpu'
if (torch.cuda.is_available()) and (args.use_cuda):
    agent = PPO(state_space,action_space,args.hidden_dim, args.learning_rate,args.entropy_coef,args.critic_coef,args.gamma,args.lmbda,args.eps_clip,\
               args.K_epoch, args.minibatch_size,device).cuda()
else:
    agent = PPO(state_space,action_space,args.hidden_dim, args.learning_rate,args.entropy_coef,args.critic_coef,args.gamma,args.lmbda,args.eps_clip,\
               args.K_epoch, args.minibatch_size,device)

if args.load != 'no':
    agent.load_state_dict(torch.load("./model_weights/" + args.load))

if args.tensorboard:
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter()
else:
    writer = None

score_lst = []
Exemple #4
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_return, std_return, mean_len = runner.get_performance()
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps    = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time = {:.2f} sec".format(n_sec))
            print("FPS          = {:d}".format(fps))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_return))
            print("mean length  = {:.2f}".format(mean_len))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Exemple #5
0
def main():
    env = make_env()
    set_global_seeds(env, args.seed)

    agent = PPO(env=env)

    batch_steps = args.n_envs * args.batch_steps  # number of steps per update

    if args.save_interval and logger.get_dir():
        # some saving jobs
        pass

    ep_info_buffer = deque(maxlen=100)
    t_train_start = time.time()
    n_updates = args.n_steps // batch_steps
    runner = Runner(env, agent)

    for update in range(1, n_updates + 1):
        t_start = time.time()
        frac = 1.0 - (update - 1.0) / n_updates
        lr_now = args.lr  # maybe dynamic change
        clip_range_now = args.clip_range  # maybe dynamic change
        obs, returns, masks, acts, vals, neglogps, advs, rewards, ep_infos = \
            runner.run(args.batch_steps, frac)
        ep_info_buffer.extend(ep_infos)
        loss_infos = []

        idxs = np.arange(batch_steps)
        for _ in range(args.n_epochs):
            np.random.shuffle(idxs)
            for start in range(0, batch_steps, args.minibatch):
                end = start + args.minibatch
                mb_idxs = idxs[start:end]
                minibatch = [
                    arr[mb_idxs] for arr in
                    [obs, returns, masks, acts, vals, neglogps, advs]
                ]
                loss_infos.append(
                    agent.train(lr_now, clip_range_now, *minibatch))

        t_now = time.time()
        time_this_batch = t_now - t_start
        if update % args.log_interval == 0:
            ev = float(explained_variance(vals, returns))
            logger.logkv('updates', str(update) + '/' + str(n_updates))
            logger.logkv('serial_steps', update * args.batch_steps)
            logger.logkv('total_steps', update * batch_steps)
            logger.logkv('time', time_this_batch)
            logger.logkv('fps', int(batch_steps / (t_now - t_start)))
            logger.logkv('total_time', t_now - t_train_start)
            logger.logkv("explained_variance", ev)
            logger.logkv('avg_reward',
                         np.mean([e['r'] for e in ep_info_buffer]))
            logger.logkv('avg_ep_len',
                         np.mean([e['l'] for e in ep_info_buffer]))
            logger.logkv('adv_mean', np.mean(returns - vals))
            logger.logkv('adv_variance', np.std(returns - vals)**2)
            loss_infos = np.mean(loss_infos, axis=0)
            for loss_name, loss_info in zip(agent.loss_names, loss_infos):
                logger.logkv(loss_name, loss_info)
            logger.dumpkvs()

        if args.save_interval and update % args.save_interval == 0 and logger.get_dir(
        ):
            pass
    env.close()
Exemple #6
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    # Maxime: commented this out because it very much changes the behavior
    # of the code for seemingly arbitrary reasons
    #if len(envs.observation_space.shape) == 1:
    #    envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    elif args.recurrent_policy:
        actor_critic = RecMLPPolicy(obs_numel, envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    # Maxime: log some info about the model and its size
    # call function PPO.modelsize() for this to happen
    '''
	modelSize = 0
	for p in actor_critic.parameters():
		pSize = reduce(operator.mul, p.size(), 1)
		modelSize += pSize
	'''

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)

    if args.algo == 'a2c':
        Agent = A2C(actor_critic, rollouts, args.lr, args.eps,
                    args.num_processes, obs_shape, args.use_gae, args.gamma,
                    args.tau, args.recurrent_policy, args.num_mini_batch,
                    args.cuda, args.log_interval, args.vis, args.env_name,
                    args.log_dir, args.entropy_coef, args.num_stack,
                    args.num_steps, args.ppo_epoch, args.clip_param,
                    args.max_grad_norm, args.alpha, args.save_dir,
                    args.vis_interval, args.save_interval, num_updates,
                    action_shape, args.value_loss_coef)

    elif args.algo == 'ppo':
        Agent = PPO(actor_critic, rollouts, args.lr, args.eps,
                    args.num_processes, obs_shape, args.use_gae, args.gamma,
                    args.tau, args.recurrent_policy, args.num_mini_batch,
                    args.cuda, args.log_interval, args.vis, args.env_name,
                    args.log_dir, args.entropy_coef, args.num_stack,
                    args.num_steps, args.ppo_epoch, args.clip_param,
                    args.max_grad_norm, args.save_dir, args.vis_interval,
                    args.save_interval, num_updates, action_shape,
                    args.value_loss_coef)

    elif args.algo == 'acktr':
        Agent = ACKTR(actor_critic, rollouts, args.lr, args.eps,
                      args.num_processes, obs_shape, args.use_gae, args.gamma,
                      args.tau, args.recurrent_policy, args.num_mini_batch,
                      args.cuda, args.log_interval, args.vis, args.env_name,
                      args.log_dir, args.entropy_coef, args.num_stack,
                      args.num_steps, args.ppo_epoch, args.clip_param,
                      args.max_grad_norm, args.alpha, args.save_dir,
                      args.vis_interval, args.save_interval, num_updates,
                      action_shape, args.value_loss_coef)
    print(str(actor_critic))
    print('Total model size: %d' % Agent.modelsize())

    obs = envs.reset()
    Agent.update_current_obs(obs, envs)
    Agent.rollouts.observations[0].copy_(Agent.current_obs)

    # These variables are used to compute average rewards for all processes.
    Agent.train(envs)
Exemple #7
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="BipedalWalker-v3")
    parser.add_argument("--discrete", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    clip_val = 0.2
    sample_mb_size = 64
    sample_n_epoch = 4
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.01
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 10000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    #Create environment
    #----------------------------
    env = gym.make(args.env)

    if args.discrete:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]

    if args.unwrap:
        env = env.unwrapped

    runner = EnvRunner(s_dim,
                       a_dim,
                       gamma,
                       lamb,
                       max_step=2048,
                       device=device,
                       conti=not args.discrete)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()
    mean_total_reward = 0
    mean_length = 0

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run(
                env, policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)
        mean_total_reward += mb_rewards.sum()
        mean_length += len(mb_obs)
        print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format(
            it, mb_rewards.sum(), len(mb_obs)))

        #Print the result
        if it % disp_step == 0:
            print("\n[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Elapsed time = {:.2f} sec".format(time.time() - t_start))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_total_reward /
                                                 disp_step))
            print("mean length  = {:.2f}".format(mean_length / disp_step))
            print()

            agent.lr_decay(it, n_iter)
            mean_total_reward = 0
            mean_length = 0

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()