Ejemplo n.º 1
0
def main():
	#Parse arguments
	#----------------------------
	parser = argparse.ArgumentParser()
	parser.add_argument("--env", default="CartPole-v0")
	parser.add_argument("--conti", action="store_true")
	parser.add_argument("--render", action="store_true")
	parser.add_argument("--unwrap", action="store_true")
	parser.add_argument("--episode", default=1000)
	args = parser.parse_args()

	#Parameters
	#----------------------------
	env_id    = args.env
	save_dir  = "./save"
	device    = "cuda:0"
	n_episode = args.episode

	#Create environment
	#----------------------------
	env = gym.make(env_id)

	if args.conti:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.shape[0]
	else:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.n

	if args.unwrap:
		env = env.unwrapped

	#Create model
	#----------------------------
	policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)

	#Load model
	#----------------------------
	if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))):
		print("Loading the model ... ", end="")
		checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id)))
		policy_net.load_state_dict(checkpoint["PolicyNet"])
		print("Done.")
	else:
		print("Error: No model saved")

	#Start playing
	#----------------------------
	policy_net.eval()
	s_traj = []
	a_traj = []

	for i_episode in range(n_episode):
		ob  = env.reset()
		ret = 0
		s_traj.append([])
		a_traj.append([])

		while True:
			if args.render:
				env.render()

			action = policy_net.action_step(torch.FloatTensor(np.expand_dims(ob, axis=0)).to(device), deterministic=True)
			action = action.cpu().detach().numpy()[0]

			s_traj[i_episode].append(ob)
			a_traj[i_episode].append(action)

			ob, reward, done, info = env.step(action)
			ret += reward

			if done:
				s_traj[i_episode] = np.array(s_traj[i_episode], dtype=np.float32)

				if args.conti:
					a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.float32)
				else:
					a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.int32)

				print("{:d}: return = {:.4f}, len = {:d}".format(i_episode, ret, len(s_traj[i_episode])))
				break

	#s_traj: (n_episode, timesteps, s_dim)
	#a_traj: (n_episode, timesteps, a_dim) or (n_episode, timesteps)
	print("Saving the trajectories ... ", end="")
	pkl.dump((s_traj, a_traj), open(os.path.join(save_dir, "{}_traj.pkl".format(env_id)), "wb"))
	print("Done.")
	env.close()
Ejemplo n.º 2
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    beta = 0.1
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"
    expert_path = "../save/{}_traj.pkl".format(args.env)

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Load expert trajectories
    #----------------------------
    if os.path.exists(expert_path):
        s_real, a_real = pkl.load(open(expert_path, "rb"))
        sa_real = []

        if args.conti:
            for i in range(len(s_real)):
                sa_real.append(np.concatenate([s_real[i], a_real[i]], 1))
        else:
            for i in range(len(s_real)):
                a_real_onehot = np.zeros((len(a_real[i]), a_dim),
                                         dtype=np.float32)

                for j in range(len(a_real[i])):
                    a_real_onehot[j, a_real[i][j]] = 1

                sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1))

        sa_real = np.concatenate(sa_real, 0)
    else:
        print("ERROR: No expert trajectory file found")
        sys.exit(1)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    dis_net = DiscriminatorNet(s_dim + a_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                dis_net,
                a_dim,
                beta,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device,
                conti=args.conti)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        dis_net.load_state_dict(checkpoint["DiscriminatorNet"])
        agent.beta = checkpoint["beta"]
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net, dis_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train(
            policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values,
            mb_advs, mb_returns, mb_old_a_logps, sa_real)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance(
            )
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps        = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time     = {:.2f} sec".format(n_sec))
            print("FPS              = {:d}".format(fps))
            print("actor loss       = {:.6f}".format(pg_loss))
            print("critic loss      = {:.6f}".format(v_loss))
            print("dis loss         = {:.6f}".format(dis_loss))
            print("entropy          = {:.6f}".format(ent))
            print("avg_kl           = {:.6f}".format(avg_kl))
            print("beta             = {:.6f}".format(agent.beta))
            print("mean true return = {:.6f}".format(mean_true_return))
            print("mean return      = {:.6f}".format(mean_return))
            print("mean length      = {:.2f}".format(mean_len))
            print("dis_real         = {:.3f}".format(dis_real))
            print("dis_fake         = {:.3f}".format(dis_fake))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "beta": agent.beta,
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict(),
                    "DiscriminatorNet": dis_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Ejemplo n.º 3
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_return, std_return, mean_len = runner.get_performance()
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps    = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time = {:.2f} sec".format(n_sec))
            print("FPS          = {:d}".format(fps))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_return))
            print("mean length  = {:.2f}".format(mean_len))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Ejemplo n.º 4
0
def main():
	#Parse arguments
	#----------------------------
	parser = argparse.ArgumentParser()
	parser.add_argument("--env", default="CartPole-v0")
	parser.add_argument("--conti", action="store_true")
	parser.add_argument("--unwrap", action="store_true")
	args = parser.parse_args()

	#Parameters
	#----------------------------
	env_id   = args.env
	save_dir = "./save"
	device   = "cuda:0"

	#Create environment
	#----------------------------
	env = gym.make(env_id)
	
	if args.conti:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.shape[0]
	else:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.n
	
	if args.unwrap:
		env = env.unwrapped

	#Create model
	#----------------------------
	policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)

	#Load model
	#----------------------------
	if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))):
		print("Loading the model ... ", end="")
		checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id)))
		policy_net.load_state_dict(checkpoint["PolicyNet"])
		print("Done.")
	else:
		print("Error: No model saved")

	#Start playing
	#----------------------------
	policy_net.eval()

	for it in range(100):
		ob  = env.reset()
		ret = 0

		while True:
			env.render()
			action = policy_net.action_step(torch.from_numpy(np.expand_dims(ob.__array__(), axis=0)).float().to(device), deterministic=True)
			ob, reward, done, info = env.step(action.cpu().detach().numpy()[0])
			ret += reward

			if done:
				print("return = {:.4f}".format(ret))
				break

	env.close()
Ejemplo n.º 5
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="BipedalWalker-v3")
    parser.add_argument("--discrete", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    save_dir       = "./save"
    device         = "cuda:0" if torch.cuda.is_available() else "cpu"

    #Create environment
    #----------------------------
    env = gym.make(args.env)

    if args.discrete:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]

    if args.unwrap:
        env = env.unwrapped

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device)
    print(policy_net)

    #Load model
    #----------------------------
    model_path = os.path.join(save_dir, "{}.pt".format(args.env))

    if os.path.exists(model_path):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(model_path)
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        print("Error: No model saved")
        os.exit(1)

    #Start training
    #----------------------------
    policy_net.eval()

    with torch.no_grad():
        for it in range(10):
            ob = env.reset()
            total_reward = 0
            length = 0

            while True:
                env.render()
                ob_tensor = torch.tensor(np.expand_dims(ob, axis=0), dtype=torch.float32, device=device)
                action = policy_net.action_step(ob_tensor, deterministic=True).cpu().numpy()
                ob, reward, done, info = env.step(action[0])
                total_reward += reward
                length += 1

                if done:
                    print("Total reward = {:.6f}, length = {:d}".format(total_reward, length))
                    break

    env.close()