def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 beta = 0.1 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_real, a_real = pkl.load(open(expert_path, "rb")) sa_real = [] if args.conti: for i in range(len(s_real)): sa_real.append(np.concatenate([s_real[i], a_real[i]], 1)) else: for i in range(len(s_real)): a_real_onehot = np.zeros((len(a_real[i]), a_dim), dtype=np.float32) for j in range(len(a_real[i])): a_real_onehot[j, a_real[i][j]] = 1 sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1)) sa_real = np.concatenate(sa_real, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) dis_net = DiscriminatorNet(s_dim + a_dim).to(device) agent = PPO(policy_net, value_net, dis_net, a_dim, beta, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device, conti=args.conti) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) dis_net.load_state_dict(checkpoint["DiscriminatorNet"]) agent.beta = checkpoint["beta"] start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net, dis_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train( policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps, sa_real) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance( ) policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("dis loss = {:.6f}".format(dis_loss)) print("entropy = {:.6f}".format(ent)) print("avg_kl = {:.6f}".format(avg_kl)) print("beta = {:.6f}".format(agent.beta)) print("mean true return = {:.6f}".format(mean_true_return)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print("dis_real = {:.3f}".format(dis_real)) print("dis_fake = {:.3f}".format(dis_fake)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "beta": agent.beta, "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict(), "DiscriminatorNet": dis_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def main(): env_name = 'BipedalWalker-v3' render = False solved_reward = 60 log_interval = 20 max_episodes = 10000 max_timesteps = 1500 update_timestep = 8000 action_std = 0.5 K_epochs = 80 eps_clip = 0.2 gamma = 0.99 lr = 3e-4 betas = (0.9, 0.999) env = gym.make(env_name) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # s_dim, a_dim = 24, 4 # print(env.observation_space.high) # print(env.observation_space.low) # print(env.action_space.high) # print(env.action_space.low) # assert False device = torch.device('cuda') memory = Memory() ppo = PPO(s_dim, a_dim, action_std, lr, betas, gamma, K_epochs, eps_clip, device) running_reward = 0 avg_length = 0 time_step = 0 total_time = time.time() for eps in range(max_episodes): state = env.reset() if eps % log_interval == 0: start_t = time.time() for t in range(max_timesteps): time_step += 1 action = ppo.choose_action(state, memory) # action.shape = (a_dim,) state, reward, done, _ = env.step(action) memory.rewards.append(reward) memory.is_terminals.append(done) if time_step % update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 running_reward += reward if render: env.render() if done: break avg_length += (t + 1) # solved_reward = 60 # log_interval = 20 # max_episodes = 10000 # max_timesteps = 1500 # update_timestep = 8000 if running_reward > log_interval * solved_reward: print("### Solved! ###") torch.save(ppo.policy.state_dict(), 'ckpt/ppo_solved.pt') break if (eps + 1) % 500 == 0: torch.save(ppo.policy.state_dict(), 'ckpt/ppo.pt') if (eps + 1) % log_interval == 0: end_t = time.time() m, s = epoch_time(start_t, end_t) t_m, t_s = epoch_time(total_time, end_t) avg_length = int(avg_length / log_interval) running_reward = int(running_reward / log_interval) print(f"Episode {eps+1}") print(f"\tTime: {m}m {s}s | Total Time: {t_m}m {t_s}s") print(f"\tAvg length: {avg_length}") print(f"\tAvg reward: {running_reward}") running_reward = 0 avg_length = 0
assert args.env_name in env_lst env = NormalizedGymEnv(args.env_name, normalize_obs=True) ''' #for pybullet envs import pybullet_envs env = NormalizedGymEnv("HopperBulletEnv-v0",normalize_obs=True) ''' action_space = env.action_space.shape[0] state_space = env.observation_space.shape[0] device = 'cuda' if torch.cuda.is_available() else 'cpu' if args.use_cuda == False: device = 'cpu' if (torch.cuda.is_available()) and (args.use_cuda): agent = PPO(state_space,action_space,args.hidden_dim, args.learning_rate,args.entropy_coef,args.critic_coef,args.gamma,args.lmbda,args.eps_clip,\ args.K_epoch, args.minibatch_size,device).cuda() else: agent = PPO(state_space,action_space,args.hidden_dim, args.learning_rate,args.entropy_coef,args.critic_coef,args.gamma,args.lmbda,args.eps_clip,\ args.K_epoch, args.minibatch_size,device) if args.load != 'no': agent.load_state_dict(torch.load("./model_weights/" + args.load)) if args.tensorboard: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() else: writer = None score_lst = []
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_return, std_return, mean_len = runner.get_performance() policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def main(): env = make_env() set_global_seeds(env, args.seed) agent = PPO(env=env) batch_steps = args.n_envs * args.batch_steps # number of steps per update if args.save_interval and logger.get_dir(): # some saving jobs pass ep_info_buffer = deque(maxlen=100) t_train_start = time.time() n_updates = args.n_steps // batch_steps runner = Runner(env, agent) for update in range(1, n_updates + 1): t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = args.lr # maybe dynamic change clip_range_now = args.clip_range # maybe dynamic change obs, returns, masks, acts, vals, neglogps, advs, rewards, ep_infos = \ runner.run(args.batch_steps, frac) ep_info_buffer.extend(ep_infos) loss_infos = [] idxs = np.arange(batch_steps) for _ in range(args.n_epochs): np.random.shuffle(idxs) for start in range(0, batch_steps, args.minibatch): end = start + args.minibatch mb_idxs = idxs[start:end] minibatch = [ arr[mb_idxs] for arr in [obs, returns, masks, acts, vals, neglogps, advs] ] loss_infos.append( agent.train(lr_now, clip_range_now, *minibatch)) t_now = time.time() time_this_batch = t_now - t_start if update % args.log_interval == 0: ev = float(explained_variance(vals, returns)) logger.logkv('updates', str(update) + '/' + str(n_updates)) logger.logkv('serial_steps', update * args.batch_steps) logger.logkv('total_steps', update * batch_steps) logger.logkv('time', time_this_batch) logger.logkv('fps', int(batch_steps / (t_now - t_start))) logger.logkv('total_time', t_now - t_train_start) logger.logkv("explained_variance", ev) logger.logkv('avg_reward', np.mean([e['r'] for e in ep_info_buffer])) logger.logkv('avg_ep_len', np.mean([e['l'] for e in ep_info_buffer])) logger.logkv('adv_mean', np.mean(returns - vals)) logger.logkv('adv_variance', np.std(returns - vals)**2) loss_infos = np.mean(loss_infos, axis=0) for loss_name, loss_info in zip(agent.loss_names, loss_infos): logger.logkv(loss_name, loss_info) logger.dumpkvs() if args.save_interval and update % args.save_interval == 0 and logger.get_dir( ): pass env.close()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Maxime: commented this out because it very much changes the behavior # of the code for seemingly arbitrary reasons #if len(envs.observation_space.shape) == 1: # envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) elif args.recurrent_policy: actor_critic = RecMLPPolicy(obs_numel, envs.action_space) else: actor_critic = MLPPolicy(obs_numel, envs.action_space) # Maxime: log some info about the model and its size # call function PPO.modelsize() for this to happen ''' modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize ''' if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) if args.algo == 'a2c': Agent = A2C(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.alpha, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) elif args.algo == 'ppo': Agent = PPO(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) elif args.algo == 'acktr': Agent = ACKTR(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.alpha, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) print(str(actor_critic)) print('Total model size: %d' % Agent.modelsize()) obs = envs.reset() Agent.update_current_obs(obs, envs) Agent.rollouts.observations[0].copy_(Agent.current_obs) # These variables are used to compute average rewards for all processes. Agent.train(envs)
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="BipedalWalker-v3") parser.add_argument("--discrete", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- clip_val = 0.2 sample_mb_size = 64 sample_n_epoch = 4 lamb = 0.95 gamma = 0.99 ent_weight = 0.01 max_grad_norm = 0.5 lr = 1e-4 n_iter = 10000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" if torch.cuda.is_available() else "cpu" #Create environment #---------------------------- env = gym.make(args.env) if args.discrete: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] if args.unwrap: env = env.unwrapped runner = EnvRunner(s_dim, a_dim, gamma, lamb, max_step=2048, device=device, conti=not args.discrete) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() mean_total_reward = 0 mean_length = 0 for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run( env, policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) mean_total_reward += mb_rewards.sum() mean_length += len(mb_obs) print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format( it, mb_rewards.sum(), len(mb_obs))) #Print the result if it % disp_step == 0: print("\n[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Elapsed time = {:.2f} sec".format(time.time() - t_start)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_total_reward / disp_step)) print("mean length = {:.2f}".format(mean_length / disp_step)) print() agent.lr_decay(it, n_iter) mean_total_reward = 0 mean_length = 0 #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()