def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 beta = 0.1 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_real, a_real = pkl.load(open(expert_path, "rb")) sa_real = [] if args.conti: for i in range(len(s_real)): sa_real.append(np.concatenate([s_real[i], a_real[i]], 1)) else: for i in range(len(s_real)): a_real_onehot = np.zeros((len(a_real[i]), a_dim), dtype=np.float32) for j in range(len(a_real[i])): a_real_onehot[j, a_real[i][j]] = 1 sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1)) sa_real = np.concatenate(sa_real, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) dis_net = DiscriminatorNet(s_dim + a_dim).to(device) agent = PPO(policy_net, value_net, dis_net, a_dim, beta, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device, conti=args.conti) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) dis_net.load_state_dict(checkpoint["DiscriminatorNet"]) agent.beta = checkpoint["beta"] start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net, dis_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train( policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps, sa_real) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance( ) policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("dis loss = {:.6f}".format(dis_loss)) print("entropy = {:.6f}".format(ent)) print("avg_kl = {:.6f}".format(avg_kl)) print("beta = {:.6f}".format(agent.beta)) print("mean true return = {:.6f}".format(mean_true_return)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print("dis_real = {:.3f}".format(dis_real)) print("dis_fake = {:.3f}".format(dis_fake)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "beta": agent.beta, "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict(), "DiscriminatorNet": dis_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_return, std_return, mean_len = runner.get_performance() policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
opt1.zero_grad() opt2.zero_grad() observ_batch = observ_batch[:, :-1, :] policy_pr = policy_net(observ_batch).transpose(1, 2) value_pr = value_net(observ_batch) advantage = (discount_reward_batch - value_pr).detach() logprob = F.cross_entropy(policy_pr, action_batch, reduction='none') loss1 = (logprob * mask_batch * advantage).mean() loss2 = ((value_pr - discount_reward_batch) * mask_batch).pow(2).mean() loss1.backward() loss2.backward() opt1.step() opt2.step() return avg_reward.item(), loss1.item(), loss2.item() if __name__ == "__main__": for i in range(EPOCH): print( "Avg_reward: {:12.6f}, PolicyNet Loss: {:12.6f}, ValueNet Loss: {:12.6f}" .format(*train_step()), end="\r") torch.save( { "policy": policy_net.state_dict(), "value": value_net.state_dict() }, "model.pt")
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") args = parser.parse_args() #Parameters #---------------------------- env_id = args.env mb_size = 256 lr = 1e-5 n_iter = 100000 disp_step = 1000 save_step = 10000 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_traj, a_traj = pkl.load(open(expert_path, "rb")) s_traj = np.concatenate(s_traj, 0) a_traj = np.concatenate(a_traj, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) opt = torch.optim.Adam(policy_net.parameters(), lr) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() for it in range(start_it, n_iter + 1): #Train mb_obs, mb_actions = sample_batch(s_traj, a_traj, mb_size) mb_a_logps, mb_ents = policy_net.evaluate( torch.from_numpy(mb_obs).to(device), torch.from_numpy(mb_actions).to(device)) loss = -mb_a_logps.mean() opt.zero_grad() loss.backward() opt.step() #Print the result if it % disp_step == 0: print("[{:5d} / {:5d}] Elapsed time = {:.2f}, actor loss = {:.6f}". format(it, n_iter, time.time() - t_start, loss.item())) #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save({ "it": it, "PolicyNet": policy_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
class PGAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.model = PolicyNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.model(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R if 1 < len(returns): returns -= torch.mean(returns) returns /= (torch.std(returns) + 1.e-7) return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) # 리턴값 계산 returns = self.get_returns(rewards) loss = self.train_policy(states, actions, returns) return loss # 정책신경망을 업데이트하는 함수 def train_policy(self, states, actions, returns): policy = self.model(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * returns loss = -torch.mean(cross_entropy) self.model_optimizer.zero_grad() loss.backward() self.model_optimizer.step() return loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.model.state_dict(), self.config.save_file) # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.model.load_state_dict(torch.load(self.config.save_file)) # GPU 메모리 반납 def close(self): del self.model
class A2CAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, done): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.actor.state_dict(), self.config.save_file + ".actor") torch.save(self.critic.state_dict(), self.config.save_file + ".critic") # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.actor.load_state_dict(torch.load(self.config.save_file + ".actor")) self.critic.load_state_dict(torch.load(self.config.save_file + ".critic")) # GPU 메모리 반납 def close(self): del self.actor del self.critic
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="BipedalWalker-v3") parser.add_argument("--discrete", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- clip_val = 0.2 sample_mb_size = 64 sample_n_epoch = 4 lamb = 0.95 gamma = 0.99 ent_weight = 0.01 max_grad_norm = 0.5 lr = 1e-4 n_iter = 10000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" if torch.cuda.is_available() else "cpu" #Create environment #---------------------------- env = gym.make(args.env) if args.discrete: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] if args.unwrap: env = env.unwrapped runner = EnvRunner(s_dim, a_dim, gamma, lamb, max_step=2048, device=device, conti=not args.discrete) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() mean_total_reward = 0 mean_length = 0 for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run( env, policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) mean_total_reward += mb_rewards.sum() mean_length += len(mb_obs) print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format( it, mb_rewards.sum(), len(mb_obs))) #Print the result if it % disp_step == 0: print("\n[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Elapsed time = {:.2f} sec".format(time.time() - t_start)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_total_reward / disp_step)) print("mean length = {:.2f}".format(mean_length / disp_step)) print() agent.lr_decay(it, n_iter) mean_total_reward = 0 mean_length = 0 #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
policy_net = PolicyNet(NFRAMES) policy_net.cuda() policy_net.share_memory() # make it store in shared memory opt = optim.RMSprop(policy_net.parameters(), lr=5e-4, alpha=0.99, eps=1e-5) samplers = [ EnvSampler(env, policy_net, NFRAMES, MAXSTEP, GAMMA) for _ in range(NWORKERS) ] global_step = 0 ctx = mp.get_context('spawn') queue = ctx.Queue() event = ctx.Event() workers = [] for i in range(NWORKERS): worker = ctx.Process(target=sample, args=(samplers[i], queue, event), daemon=True) worker.start() workers.append(worker) for i in range(NEPOCH): print("Step: {:6d}, Avg_reward: {:12.6f}, PolicyNet Loss: {:12.6f}". format(i + 1, *train_step(queue, event)), end="\r") if (i + 1) % EPOCHSTEP == 0: torch.save({"policy": policy_net.state_dict()}, "model.pt")