def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_range = [env.action_space.low, env.action_space.high] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNet(self.state_dim).to(device) self.target_value_net = ValueNet(self.state_dim).to(device) self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device) # Load the target value network parameters for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # Initialize the optimizer self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # Initialize thebuffer self.buffer = ReplayBeffer(buffer_maxlen)
def __init__(self, args): self.args = args self.pool = multiprocessing.Pool(args.worker) env = gym.make(args.env_name) o_dim = env.observation_space.shape[0] s_dim = 128 a_dim = env.action_space.n self.population_status = [] for _ in range(args.population): individual_status = {} name = ''.join( random.choice(string.ascii_letters + string.digits) for _ in range(8)) individual_status['name'] = name env_name = self.args.env_name individual_status['env_name'] = env_name policy_net = PolicyNet(o_dim, s_dim, a_dim) policy_net.share_memory() individual_status['policy_net'] = policy_net evaluate_net = EvaluateNet(o_dim, s_dim, a_dim) evaluate_net.share_memory() individual_status['evolution_net'] = evaluate_net steps = self.args.step_per_generation individual_status['steps'] = steps self.population_status.append(individual_status)
def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.model = PolicyNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate)
def __init__(self, env, lr, gamma, save_path, load=False): self.mean = [] self.std = [] self.env = env s_len = self.env.observation_space_shape[0] a_len = self.env.action_space_n self.model = PolicyNet(lr, s_len, a_len) self.gamma = gamma self.save_path = save_path if load: self.model.load(self.save_path) return
def __init__(self, model_file=None): self.model_dir = "models/" if not model_file: model_file = self.find_latest_model("model") if model_file == "init": self.net = PolicyNet() else: self.net = PolicyNet(model_file) print("using model from", model_file) self.data = [] self.data_buffer = [[], []] self.old_net = None self.best_net = self.net
def __init__(self, config): self.config = config # 리플레이메모리 self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device)
def __init__(self, config): self.config = config # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr)
class A3CLocal: def __init__(self, config): self.config = config # 리플레이메모리 self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 리플에이메모리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리플에이메모리 조회 및 클리어 def get_replay(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) return states, actions, rewards, next_states # 글로벌신병망의 wegith를 로컬신경망으로 복사 def update_local_model(self, actor_dict, critic_dict): self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) # GPU 메모리 반납 def close(self): del self.actor del self.critic
def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'): checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch), map_location=device) policy_net = PolicyNet(layer_sizes).to(device) value_net = ValueNet(input_size).to(device) policy_net.load_state_dict(checkpoint["policy_net"]) policy_net.train() value_net.load_state_dict(checkpoint["value_net"]) value_net.train() policy_lr = checkpoint["policy_lr"] valuenet_lr = checkpoint["valuenet_lr"] policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr) policynet_optim.load_state_dict(checkpoint["policynet_optim"]) valuenet_optim = optim.Adam(value_net.parameters(), lr=valuenet_lr) valuenet_optim.load_state_dict(checkpoint["valuenet_optim"]) checkpoint.pop("policy_net") checkpoint.pop("value_net") checkpoint.pop("policynet_optim") checkpoint.pop("valuenet_optim") checkpoint.pop("i_epoch") checkpoint.pop("policy_lr") checkpoint.pop("valuenet_lr") return policy_net, value_net, policynet_optim, valuenet_optim, checkpoint
def load_checkpoint(file_dir, i_epoch, layer_sizes, device='cuda'): checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch)) policy_net = PolicyNet(layer_sizes).to(device) policy_net.load_state_dict(checkpoint["policy_net"]) policy_net.train() learning_rate = checkpoint["learning_rate"] optimizer = optim.Adam(policy_net.parameters()) # optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate) optimizer.load_state_dict(checkpoint["optimizer"]) checkpoint.pop("policy_net") checkpoint.pop("optimizer") checkpoint.pop("i_epoch") checkpoint.pop("learning_rate") return policy_net, optimizer, checkpoint
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="BipedalWalker-v3") parser.add_argument("--discrete", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- save_dir = "./save" device = "cuda:0" if torch.cuda.is_available() else "cpu" #Create environment #---------------------------- env = gym.make(args.env) if args.discrete: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] if args.unwrap: env = env.unwrapped #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device) print(policy_net) #Load model #---------------------------- model_path = os.path.join(save_dir, "{}.pt".format(args.env)) if os.path.exists(model_path): print("Loading the model ... ", end="") checkpoint = torch.load(model_path) policy_net.load_state_dict(checkpoint["PolicyNet"]) start_it = checkpoint["it"] print("Done.") else: print("Error: No model saved") os.exit(1) #Start training #---------------------------- policy_net.eval() with torch.no_grad(): for it in range(10): ob = env.reset() total_reward = 0 length = 0 while True: env.render() ob_tensor = torch.tensor(np.expand_dims(ob, axis=0), dtype=torch.float32, device=device) action = policy_net.action_step(ob_tensor, deterministic=True).cpu().numpy() ob, reward, done, info = env.step(action[0]) total_reward += reward length += 1 if done: print("Total reward = {:.6f}, length = {:d}".format(total_reward, length)) break env.close()
class Reinforce(object): # Implementation of the policy gradient method REINFORCE. def __init__(self, env, lr, gamma, save_path, load=False): self.mean = [] self.std = [] self.env = env s_len = self.env.observation_space_shape[0] a_len = self.env.action_space_n self.model = PolicyNet(lr, s_len, a_len) self.gamma = gamma self.save_path = save_path if load: self.model.load(self.save_path) return def train(self): # Trains the model on a single episode using REINFORCE. # method generate_episode() to generate training data. K = 100 print("pretrain test:") self.test() print("training") # generate an episode for i in range(10000000): s, ava, a, r = self.generate_episode() s = np.array(s) r = np.array(r) r /= 100.0 T = len(r) G = np.zeros(shape=(T, ), dtype=np.float32) G[T - 1] = r[T - 1] for t in range(T - 2, -1, -1): G[t] = self.gamma * G[t + 1] + r[t] for j in range(6): self.model.fit(s, ava, a, G) if (i + 1) % K == 0: mean, std = self.test() self.mean.append(mean) self.std.append(std) with open('mean_np_array.pickle', 'wb+') as f: pickle.dump(self.mean, f) with open('std_np_array.pickle', 'wb+') as f: pickle.dump(self.std, f) self.model.save(self.save_path) self.model.save(self.save_path) return def generate_episode_fast(self): s = self.env.reset() done = False states = [] avas = [] rewards = [] predict_fn = self.model.predict step_fn = self.env.step s_append_fn = states.append r_append_fn = rewards.append while not done: ava = self.env.ava() a = predict_fn(s.reshape([1, -1]), ava.reshape([1, -1])) s_append_fn(s) avas.append(ava) s, r, done, _ = step_fn(a) r_append_fn(r) return states, avas, rewards def generate_episode(self): s = self.env.reset() done = False states = [] rewards = [] actions = [] avas = [] predict_fn = self.model.predict step_fn = self.env.step s_append_fn = states.append r_append_fn = rewards.append a_append_fn = actions.append while not done: ava = self.env.ava() a = predict_fn(s.reshape([1, -1]), ava.reshape([1, -1])) if a >= 1352: p, p_norm = self.model.sess.run( [self.model.p, self.model.p_norm], { self.model.s: s.reshape([1, -1]), self.model.ava: ava.reshape([1, -1]) }) print(a) print(ava) print(sum(ava)) a_append_fn(a) s_append_fn(s) avas.append(ava) s, r, done, _ = step_fn(a) r_append_fn(r) return states, avas, actions, rewards def generate_episode_render(self): s = self.env.reset() done = False states = [] avas = [] rewards = [] predict_fn = self.model.predict step_fn = self.env.step s_append_fn = states.append r_append_fn = rewards.append while not done: ava = self.env.ava() a = predict_fn(s.reshape([1, -1]), ava.reshape([1, -1])) s_append_fn(s) avas.append(ava) s, r, done, _ = step_fn(a) r_append_fn(r) self.env.render() return states, avas, rewards def test(self): r = [] for i in range(100): _, _, ri = self.generate_episode_fast() ri = sum(ri) r.append(ri) std = np.std(r) mean = np.mean(r) print('r =', mean, "+-", std) return mean, std def test_render(self): r = [] for i in range(100): _, _, ri = self.generate_episode_render() ri = sum(ri) r.append(ri) steps = len(r) std = np.std(r) mean = np.mean(r) print('r =', mean, "+-", std, "steps", steps) return mean, std
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 beta = 0.1 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_real, a_real = pkl.load(open(expert_path, "rb")) sa_real = [] if args.conti: for i in range(len(s_real)): sa_real.append(np.concatenate([s_real[i], a_real[i]], 1)) else: for i in range(len(s_real)): a_real_onehot = np.zeros((len(a_real[i]), a_dim), dtype=np.float32) for j in range(len(a_real[i])): a_real_onehot[j, a_real[i][j]] = 1 sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1)) sa_real = np.concatenate(sa_real, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) dis_net = DiscriminatorNet(s_dim + a_dim).to(device) agent = PPO(policy_net, value_net, dis_net, a_dim, beta, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device, conti=args.conti) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) dis_net.load_state_dict(checkpoint["DiscriminatorNet"]) agent.beta = checkpoint["beta"] start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net, dis_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train( policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps, sa_real) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance( ) policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("dis loss = {:.6f}".format(dis_loss)) print("entropy = {:.6f}".format(ent)) print("avg_kl = {:.6f}".format(avg_kl)) print("beta = {:.6f}".format(agent.beta)) print("mean true return = {:.6f}".format(mean_true_return)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print("dis_real = {:.3f}".format(dis_real)) print("dis_fake = {:.3f}".format(dis_fake)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "beta": agent.beta, "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict(), "DiscriminatorNet": dis_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
from model import PolicyNet, ValueNet import torch from torch import optim, distributions import torch.nn.functional as F env = gym.make("CartPole-v1") # observation = env.reset() # print(observation) # print(env.observation_space) MAXSTEP = 100 BATCHSIZE = 16 EPOCH = 1000 GAMMA = 0.99 policy_net = PolicyNet() value_net = ValueNet() policy_net.cuda() value_net.cuda() opt1 = optim.Adam(policy_net.parameters(), lr=1e-3) opt2 = optim.Adam(value_net.parameters(), lr=1e-3) # train one epoch def train_step(): observ_batch = [] reward_batch = [] action_batch = [] mask_batch = []
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") args = parser.parse_args() #Parameters #---------------------------- env_id = args.env mb_size = 256 lr = 1e-5 n_iter = 100000 disp_step = 1000 save_step = 10000 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_traj, a_traj = pkl.load(open(expert_path, "rb")) s_traj = np.concatenate(s_traj, 0) a_traj = np.concatenate(a_traj, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) opt = torch.optim.Adam(policy_net.parameters(), lr) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() for it in range(start_it, n_iter + 1): #Train mb_obs, mb_actions = sample_batch(s_traj, a_traj, mb_size) mb_a_logps, mb_ents = policy_net.evaluate( torch.from_numpy(mb_obs).to(device), torch.from_numpy(mb_actions).to(device)) loss = -mb_a_logps.mean() opt.zero_grad() loss.backward() opt.step() #Print the result if it % disp_step == 0: print("[{:5d} / {:5d}] Elapsed time = {:.2f}, actor loss = {:.6f}". format(it, n_iter, time.time() - t_start, loss.item())) #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save({ "it": it, "PolicyNet": policy_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
class A3CGlobal: def __init__(self, config): self.config = config # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, states, actions, rewards, next_states, done): states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # GPU 메모리 반납 def close(self): del self.actor del self.critic
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="BipedalWalker-v3") parser.add_argument("--discrete", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- clip_val = 0.2 sample_mb_size = 64 sample_n_epoch = 4 lamb = 0.95 gamma = 0.99 ent_weight = 0.01 max_grad_norm = 0.5 lr = 1e-4 n_iter = 10000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" if torch.cuda.is_available() else "cpu" #Create environment #---------------------------- env = gym.make(args.env) if args.discrete: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] if args.unwrap: env = env.unwrapped runner = EnvRunner(s_dim, a_dim, gamma, lamb, max_step=2048, device=device, conti=not args.discrete) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() mean_total_reward = 0 mean_length = 0 for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run( env, policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) mean_total_reward += mb_rewards.sum() mean_length += len(mb_obs) print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format( it, mb_rewards.sum(), len(mb_obs))) #Print the result if it % disp_step == 0: print("\n[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Elapsed time = {:.2f} sec".format(time.time() - t_start)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_total_reward / disp_step)) print("mean length = {:.2f}".format(mean_length / disp_step)) print() agent.lr_decay(it, n_iter) mean_total_reward = 0 mean_length = 0 #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
if __name__ == "__main__": writer = SummaryWriter("./log") env = gym.make("Pong-v0") MAXSTEP = 6 NWORKERS = 4 EPOCHSTEP = 4000 * 1024 // (MAXSTEP * BATCHSIZE * NWORKERS ) # around ~4000 1 EPOCH in A3C paper print("1 epoch contains {} steps".format(EPOCHSTEP)) NEPOCH = 100 * EPOCHSTEP GAMMA = 0.99 NFRAMES = 4 policy_net = PolicyNet(NFRAMES) policy_net.cuda() policy_net.share_memory() # make it store in shared memory opt = optim.RMSprop(policy_net.parameters(), lr=5e-4, alpha=0.99, eps=1e-5) samplers = [ EnvSampler(env, policy_net, NFRAMES, MAXSTEP, GAMMA) for _ in range(NWORKERS) ] global_step = 0 ctx = mp.get_context('spawn') queue = ctx.Queue() event = ctx.Event() workers = []
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- env_id = args.env save_dir = "./save" device = "cuda:0" #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n if args.unwrap: env = env.unwrapped #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) #Load model #---------------------------- if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))): print("Loading the model ... ", end="") checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id))) policy_net.load_state_dict(checkpoint["PolicyNet"]) print("Done.") else: print("Error: No model saved") #Start playing #---------------------------- policy_net.eval() for it in range(100): ob = env.reset() ret = 0 while True: env.render() action = policy_net.action_step(torch.from_numpy(np.expand_dims(ob.__array__(), axis=0)).float().to(device), deterministic=True) ob, reward, done, info = env.step(action.cpu().detach().numpy()[0]) ret += reward if done: print("return = {:.4f}".format(ret)) break env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_return, std_return, mean_len = runner.get_performance() policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
class A2CAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, done): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.actor.state_dict(), self.config.save_file + ".actor") torch.save(self.critic.state_dict(), self.config.save_file + ".critic") # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.actor.load_state_dict(torch.load(self.config.save_file + ".actor")) self.critic.load_state_dict(torch.load(self.config.save_file + ".critic")) # GPU 메모리 반납 def close(self): del self.actor del self.critic
input_size = 4 hidden_size = 128 num_layers = 2 dropout = 0.85 seq_len = 1024 state_size = 3 num_actions = 3 act_lim = 1 batch_size = 32 ###################################################### encoder = Encoder(input_size, batch_size, hidden_size, num_layers, dropout) policy_net = PolicyNet(state_size, num_actions, act_lim, batch_size, hidden_size) print("Encoder network: ", encoder) print("Policy network: ", policy_net) print() # Test encoder test_input = torch.randn((batch_size, seq_len, input_size)) print("test_input shape: ", test_input.shape) encoding = encoder(test_input) print("encoding shape: ", encoding.shape) # Test Policy Net # One step forward propagation state = torch.randn((batch_size, state_size))
# Turn on pyplot's interactive mode # VERY IMPORTANT because otherwise training stats plot will hault plt.ion() # Create OpenAI gym environment env = gym.make(env_name) if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes, action_lim).to(device) # Policy network value_net = ValueNet(input_size).to(device) # Value network # Set up memory memory = Memory(capacity, device) # Set up optimizer policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr) valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [],
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--render", action="store_true") parser.add_argument("--unwrap", action="store_true") parser.add_argument("--episode", default=1000) args = parser.parse_args() #Parameters #---------------------------- env_id = args.env save_dir = "./save" device = "cuda:0" n_episode = args.episode #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n if args.unwrap: env = env.unwrapped #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) #Load model #---------------------------- if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))): print("Loading the model ... ", end="") checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id))) policy_net.load_state_dict(checkpoint["PolicyNet"]) print("Done.") else: print("Error: No model saved") #Start playing #---------------------------- policy_net.eval() s_traj = [] a_traj = [] for i_episode in range(n_episode): ob = env.reset() ret = 0 s_traj.append([]) a_traj.append([]) while True: if args.render: env.render() action = policy_net.action_step(torch.FloatTensor(np.expand_dims(ob, axis=0)).to(device), deterministic=True) action = action.cpu().detach().numpy()[0] s_traj[i_episode].append(ob) a_traj[i_episode].append(action) ob, reward, done, info = env.step(action) ret += reward if done: s_traj[i_episode] = np.array(s_traj[i_episode], dtype=np.float32) if args.conti: a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.float32) else: a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.int32) print("{:d}: return = {:.4f}, len = {:d}".format(i_episode, ret, len(s_traj[i_episode]))) break #s_traj: (n_episode, timesteps, s_dim) #a_traj: (n_episode, timesteps, a_dim) or (n_episode, timesteps) print("Saving the trajectories ... ", end="") pkl.dump((s_traj, a_traj), open(os.path.join(save_dir, "{}_traj.pkl".format(env_id)), "wb")) print("Done.") env.close()
def main(): env = FrozenSeaEnv(H, W, P, 'uniform', SEED) net = PolicyNet(env.get_obs_size(), env.get_n_actions()) print(net) test_play(env, net, 10)
class Trainer: actions = { 'j': 0, 't': 1, 'by': 2, 'b': 3, 'bt': 4, 'fs': 5, 'fm': 6, 'fg': 7, 'fk': 8, 's': 9, 'gs': 10, 'mj': 11, 'k': 12, 'xd': 13 } # action名称映射到其index def __init__(self, model_file=None): self.model_dir = "models/" if not model_file: model_file = self.find_latest_model("model") if model_file == "init": self.net = PolicyNet() else: self.net = PolicyNet(model_file) print("using model from", model_file) self.data = [] self.data_buffer = [[], []] self.old_net = None self.best_net = self.net def find_latest_model(self, prefix): biggest_timestamp = 0 latest_model = "" for filename in os.listdir(self.model_dir): pat = prefix + r"(.+)\.h5$" match = re.match(pat, filename) if match: timestamp = int(match.group(1)) if timestamp > biggest_timestamp: biggest_timestamp = timestamp latest_model = filename return self.model_dir + latest_model if latest_model else None @staticmethod def one_hot(action): p = Player(0) coded_action = [0 for i in range(14)] coded_action[list(p.all_actions).index(action)] = 1 return coded_action @staticmethod def abstract_state(game: Game, player: int): """ jue(10) ta(5) bingYing(bool) bing(10) baoTou(5) defended_rush(5) then last three steps of each using one-hot coding following the order of "my resources, his resources, my steps, his steps“ """ state = [] for i in [player, 1 - player]: # 先自己 再对方的顺序 p = game.players[i] state += [ p.jue / 10, p.tower / 5, int(p.camp), p.soldier / 10, p.baotou / 5, p.defended_rush / 5 ] for i in [player, 1 - player]: his = game.histories[i] processed_his = [his[0]] * 2 + his for action in processed_his[-3:]: state += Trainer.one_hot(action) return state def self_play(self): game = Game(2) game.do(0, "j") game.do(1, "j") game.settle() cache = [[[], []], [[], []]] while True: for i in range(2): # p0, p1 = game.players[0], game.players[1] act, state = self.choose_action(game, i, self.net) game.do(i, act, [1 - i]) cache[i][0].append(state) cache[i][1].append(self.one_hot(act)) game.settle() if len(game.players) == 1: winner = list(game.players.keys())[0] break self.data += cache[winner] self.data_buffer[0] += cache[winner][0] self.data_buffer[1] += cache[winner][1] def progress(self, data_size=512): print("progress") while len(self.data_buffer[0]) < data_size: self.self_play() self.old_net = self.net.copy() self.net.train_step(*self.data_buffer) self.data_buffer = [[], []] self.evaluate(self.old_net, self.net, 1, True) return self.evaluate(self.old_net, self.net, 20) def evaluate(self, old_net, new_net, count=100, verbose=0): winners = [0, 0] for t in range(count): game = Game(2) game.do(0, "j") game.do(1, "j") game.settle() while True: for i in range(2): # p0, p1 = game.players[0], game.players[1] net = [old_net, new_net][i] act, _ = self.choose_action(game, i, net) game.do(i, act, [1 - i]) if verbose: print(i, act) game.settle() if verbose: print(game.info()) if len(game.players) == 1: winner = list(game.players.keys())[0] winners[winner] += 1 break if sum(winners) != count: raise ValueError return winners[1] / sum(winners), winners @staticmethod def choose_action(game, i, net): p = game.players[i] state = Trainer.abstract_state(game, i) actions = net.predict(state, noise=0.25) # choose max max_val = 0 act = None available = p.aActions() for action in available: val = actions[Trainer.actions[action]] if val > max_val: max_val = val act = action return act, state def main(self): count = 0 while True: result = self.progress(2048) print(result) count += 1 if count % 5 == 0: self.net.save_model()
class PGAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.model = PolicyNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.model(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R if 1 < len(returns): returns -= torch.mean(returns) returns /= (torch.std(returns) + 1.e-7) return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) # 리턴값 계산 returns = self.get_returns(rewards) loss = self.train_policy(states, actions, returns) return loss # 정책신경망을 업데이트하는 함수 def train_policy(self, states, actions, returns): policy = self.model(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * returns loss = -torch.mean(cross_entropy) self.model_optimizer.zero_grad() loss.backward() self.model_optimizer.step() return loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.model.state_dict(), self.config.save_file) # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.model.load_state_dict(torch.load(self.config.save_file)) # GPU 메모리 반납 def close(self): del self.model
# Turn on pyplot's interactive mode # VERY IMPORTANT because otherwise training stats plot will hault plt.ion() # Create OpenAI gym environment env = gym.make(env_name) if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes).to(device) # Set up optimizer - Minimal optimizer = optim.Adam(policy_net.parameters()) # optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = {"epoch mean durations" : [], "epoch mean rewards" : [], "max reward achieved": 0, "past %d epochs mean reward" % (num_avg_epoch): 0,} # Batch that records trajectories
class SAC: def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_range = [env.action_space.low, env.action_space.high] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNet(self.state_dim).to(device) self.target_value_net = ValueNet(self.state_dim).to(device) self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device) # Load the target value network parameters for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # Initialize the optimizer self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # Initialize thebuffer self.buffer = ReplayBeffer(buffer_maxlen) def get_action(self, state): action = self.policy_net.action(state) action = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \ (self.action_range[1] + self.action_range[0]) / 2.0 return action def update(self, batch_size): state, action, reward, next_state, done = self.buffer.sample(batch_size) new_action, log_prob = self.policy_net.evaluate(state) # V value loss value = self.value_net(state) new_q1_value = self.q1_net(state, new_action) new_q2_value = self.q2_net(state, new_action) next_value = torch.min(new_q1_value, new_q2_value) - log_prob value_loss = F.mse_loss(value, next_value.detach()) # Soft q loss q1_value = self.q1_net(state, action) q2_value = self.q2_net(state, action) target_value = self.target_value_net(next_state) target_q_value = reward + done * self.gamma * target_value q1_value_loss = F.mse_loss(q1_value, target_q_value.detach()) q2_value_loss = F.mse_loss(q2_value, target_q_value.detach()) # Policy loss policy_loss = (log_prob - torch.min(new_q1_value, new_q2_value)).mean() # Update v self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Update Soft q self.q1_optimizer.zero_grad() self.q2_optimizer.zero_grad() q1_value_loss.backward() q2_value_loss.backward() self.q1_optimizer.step() self.q2_optimizer.step() # Update Policy self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Update target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)