def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'): checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch), map_location=device) policy_net = PolicyNet(layer_sizes).to(device) value_net = ValueNet(input_size).to(device) policy_net.load_state_dict(checkpoint["policy_net"]) policy_net.train() value_net.load_state_dict(checkpoint["value_net"]) value_net.train() policy_lr = checkpoint["policy_lr"] valuenet_lr = checkpoint["valuenet_lr"] policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr) policynet_optim.load_state_dict(checkpoint["policynet_optim"]) valuenet_optim = optim.Adam(value_net.parameters(), lr=valuenet_lr) valuenet_optim.load_state_dict(checkpoint["valuenet_optim"]) checkpoint.pop("policy_net") checkpoint.pop("value_net") checkpoint.pop("policynet_optim") checkpoint.pop("valuenet_optim") checkpoint.pop("i_epoch") checkpoint.pop("policy_lr") checkpoint.pop("valuenet_lr") return policy_net, value_net, policynet_optim, valuenet_optim, checkpoint
def load_checkpoint(file_dir, i_epoch, layer_sizes, device='cuda'): checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch)) policy_net = PolicyNet(layer_sizes).to(device) policy_net.load_state_dict(checkpoint["policy_net"]) policy_net.train() learning_rate = checkpoint["learning_rate"] optimizer = optim.Adam(policy_net.parameters()) # optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate) optimizer.load_state_dict(checkpoint["optimizer"]) checkpoint.pop("policy_net") checkpoint.pop("optimizer") checkpoint.pop("i_epoch") checkpoint.pop("learning_rate") return policy_net, optimizer, checkpoint
plt.ion() # Create OpenAI gym environment env = gym.make(env_name) if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes).to(device) # Set up optimizer - Minimal optimizer = optim.Adam(policy_net.parameters()) # optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = {"epoch mean durations" : [], "epoch mean rewards" : [], "max reward achieved": 0, "past %d epochs mean reward" % (num_avg_epoch): 0,} # Batch that records trajectories batch_log_prob = [] batch_rewards = []
class A3CGlobal: def __init__(self, config): self.config = config # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, states, actions, rewards, next_states, done): states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # GPU 메모리 반납 def close(self): del self.actor del self.critic
env = gym.make("CartPole-v1") # observation = env.reset() # print(observation) # print(env.observation_space) MAXSTEP = 100 BATCHSIZE = 16 EPOCH = 1000 GAMMA = 0.99 policy_net = PolicyNet() value_net = ValueNet() policy_net.cuda() value_net.cuda() opt1 = optim.Adam(policy_net.parameters(), lr=1e-3) opt2 = optim.Adam(value_net.parameters(), lr=1e-3) # train one epoch def train_step(): observ_batch = [] reward_batch = [] action_batch = [] mask_batch = [] policy_net.cpu() value_net.cpu() for _ in range(BATCHSIZE): observ = []
class PGAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.model = PolicyNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.model(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R if 1 < len(returns): returns -= torch.mean(returns) returns /= (torch.std(returns) + 1.e-7) return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) # 리턴값 계산 returns = self.get_returns(rewards) loss = self.train_policy(states, actions, returns) return loss # 정책신경망을 업데이트하는 함수 def train_policy(self, states, actions, returns): policy = self.model(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * returns loss = -torch.mean(cross_entropy) self.model_optimizer.zero_grad() loss.backward() self.model_optimizer.step() return loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.model.state_dict(), self.config.save_file) # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.model.load_state_dict(torch.load(self.config.save_file)) # GPU 메모리 반납 def close(self): del self.model
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") args = parser.parse_args() #Parameters #---------------------------- env_id = args.env mb_size = 256 lr = 1e-5 n_iter = 100000 disp_step = 1000 save_step = 10000 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_traj, a_traj = pkl.load(open(expert_path, "rb")) s_traj = np.concatenate(s_traj, 0) a_traj = np.concatenate(a_traj, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) opt = torch.optim.Adam(policy_net.parameters(), lr) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() for it in range(start_it, n_iter + 1): #Train mb_obs, mb_actions = sample_batch(s_traj, a_traj, mb_size) mb_a_logps, mb_ents = policy_net.evaluate( torch.from_numpy(mb_obs).to(device), torch.from_numpy(mb_actions).to(device)) loss = -mb_a_logps.mean() opt.zero_grad() loss.backward() opt.step() #Print the result if it % disp_step == 0: print("[{:5d} / {:5d}] Elapsed time = {:.2f}, actor loss = {:.6f}". format(it, n_iter, time.time() - t_start, loss.item())) #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save({ "it": it, "PolicyNet": policy_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
class A2CAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, done): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.actor.state_dict(), self.config.save_file + ".actor") torch.save(self.critic.state_dict(), self.config.save_file + ".critic") # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.actor.load_state_dict(torch.load(self.config.save_file + ".actor")) self.critic.load_state_dict(torch.load(self.config.save_file + ".critic")) # GPU 메모리 반납 def close(self): del self.actor del self.critic
writer = SummaryWriter("./log") env = gym.make("Pong-v0") MAXSTEP = 6 NWORKERS = 4 EPOCHSTEP = 4000 * 1024 // (MAXSTEP * BATCHSIZE * NWORKERS ) # around ~4000 1 EPOCH in A3C paper print("1 epoch contains {} steps".format(EPOCHSTEP)) NEPOCH = 100 * EPOCHSTEP GAMMA = 0.99 NFRAMES = 4 policy_net = PolicyNet(NFRAMES) policy_net.cuda() policy_net.share_memory() # make it store in shared memory opt = optim.RMSprop(policy_net.parameters(), lr=5e-4, alpha=0.99, eps=1e-5) samplers = [ EnvSampler(env, policy_net, NFRAMES, MAXSTEP, GAMMA) for _ in range(NWORKERS) ] global_step = 0 ctx = mp.get_context('spawn') queue = ctx.Queue() event = ctx.Event() workers = [] for i in range(NWORKERS): worker = ctx.Process(target=sample, args=(samplers[i], queue, event),
if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes, action_lim).to(device) # Policy network value_net = ValueNet(input_size).to(device) # Value network # Set up memory memory = Memory(capacity, device) # Set up optimizer policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr) valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [], "max reward achieved": 0, "past %d epochs mean reward" % num_avg_epoch: 0, "value net loss": [] } # Batch that records trajectories
env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes).to(device) # Policy network value_net = ValueNet(input_size).to(device) # Value network # Set up memory memory = Memory(capacity, device) # Set up optimizer # policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr) policynet_optimizer = optim.RMSprop(policy_net.parameters(), lr=policy_lr) valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [], "max reward achieved": 0, "past %d epochs mean reward" % num_avg_epoch: 0, "value net loss": [] } # Batch that records trajectories
class SAC: def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_range = [env.action_space.low, env.action_space.high] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNet(self.state_dim).to(device) self.target_value_net = ValueNet(self.state_dim).to(device) self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device) # Load the target value network parameters for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # Initialize the optimizer self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # Initialize thebuffer self.buffer = ReplayBeffer(buffer_maxlen) def get_action(self, state): action = self.policy_net.action(state) action = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \ (self.action_range[1] + self.action_range[0]) / 2.0 return action def update(self, batch_size): state, action, reward, next_state, done = self.buffer.sample(batch_size) new_action, log_prob = self.policy_net.evaluate(state) # V value loss value = self.value_net(state) new_q1_value = self.q1_net(state, new_action) new_q2_value = self.q2_net(state, new_action) next_value = torch.min(new_q1_value, new_q2_value) - log_prob value_loss = F.mse_loss(value, next_value.detach()) # Soft q loss q1_value = self.q1_net(state, action) q2_value = self.q2_net(state, action) target_value = self.target_value_net(next_state) target_q_value = reward + done * self.gamma * target_value q1_value_loss = F.mse_loss(q1_value, target_q_value.detach()) q2_value_loss = F.mse_loss(q2_value, target_q_value.detach()) # Policy loss policy_loss = (log_prob - torch.min(new_q1_value, new_q2_value)).mean() # Update v self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Update Soft q self.q1_optimizer.zero_grad() self.q2_optimizer.zero_grad() q1_value_loss.backward() q2_value_loss.backward() self.q1_optimizer.step() self.q2_optimizer.step() # Update Policy self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Update target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)