def train(args): logger.warning( 'This script is an example to showcase the MetaModule and ' 'data-loading features of Torchmeta, and as such has been ' 'very lightly tested. For a better tested implementation of ' 'Model-Agnostic Meta-Learning (MAML) using Torchmeta with ' 'more features (including multi-step adaptation and ' 'different datasets), please check `https://github.com/' 'tristandeleu/pytorch-maml`.') inputs = [ "new_models/210429/dataset/BedBathingBaxterHuman-v0217_0-v1-human-coop-robot-coop_10k" ] env_name = "BedBathingBaxterHuman-v0217_0-v1" env = gym.make('assistive_gym:' + env_name) dataset = behaviour(inputs, shots=400, test_shots=1) dataloader = BatchMetaDataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = PolicyNetwork(env.observation_space_human.shape[0], env.action_space_human.shape[0]) for key, v in model.features.named_parameters(): v.data = torch.nn.init.zeros_(v) model.to(device=args.device) model.train() meta_optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # Training loop pbar = tqdm(total=args.num_batches) batch_idx = 0 while batch_idx < args.num_batches: for batch in dataloader: model.zero_grad() train_inputs, train_targets = batch['train'] train_inputs = train_inputs.to(device=args.device).float() train_targets = train_targets.to(device=args.device).float() loss = torch.tensor(0., device=args.device) for task_idx, (train_input, train_target) in enumerate( zip(train_inputs, train_targets)): train_output = model(train_input) loss += get_loss(train_output, train_target) model.zero_grad() loss.div_(len(dataloader)) loss.backward() meta_optimizer.step() pbar.update(1) pbar.set_postfix(loss='{0:.4f}'.format(loss.item())) batch_idx += 1
class Agent(): def __init__(self, state_size, action_size, num_agents): state_dim = state_size #agent_input_state_dim = state_size*2 # Previos state is passed in with with the current state. action_dim = action_size self.num_agents = num_agents max_size = 100000 ### self.replay = Replay(max_size) hidden_dim = 128 self.critic_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) self.target_critic_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) self.actor_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) self.target_actor_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) for target_param, param in zip(self.target_critic_net.parameters(), self.critic_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor_net.parameters(), self.actor_net.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=CRITIC_LEARNING_RATE) self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=ACTOR_LEARNING_RATE) def get_action(self, state): return self.actor_net.get_action(state)[0] def add_replay(self, state, action, reward, next_state, done): for i in range(self.num_agents): self.replay.add(state[i], action[i], reward[i], next_state[i], done[i]) def learning_step(self): #Check if relay buffer contains enough samples for 1 batch if (self.replay.cursize < BATCH_SIZE): return #Get Samples state, action, reward, next_state, done = self.replay.get(BATCH_SIZE) #calculate loss actor_loss = self.critic_net(state, self.actor_net(state)) actor_loss = -actor_loss.mean() next_action = self.target_actor_net(next_state) target_value = self.target_critic_net(next_state, next_action.detach()) expected_value = reward + (1.0 - done) * DISCOUNT_RATE * target_value value = self.critic_net(state, action) critic_loss = F.mse_loss(value, expected_value.detach()) #backprop self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #soft update self.soft_update(self.critic_net, self.target_critic_net, TAU) self.soft_update(self.actor_net, self.target_actor_net, TAU) def save(self, name): torch.save(self.critic_net.state_dict(), name + "_critic") torch.save(self.actor_net.state_dict(), name + "_actor") def load(self, name): self.critic_net.load_state_dict(torch.load(name + "_critic")) self.critic_net.eval() self.actor_net.load_state_dict(torch.load(name + "_actor")) self.actor_net.eval() for target_param, param in zip(self.target_critic_net.parameters(), self.critic_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor_net.parameters(), self.actor_net.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, cfg): self.device = cfg.device self.gamma = cfg.gamma self.batch_size = cfg.batch_size self.value_net = ValueNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_value_net = ValueNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.target_policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_policy_net.load_state_dict(self.policy_net.state_dict()) self.soft_tau = cfg.soft_tau self.value_lr = cfg.value_lr self.policy_lr = cfg.policy_lr self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) # mean squared error self.value_criterion = nn.MSELoss() self.replay_buffer = ReplayBuffer(cfg.replay_buffer_size) def update(self, cfg): state, action, reward, next_state, done = self.replay_buffer.sample( cfg.batch_size) # print(np.shape(state), np.shape(action), np.shape(reward), np.shape(next_state), np.shape(done)) # (128, 3) (128, 1) (128,) (128, 3) (128,) state = torch.FloatTensor(state).to(cfg.device) action = torch.FloatTensor(action).to(cfg.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(cfg.device) next_state = torch.FloatTensor(next_state).to(cfg.device) done = torch.FloatTensor(done).unsqueeze(1).to(cfg.device) self.value_net(state, self.policy_net(state)) # Actor Loss policy_loss = self.value_net(state, self.policy_net(state)) policy_loss = -policy_loss.mean() next_action = self.target_policy_net(next_state) target_value = self.target_value_net(next_state, next_action.detach()) TD_target = reward + (1.0 - done) * self.gamma * target_value TD_target = torch.clamp(TD_target, -np.inf, np.inf) value = self.value_net(state, action) # Critic Loss value_loss = self.value_criterion(value, TD_target.detach()) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Update target network for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau)
class A2CAgent(): """ init function input: env, which is the CartPole-v0 gamma, 0.99 in this case lr, learning rate is 1e-4 define: env = env, which is the CartPole-v0 obs_dim: 4 obervations Observation: Type: Box(4) Num Observation Min Max 0 Cart Position -4.8 4.8 1 Cart Velocity -Inf Inf 2 Pole Angle -24 deg 24 deg 3 Pole Velocity At Tip -Inf Inf action_dim: 2 actions Actions: Type: Discrete(2) Num Action 0 Push cart to the left 1 Push cart to the right value_network: two layer network with input 4 (observation dim) and output 1 (reward?) policy_network: two layer network with input 4 (observation dim) and output 2 (action dim) value and policy optimizer using default Adam and learning rate """ def __init__(self, env, gamma, lr): self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) """ input state to get the next action using policy network to get the next state by using softmax """ def get_action(self, state): state = torch.FloatTensor(state) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() """ form trajectory get all of the information, and calculated the discounted_rewards use value network to train the states with new values and compute the loss between the value and target value by using MSE same logic for policy network FloatTensor = FLOAT TYPE ARRAY t tensor([[1, 2, 3], [4, 5, 6]]) t.view(-1,1) tensor([[1], [2], [3], [4], [5], [6]]) """ def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]) actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1) rewards = torch.FloatTensor([sars[2] for sars in trajectory]) next_states = torch.FloatTensor([sars[3] for sars in trajectory]) dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1) # compute value target ## Two for loop to calculate the discounted reward for each one discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma ** i for i in range(rewards[j:].size(0))]) \ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss """ zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. opt.step() causes the optimizer to take a step based on the gradients of the parameters. """ def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class PPOAgent(): def __init__(self, state_size, action_size, seed, hidden_layers, opt_epoch=OPT_EPOCH, use_gae=True, batch_size=BATCH_SIZE): """Initialize an PPO Agent object. Arguments --------- state_size (int): Dimension of state action_size (int): Total number of possible actions seed (int): Random seed hidden_layers (list): List of integers, each element represents for the size of a hidden layer opt_epoch (int): Nummber of updates using the collected trajectories use_gae (logic): Indicator of using GAE, True or False """ self.state_size = state_size self.action_size = action_size self.state_normalizer = MeanStdNormalizer() self.opt_epoch = opt_epoch self.use_gae = use_gae self.batch_size = batch_size self.seed = random.seed(seed) # networks if use_gae: # self.network = ActorCriticNetwork(state_size, action_size, seed, hidden_layers).to(device) self.network = ActorCriticNetwork(state_size, action_size, seed, hidden_layers) else: # self.network = PolicyNetwork(state_size, action_size, seed, hidden_layers).to(device) self.network = PolicyNetwork(state_size, action_size, seed, hidden_layers) self.optimizer = optim.Adam(self.network.parameters(), lr=LR) # collect trajectories for a parallelized parallelEnv object def collect_trajectories(self, envs, brain_name, tmax=ROLLOUT_LEN, nrand=5, n_agents=N, discount=GAMMA, lamda=LAMDA): """Collect trajectories. Arguments --------- envs: Environment brain_name: brain name of given environment tmax: Maximum length of collected trajectories nrand: Random steps performed before collecting trajectories n_agents: Number of parallel agents in the environment """ # number of parallel instances n = n_agents #initialize returning lists and start the game! state_list = [] reward_list = [] log_prob_list = [] action_list = [] done_list = [] prediction_list = [] # reset environment env_info = envs.reset(train_mode=True)[brain_name] ''' # perform nrand random steps for _ in range(nrand): actions = np.random.randn(n, self.action_size) actions = np.clip(actions, -1, 1) env_info = envs.step(actions)[brain_name] ''' states = env_info.vector_observations # states = self.state_normalizer(states) for _ in range(tmax): # probs will only be used as the pi_old # no gradient propagation is needed # so we move it to the cpu #states_input = torch.tensor(states, dtype=torch.float, device=device) states_input = torch.tensor(states, dtype=torch.float) predictions = self.network(states_input) actions = to_np(predictions['a']) actions = np.clip(actions, -1, 1) env_info = envs.step(actions)[brain_name] next_states = env_info.vector_observations # next_states = self.state_normalizer(next_states) rewards = env_info.rewards dones = env_info.local_done # store the result state_list.append(states) reward_list.append(rewards) log_prob_list.append(to_np(predictions['log_pi_a'])) action_list.append(actions) done_list.append(dones) prediction_list.append(predictions) states = next_states.copy() # stop if any of the trajectories is done # we want all the lists to be retangular if np.stack(dones).any(): break # store one more step's prediction #states_input = torch.tensor(states, dtype=torch.float, device=device) states_input = torch.tensor(states, dtype=torch.float) predictions = self.network(states_input) prediction_list.append(predictions) # # return pi_theta, states, actions, rewards, probability # return np.stack(log_prob_list), np.stack(state_list), np.stack(action_list), \ # np.stack(reward_list), np.stack(done_list), np.stack(prediction_list) # calculate accumulated discounted rewards and advantage values log_old_probs = np.stack(log_prob_list) states = np.stack(state_list) actions = np.stack(action_list) rewards = np.stack(reward_list) dones = np.stack(done_list) predictions = np.stack(prediction_list) # calculate accumulated discounted rewards and advantage functions if not self.use_gae: discount_seq = discount**np.arange(len(rewards)) rewards_discounted = np.asarray(rewards) * discount_seq[:, np.newaxis] rewards_future = rewards_discounted[::-1].cumsum(axis=0)[::-1] advantages = rewards_future.copy() else: T = log_old_probs.shape[0] rewards_future = np.zeros_like(log_old_probs) advantages = np.zeros_like(log_old_probs) tmp_adv = np.zeros(log_old_probs.shape[1]) for i in reversed(range(T)): td_error = rewards[i, :] + discount * (1 - dones[i, :]) * to_np(predictions[i+1]['v']) - \ to_np(predictions[i]['v']) tmp_adv = tmp_adv * lamda * discount * (1 - dones[i, :]) + td_error advantages[i] = tmp_adv.copy() rewards_future[i] = tmp_adv + to_np(predictions[i]['v']) mean = np.mean(advantages) std = np.std(advantages) + 1.0e-10 adv_normalized = (advantages - mean) / std # return return log_old_probs, states, actions, rewards, rewards_future, adv_normalized # clipped surrogate function # similar as -policy_loss for REINFORCE, but for PPO def clipped_surrogate(self, log_old_probs, states, actions, rewards_future, adv_normalized, epsilon=EPSILON, beta=BETA): """Clipped surrogate function. Arguments --------- log_old_probs: Log probability of old policy, array with dim batch_size * 1 states: States, array with dim batch_size * state_size actions: Actions, array with dim batch_size * action_size rewards_future: Accumulated discounted rewards, array with dim batch_size * 1 adv_normalized: Advantage values, array with dim batch_size * 1 """ # convert everything into pytorch tensors and move to gpu if available # state_count = (states.shape[0], states.shape[1]) ''' log_old_probs = torch.tensor(log_old_probs.copy(), dtype=torch.float, device=device) adv = torch.tensor(adv_normalized.copy(), dtype=torch.float, device=device) rewards_future = torch.tensor(rewards_future.copy(), dtype=torch.float, device=device) states = torch.tensor(states.copy(), dtype=torch.float, device=device) actions = torch.tensor(actions.copy(), dtype=torch.float, device=device) ''' log_old_probs = torch.tensor(log_old_probs.copy(), dtype=torch.float) adv = torch.tensor(adv_normalized.copy(), dtype=torch.float) rewards_future = torch.tensor(rewards_future.copy(), dtype=torch.float) states = torch.tensor(states.copy(), dtype=torch.float) actions = torch.tensor(actions.copy(), dtype=torch.float) # convert states to policy (or probability) # states_input = states.view(-1, self.state_size) # actions_input = actions.view(-1, self.action_size) new_predictions = self.network(states, actions) # log_new_probs = new_predictions['log_pi_a'].view(state_count) log_new_probs = new_predictions['log_pi_a'].view(-1, 1) # ratio for clipping ratio = (log_new_probs - log_old_probs).exp() # clipped function clip = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) clipped_surrogate = torch.min(ratio * adv, clip * adv) # include entropy as a regularization term # entropy = new_predictions['entropy'].view(state_count) entropy = new_predictions['entropy'].view(-1, 1) # policy/actor loss policy_loss = -clipped_surrogate.mean() - beta * entropy.mean() # value/cirtic loss, if use GAE if self.use_gae: # value_loss = (rewards_future - new_predictions['v'].view(state_count)).pow(2).mean() value_loss = 1.0 * (rewards_future - new_predictions['v'].view( -1, 1)).pow(2).mean() loss = policy_loss + value_loss else: loss = policy_loss # this returns an average of all the loss entries return loss def step(self, envs, brain_name, grad_clip=10): # first, collect trajectories log_old_probs, states, actions, rewards, rewards_future, adv_normalized = \ self.collect_trajectories(envs, brain_name) # reshape the data log_old_probs_flat = log_old_probs.reshape(-1, 1) states_flat = states.reshape(-1, self.state_size) actions_flat = actions.reshape(-1, self.action_size) rewards_future_flat = rewards_future.reshape(-1, 1) adv_normalized_flat = adv_normalized.reshape(-1, 1) # update parameters using collected trajectories for _ in range(self.opt_epoch): # random sample from the collected trajectories by mini-batches sampler = random_sample(np.arange(states_flat.shape[0]), self.batch_size) # then updates parameters using the sampled mini-batch for batch_indices in sampler: self.network.train() L = self.clipped_surrogate(log_old_probs_flat[batch_indices], states_flat[batch_indices], actions_flat[batch_indices], rewards_future_flat[batch_indices], adv_normalized_flat[batch_indices]) self.optimizer.zero_grad() L.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), grad_clip) self.optimizer.step() # del L return rewards
class SAC(object): def __init__(self, config, env): self.device = config.device self.gamma = config.gamma # 折扣因子 self.tau = config.tau # 学习率 self.value_lr = config.value_lr self.soft_q_lr = config.soft_q_lr self.policy_lr = config.policy_lr self.replace_target_iter = config.replace_target_iter # 目标网络更新频率 self.replay_size = config.replay_size # 经验池大小 self.batch_size = config.batch_size # 批样本数 self.num_states = env.observation_space.shape[0] # 状态空间维度 self.num_actions = env.action_space.shape[0] # 动作空间维度 self.learn_start = self.batch_size * 3 # 控制学习的参数 self.learn_step_counter = 0 # 学习的总步数 self.memory = ReplayMemory(self.replay_size) # 初始化经验池 # 初始化V网络 self.value_net = ValueNetwork(self.num_states, 256).to(self.device) # 初始化V目标网络 self.target_value_net = ValueNetwork(self.num_states, 256).to(self.device) # V目标网络和V网络初始时参数一致 for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) # 初始化Q网络 self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions, 256).to(self.device) # 初始化策略网络 self.policy_net = PolicyNetwork(self.num_states, self.num_actions, 256).to(self.device) # 训练的优化器 self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) # 均方损失函数 self.value_criterion = nn.MSELoss() self.soft_q_criterion = nn.MSELoss() # 储存记忆 def store_transition(self, state, action, reward, next_state, done): self.memory.push((state, action, reward, next_state, done)) # 选择动作 def choose_action(self, s): s = torch.FloatTensor(s).to(self.device) mean, log_std = self.policy_net(s) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.detach().cpu().numpy() return action[0] # 获取动作的log_prob def get_action_log_prob(self, s, epsilon=1e-6): mean, log_std = self.policy_net(s) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) log_prob = log_prob.sum(-1, keepdim=True) # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon) # reparameterization return action, log_prob, z, mean, log_std # 从经验池中选取样本 def get_batch(self): transitions, _, _ = self.memory.sample(self.batch_size) # 批样本 # 解压批样本 # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)] batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip( *transitions) # 将样本转化为tensor batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float) batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.float).squeeze().view( -1, 1) # view转换为列tensor batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1) batch_next_state = torch.tensor(batch_next_state, device=self.device, dtype=torch.float) batch_done = torch.tensor(batch_done, device=self.device, dtype=torch.float).squeeze().view(-1, 1) # print("状态:", batch_state.shape) 128,4 # print("动作:", batch_action.shape) # print("奖励:", batch_reward.shape) # print("done:", batch_done.shape) # return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ # 学习 def learn(self): # 获取批样本 batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch( ) # print("状态:", batch_state) # print("动作:", batch_action) # print("done:", batch_done) expected_q_value = self.soft_q_net(batch_state, batch_action) # q(s,a) expected_value = self.value_net(batch_state) # v(s) new_action, log_prob, z, mean, log_std = self.get_action_log_prob( batch_state) # a~, logpi(a~|s), dist, 均值,标准差 target_value = self.target_value_net(batch_next_state) # vtar(s') next_q_value = batch_reward + ( 1 - batch_done) * self.gamma * target_value # r + gamma*(1-d)*vtar(s') q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach()).mean() expected_new_q_value = self.soft_q_net(batch_state, new_action) # q(s,a~) next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()).mean() log_prob_target = expected_new_q_value - expected_value # q(s,a) - v(s) policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() self.soft_q_optimizer.zero_grad() q_value_loss.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) # 学习的步数加一 self.learn_step_counter += 1 # 保存模型 def save(self): torch.save(self.soft_q_net, 'sac1_q.pkl') torch.save(self.value_net, 'sac1_v.pkl') torch.save(self.policy_net, 'sac1_policy.pkl') # 加载模型 def load(self): self.soft_q_net = torch.load('sac1_q.pkl') self.value_net = torch.load('sac1_v.pkl') self.policy_net = torch.load('sac1_policy.pkl')
class SAC: def __init__(self, env_name, n_states, n_actions, memory_size, batch_size, gamma, alpha, lr, action_bounds, reward_scale): self.env_name = env_name self.n_states = n_states self.n_actions = n_actions self.memory_size = memory_size self.batch_size = batch_size self.gamma = gamma self.alpha = alpha self.lr = lr self.action_bounds = action_bounds self.reward_scale = reward_scale self.memory = Memory(memory_size=self.memory_size) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.policy_network = PolicyNetwork( n_states=self.n_states, n_actions=self.n_actions, action_bounds=self.action_bounds).to(self.device) self.q_value_network1 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.q_value_network2 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.value_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network.load_state_dict( self.value_network.state_dict()) self.value_target_network.eval() self.value_loss = torch.nn.MSELoss() self.q_value_loss = torch.nn.MSELoss() self.value_opt = Adam(self.value_network.parameters(), lr=self.lr) self.q_value1_opt = Adam(self.q_value_network1.parameters(), lr=self.lr) self.q_value2_opt = Adam(self.q_value_network2.parameters(), lr=self.lr) self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr) def store(self, state, reward, done, action, next_state): state = from_numpy(state).float().to("cpu") reward = torch.Tensor([reward]).to("cpu") done = torch.Tensor([done]).to("cpu") action = torch.Tensor([action]).to("cpu") next_state = from_numpy(next_state).float().to("cpu") self.memory.add(state, reward, done, action, next_state) def unpack(self, batch): batch = Transition(*zip(*batch)) states = torch.cat(batch.state).view(self.batch_size, self.n_states).to(self.device) rewards = torch.cat(batch.reward).view(self.batch_size, 1).to(self.device) dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device) actions = torch.cat(batch.action).view(-1, self.n_actions).to(self.device) next_states = torch.cat(batch.next_state).view( self.batch_size, self.n_states).to(self.device) return states, rewards, dones, actions, next_states def train(self): if len(self.memory) < self.batch_size: return 0, 0, 0 else: batch = self.memory.sample(self.batch_size) states, rewards, dones, actions, next_states = self.unpack(batch) # Calculating the value target reparam_actions, log_probs = self.policy_network.sample_or_likelihood( states) q1 = self.q_value_network1(states, reparam_actions) q2 = self.q_value_network2(states, reparam_actions) q = torch.min(q1, q2) target_value = q.detach() - self.alpha * log_probs.detach() value = self.value_network(states) value_loss = self.value_loss(value, target_value) # Calculating the Q-Value target with torch.no_grad(): target_q = self.reward_scale * rewards + \ self.gamma * self.value_target_network(next_states) * (1 - dones) q1 = self.q_value_network1(states, actions) q2 = self.q_value_network2(states, actions) q1_loss = self.q_value_loss(q1, target_q) q2_loss = self.q_value_loss(q2, target_q) policy_loss = (self.alpha * log_probs - q).mean() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() self.value_opt.zero_grad() value_loss.backward() self.value_opt.step() self.q_value1_opt.zero_grad() q1_loss.backward() self.q_value1_opt.step() self.q_value2_opt.zero_grad() q2_loss.backward() self.q_value2_opt.step() self.soft_update_target_network(self.value_network, self.value_target_network) return value_loss.item(), 0.5 * ( q1_loss + q2_loss).item(), policy_loss.item() def choose_action(self, states): states = np.expand_dims(states, axis=0) states = from_numpy(states).float().to(self.device) action, _ = self.policy_network.sample_or_likelihood(states) return action.detach().cpu().numpy()[0] @staticmethod def soft_update_target_network(local_network, target_network, tau=0.005): for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def save_weights(self): torch.save(self.policy_network.state_dict(), self.env_name + "_weights.pth") def load_weights(self): self.policy_network.load_state_dict( torch.load(self.env_name + "_weights.pth")) def set_to_eval_mode(self): self.policy_network.eval()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env # self.action_range = [env.action_space.low, env.action_space.high] # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters self.action_range = [[-1, 1], [-1, 1]] self.obs_dim = env.observation_space.shape[0] self.action_dim = 2 # self.action_dim = 1 # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) # pi: state -> acton def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): '''if action < 0.5: return 0 else: return 1''' scaled_action = [] for idx, a in enumerate(action): action_range = self.action_range[idx] a = (action_range[1] - action_range[0]) / 2.0 + ( action_range[1] + action_range[0]) / 2.0 scaled_action.append(a) return scaled_action def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) #TODO: Question: why using 2 Q-networks? # To reduce bias in training. # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy net and target value nets # TODO: Question: what does this part do? # The original paper mentioned 2 methods for approximating the value function # 1. the EMA of policy weights to update the Q network # 2. periodical update of the policy network, which is used in this code if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1