def __init__(self, state_dim, action_dim, discrete, train_config=None) -> None: super().__init__() self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) self.v = ValueNetwork(self.state_dim)
def __init__(self, state_dim, action_dim, discrete, train_config=None): self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) self.v = ValueNetwork(self.state_dim) self.d = Discriminator(self.state_dim, self.action_dim, self.discrete) if torch.cuda.is_available(): for net in self.get_networks(): net.to(torch.device("cuda"))
class GAE(Module): def __init__(self, state_dim, action_dim, discrete, train_config=None) -> None: super().__init__() self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) self.v = ValueNetwork(self.state_dim) def get_networks(self): return [self.pi, self.v] def act(self, state): self.pi.eval() state = FloatTensor(state) distb = self.pi(state) action = distb.sample().detach().cpu().numpy() return action def train(self, env, render=False): num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] horizon = self.train_config["horizon"] gamma_ = self.train_config["gamma"] lambda_ = self.train_config["lambda"] eps = self.train_config["epsilon"] max_kl = self.train_config["max_kl"] cg_damping = self.train_config["cg_damping"] normalize_advantage = self.train_config["normalize_advantage"] rwd_iter_means = [] for i in range(num_iters): rwd_iter = [] obs = [] acts = [] rets = [] advs = [] gms = [] steps = 0 while steps < num_steps_per_iter: ep_obs = [] ep_rwds = [] ep_disc_rwds = [] ep_gms = [] ep_lmbs = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = self.act(ob) ep_obs.append(ob) obs.append(ob) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) ep_disc_rwds.append(rwd * (gamma_**t)) ep_gms.append(gamma_**t) ep_lmbs.append(lambda_**t) t += 1 steps += 1 if horizon is not None: if t >= horizon: done = True break if done: rwd_iter.append(np.sum(ep_rwds)) ep_obs = FloatTensor(np.array(ep_obs)) ep_rwds = FloatTensor(ep_rwds) ep_disc_rwds = FloatTensor(ep_disc_rwds) ep_gms = FloatTensor(ep_gms) ep_lmbs = FloatTensor(ep_lmbs) ep_disc_rets = FloatTensor( [sum(ep_disc_rwds[i:]) for i in range(t)]) ep_rets = ep_disc_rets / ep_gms rets.append(ep_rets) self.v.eval() curr_vals = self.v(ep_obs).detach() next_vals = torch.cat( (self.v(ep_obs)[1:], FloatTensor([[0.]]))).detach() ep_deltas = ep_rwds.unsqueeze(-1)\ + gamma_ * next_vals\ - curr_vals ep_advs = FloatTensor([ ((ep_gms * ep_lmbs)[:t - j].unsqueeze(-1) * ep_deltas[j:]).sum() for j in range(t) ]) advs.append(ep_advs) gms.append(ep_gms) rwd_iter_means.append(np.mean(rwd_iter)) print("Iterations: {}, Reward Mean: {}".format( i + 1, np.mean(rwd_iter))) obs = FloatTensor(np.array(obs)) acts = FloatTensor(np.array(acts)) rets = torch.cat(rets) advs = torch.cat(advs) gms = torch.cat(gms) if normalize_advantage: advs = (advs - advs.mean()) / advs.std() self.v.train() old_params = get_flat_params(self.v).detach() old_v = self.v(obs).detach() def constraint(): return ((old_v - self.v(obs))**2).mean() grad_diff = get_flat_grads(constraint(), self.v) def Hv(v): hessian = get_flat_grads(torch.dot(grad_diff, v), self.v)\ .detach() return hessian g = get_flat_grads( ((-1) * (self.v(obs).squeeze() - rets)**2).mean(), self.v).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() alpha = torch.sqrt(2 * eps / torch.dot(s, Hs)) new_params = old_params + alpha * s set_params(self.v, new_params) self.pi.train() old_params = get_flat_params(self.pi).detach() old_distb = self.pi(obs) def L(): distb = self.pi(obs) return (advs * torch.exp( distb.log_prob(acts) - old_distb.log_prob(acts).detach()) ).mean() def kld(): distb = self.pi(obs) if self.discrete: old_p = old_distb.probs.detach() p = distb.probs return (old_p * (torch.log(old_p) - torch.log(p)))\ .sum(-1)\ .mean() else: old_mean = old_distb.mean.detach() old_cov = old_distb.covariance_matrix.sum(-1).detach() mean = distb.mean cov = distb.covariance_matrix.sum(-1) return (0.5) * ((old_cov / cov).sum(-1) + (((old_mean - mean)**2) / cov).sum(-1) - self.action_dim + torch.log(cov).sum(-1) - torch.log(old_cov).sum(-1)).mean() grad_kld_old_param = get_flat_grads(kld(), self.pi) def Hv(v): hessian = get_flat_grads(torch.dot(grad_kld_old_param, v), self.pi).detach() return hessian + cg_damping * v g = get_flat_grads(L(), self.pi).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() new_params = rescale_and_linesearch(g, s, Hs, max_kl, L, kld, old_params, self.pi) set_params(self.pi, new_params) return rwd_iter_means
class GAIL: def __init__(self, state_dim, action_dim, discrete, train_config=None): self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) self.v = ValueNetwork(self.state_dim) self.d = Discriminator(self.state_dim, self.action_dim, self.discrete) if torch.cuda.is_available(): for net in self.get_networks(): net.to(torch.device("cuda")) def get_networks(self): return [self.pi, self.v] def act(self, state): self.pi.eval() state = FloatTensor(state) distb = self.pi(state) action = distb.sample().detach().cpu().numpy() return action def train(self, env, expert, render=False): num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] horizon = self.train_config["horizon"] lambda_ = self.train_config["lambda"] gae_gamma = self.train_config["gae_gamma"] gae_lambda = self.train_config["gae_lambda"] eps = self.train_config["epsilon"] max_kl = self.train_config["max_kl"] cg_damping = self.train_config["cg_damping"] normalize_advantage = self.train_config["normalize_advantage"] opt_d = torch.optim.Adam(self.d.parameters()) exp_rwd_iter = [] exp_obs = [] exp_acts = [] steps = 0 while steps < num_steps_per_iter: ep_obs = [] ep_rwds = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = expert.act(ob) ep_obs.append(ob) exp_obs.append(ob) exp_acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) t += 1 steps += 1 if horizon is not None: if t >= horizon: break if done: exp_rwd_iter.append(np.sum(ep_rwds)) ep_obs = FloatTensor(ep_obs) ep_rwds = FloatTensor(ep_rwds) exp_rwd_mean = np.mean(exp_rwd_iter) print("Expert Reward Mean: {}".format(exp_rwd_mean)) exp_obs = FloatTensor(exp_obs) exp_acts = FloatTensor(np.array(exp_acts)) rwd_iter_means = [] for i in range(num_iters): rwd_iter = [] obs = [] acts = [] rets = [] advs = [] gms = [] steps = 0 while steps < num_steps_per_iter: ep_obs = [] ep_acts = [] ep_rwds = [] ep_costs = [] ep_disc_costs = [] ep_gms = [] ep_lmbs = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = self.act(ob) ep_obs.append(ob) obs.append(ob) ep_acts.append(act) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) ep_gms.append(gae_gamma**t) ep_lmbs.append(gae_lambda**t) t += 1 steps += 1 if horizon is not None: if t >= horizon: break if done: rwd_iter.append(np.sum(ep_rwds)) ep_obs = FloatTensor(ep_obs) # ep_acts = FloatTensor(np.array(ep_acts)).to(torch.device("cuda")) ep_acts = FloatTensor(np.array(ep_acts)) ep_rwds = FloatTensor(ep_rwds) # ep_disc_rwds = FloatTensor(ep_disc_rwds) ep_gms = FloatTensor(ep_gms) ep_lmbs = FloatTensor(ep_lmbs) ep_costs = (-1) * torch.log(self.d(ep_obs, ep_acts))\ .squeeze().detach() ep_disc_costs = ep_gms * ep_costs ep_disc_rets = FloatTensor( [sum(ep_disc_costs[i:]) for i in range(t)]) ep_rets = ep_disc_rets / ep_gms rets.append(ep_rets) self.v.eval() curr_vals = self.v(ep_obs).detach() next_vals = torch.cat( (self.v(ep_obs)[1:], FloatTensor([[0.]]))).detach() ep_deltas = ep_costs.unsqueeze(-1)\ + gae_gamma * next_vals\ - curr_vals ep_advs = torch.FloatTensor([ ((ep_gms * ep_lmbs)[:t - j].unsqueeze(-1) * ep_deltas[j:]).sum() for j in range(t) ]) advs.append(ep_advs) gms.append(ep_gms) rwd_iter_means.append(np.mean(rwd_iter)) print("Iterations: {}, Reward Mean: {}".format( i + 1, np.mean(rwd_iter))) obs = FloatTensor(obs) # acts = FloatTensor(np.array(acts)).to(torch.device("cuda")) acts = FloatTensor(np.array(acts)) rets = torch.cat(rets) advs = torch.cat(advs) gms = torch.cat(gms) if normalize_advantage: advs = (advs - advs.mean()) / advs.std() self.d.train() exp_scores = self.d.get_logits(exp_obs, exp_acts) nov_scores = self.d.get_logits(obs, acts) opt_d.zero_grad() loss = torch.nn.functional.binary_cross_entropy_with_logits( exp_scores, torch.zeros_like(exp_scores) ) \ + torch.nn.functional.binary_cross_entropy_with_logits( nov_scores, torch.ones_like(nov_scores) ) loss.backward() opt_d.step() self.v.train() old_params = get_flat_params(self.v).detach() old_v = self.v(obs).detach() def constraint(): return ((old_v - self.v(obs))**2).mean() grad_diff = get_flat_grads(constraint(), self.v) def Hv(v): hessian = get_flat_grads(torch.dot(grad_diff, v), self.v)\ .detach() return hessian g = get_flat_grads( ((-1) * (self.v(obs).squeeze() - rets)**2).mean(), self.v).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() alpha = torch.sqrt(2 * eps / torch.dot(s, Hs)) new_params = old_params + alpha * s set_params(self.v, new_params) self.pi.train() old_params = get_flat_params(self.pi).detach() old_distb = self.pi(obs) def L(): distb = self.pi(obs) return (advs.to(torch.device("cuda")) * torch.exp( distb.log_prob(acts) - old_distb.log_prob(acts).detach()) ).mean() def kld(): distb = self.pi(obs) if self.discrete: old_p = old_distb.probs.detach() p = distb.probs return (old_p * (torch.log(old_p) - torch.log(p)))\ .sum(-1)\ .mean() else: old_mean = old_distb.mean.detach() old_cov = old_distb.covariance_matrix.sum(-1).detach() mean = distb.mean cov = distb.covariance_matrix.sum(-1) return (0.5) * ((old_cov / cov).sum(-1) + (((old_mean - mean)**2) / cov).sum(-1) - self.action_dim + torch.log(cov).sum(-1) - torch.log(old_cov).sum(-1)).mean() grad_kld_old_param = get_flat_grads(kld(), self.pi) def Hv(v): hessian = get_flat_grads(torch.dot(grad_kld_old_param, v), self.pi).detach() return hessian + cg_damping * v g = get_flat_grads(L(), self.pi).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() new_params = rescale_and_linesearch(g, s, Hs, max_kl, L, kld, old_params, self.pi) disc_causal_entropy = ((-1) * gms * self.pi(obs).log_prob(acts))\ .mean() grad_disc_causal_entropy = get_flat_grads(disc_causal_entropy, self.pi) new_params += lambda_ * grad_disc_causal_entropy set_params(self.pi, new_params) return exp_rwd_mean, rwd_iter_means
class TRPO(Module): def __init__( self, state_dim, action_dim, discrete, train_config=None ) -> None: super().__init__() self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) if self.train_config["use_baseline"]: self.v = ValueNetwork(self.state_dim) def get_networks(self): if self.train_config["use_baseline"]: return [self.pi, self.v] else: return [self.pi] def act(self, state): self.pi.eval() state = FloatTensor(state) distb = self.pi(state) action = distb.sample().detach().cpu().numpy() return action def train(self, env, render=False): lr = self.train_config["lr"] num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] horizon = self.train_config["horizon"] discount = self.train_config["discount"] max_kl = self.train_config["max_kl"] cg_damping = self.train_config["cg_damping"] normalize_return = self.train_config["normalize_return"] use_baseline = self.train_config["use_baseline"] if use_baseline: opt_v = torch.optim.Adam(self.v.parameters(), lr) rwd_iter_means = [] for i in range(num_iters): rwd_iter = [] obs = [] acts = [] rets = [] disc = [] steps = 0 while steps < num_steps_per_iter: ep_rwds = [] ep_disc_rwds = [] ep_disc = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = self.act(ob) obs.append(ob) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) ep_disc_rwds.append(rwd * (discount ** t)) ep_disc.append(discount ** t) t += 1 steps += 1 if horizon is not None: if t >= horizon: done = True break ep_disc = FloatTensor(ep_disc) ep_disc_rets = FloatTensor( [sum(ep_disc_rwds[i:]) for i in range(t)] ) ep_rets = ep_disc_rets / ep_disc rets.append(ep_rets) disc.append(ep_disc) if done: rwd_iter.append(np.sum(ep_rwds)) rwd_iter_means.append(np.mean(rwd_iter)) print( "Iterations: {}, Reward Mean: {}" .format(i + 1, np.mean(rwd_iter)) ) obs = FloatTensor(np.array(obs)) acts = FloatTensor(np.array(acts)) rets = torch.cat(rets) disc = torch.cat(disc) if normalize_return: rets = (rets - rets.mean()) / rets.std() if use_baseline: self.v.eval() delta = (rets - self.v(obs).squeeze()).detach() self.v.train() opt_v.zero_grad() loss = (-1) * disc * delta * self.v(obs).squeeze() loss.mean().backward() opt_v.step() self.pi.train() old_params = get_flat_params(self.pi).detach() old_distb = self.pi(obs) def L(): distb = self.pi(obs) if use_baseline: return (disc * delta * torch.exp( distb.log_prob(acts) - old_distb.log_prob(acts).detach() )).mean() else: return (disc * rets * torch.exp( distb.log_prob(acts) - old_distb.log_prob(acts).detach() )).mean() def kld(): distb = self.pi(obs) if self.discrete: old_p = old_distb.probs.detach() p = distb.probs return (old_p * (torch.log(old_p) - torch.log(p)))\ .sum(-1)\ .mean() else: old_mean = old_distb.mean.detach() old_cov = old_distb.covariance_matrix.sum(-1).detach() mean = distb.mean cov = distb.covariance_matrix.sum(-1) return (0.5) * ( (old_cov / cov).sum(-1) + (((old_mean - mean) ** 2) / cov).sum(-1) - self.action_dim + torch.log(cov).sum(-1) - torch.log(old_cov).sum(-1) ).mean() grad_kld_old_param = get_flat_grads(kld(), self.pi) def Hv(v): hessian = get_flat_grads( torch.dot(grad_kld_old_param, v), self.pi ).detach() return hessian + cg_damping * v g = get_flat_grads(L(), self.pi).detach() s = conjugate_gradient(Hv, g).detach() Hs = Hv(s).detach() new_params = rescale_and_linesearch( g, s, Hs, max_kl, L, kld, old_params, self.pi ) set_params(self.pi, new_params) return rwd_iter_means
class ActorCritic(Module): def __init__(self, state_dim, action_dim, discrete, train_config=None) -> None: super().__init__() self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) self.v = ValueNetwork(self.state_dim) def get_networks(self): return [self.pi, self.v] def act(self, state): self.pi.eval() state = FloatTensor(state) distb = self.pi(state) action = distb.sample().detach().cpu().numpy() return action def train(self, env, render=False): lr = self.train_config["lr"] num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] horizon = self.train_config["horizon"] discount = self.train_config["discount"] normalize_advantage = self.train_config["normalize_advantage"] opt_pi = torch.optim.Adam(self.pi.parameters(), lr) opt_v = torch.optim.Adam(self.v.parameters(), lr) rwd_iter_means = [] rwd_iter = [] i = 0 steps = 0 while i < num_iters: obs = [] acts = [] rwds = [] disc_rwds = [] disc = [] t = 0 done = False ob = env.reset() while not done: act = self.act(ob) obs.append(ob) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) rwds.append(rwd) disc_rwds.append(rwd * (discount**t)) disc.append(discount**t) t += 1 steps += 1 if steps == num_steps_per_iter: rwd_iter_means.append(np.mean(rwd_iter)) print("Iterations: {}, Reward Mean: {}".format( i + 1, np.mean(rwd_iter))) i += 1 steps = 0 rwd_iter = [] if horizon is not None: if t >= horizon: done = True break rwd_iter.append(np.sum(rwds)) obs = FloatTensor(np.array(obs)) acts = FloatTensor(np.array(acts)) rwds = FloatTensor(rwds) disc = FloatTensor(disc) ### disc_rets = FloatTensor( [sum(disc_rwds[i:]) for i in range(len(disc_rwds))]) rets = disc_rets / disc ### self.v.eval() curr_vals = self.v(obs) next_vals = torch.cat((self.v(obs)[1:], FloatTensor([[0.]]))) advantage = (rwds.unsqueeze(-1) + discount * next_vals - curr_vals).detach() if normalize_advantage: advantage = (advantage - advantage.mean()) / advantage.std() # print(advantage.shape, obs.shape, disc.shape) delta = (rets - self.v(obs).squeeze()).detach() self.v.train() opt_v.zero_grad() # loss = (0.5) * ( # rwds.unsqueeze(-1) # + discount * next_vals.detach() # - self.v(obs) # ) ** 2 loss = (-1) * disc * delta * self.v(obs).squeeze() # loss = (0.5) * ((rets - self.v(obs).squeeze()) ** 2) # loss = (-1) * disc.unsqueeze(-1) * advantage * self.v(obs) # print(loss.shape) loss.mean().backward() opt_v.step() self.pi.train() distb = self.pi(obs) opt_pi.zero_grad() loss = (-1) * disc.unsqueeze(-1) * advantage * distb.log_prob(acts) loss.mean().backward() opt_pi.step() return rwd_iter_means
class PPO(Module): def __init__(self, state_dim, action_dim, discrete, train_config=None) -> None: super().__init__() self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) self.v = ValueNetwork(self.state_dim) def get_networks(self): return [self.pi, self.v] def act(self, state): self.pi.eval() state = FloatTensor(state) distb = self.pi(state) action = distb.sample().detach().cpu().numpy() return action def train(self, env, render=False): lr = self.train_config["lr"] num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] num_epochs = self.train_config["num_epochs"] minibatch_size = self.train_config["minibatch_size"] horizon = self.train_config["horizon"] gamma_ = self.train_config["gamma"] lambda_ = self.train_config["lambda"] eps = self.train_config["epsilon"] c1 = self.train_config["vf_coeff"] c2 = self.train_config["entropy_coeff"] normalize_advantage = self.train_config["normalize_advantage"] opt_pi = torch.optim.Adam(self.pi.parameters(), lr) opt_v = torch.optim.Adam(self.v.parameters(), lr) rwd_iter_means = [] for i in range(num_iters): rwd_iter = [] obs = [] acts = [] rets = [] advs = [] gms = [] steps = 0 while steps < num_steps_per_iter: ep_obs = [] ep_rwds = [] ep_disc_rwds = [] ep_gms = [] ep_lmbs = [] t = 0 done = False ob = env.reset() while not done and steps < num_steps_per_iter: act = self.act(ob) ep_obs.append(ob) obs.append(ob) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) ep_rwds.append(rwd) ep_disc_rwds.append(rwd * (gamma_**t)) ep_gms.append(gamma_**t) ep_lmbs.append(lambda_**t) t += 1 steps += 1 if horizon is not None: if t >= horizon: done = True break if done: rwd_iter.append(np.sum(ep_rwds)) ep_obs = FloatTensor(np.array(ep_obs)) ep_rwds = FloatTensor(ep_rwds) ep_disc_rwds = FloatTensor(ep_disc_rwds) ep_gms = FloatTensor(ep_gms) ep_lmbs = FloatTensor(ep_lmbs) ep_disc_rets = FloatTensor( [sum(ep_disc_rwds[i:]) for i in range(t)]) ep_rets = ep_disc_rets / ep_gms rets.append(ep_rets) self.v.eval() curr_vals = self.v(ep_obs).detach() next_vals = torch.cat( (self.v(ep_obs)[1:], FloatTensor([[0.]]))).detach() ep_deltas = ep_rwds.unsqueeze(-1)\ + gamma_ * next_vals\ - curr_vals ep_advs = FloatTensor([ ((ep_gms * ep_lmbs)[:t - j].unsqueeze(-1) * ep_deltas[j:]).sum() for j in range(t) ]) advs.append(ep_advs) gms.append(ep_gms) rwd_iter_means.append(np.mean(rwd_iter)) print("Iterations: {}, Reward Mean: {}".format( i + 1, np.mean(rwd_iter))) obs = FloatTensor(np.array(obs)) acts = FloatTensor(np.array(acts)) rets = torch.cat(rets) advs = torch.cat(advs) gms = torch.cat(gms) if normalize_advantage: advs = (advs - advs.mean()) / advs.std() self.pi.eval() old_log_pi = self.pi(obs).log_prob(acts).detach() self.pi.train() self.v.train() max_steps = num_epochs * (num_steps_per_iter // minibatch_size) for _ in range(max_steps): minibatch_indices = np.random.choice(range(steps), minibatch_size, False) mb_obs = obs[minibatch_indices] mb_acts = acts[minibatch_indices] mb_advs = advs[minibatch_indices] mb_rets = rets[minibatch_indices] mb_distb = self.pi(mb_obs) mb_log_pi = mb_distb.log_prob(mb_acts) mb_old_log_pi = old_log_pi[minibatch_indices] r = torch.exp(mb_log_pi - mb_old_log_pi) L_clip = torch.minimum( r * mb_advs, torch.clip(r, 1 - eps, 1 + eps) * mb_advs) L_vf = (self.v(mb_obs).squeeze() - mb_rets)**2 S = mb_distb.entropy() opt_pi.zero_grad() opt_v.zero_grad() loss = (-1) * (L_clip - c1 * L_vf + c2 * S).mean() loss.backward() opt_pi.step() opt_v.step() return rwd_iter_means
class PolicyGradient(Module): def __init__(self, state_dim, action_dim, discrete, train_config=None) -> None: super().__init__() self.state_dim = state_dim self.action_dim = action_dim self.discrete = discrete self.train_config = train_config self.pi = PolicyNetwork(self.state_dim, self.action_dim, self.discrete) if self.train_config["use_baseline"]: self.v = ValueNetwork(self.state_dim) def get_networks(self): if self.train_config["use_baseline"]: return [self.pi, self.v] else: return [self.pi] def act(self, state): self.pi.eval() state = FloatTensor(state) distb = self.pi(state) action = distb.sample().detach().cpu().numpy() return action def train(self, env, render=False): lr = self.train_config["lr"] num_iters = self.train_config["num_iters"] num_steps_per_iter = self.train_config["num_steps_per_iter"] horizon = self.train_config["horizon"] discount = self.train_config["discount"] normalize_return = self.train_config["normalize_return"] use_baseline = self.train_config["use_baseline"] opt_pi = torch.optim.Adam(self.pi.parameters(), lr) if use_baseline: opt_v = torch.optim.Adam(self.v.parameters(), lr) rwd_iter_means = [] rwd_iter = [] i = 0 steps = 0 while i < num_iters: obs = [] acts = [] rwds = [] disc_rwds = [] disc = [] t = 0 done = False ob = env.reset() while not done: act = self.act(ob) obs.append(ob) acts.append(act) if render: env.render() ob, rwd, done, info = env.step(act) rwds.append(rwd) disc_rwds.append(rwd * (discount**t)) disc.append(discount**t) t += 1 steps += 1 if steps == num_steps_per_iter: rwd_iter_means.append(np.mean(rwd_iter)) print("Iterations: {}, Reward Mean: {}".format( i + 1, np.mean(rwd_iter))) i += 1 steps = 0 rwd_iter = [] if horizon is not None: if t >= horizon: done = True break rwd_iter.append(np.sum(rwds)) obs = FloatTensor(np.array(obs)) acts = FloatTensor(np.array(acts)) disc = FloatTensor(disc) disc_rets = FloatTensor( [sum(disc_rwds[i:]) for i in range(len(disc_rwds))]) rets = disc_rets / disc if normalize_return: rets = (rets - rets.mean()) / rets.std() if use_baseline: self.v.eval() delta = (rets - self.v(obs).squeeze()).detach() self.v.train() opt_v.zero_grad() loss = (-1) * disc * delta * self.v(obs).squeeze() loss.mean().backward() opt_v.step() self.pi.train() distb = self.pi(obs) opt_pi.zero_grad() if use_baseline: loss = (-1) * disc * delta * distb.log_prob(acts) else: loss = (-1) * disc * distb.log_prob(acts) * rets loss.mean().backward() opt_pi.step() return rwd_iter_means