def arrayflatgrad(self, f, symmetric=True): shape = f.shape + (self.total_size, ) Res = U.torchify(np.zeros(shape)) #assert shape[0]==shape[1] for i in range(shape[0]): for j in range(i, shape[0]): Res[i, j] = self.flatgrad(f[i, j], retain=True) return Res
def prob_predict(self,x): self.eval() x = U.torchify(x) if len(x.shape) ==len(self.input_shape): x.unsqueeze_(0) y = U.get(self.logsoftmax(x).squeeze().exp()) self.train() return y
def __init__(self, k, alpha=1e-2): super(Cholesky, self).__init__() self.layer = nn.Linear(k, k, bias=False) self.k = k self.alpha = alpha self.layer.weight.requires_grad = False self.Sig = U.torchify(np.eye(k)) self.min_eig = U.queue(50) self.update_weight()
def predict(self,x): self.eval() x = U.torchify(x) if len(x.shape) ==len(self.input_shape): x.unsqueeze_(0) y = U.get(self.forward(x).squeeze()) self.train() return y
def transform(self, state0): state = state0[self.crop['up']:self.crop['down'], self.crop['left']:self.crop['right'], :].astype(int) state = skimage.transform.resize(state.astype(float), (self.size, self.size, 3), mode="constant") / 255.0 self.last_frame = state.copy() self.episode.append(self.last_frame) state = state.astype(float).transpose(self.axis) return U.get(self.spin(U.torchify(state).unsqueeze(0))).squeeze()
def predict(self, x): x = U.torchify(x) if len(x.shape) == len(self.input_shape): x.unsqueeze_(0) return U.get(self.forward(x).squeeze())
def set_grad(self, d_theta): assert d_theta.shape == (self.total_size, ) for i, v in enumerate(self.variables()): v.grad = U.torchify(d_theta[self.idx[i]:self.idx[i + 1]].view( self.shapes[i])).detach()
def set(self, theta): assert theta.shape == (self.total_size, ) for i, v in enumerate(self.variables()): v.data = U.torchify(theta[self.idx[i]:self.idx[i + 1]].view( self.shapes[i])).detach()
path = self.path_generator.__next__() self.oldpolicy.copy(self.policy) for p in self.options: p.oldpolicy.copy(p.policy) import numpy as np import torch import collections from base.baseagent import BaseAgent from core.console import Progbar import core.math as m_utils import core.utils as U from Option import OptionTRPO import core.console as C states = U.torchify(path["states"]) options = U.torchify(path["options"]).long() actions = U.torchify(path["actions"]).long() advantages = U.torchify(path["baseline"]) tdlamret = U.torchify(path["tdlamret"]) vpred = U.torchify(path["vf"]) # predicted value function before udpate advantages = (advantages - advantages.mean()) / advantages.std() # standardized advantage function estimate losses = self.calculate_losses(states, options, actions, advantages) kl = losses["meankl"] optimization_gain = losses["gain"] loss_grad = self.policy.flaten.flatgrad(optimization_gain,retain=True) grad_kl = self.policy.flaten.flatgrad(kl,create=True,retain=True) theta_before = self.policy.flaten.get() self.log("Init param sum", theta_before.sum()) self.log("explained variance",(vpred-tdlamret).var()/tdlamret.var()) if np.allclose(loss_grad.detach().cpu().numpy(), 0,atol=1e-15):
def _train(self): # Prepare for rollouts # ---------------------------------------- self.oldpolicy.copy(self.policy) path = self.path_generator.__next__() states = U.torchify(path["state"]) actions = U.torchify(path["action"]).long() advantages = U.torchify(path["advantage"]) tdlamret = U.torchify(path["tdlamret"]) vpred = U.torchify( path["vf"]) # predicted value function before udpate advantages = (advantages - advantages.mean()) / advantages.std( ) # standardized advantage function estimate losses = self.calculate_losses(states, actions, advantages, tdlamret) kl = losses["meankl"] optimization_gain = losses["gain"] loss_grad = self.policy.flaten.flatgrad(optimization_gain, retain=True) grad_kl = self.policy.flaten.flatgrad(kl, create=True, retain=True) theta_before = self.policy.flaten.get() self.log("Init param sum", theta_before.sum()) self.log("explained variance", (vpred - tdlamret).var() / tdlamret.var()) if np.allclose(loss_grad.detach().cpu().numpy(), 0, atol=1e-15): print("Got zero gradient. not updating") else: print("Conjugate Gradient", end="") start = time.time() stepdir = m_utils.conjugate_gradient(self.Fvp(grad_kl), loss_grad, cg_iters=self.cg_iters) elapsed = time.time() - start print(", Done in %.3f" % elapsed) self.log("Conjugate Gradient in s", elapsed) assert stepdir.sum() != float("Inf") shs = .5 * stepdir.dot(self.Fvp(grad_kl)(stepdir)) lm = torch.sqrt(shs / self.max_kl) self.log("lagrange multiplier:", lm) self.log("gnorm:", np.linalg.norm(loss_grad.cpu().detach().numpy())) fullstep = stepdir / lm expected_improve = loss_grad.dot(fullstep) surrogate_before = losses["surrogate"] stepsize = 1.0 print("Line Search", end="") start = time.time() for _ in range(10): theta_new = theta_before + fullstep * stepsize self.policy.flaten.set(theta_new) losses = self.calculate_losses(states, actions, advantages, tdlamret) surr = losses["surrogate"] improve = surr - surrogate_before kl = losses["meankl"] if surr == float("Inf") or kl == float("Inf"): print("Infinite value of losses") elif kl > self.max_kl: print("Violated KL") elif improve < 0: print("Surrogate didn't improve. shrinking step.") else: print("Expected: %.3f Actual: %.3f" % (expected_improve, improve)) print("Stepsize OK!") self.log("Line Search", "OK") break stepsize *= .5 else: print("couldn't compute a good step") self.log("Line Search", "NOPE") self.policy.flaten.set(theta_before) elapsed = time.time() - start print(", Done in %.3f" % elapsed) self.log("Line Search in s", elapsed) self.log("KL", kl) self.log("Surrogate", surr) start = time.time() print("Value Function Update", end="") self.value_function.fit(states[::5], tdlamret[::5], batch_size=50, epochs=self.vf_iters) elapsed = time.time() - start print(", Done in %.3f" % elapsed) self.log("Value Function Fitting in s", elapsed) self.log("TDlamret mean", tdlamret.mean()) self.log("Last 50 rolls mean rew", np.mean(self.episodes_reward)) self.log("Last 50 rolls mean len", np.mean(self.episodes_len)) self.print()
def train(self): self.progbar.__init__(self.memory_min) while (self.memory.size < self.memory_min): self.path_generator.__next__() while (self.done < self.train_steps): to_log = 0 self.progbar.__init__(self.update_double) old_theta = self.Q.flaten.get() self.target_Q.copy(self.Q) while to_log < self.update_double: self.path_generator.__next__() rollout = self.memory.sample(self.batch_size) state_batch = U.torchify(rollout["state"]) action_batch = U.torchify(rollout["action"]).long() reward_batch = U.torchify(rollout["reward"]) non_final_batch = U.torchify(1 - rollout["terminated"]) next_state_batch = U.torchify(rollout["next_state"]) #current_q = self.Q(state_batch) current_q = self.Q(state_batch).gather( 1, action_batch.unsqueeze(1)).view(-1) _, a_prime = self.Q(next_state_batch).max(1) # Compute the target of the current Q values next_max_q = self.target_Q(next_state_batch).gather( 1, a_prime.unsqueeze(1)).view(-1) target_q = reward_batch + self.discount * non_final_batch * next_max_q.squeeze( ) # Compute loss loss = self.Q.loss(current_q, target_q.detach( )) # loss = self.Q.total_loss(current_q, target_q) # Optimize the model self.Q.optimize(loss, clip=True) self.progbar.add(self.batch_size, values=[("Loss", U.get(loss))]) to_log += self.batch_size self.target_Q.copy(self.Q) new_theta = self.Q.flaten.get() self.log("Delta Theta L1", U.get((new_theta - old_theta).abs().mean())) self.log("Av 50ep rew", np.mean(self.past_rewards)) self.log("Max 50ep rew", np.max(self.past_rewards)) self.log("Min 50ep rew", np.min(self.past_rewards)) self.log("Epsilon", self.eps) self.log("Done", self.done) self.log("Total", self.train_steps) self.target_Q.copy(self.Q) self.print() #self.play() self.save()
def _train(self, path): states = U.torchify(path["states"]) options = U.torchify(path["options"]).long() actions = U.torchify(path["actions"]).long() advantages = U.torchify(path["baseline"]) tdlamret = U.torchify(path["tdlamret"]) vpred = U.torchify( path["vf"]) # predicted value function before udpate #advantages = (advantages - advantages.mean()) / advantages.std() # standardized advantage function estimate losses = self.calculate_losses(states, options, actions, advantages) kl = losses["gate_meankl"] optimization_gain = losses["gain"] loss_grad = self.policy.flaten.flatgrad(optimization_gain, retain=True) grad_kl = self.policy.flaten.flatgrad(kl, create=True, retain=True) theta_before = self.policy.flaten.get() self.log("Init param sum", theta_before.sum()) self.log("explained variance", (vpred - tdlamret).var() / tdlamret.var()) if np.allclose(loss_grad.detach().cpu().numpy(), 0, atol=1e-19): print("Got zero gradient. not updating") else: with C.timeit("Conjugate Gradient"): stepdir = m_utils.conjugate_gradient(self.Fvp(grad_kl), loss_grad, cg_iters=self.cg_iters) self.log("Conjugate Gradient in s", C.elapsed) assert stepdir.sum() != float("Inf") shs = .5 * stepdir.dot(self.Fvp(grad_kl)(stepdir)) lm = torch.sqrt(shs / self.gate_max_kl) self.log("lagrange multiplier:", lm) self.log("gnorm:", np.linalg.norm(loss_grad.cpu().detach().numpy())) fullstep = stepdir / lm expected_improve = loss_grad.dot(fullstep) surrogate_before = losses["gain"].detach() with C.timeit("Line Search"): stepsize = 1.0 for i in range(10): theta_new = theta_before + fullstep * stepsize self.policy.flaten.set(theta_new) surr = losses["surr_get"]() improve = surr - surrogate_before kl = losses["KL_gate_get"]() if surr == float("Inf") or kl == float("Inf"): C.warning("Infinite value of losses") elif kl > self.gate_max_kl: C.warning("Violated KL") elif improve < 0: stepsize *= self.ls_step else: self.log("Line Search", "OK") break else: improve = 0 self.log("Line Search", "NOPE") self.policy.flaten.set(theta_before) for op in self.options: losses["gain"] = losses["surr_get"](grad=True) op.train(states, options, actions, advantages, tdlamret, losses) surr = losses["surr_get"]() improve = surr - surrogate_before self.log("Expected", expected_improve) self.log("Actual", improve) self.log("Line Search in s", C.elapsed) self.log("LS Steps", i) self.log("KL", kl) self.log("MI", -losses["MI"]) self.log("MI improve", -losses["MI_get"]()[0] + losses["MI"]) self.log("Surrogate", surr) self.log("Gate KL", losses["KL_gate_get"]()) self.log("HRL KL", losses["KL_get"]()) self.log("TDlamret mean", tdlamret.mean()) del (improve, surr, kl) self.log("Last %i rolls mean rew" % len(self.episodes_reward), np.mean(self.episodes_reward)) self.log("Last %i rolls mean len" % len(self.episodes_len), np.mean(self.episodes_len)) del (losses, states, options, actions, advantages, tdlamret, vpred, optimization_gain, loss_grad, grad_kl) for _ in range(10): gc.collect()