class DDQN(BasePolicy): def __init__(self, config): self.action_size = config['action_size'] self.device = config['device'] self.bounds = config['bounds'] self.model = BaseNetwork(**config).to(config['device']) self.target = copy.deepcopy(self.model) self.target.eval() self.action_select_eval = CEMOptimizer(**config) self.action_select_train = UniformOptimizer(**config) self.optimizer = torch.optim.Adam(self.model.parameters(), config['lrate'], eps=1e-3, weight_decay=config['decay']) def get_weights(self): return (self.model.state_dict(), self.target.state_dict()) def set_weights(self, weights): self.model.load_state_dict(weights[0]) self.target.load_state_dict(weights[1]) def load_checkpoint(self, checkpoint_dir): """Loads a model from a directory containing a checkpoint.""" if not os.path.exists(checkpoint_dir): raise Exception('No checkpoint directory <%s>' % checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') self.model.load_state_dict(torch.load(path, self.device)) self.update() def save_checkpoint(self, checkpoint_dir): """Saves a model to a directory containing a single checkpoint.""" if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') torch.save(self.model.state_dict(), path) @torch.no_grad() def sample_action(self, state, timestep, explore_prob): """Samples an action to perform in the environment.""" if np.random.random() < explore_prob: return np.random.uniform(*self.bounds, size=(self.action_size, )) return self.action_select_eval(self.model, state, timestep)[0].detach() def train(self, memory, gamma, batch_size, **kwargs): """Performs a single step of Q-Learning.""" self.model.train() # Sample a minibatch from the memory buffer s0, act, r, s1, done, timestep = memory.sample(batch_size) s0 = torch.from_numpy(s0).to(self.device) act = torch.from_numpy(act).to(self.device) r = torch.from_numpy(r).to(self.device) s1 = torch.from_numpy(s1).to(self.device) done = torch.from_numpy(done).to(self.device) t0 = torch.from_numpy(timestep).to(self.device) t1 = torch.from_numpy(timestep + 1).to(self.device) pred = self.model(s0, t0, act).view(-1) with torch.no_grad(): # DDQN finds the maximal action for the current policy aopt, _ = self.action_select_train(self.model, s1, t1) # but uses the q-value from the target network target = r + (1. - done) * gamma * self.target(s1, t1, aopt).view(-1) loss = torch.mean((pred - target)**2) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.) self.optimizer.step() return loss.item() def update(self): """Copy the network weights every few epochs.""" self.target.load_state_dict(self.model.state_dict()) self.target.eval()
class CMCRE(BasePolicy): def __init__(self, config): self.model = BaseNetwork(**config).to(config['device']) self.action_size = config['action_size'] self.device = config['device'] self.bounds = config['bounds'] self.action_select_eval = CEMOptimizer(**config) self.action_select_train = UniformOptimizer(**config) self.optimizer = torch.optim.Adam(self.model.parameters(), config['lrate'], eps=1e-3, weight_decay=config['decay']) def get_weights(self): return (self.model.state_dict(), ) def set_weights(self, weights): self.model.load_state_dict(weights[0]) def load_checkpoint(self, checkpoint_dir): """Loads a model from a directory containing a checkpoint.""" if not os.path.exists(checkpoint_dir): raise Exception('No checkpoint directory <%s>' % checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') self.model.load_state_dict(torch.load(path, self.device)) def save_checkpoint(self, checkpoint_dir): """Saves a model to a directory containing a single checkpoint.""" if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') torch.save(self.model.state_dict(), path) @torch.no_grad() def sample_action(self, state, timestep, explore_prob): """Samples an action to perform in the environment.""" if np.random.random() < explore_prob: return np.random.uniform(*self.bounds, size=(self.action_size, )) return self.action_select_eval(self.model, state, timestep)[0].detach() def _loss(self, Vstar, Qstar, r, gamma): """Calculates corrected loss over a single episode. Assumes that all inputs (Q, pred, r) belong to a single episode only. These are obtained by slicing the input at each timestep == 0. """ advantage = Qstar - Vstar out = torch.zeros_like(r, requires_grad=False) for i in reversed(range(r.shape[0] - 1)): out[i] = gamma * (out[i + 1] + (r[i + 1] - advantage[i + 1])) # Note that we later normalize over batch size loss = ((Qstar - (r + out))**2).sum() return loss def train(self, memory, gamma, batch_size, **kwargs): # Sample full episodes from memory s0, act, r, _, _, timestep = memory.sample(batch_size // 8) # Used to help compute proper loss per episode starts = np.hstack((np.where(timestep == 0)[0], r.shape[0])) s0 = torch.from_numpy(s0).to(self.device) act = torch.from_numpy(act).to(self.device) r = torch.from_numpy(r).to(self.device) t0 = torch.from_numpy(timestep).to(self.device) # Need both Q&V Q = self.model(s0, t0, act).view(-1) _, V = self.action_select_train(self.model, s0, t0) # Sum the loss for each of the episodes loss = 0 for s, e in zip(starts[:-1], starts[1:]): loss = loss + self._loss(V[s:e], Q[s:e], r[s:e], gamma) loss = loss / s0.shape[0] self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.) self.optimizer.step() return loss.item() def update(self): pass
class MCRE(BasePolicy): def __init__(self, config): self.action_size = config['action_size'] self.device = config['device'] self.bounds = config['bounds'] self.model = BaseNetwork(**config).to(config['device']) self.cem = CEMOptimizer(**config) self.optimizer = torch.optim.Adam(self.model.parameters(), config['lrate'], eps=1e-3, weight_decay=config['decay']) def get_weights(self): return (self.model.state_dict(),) def set_weights(self, weights): self.model.load_state_dict(weights[0]) def load_checkpoint(self, checkpoint_dir): """Loads a model from a directory containing a checkpoint.""" if not os.path.exists(checkpoint_dir): raise Exception('No checkpoint directory <%s>'%checkpoint_dir) weights = torch.load(os.path.join(checkpoint_dir, 'model.pt'), self.device) self.model.load_state_dict(weights) def save_checkpoint(self, checkpoint_dir): """Saves a model to a directory containing a single checkpoint.""" if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') torch.save(self.model.state_dict(), path) @torch.no_grad() def sample_action(self, state, timestep, explore_prob): """Samples an action to perform using CEM.""" if np.random.random() < explore_prob: return np.random.uniform(*self.bounds, size=(self.action_size,)) return self.cem(self.model, state, timestep)[0].detach() def train(self, memory, gamma, batch_size, **kwargs): del gamma # unused # Sample a minibatch from the memory buffer. Note that we sample # full grasping episodes in this method, so the output of # memory.sample will be episode_length * num_episodes s0, act, r, _, _, timestep = memory.sample(batch_size // 8) s0 = torch.from_numpy(s0).to(self.device) act = torch.from_numpy(act).to(self.device) r = torch.from_numpy(r).to(self.device) t0 = torch.from_numpy(timestep).to(self.device) pred = self.model(s0, t0, act).view(-1) # Note that the reward 'r' has been discounted in memory.load loss = torch.mean((pred - r) ** 2) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.) self.optimizer.step() return loss.item() def update(self): pass
class Supervised(BasePolicy): def __init__(self, config): self.action_size = config['action_size'] self.device = config['device'] self.bounds = config['bounds'] self.model = BaseNetwork(**config).to(config['device']) # Note this optimizer is slightly different then the one # used for other models self.action_select_eval = SupervisedCEMOptimizer(**config) self.optimizer = torch.optim.Adam(self.model.parameters(), config['lrate'], eps=1e-3, weight_decay=config['decay']) def get_weights(self): return (self.model.state_dict(),) # as tuple def set_weights(self, weights): self.model.load_state_dict(weights[0]) def load_checkpoint(self, checkpoint_dir): """Loads a model from a directory containing a checkpoint.""" if not os.path.exists(checkpoint_dir): raise Exception('No checkpoint directory <%s>'%checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') self.model.load_state_dict(torch.load(path, self.device)) def save_checkpoint(self, checkpoint_dir): """Saves a model to a directory containing a single checkpoint.""" if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) path = os.path.join(checkpoint_dir, 'model.pt') torch.save(self.model.state_dict(), path) @torch.no_grad() def sample_action(self, state, timestep, explore_prob): """Samples an action to perform in the environment.""" if np.random.random() < explore_prob: return np.random.uniform(*self.bounds, size=(self.action_size,)) return self.action_select_eval(self.model, state, timestep)[0].detach() def train(self, memory, batch_size, **kwargs): """Performs a single training step.""" s0, act, r, _, _, timestep = memory.sample(batch_size) # The dataset contains more failures then successes, so we'll # balance the minibatch loss by weighting it by class frequency weight = np.sum(r) / (batch_size - np.sum(r)) weight = np.where(r == 0, weight, 1).astype(np.float32) weight = torch.from_numpy(weight).to(self.device).view(-1) s0 = torch.from_numpy(s0).to(self.device) act = torch.from_numpy(act).to(self.device) r = torch.from_numpy(r).to(self.device) t0 = torch.from_numpy(timestep).to(self.device) pred = self.model(s0, t0, act).clamp(1e-8, 1-1e-8).view(-1) # Uses the outcome of the episode as individual step label loss = torch.nn.BCELoss(weight=weight)(pred, r) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.) self.optimizer.step() return loss.item() def update(self): pass
class DDPG(BasePolicy): def __init__(self, config): # Needed for sampling actions self.action_size = config['action_size'] self.device = config['device'] self.bounds = config['bounds'] self.critic = BaseNetwork(**config).to(config['device']) self.critic_target = copy.deepcopy(self.critic) self.critic_target.eval() self.actor = Actor(**config).to(config['device']) self.actor_target = copy.deepcopy(self.actor) self.actor_target.eval() self.aopt = torch.optim.Adam(self.actor.parameters(), config['lrate'], eps=1e-3, weight_decay=config['decay']) self.copt = torch.optim.Adam(self.critic.parameters(), config['lrate'], eps=1e-3, weight_decay=config['decay']) def get_weights(self): return (self.actor.state_dict(), self.critic.state_dict(), self.actor_target.state_dict(), self.critic_target.state_dict()) def set_weights(self, weights): self.actor.load_state_dict(weights[0]) self.critic.load_state_dict(weights[1]) self.actor_target.load_state_dict(weights[2]) self.critic_target.load_state_dict(weights[3]) def load_checkpoint(self, checkpoint_dir): """Loads a model from a directory containing a checkpoint.""" if not os.path.exists(checkpoint_dir): raise Exception('No checkpoint directory <%s>' % checkpoint_dir) path = os.path.join(checkpoint_dir, 'actor.pt') self.actor.load_state_dict(torch.load(path, self.device)) path = os.path.join(checkpoint_dir, 'critic.pt') self.critic.load_state_dict(torch.load(path, self.device)) self.update() def save_checkpoint(self, checkpoint_dir): """Saves a model to a directory containing a single checkpoint.""" if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) path = os.path.join(checkpoint_dir, 'actor.pt') torch.save(self.actor.state_dict(), path) path = os.path.join(checkpoint_dir, 'critic.pt') torch.save(self.critic.state_dict(), path) @torch.no_grad() def sample_action(self, state, timestep, explore_prob): """Samples an action to perform in the environment.""" if np.random.random() < explore_prob: return np.random.uniform(-1, 1, self.action_size) self.actor.eval() if isinstance(state, np.ndarray): state = torch.from_numpy(state).to(self.device) if isinstance(timestep, float): timestep = torch.tensor([timestep], device=self.device) return self.actor(state, timestep).detach() def train(self, memory, gamma, batch_size, **kwargs): self.actor.train() s0, act, r, s1, term, timestep = memory.sample(batch_size) s0 = torch.from_numpy(s0).to(self.device) act = torch.from_numpy(act).to(self.device) s1 = torch.from_numpy(s1).to(self.device) r = torch.from_numpy(r).to(self.device) term = torch.from_numpy(term).to(self.device) t0 = torch.from_numpy(timestep).to(self.device) t1 = torch.from_numpy(timestep + 1.).to(self.device) # Train the critic pred = self.critic(s0, t0, act).view(-1) with torch.no_grad(): at = self.actor_target(s1, t1) qt = self.critic_target(s1, t1, at).view(-1) target = r + (1. - term) * gamma * qt loss = torch.mean((pred - target)**2) #.clamp(-1, 1) self.aopt.zero_grad() self.copt.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 10.) self.copt.step() self.copt.zero_grad() # Train the actor by following the policy gradient self.aopt.zero_grad() action = self.actor(s0, t0) q_pred = -self.critic(s0, t0, action).mean() q_grad = torch.autograd.grad(q_pred, action)[0] action.backward(gradient=q_grad) torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 10.) self.aopt.step() return loss.item() def update(self): self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict())