Esempio n. 1
0
    def __init__(self, config):

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.model = BaseNetwork(**config).to(config['device'])

        self.cem = CEMOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])
Esempio n. 2
0
    def __init__(self, config):

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.model = BaseNetwork(**config).to(config['device'])

        # Note this optimizer is slightly different then the one 
        # used for other models
        self.action_select_eval = SupervisedCEMOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])
Esempio n. 3
0
    def __init__(self, config):

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.model = BaseNetwork(**config).to(config['device'])
        self.target = copy.deepcopy(self.model)
        self.target.eval()

        self.action_select_eval = CEMOptimizer(**config)
        self.action_select_train = UniformOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])
Esempio n. 4
0
    def __init__(self, config):

        # Needed for sampling actions
        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.critic = BaseNetwork(**config).to(config['device'])
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_target.eval()

        self.actor = Actor(**config).to(config['device'])
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_target.eval()

        self.aopt = torch.optim.Adam(self.actor.parameters(),
                                     config['lrate'],
                                     eps=1e-3,
                                     weight_decay=config['decay'])

        self.copt = torch.optim.Adam(self.critic.parameters(),
                                     config['lrate'],
                                     eps=1e-3,
                                     weight_decay=config['decay'])
Esempio n. 5
0
class DDQN(BasePolicy):
    def __init__(self, config):

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.model = BaseNetwork(**config).to(config['device'])
        self.target = copy.deepcopy(self.model)
        self.target.eval()

        self.action_select_eval = CEMOptimizer(**config)
        self.action_select_train = UniformOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])

    def get_weights(self):
        return (self.model.state_dict(), self.target.state_dict())

    def set_weights(self, weights):
        self.model.load_state_dict(weights[0])
        self.target.load_state_dict(weights[1])

    def load_checkpoint(self, checkpoint_dir):
        """Loads a model from a directory containing a checkpoint."""

        if not os.path.exists(checkpoint_dir):
            raise Exception('No checkpoint directory <%s>' % checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        self.model.load_state_dict(torch.load(path, self.device))
        self.update()

    def save_checkpoint(self, checkpoint_dir):
        """Saves a model to a directory containing a single checkpoint."""

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        torch.save(self.model.state_dict(), path)

    @torch.no_grad()
    def sample_action(self, state, timestep, explore_prob):
        """Samples an action to perform in the environment."""

        if np.random.random() < explore_prob:
            return np.random.uniform(*self.bounds, size=(self.action_size, ))
        return self.action_select_eval(self.model, state, timestep)[0].detach()

    def train(self, memory, gamma, batch_size, **kwargs):
        """Performs a single step of Q-Learning."""

        self.model.train()

        # Sample a minibatch from the memory buffer
        s0, act, r, s1, done, timestep = memory.sample(batch_size)

        s0 = torch.from_numpy(s0).to(self.device)
        act = torch.from_numpy(act).to(self.device)
        r = torch.from_numpy(r).to(self.device)
        s1 = torch.from_numpy(s1).to(self.device)
        done = torch.from_numpy(done).to(self.device)
        t0 = torch.from_numpy(timestep).to(self.device)
        t1 = torch.from_numpy(timestep + 1).to(self.device)

        pred = self.model(s0, t0, act).view(-1)

        with torch.no_grad():

            # DDQN finds the maximal action for the current policy
            aopt, _ = self.action_select_train(self.model, s1, t1)

            # but uses the q-value from the target network
            target = r + (1. - done) * gamma * self.target(s1, t1,
                                                           aopt).view(-1)

        loss = torch.mean((pred - target)**2)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.)
        self.optimizer.step()

        return loss.item()

    def update(self):
        """Copy the network weights every few epochs."""
        self.target.load_state_dict(self.model.state_dict())
        self.target.eval()
Esempio n. 6
0
class CMCRE(BasePolicy):
    def __init__(self, config):

        self.model = BaseNetwork(**config).to(config['device'])

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.action_select_eval = CEMOptimizer(**config)
        self.action_select_train = UniformOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])

    def get_weights(self):
        return (self.model.state_dict(), )

    def set_weights(self, weights):
        self.model.load_state_dict(weights[0])

    def load_checkpoint(self, checkpoint_dir):
        """Loads a model from a directory containing a checkpoint."""

        if not os.path.exists(checkpoint_dir):
            raise Exception('No checkpoint directory <%s>' % checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        self.model.load_state_dict(torch.load(path, self.device))

    def save_checkpoint(self, checkpoint_dir):
        """Saves a model to a directory containing a single checkpoint."""

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        torch.save(self.model.state_dict(), path)

    @torch.no_grad()
    def sample_action(self, state, timestep, explore_prob):
        """Samples an action to perform in the environment."""

        if np.random.random() < explore_prob:
            return np.random.uniform(*self.bounds, size=(self.action_size, ))

        return self.action_select_eval(self.model, state, timestep)[0].detach()

    def _loss(self, Vstar, Qstar, r, gamma):
        """Calculates corrected loss over a single episode.

        Assumes that all inputs (Q, pred, r) belong to a single episode
        only. These are obtained by slicing the input at each timestep == 0.
        """

        advantage = Qstar - Vstar

        out = torch.zeros_like(r, requires_grad=False)
        for i in reversed(range(r.shape[0] - 1)):
            out[i] = gamma * (out[i + 1] + (r[i + 1] - advantage[i + 1]))

        # Note that we later normalize over batch size
        loss = ((Qstar - (r + out))**2).sum()

        return loss

    def train(self, memory, gamma, batch_size, **kwargs):

        # Sample full episodes from memory
        s0, act, r, _, _, timestep = memory.sample(batch_size // 8)

        # Used to help compute proper loss per episode
        starts = np.hstack((np.where(timestep == 0)[0], r.shape[0]))

        s0 = torch.from_numpy(s0).to(self.device)
        act = torch.from_numpy(act).to(self.device)
        r = torch.from_numpy(r).to(self.device)
        t0 = torch.from_numpy(timestep).to(self.device)

        # Need both Q&V
        Q = self.model(s0, t0, act).view(-1)

        _, V = self.action_select_train(self.model, s0, t0)

        # Sum the loss for each of the episodes
        loss = 0
        for s, e in zip(starts[:-1], starts[1:]):
            loss = loss + self._loss(V[s:e], Q[s:e], r[s:e], gamma)

        loss = loss / s0.shape[0]

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.)
        self.optimizer.step()

        return loss.item()

    def update(self):
        pass
Esempio n. 7
0
class Supervised(BasePolicy):

    def __init__(self, config):

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.model = BaseNetwork(**config).to(config['device'])

        # Note this optimizer is slightly different then the one 
        # used for other models
        self.action_select_eval = SupervisedCEMOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])

    def get_weights(self):
        return (self.model.state_dict(),)  # as tuple

    def set_weights(self, weights):
        self.model.load_state_dict(weights[0])

    def load_checkpoint(self, checkpoint_dir):
        """Loads a model from a directory containing a checkpoint."""

        if not os.path.exists(checkpoint_dir):
            raise Exception('No checkpoint directory <%s>'%checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        self.model.load_state_dict(torch.load(path, self.device))

    def save_checkpoint(self, checkpoint_dir):
        """Saves a model to a directory containing a single checkpoint."""

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        torch.save(self.model.state_dict(), path)

    @torch.no_grad()
    def sample_action(self, state, timestep, explore_prob):
        """Samples an action to perform in the environment."""

        if np.random.random() < explore_prob:
            return np.random.uniform(*self.bounds, size=(self.action_size,))
        return self.action_select_eval(self.model, state, timestep)[0].detach()

    def train(self, memory, batch_size, **kwargs):
        """Performs a single training step."""

        s0, act, r, _, _, timestep = memory.sample(batch_size)

        # The dataset contains more failures then successes, so we'll
        # balance the minibatch loss by weighting it by class frequency
        weight = np.sum(r) / (batch_size - np.sum(r))
        weight = np.where(r == 0, weight, 1).astype(np.float32)
        weight = torch.from_numpy(weight).to(self.device).view(-1)

        s0 = torch.from_numpy(s0).to(self.device)
        act = torch.from_numpy(act).to(self.device)
        r = torch.from_numpy(r).to(self.device)
        t0 = torch.from_numpy(timestep).to(self.device)

        pred = self.model(s0, t0, act).clamp(1e-8, 1-1e-8).view(-1)

        # Uses the outcome of the episode as individual step label
        loss = torch.nn.BCELoss(weight=weight)(pred, r)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.)
        self.optimizer.step()

        return loss.item()

    def update(self):
        pass
Esempio n. 8
0
class MCRE(BasePolicy):

    def __init__(self, config):

        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.model = BaseNetwork(**config).to(config['device'])

        self.cem = CEMOptimizer(**config)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          config['lrate'],
                                          eps=1e-3,
                                          weight_decay=config['decay'])

    def get_weights(self):
        return (self.model.state_dict(),)

    def set_weights(self, weights):
        self.model.load_state_dict(weights[0])

    def load_checkpoint(self, checkpoint_dir):
        """Loads a model from a directory containing a checkpoint."""

        if not os.path.exists(checkpoint_dir):
            raise Exception('No checkpoint directory <%s>'%checkpoint_dir)

        weights = torch.load(os.path.join(checkpoint_dir, 'model.pt'), self.device)
        self.model.load_state_dict(weights)

    def save_checkpoint(self, checkpoint_dir):
        """Saves a model to a directory containing a single checkpoint."""

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'model.pt')
        torch.save(self.model.state_dict(), path)

    @torch.no_grad()
    def sample_action(self, state, timestep, explore_prob):
        """Samples an action to perform using CEM."""

        if np.random.random() < explore_prob:
            return np.random.uniform(*self.bounds, size=(self.action_size,))
        return self.cem(self.model, state, timestep)[0].detach()

    def train(self, memory, gamma, batch_size, **kwargs):

        del gamma  # unused

        # Sample a minibatch from the memory buffer. Note that we sample
        # full grasping episodes in this method, so the output of
        # memory.sample will be episode_length * num_episodes
        s0, act, r, _, _, timestep = memory.sample(batch_size // 8)

        s0 = torch.from_numpy(s0).to(self.device)
        act = torch.from_numpy(act).to(self.device)
        r = torch.from_numpy(r).to(self.device)
        t0 = torch.from_numpy(timestep).to(self.device)

        pred = self.model(s0, t0, act).view(-1)

        # Note that the reward 'r' has been discounted in memory.load
        loss = torch.mean((pred - r) ** 2)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10.)
        self.optimizer.step()

        return loss.item()

    def update(self):
        pass
Esempio n. 9
0
class DDPG(BasePolicy):
    def __init__(self, config):

        # Needed for sampling actions
        self.action_size = config['action_size']
        self.device = config['device']
        self.bounds = config['bounds']

        self.critic = BaseNetwork(**config).to(config['device'])
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_target.eval()

        self.actor = Actor(**config).to(config['device'])
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_target.eval()

        self.aopt = torch.optim.Adam(self.actor.parameters(),
                                     config['lrate'],
                                     eps=1e-3,
                                     weight_decay=config['decay'])

        self.copt = torch.optim.Adam(self.critic.parameters(),
                                     config['lrate'],
                                     eps=1e-3,
                                     weight_decay=config['decay'])

    def get_weights(self):
        return (self.actor.state_dict(), self.critic.state_dict(),
                self.actor_target.state_dict(),
                self.critic_target.state_dict())

    def set_weights(self, weights):
        self.actor.load_state_dict(weights[0])
        self.critic.load_state_dict(weights[1])
        self.actor_target.load_state_dict(weights[2])
        self.critic_target.load_state_dict(weights[3])

    def load_checkpoint(self, checkpoint_dir):
        """Loads a model from a directory containing a checkpoint."""

        if not os.path.exists(checkpoint_dir):
            raise Exception('No checkpoint directory <%s>' % checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'actor.pt')
        self.actor.load_state_dict(torch.load(path, self.device))

        path = os.path.join(checkpoint_dir, 'critic.pt')
        self.critic.load_state_dict(torch.load(path, self.device))
        self.update()

    def save_checkpoint(self, checkpoint_dir):
        """Saves a model to a directory containing a single checkpoint."""

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        path = os.path.join(checkpoint_dir, 'actor.pt')
        torch.save(self.actor.state_dict(), path)

        path = os.path.join(checkpoint_dir, 'critic.pt')
        torch.save(self.critic.state_dict(), path)

    @torch.no_grad()
    def sample_action(self, state, timestep, explore_prob):
        """Samples an action to perform in the environment."""

        if np.random.random() < explore_prob:
            return np.random.uniform(-1, 1, self.action_size)

        self.actor.eval()

        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).to(self.device)
        if isinstance(timestep, float):
            timestep = torch.tensor([timestep], device=self.device)

        return self.actor(state, timestep).detach()

    def train(self, memory, gamma, batch_size, **kwargs):

        self.actor.train()

        s0, act, r, s1, term, timestep = memory.sample(batch_size)

        s0 = torch.from_numpy(s0).to(self.device)
        act = torch.from_numpy(act).to(self.device)
        s1 = torch.from_numpy(s1).to(self.device)
        r = torch.from_numpy(r).to(self.device)
        term = torch.from_numpy(term).to(self.device)

        t0 = torch.from_numpy(timestep).to(self.device)
        t1 = torch.from_numpy(timestep + 1.).to(self.device)

        # Train the critic
        pred = self.critic(s0, t0, act).view(-1)

        with torch.no_grad():

            at = self.actor_target(s1, t1)
            qt = self.critic_target(s1, t1, at).view(-1)
            target = r + (1. - term) * gamma * qt

        loss = torch.mean((pred - target)**2)  #.clamp(-1, 1)

        self.aopt.zero_grad()
        self.copt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 10.)
        self.copt.step()
        self.copt.zero_grad()

        # Train the actor by following the policy gradient
        self.aopt.zero_grad()

        action = self.actor(s0, t0)
        q_pred = -self.critic(s0, t0, action).mean()

        q_grad = torch.autograd.grad(q_pred, action)[0]

        action.backward(gradient=q_grad)
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 10.)
        self.aopt.step()

        return loss.item()

    def update(self):
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())