Ejemplo n.º 1
0
Archivo: ddpg.py Proyecto: ozcell/sldr
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 masked_with_r=False,
                 clip_Q_neg=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.masked_with_r = masked_with_r
        if clip_Q_neg is not None:
            self.clip_Q_neg = clip_Q_neg
        else:
            if self.object_Qfunc is None:
                self.clip_Q_neg = -1. / (1. - self.gamma)
            else:
                self.clip_Q_neg = -2. / (1. - self.gamma)

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)

            self.backward_otw = BackwardDyn(observation_space,
                                            action_space[1]).to(device)
            self.backward_otw_optim = optimizer(self.backward_otw.parameters(),
                                                lr=critic_lr)
            self.entities.append(self.backward_otw)
            self.entities.append(self.backward_otw_optim)
        else:
            self.backward = backward_dyn
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy.eval()
            self.entities.append(self.object_policy)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        print('clipped + r')
Ejemplo n.º 2
0
Archivo: ddpg.py Proyecto: ozcell/sldr
class DDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 masked_with_r=False,
                 clip_Q_neg=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.masked_with_r = masked_with_r
        if clip_Q_neg is not None:
            self.clip_Q_neg = clip_Q_neg
        else:
            if self.object_Qfunc is None:
                self.clip_Q_neg = -1. / (1. - self.gamma)
            else:
                self.clip_Q_neg = -2. / (1. - self.gamma)

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)

            self.backward_otw = BackwardDyn(observation_space,
                                            action_space[1]).to(device)
            self.backward_otw_optim = optimizer(self.backward_otw.parameters(),
                                                lr=critic_lr)
            self.entities.append(self.backward_otw)
            self.entities.append(self.backward_otw_optim)
        else:
            self.backward = backward_dyn
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy.eval()
            self.entities.append(self.object_policy)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        print('clipped + r')

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[self.agent_id].low[0]),
                      int(self.action_space[self.agent_id].high[0]))

        return mu

    def update_parameters(self,
                          batch,
                          normalizer=None,
                          running_rintr_mean=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        s, s_, a = (s1, s1_, a1) if self.agent_id == 0 else (s2, s2_, a2)
        a_ = self.actors_target[0](s_)

        if self.object_Qfunc is None:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
        else:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
            r_intr = self.get_obj_reward(s2, s2_)
            if self.masked_with_r:
                r = r_intr * K.abs(r) + r
            else:
                r = r_intr + r

        Q = self.critics[0](s, a)
        V = self.critics_target[0](s_, a_).detach()

        target_Q = (V * self.gamma) + r
        target_Q = target_Q.clamp(self.clip_Q_neg, 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

        return action

    def get_obj_action(self, state):
        with K.no_grad():
            action = self.object_policy(state.to(self.device))

        return action

    def reward_fun(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            opt_action = self.object_policy(state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), opt_action)
        return reward.clamp(min=-1.0, max=0.0)

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        a2_pred = self.backward(s2, s2_)

        loss_backward = self.loss_func(a2_pred, a2)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()

    def update_backward_otw(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        a2_pred = self.backward_otw(s2, s2_)

        loss_backward_otw = self.loss_func(a2_pred, a2)

        self.backward_otw_optim.zero_grad()
        loss_backward_otw.backward()
        self.backward_otw_optim.step()

        return loss_backward_otw.item()