Beispiel #1
0
class DDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 masked_with_r=False,
                 clip_Q_neg=None,
                 goal_space=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.masked_with_r = masked_with_r
        if clip_Q_neg is not None:
            self.clip_Q_neg = clip_Q_neg
        else:
            if self.object_Qfunc is None:
                self.clip_Q_neg = -1. / (1. - self.gamma)
            else:
                self.clip_Q_neg = -2. / (1. - self.gamma)

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)

        else:
            self.backward = backward_dyn
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy.eval()
            self.entities.append(self.object_policy)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        print('clipped + r')

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[self.agent_id].low[0]),
                      int(self.action_space[self.agent_id].high[0]))

        return mu

    def update_parameters(self,
                          batch,
                          normalizer=None,
                          running_rintr_mean=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        s, s_, a = (s1, s1_, a1) if self.agent_id == 0 else (s2, s2_, a2)
        a_ = self.actors_target[0](s_)

        if self.object_Qfunc is None:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
        else:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
            r_intr = self.get_obj_reward(s2, s2_)
            if self.masked_with_r:
                r = r_intr * K.abs(r) + r
            else:
                r = r_intr + r

        Q = self.critics[0](s, a)
        V = self.critics_target[0](s_, a_).detach()

        target_Q = (V * self.gamma) + r
        target_Q = target_Q.clamp(self.clip_Q_neg, 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

        return action

    def get_obj_action(self, state):
        with K.no_grad():
            action = self.object_policy(state.to(self.device))

        return action

    def reward_fun(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            opt_action = self.object_policy(state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), opt_action)
        return reward.clamp(min=-1.0, max=0.0)

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        a2_pred = self.backward(s2, s2_)

        loss_backward = self.loss_func(a2_pred, a2)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()
Beispiel #2
0
class DDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 object_Qfunc=None,
                 backward_dyn=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.action_space = action_space

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space, discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space, discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(Critic(observation_space, action_space).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn.to(device)
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if object_Qfunc is not None:
            self.object_Qfunc = object_Qfunc
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space.low[0]),
                      int(self.action_space.high[0]))

        return mu

    def update_parameters(self,
                          batch,
                          normalizer=None,
                          use_object_Qfunc=False):

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s = K.cat([
            K.tensor(batch['o'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                  dim=-1)
        a = K.tensor(batch['u'], dtype=self.dtype, device=self.device)
        r = K.tensor(batch['r'], dtype=self.dtype,
                     device=self.device).unsqueeze(1)
        s_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        a_ = K.zeros_like(a)

        if normalizer is not None:
            s = normalizer.preprocess(s)
            s_ = normalizer.preprocess(s_)

        Q = self.critics[0](s, a)

        a_ = self.actors_target[0](s_)
        V = self.critics_target[0](s_, a_).detach()

        if use_object_Qfunc:
            r = self.get_obj_reward(s, s_)
            target_Q = (V * self.gamma) + r
        else:
            target_Q = (V * self.gamma) + r
            target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

        return action

    def get_obj_reward(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            reward = self.object_Qfunc(state.to(self.device), action)

        return reward

    def update_backward(self, batch, normalizer=None):

        s = K.cat([
            K.tensor(batch['o'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                  dim=-1)
        a = K.tensor(batch['u'], dtype=self.dtype, device=self.device)
        s_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        if normalizer is not None:
            s = normalizer.preprocess(s)
            s_ = normalizer.preprocess(s_)

        a_pred = self.backward(s, s_)

        loss_backward = self.loss_func(a_pred, a)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()
Beispiel #3
0
class MADDPG_RAE(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 dtype=K.float32,
                 device="cuda"):

        super(MADDPG_RAE, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        for i in range(2):
            self.actors.append(
                Actor(observation_space, action_space[i], discrete,
                      out_func).to(device))
            self.actors_target.append(
                Actor(observation_space, action_space[i], discrete,
                      out_func).to(device))
            self.actors_optim.append(
                optimizer(self.actors[i].parameters(), lr=actor_lr))

        for i in range(2):
            hard_update(self.actors_target[i], self.actors[i])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        for i in range(2):
            self.critics.append(
                Critic(observation_space, action_space[2]).to(device))
            self.critics_target.append(
                Critic(observation_space, action_space[2]).to(device))
            self.critics_optim.append(
                optimizer(self.critics[i].parameters(), lr=critic_lr))

        for i in range(2):
            hard_update(self.critics_target[i], self.critics[i])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        self.backward = BackwardDyn(3, action_space[1]).to(device)
        self.backward_optim = optimizer(self.backward.parameters(),
                                        lr=critic_lr)

        self.entities.append(self.backward)
        self.entities.append(self.backward_optim)

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, i_agent, exploration=False):
        self.actors[i_agent].eval()
        with K.no_grad():
            mu = self.actors[i_agent](state.to(self.device))
        self.actors[i_agent].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[i_agent].low[0]),
                      int(self.action_space[i_agent].high[0]))

        return mu

    def update_parameters(self, batch, i_agent, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        mask = K.tensor(tuple(
            map(lambda ai_object: ai_object == 0, K.tensor(batch['o'][:,
                                                                      -1]))),
                        dtype=K.uint8,
                        device=self.device)

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        r = K.tensor(batch['r'], dtype=self.dtype,
                     device=self.device).unsqueeze(1)

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2__ = K.cat([
            K.tensor(batch['o_3'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                     dim=-1)

        ag = K.tensor(batch['ag'], dtype=self.dtype, device=self.device)
        ag_2 = K.tensor(batch['ag_2'], dtype=self.dtype, device=self.device)
        ag_3 = K.tensor(batch['ag_3'], dtype=self.dtype, device=self.device)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)
            s2__ = normalizer[1].preprocess(s2__)

        a1_ = self.actors_target[0](s1_)
        a2_ = self.actors_target[1](s2_)
        #a2_[mask] = self.estimate_obj_action(s2_[mask], s2__[mask])
        a2_[mask] = self.estimate_obj_action(ag_2[mask], ag_3[mask])

        s = [s1, s2]
        s_ = [s1_, s2_]

        # Critics
        Q = self.critics[i_agent](s[i_agent], K.cat([a1, a2], dim=1))
        V = self.critics_target[i_agent](s_[i_agent], K.cat([a1_, a2_],
                                                            dim=1)).detach()

        target_Q = (V * self.gamma) + r
        target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)
        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[i_agent].zero_grad()
        loss_critic.backward()
        self.critics_optim[i_agent].step()

        # Actors
        a1 = self.actors[0](s1)
        a2 = self.actors[1](s2)
        #a2_[mask] = self.estimate_obj_action(s2[mask], s2_[mask])
        a2[mask] = self.estimate_obj_action(ag[mask], ag_2[mask])

        loss_actor = -self.critics[i_agent](s[i_agent], K.cat([a1, a2],
                                                              dim=1)).mean()

        if self.regularization:
            loss_actor += (self.actors[i_agent](s[i_agent])**2).mean() * 1

        self.actors_optim[i_agent].zero_grad()
        loss_actor.backward()
        self.actors_optim[i_agent].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

        soft_update(self.actors_target[1], self.actors[1], self.tau)
        soft_update(self.critics_target[1], self.critics[1], self.tau)

    def estimate_obj_action(self, state2, next_state2):
        self.backward.eval()
        with K.no_grad():
            action2 = self.backward(state2.to(self.device),
                                    next_state2.to(self.device))
        self.backward.train()

        return action2

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        mask = K.tensor(tuple(
            map(lambda ai_object: ai_object > 0, K.tensor(batch['o'][:, -1]))),
                        dtype=K.uint8,
                        device=self.device)

        #s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:],
        #            K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        #s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:],
        #             K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        #if normalizer[1] is not None:
        #    s2 = normalizer[1].preprocess(s2)
        #    s2_ = normalizer[1].preprocess(s2_)

        #a2_pred = self.backward(s2[mask], s2_[mask])

        ag = K.tensor(batch['ag'], dtype=self.dtype, device=self.device)
        ag_2 = K.tensor(batch['ag_2'], dtype=self.dtype, device=self.device)

        a2_pred = self.backward(ag[mask], ag_2[mask])

        loss_backward = self.loss_func(a2_pred, a2[mask])

        self.backward_optim.zero_grad()
        loss_backward.backward()
        #K.nn.utils.clip_grad_norm_(self.forward.parameters(), 0.5)
        self.backward_optim.step()

        return loss_backward.item()
Beispiel #4
0
class MADDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 dtype=K.float32,
                 device="cuda"):

        super(MADDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        if agent_id == 0:
            critic_action_space = action_space[2]
        else:
            critic_action_space = action_space[1]

        self.critics.append(
            Critic(observation_space, critic_action_space).to(device))
        self.critics_target.append(
            Critic(observation_space, critic_action_space).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy.eval()
            self.entities.append(self.object_policy)

        print('maddpg-main12algo7-clamp-both-plusR')

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[self.agent_id].low[0]),
                      int(self.action_space[self.agent_id].high[0]))

        return mu

    def update_parameters(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        mask = K.tensor(tuple(
            map(lambda ai_object: ai_object == 0, K.tensor(batch['o'][:,
                                                                      -1]))),
                        dtype=K.uint8,
                        device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        # s2__ = K.cat([K.tensor(batch['o_3'], dtype=self.dtype, device=self.device)[:, observation_space:],
        #              K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)
            # s2__ = normalizer[1].preprocess(s2__)

        s, s_, a = (s1, s1_,
                    K.cat([a1, a2], dim=1)) if self.agent_id == 0 else (s2,
                                                                        s2_,
                                                                        a2)

        if self.agent_id == 0:
            a_ = self.get_obj_action(s2_)
            #a_[mask] = self.estimate_obj_action(s2_[mask], s2__[mask])
            a_ = K.cat([self.actors_target[0](s_), a_], dim=1)
        else:
            a_ = self.actors_target[0](s_)

        if self.object_Qfunc is None:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
        else:
            r = self.get_obj_reward_v4(s2, s2_) + K.tensor(
                batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1)
            #r = self.get_obj_reward(s2, s2_, s2__)

        Q = self.critics[0](s, a)
        V = self.critics_target[0](s_, a_).detach()

        target_Q = (V * self.gamma) + r
        if self.object_Qfunc is None:
            target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)
        else:
            target_Q = target_Q.clamp(-2. / (1. - self.gamma), 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        if self.agent_id == 0:
            a = self.get_obj_action(s2)
            #a[mask] = self.estimate_obj_action(s2[mask], s2_[mask])
            a = K.cat([self.actors[0](s), a], dim=1)
        else:
            a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

        return action

    def get_obj_action(self, state):
        with K.no_grad():
            action = self.object_policy(state.to(self.device))

        return action

    def get_obj_reward(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device), action) / 10.

        return reward

    def get_obj_reward_v2(self, state, next_state, next_next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            next_action = self.backward(next_state.to(self.device),
                                        next_next_state.to(self.device))

            reward = self.object_Qfunc(next_state.to(self.device),
                                       next_action) - self.object_Qfunc(
                                           state.to(self.device), action)

        return reward

    def get_obj_reward_v3(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

            reward = self.object_Qfunc(next_state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), action)

        return reward

    def get_obj_reward_v4(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            opt_action = self.object_policy(state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), opt_action)

            reward = reward.clamp(min=-1.0, max=0.)
        return reward

    def get_obj_reward_v5(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            opt_action = self.object_policy(state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), opt_action)
        return reward

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        a2_pred = self.backward(s2, s2_)

        loss_backward = self.loss_func(a2_pred, a2)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()