Esempio n. 1
0
class DDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 masked_with_r=False,
                 n_objects=1,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.masked_with_r = masked_with_r
        self.n_objects = n_objects

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc_target = copy.deepcopy(self.object_Qfunc)
            self.object_Qfunc_optim = optimizer(self.object_Qfunc.parameters(),
                                                lr=critic_lr)
            self.entities.append(self.object_Qfunc)
            self.entities.append(self.object_Qfunc_target)
            self.entities.append(self.object_Qfunc_optim)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy_target = copy.deepcopy(self.object_policy)
            self.object_policy_optim = optimizer(
                self.object_policy.parameters(), lr=actor_lr)
            self.entities.append(self.object_policy)
            self.entities.append(self.object_policy_target)
            self.entities.append(self.object_policy_optim)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        print('clipped between -1 and 0, and masked with abs(r), and + r')

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[self.agent_id].low[0]),
                      int(self.action_space[self.agent_id].high[0]))

        return mu

    def update_parameters(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        if self.n_objects <= 1:
            s2 = K.cat([
                K.tensor(batch['o'], dtype=self.dtype,
                         device=self.device)[:, observation_space:],
                K.tensor(batch['g'], dtype=self.dtype, device=self.device)
            ],
                       dim=-1)
        else:
            s2 = get_obj_obs(K.tensor(batch['o'],
                                      dtype=self.dtype,
                                      device=self.device)[:,
                                                          observation_space:],
                             K.tensor(batch['g'],
                                      dtype=self.dtype,
                                      device=self.device),
                             n_object=self.n_objects)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if self.n_objects <= 1:
            s2_ = K.cat([
                K.tensor(batch['o_2'], dtype=self.dtype,
                         device=self.device)[:, observation_space:],
                K.tensor(batch['g'], dtype=self.dtype, device=self.device)
            ],
                        dim=-1)
        else:
            s2_ = get_obj_obs(K.tensor(batch['o_2'],
                                       dtype=self.dtype,
                                       device=self.device)[:,
                                                           observation_space:],
                              K.tensor(batch['g'],
                                       dtype=self.dtype,
                                       device=self.device),
                              n_object=self.n_objects)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            if self.n_objects <= 1:
                s2 = normalizer[1].preprocess(s2)
                s2_ = normalizer[1].preprocess(s2_)
            else:
                for i_object in range(self.n_objects):
                    s2[:, :, i_object] = normalizer[1].preprocess(s2[:, :,
                                                                     i_object])
                    s2_[:, :,
                        i_object] = normalizer[1].preprocess(s2_[:, :,
                                                                 i_object])

        s, s_, a = (s1, s1_, a1) if self.agent_id == 0 else (s2, s2_, a2)
        a_ = self.actors_target[0](s_)

        if self.object_Qfunc is None:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
        else:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
            if self.n_objects <= 1:
                if self.masked_with_r:
                    r = self.get_obj_reward(s2, s2_) * K.abs(r) + r
                else:
                    r = self.get_obj_reward(s2, s2_) + r
            else:
                r_intr = K.zeros_like(r)
                for i_object in range(self.n_objects):
                    r_intr += self.get_obj_reward(s2[:, :, i_object],
                                                  s2_[:, :, i_object])
                if self.masked_with_r:
                    r = r_intr * K.abs(r) + r
                else:
                    r = r_intr + r

        Q = self.critics[0](s, a)
        V = self.critics_target[0](s_, a_).detach()

        target_Q = (V * self.gamma) + r
        if self.object_Qfunc is None:
            target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)
        else:
            target_Q = target_Q.clamp(
                -(1 + self.n_objects) / (1. - self.gamma), 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

        return action

    def get_obj_action(self, state, exploration=False):
        self.object_policy.eval()
        with K.no_grad():
            mu = self.object_policy(state.to(self.device))
        self.object_policy.train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[1].low[0]),
                      int(self.action_space[1].high[0]))

        return mu

    def reward_fun(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            opt_action = self.object_policy(state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), opt_action)
        return reward.clamp(min=-1.0, max=0.0)

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        if self.n_objects <= 1:
            s2 = K.cat([
                K.tensor(batch['o'], dtype=self.dtype,
                         device=self.device)[:, observation_space:],
                K.tensor(batch['g'], dtype=self.dtype, device=self.device)
            ],
                       dim=-1)
        else:
            s2 = get_obj_obs(K.tensor(batch['o'],
                                      dtype=self.dtype,
                                      device=self.device)[:,
                                                          observation_space:],
                             K.tensor(batch['g'],
                                      dtype=self.dtype,
                                      device=self.device),
                             n_object=self.n_objects)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        if self.n_objects <= 1:
            s2_ = K.cat([
                K.tensor(batch['o_2'], dtype=self.dtype,
                         device=self.device)[:, observation_space:],
                K.tensor(batch['g'], dtype=self.dtype, device=self.device)
            ],
                        dim=-1)
        else:
            s2_ = get_obj_obs(K.tensor(batch['o_2'],
                                       dtype=self.dtype,
                                       device=self.device)[:,
                                                           observation_space:],
                              K.tensor(batch['g'],
                                       dtype=self.dtype,
                                       device=self.device),
                              n_object=self.n_objects)

        if normalizer[1] is not None:
            if self.n_objects <= 1:
                s2 = normalizer[1].preprocess(s2)
                s2_ = normalizer[1].preprocess(s2_)
            else:
                for i_object in range(self.n_objects):
                    s2[:, :, i_object] = normalizer[1].preprocess(s2[:, :,
                                                                     i_object])
                    s2_[:, :,
                        i_object] = normalizer[1].preprocess(s2_[:, :,
                                                                 i_object])

        if self.n_objects <= 1:
            a2_pred = self.backward(s2, s2_)
            loss_backward = self.loss_func(a2_pred, a2)
        else:
            loss_backward = 0.
            n_obj_actions = a2.shape[1] // self.n_objects
            for i_object in range(self.n_objects):
                act_slice = slice(i_object * n_obj_actions,
                                  (i_object + 1) * n_obj_actions)
                a2_pred = self.backward(s2[:, :, i_object], s2_[:, :,
                                                                i_object])
                loss_backward += self.loss_func(a2_pred, a2[:, act_slice])

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()

    def update_object_parameters(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        s, s_, a = s2, s2_, a2
        a_ = self.object_policy_target(s_)

        r = K.tensor(batch['r'], dtype=self.dtype,
                     device=self.device).unsqueeze(1)

        Q = self.object_Qfunc(s, a)
        V = self.object_Qfunc_target(s_, a_).detach()

        target_Q = (V * self.gamma) + r
        target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.object_Qfunc_optim.zero_grad()
        loss_critic.backward()
        self.object_Qfunc_optim.step()

        a = self.object_policy(s)

        loss_actor = -self.object_Qfunc(s, a).mean()

        if self.regularization:
            loss_actor += (self.object_policy(s)**2).mean() * 1

        self.object_policy_optim.zero_grad()
        loss_actor.backward()
        self.object_policy_optim.step()

        return loss_critic.item(), loss_actor.item()

    def update_object_target(self):

        soft_update(self.object_policy_target, self.object_policy, self.tau)
        soft_update(self.object_Qfunc_target, self.object_Qfunc, self.tau)
Esempio n. 2
0
class DDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 n_objects=1,
                 clip_Q_neg=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.object_backward = backward_dyn
        self.n_objects = n_objects
        self.clip_Q_neg = clip_Q_neg if clip_Q_neg is not None else -1. / (
            1. - self.gamma)

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        for i_object in range(self.n_objects):
            self.critics.append(
                Critic(observation_space, action_space[agent_id]).to(device))
            self.critics_target.append(
                Critic(observation_space, action_space[agent_id]).to(device))
            self.critics_optim.append(
                optimizer(self.critics[i_object + 1].parameters(),
                          lr=critic_lr))

            hard_update(self.critics_target[i_object + 1],
                        self.critics[i_object + 1])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model for object actions
        if self.object_backward is not None:
            self.entities.append(self.object_backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.entities.append(self.object_policy)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        # backward dynamics model for object actions
        self.backward = BackwardDyn(observation_space,
                                    action_space[0]).to(device)
        self.backward_optim = optimizer(self.backward.parameters(),
                                        lr=critic_lr)
        self.entities.append(self.backward)
        self.entities.append(self.backward_optim)

        self.backward_otw = BackwardDyn(observation_space,
                                        action_space[0]).to(device)
        self.backward_otw_optim = optimizer(self.backward_otw.parameters(),
                                            lr=critic_lr)
        self.entities.append(self.backward_otw)
        self.entities.append(self.backward_otw_optim)

        print('seperaate Qs')

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[self.agent_id].low[0]),
                      int(self.action_space[self.agent_id].high[0]))

        return mu

    def update_parameters(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        if self.n_objects <= 1:
            s2 = K.cat([
                K.tensor(batch['o'], dtype=self.dtype,
                         device=self.device)[:, observation_space:],
                K.tensor(batch['g'], dtype=self.dtype, device=self.device)
            ],
                       dim=-1)
        else:
            s2 = get_obj_obs(K.tensor(batch['o'],
                                      dtype=self.dtype,
                                      device=self.device)[:,
                                                          observation_space:],
                             K.tensor(batch['g'],
                                      dtype=self.dtype,
                                      device=self.device),
                             n_object=self.n_objects)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if self.n_objects <= 1:
            s2_ = K.cat([
                K.tensor(batch['o_2'], dtype=self.dtype,
                         device=self.device)[:, observation_space:],
                K.tensor(batch['g'], dtype=self.dtype, device=self.device)
            ],
                        dim=-1)
        else:
            s2_ = get_obj_obs(K.tensor(batch['o_2'],
                                       dtype=self.dtype,
                                       device=self.device)[:,
                                                           observation_space:],
                              K.tensor(batch['g'],
                                       dtype=self.dtype,
                                       device=self.device),
                              n_object=self.n_objects)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            if self.n_objects <= 1:
                s2 = normalizer[1].preprocess(s2)
                s2_ = normalizer[1].preprocess(s2_)
            else:
                for i_object in range(self.n_objects):
                    s2[:, :, i_object] = normalizer[1].preprocess(s2[:, :,
                                                                     i_object])
                    s2_[:, :,
                        i_object] = normalizer[1].preprocess(s2_[:, :,
                                                                 i_object])

        s, s_, a = (s1, s1_, a1) if self.agent_id == 0 else (s2, s2_, a2)
        a_ = self.actors_target[0](s_)

        r_all = []
        if self.object_Qfunc is None:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
            r_all.append(r)
        else:
            r = K.tensor(batch['r'], dtype=self.dtype,
                         device=self.device).unsqueeze(1)
            r_all.append(r)
            for i_object in range(self.n_objects):
                r_all.append(
                    self.get_obj_reward(s2[:, :, i_object], s2_[:, :,
                                                                i_object]))

        # first critic for main rewards
        Q = self.critics[0](s, a)
        V = self.critics_target[0](s_, a_).detach()

        target_Q = (V * self.gamma) + r_all[0]
        target_Q = target_Q.clamp(self.clip_Q_neg, 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        # other critics for intrinsic
        for i_object in range(self.n_objects):
            Q = self.critics[i_object + 1](s, a)
            V = self.critics_target[i_object + 1](s_, a_).detach()

            target_Q = (V * self.gamma) + r_all[i_object + 1]
            target_Q = target_Q.clamp(self.clip_Q_neg, 0.)

            loss_critic = self.loss_func(Q, target_Q)

            self.critics_optim[i_object + 1].zero_grad()
            loss_critic.backward()
            self.critics_optim[i_object + 1].step()

        # actor update
        a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()
        for i_object in range(self.n_objects):
            loss_actor += -self.critics[i_object + 1](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)
        for i_object in range(self.n_objects):
            soft_update(self.critics_target[i_object + 1],
                        self.critics[i_object + 1], self.tau)

    def reward_fun(self, state, next_state):
        with K.no_grad():
            action = self.object_backward(state.to(self.device),
                                          next_state.to(self.device))
            opt_action = self.object_policy(state.to(self.device))

            reward = self.object_Qfunc(state.to(self.device),
                                       action) - self.object_Qfunc(
                                           state.to(self.device), opt_action)
        return reward.clamp(min=-1.0, max=0.0)

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        a1_pred = self.backward(s1, s1_)

        loss_backward = self.loss_func(a1_pred, a1)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()

    def update_backward_otw(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        a1_pred = self.backward_otw(s1, s1_)

        loss_backward_otw = self.loss_func(a1_pred, a1)

        self.backward_otw_optim.zero_grad()
        loss_backward_otw.backward()
        self.backward_otw_optim.step()

        return loss_backward_otw.item()
Esempio n. 3
0
class MADDPG_RAE(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 dtype=K.float32,
                 device="cuda"):

        super(MADDPG_RAE, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        for i in range(2):
            self.actors.append(
                Actor(observation_space, action_space[i], discrete,
                      out_func).to(device))
            self.actors_target.append(
                Actor(observation_space, action_space[i], discrete,
                      out_func).to(device))
            self.actors_optim.append(
                optimizer(self.actors[i].parameters(), lr=actor_lr))

        for i in range(2):
            hard_update(self.actors_target[i], self.actors[i])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        for i in range(2):
            self.critics.append(
                Critic(observation_space, action_space[2]).to(device))
            self.critics_target.append(
                Critic(observation_space, action_space[2]).to(device))
            self.critics_optim.append(
                optimizer(self.critics[i].parameters(), lr=critic_lr))

        for i in range(2):
            hard_update(self.critics_target[i], self.critics[i])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        self.backward = BackwardDyn(3, action_space[1]).to(device)
        self.backward_optim = optimizer(self.backward.parameters(),
                                        lr=critic_lr)

        self.entities.append(self.backward)
        self.entities.append(self.backward_optim)

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, i_agent, exploration=False):
        self.actors[i_agent].eval()
        with K.no_grad():
            mu = self.actors[i_agent](state.to(self.device))
        self.actors[i_agent].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space[i_agent].low[0]),
                      int(self.action_space[i_agent].high[0]))

        return mu

    def update_parameters(self, batch, i_agent, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        mask = K.tensor(tuple(
            map(lambda ai_object: ai_object == 0, K.tensor(batch['o'][:,
                                                                      -1]))),
                        dtype=K.uint8,
                        device=self.device)

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s1 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        s2 = K.cat([
            K.tensor(batch['o'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        r = K.tensor(batch['r'], dtype=self.dtype,
                     device=self.device).unsqueeze(1)

        s1_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, 0:observation_space],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                    dim=-1)
        s2__ = K.cat([
            K.tensor(batch['o_3'], dtype=self.dtype,
                     device=self.device)[:, observation_space:],
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                     dim=-1)

        ag = K.tensor(batch['ag'], dtype=self.dtype, device=self.device)
        ag_2 = K.tensor(batch['ag_2'], dtype=self.dtype, device=self.device)
        ag_3 = K.tensor(batch['ag_3'], dtype=self.dtype, device=self.device)

        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)
            s2__ = normalizer[1].preprocess(s2__)

        a1_ = self.actors_target[0](s1_)
        a2_ = self.actors_target[1](s2_)
        #a2_[mask] = self.estimate_obj_action(s2_[mask], s2__[mask])
        a2_[mask] = self.estimate_obj_action(ag_2[mask], ag_3[mask])

        s = [s1, s2]
        s_ = [s1_, s2_]

        # Critics
        Q = self.critics[i_agent](s[i_agent], K.cat([a1, a2], dim=1))
        V = self.critics_target[i_agent](s_[i_agent], K.cat([a1_, a2_],
                                                            dim=1)).detach()

        target_Q = (V * self.gamma) + r
        target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)
        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[i_agent].zero_grad()
        loss_critic.backward()
        self.critics_optim[i_agent].step()

        # Actors
        a1 = self.actors[0](s1)
        a2 = self.actors[1](s2)
        #a2_[mask] = self.estimate_obj_action(s2[mask], s2_[mask])
        a2[mask] = self.estimate_obj_action(ag[mask], ag_2[mask])

        loss_actor = -self.critics[i_agent](s[i_agent], K.cat([a1, a2],
                                                              dim=1)).mean()

        if self.regularization:
            loss_actor += (self.actors[i_agent](s[i_agent])**2).mean() * 1

        self.actors_optim[i_agent].zero_grad()
        loss_actor.backward()
        self.actors_optim[i_agent].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

        soft_update(self.actors_target[1], self.actors[1], self.tau)
        soft_update(self.critics_target[1], self.critics[1], self.tau)

    def estimate_obj_action(self, state2, next_state2):
        self.backward.eval()
        with K.no_grad():
            action2 = self.backward(state2.to(self.device),
                                    next_state2.to(self.device))
        self.backward.train()

        return action2

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(
            batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        mask = K.tensor(tuple(
            map(lambda ai_object: ai_object > 0, K.tensor(batch['o'][:, -1]))),
                        dtype=K.uint8,
                        device=self.device)

        #s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:],
        #            K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype,
                      device=self.device)[:, action_space:]

        #s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:],
        #             K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        #if normalizer[1] is not None:
        #    s2 = normalizer[1].preprocess(s2)
        #    s2_ = normalizer[1].preprocess(s2_)

        #a2_pred = self.backward(s2[mask], s2_[mask])

        ag = K.tensor(batch['ag'], dtype=self.dtype, device=self.device)
        ag_2 = K.tensor(batch['ag_2'], dtype=self.dtype, device=self.device)

        a2_pred = self.backward(ag[mask], ag_2[mask])

        loss_backward = self.loss_func(a2_pred, a2[mask])

        self.backward_optim.zero_grad()
        loss_backward.backward()
        #K.nn.utils.clip_grad_norm_(self.forward.parameters(), 0.5)
        self.backward_optim.step()

        return loss_backward.item()
Esempio n. 4
0
class MADDPG_BD(object):
    def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid,
                 discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, 
                 object_policy=None, dtype=K.float32, device="cuda"):

        super(MADDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy

        # model initialization
        self.entities = []
        
        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []
        
        self.actors.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device))
        self.actors_target.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device))
        self.actors_optim.append(optimizer(self.actors[0].parameters(), lr = actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim) 
        
        # critics   
        self.critics = []
        self.critics_target = []
        self.critics_optim = []
        
        if agent_id == 0:
            critic_action_space = action_space[2]
        else:
            critic_action_space = action_space[1]

        self.critics.append(Critic(observation_space, critic_action_space).to(device))
        self.critics_target.append(Critic(observation_space, critic_action_space).to(device))
        self.critics_optim.append(optimizer(self.critics[0].parameters(), lr = critic_lr))

        hard_update(self.critics_target[0], self.critics[0])
            
        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space, action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(), lr = critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy.eval()
            self.entities.append(self.object_policy)

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):        
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'    

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device)
        mu = mu.clamp(int(self.action_space[self.agent_id].low[0]), int(self.action_space[self.agent_id].high[0]))

        return mu

    def update_parameters(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        mask = K.tensor(tuple(map(lambda ai_object: ai_object==0, K.tensor(batch['o'][:,-1]))), dtype=K.uint8, device=self.device)
        
        s1 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space],
                    K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)
        s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:],
                    K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space]
        a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:]

        s1_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space],
                     K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)
        s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:],
                     K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)
        s2__ = K.cat([K.tensor(batch['o_3'], dtype=self.dtype, device=self.device)[:, observation_space:],
                     K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)
        
        if normalizer[0] is not None:
            s1 = normalizer[0].preprocess(s1)
            s1_ = normalizer[0].preprocess(s1_)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)
            s2__ = normalizer[1].preprocess(s2__)

        s, s_, a = (s1, s1_, K.cat([a1, a2],dim=1)) if self.agent_id == 0 else (s2, s2_, a2)
        
        if self.agent_id == 0:
            a_ = self.get_obj_action(s2_)
            a_[mask] = self.estimate_obj_action(s2_[mask], s2__[mask])
            a_ = K.cat([self.actors_target[0](s_), a_],dim=1)
        else:
            a_ = self.actors_target[0](s_)

        if self.object_Qfunc is None:
            r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1)
        else:
            r = self.get_obj_reward(s2, s2_, s2__)

        Q = self.critics[0](s, a)
        V = self.critics_target[0](s_, a_).detach()
        
        target_Q = (V * self.gamma) + r
        if self.object_Qfunc is None:
            target_Q = target_Q.clamp(-1./(1.-self.gamma), 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        if self.agent_id == 0:
            a = self.get_obj_action(s2)
            a[mask] = self.estimate_obj_action(s2[mask], s2_[mask])
            a = K.cat([self.actors[0](s), a],dim=1)
        else:
            a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()
        
        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean()*1

        self.actors_optim[0].zero_grad()        
        loss_actor.backward()
        self.actors_optim[0].step()
                
        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device), next_state.to(self.device))

        return action

    def get_obj_action(self, state):
        with K.no_grad():
            action = self.object_policy(state.to(self.device))

        return action

    def get_obj_reward(self, state, next_state, next_next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device), next_state.to(self.device))
            next_action = self.backward(next_state.to(self.device), next_next_state.to(self.device))

            #reward = self.object_Qfunc(state.to(self.device), action) - self.gamma*self.object_Qfunc(next_state.to(self.device), next_action)
            reward = self.object_Qfunc(next_state.to(self.device), next_action) - self.object_Qfunc(state.to(self.device), action)
            #reward = self.object_Qfunc(state.to(self.device), action)
        
        return reward

    def update_backward(self, batch, normalizer=None):

        observation_space = self.observation_space - K.tensor(batch['g'], dtype=self.dtype, device=self.device).shape[1]
        action_space = self.action_space[0].shape[0]
        
        s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:],
                    K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:]

        s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:],
                     K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1)

        if normalizer[1] is not None:
            s2 = normalizer[1].preprocess(s2)
            s2_ = normalizer[1].preprocess(s2_)

        a2_pred = self.backward(s2, s2_)

        loss_backward = self.loss_func(a2_pred, a2)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()
Esempio n. 5
0
class DDPG_BD(object):
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 object_Qfunc=None,
                 backward_dyn=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.action_space = action_space

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space, discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space, discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(Critic(observation_space, action_space).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn.to(device)
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if object_Qfunc is not None:
            self.object_Qfunc = object_Qfunc
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

    def to_cpu(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cpu()
        self.device = 'cpu'

    def to_cuda(self):
        for entity in self.entities:
            if type(entity) != type(self.actors_optim[0]):
                entity.cuda()
        self.device = 'cuda'

    def select_action(self, state, exploration=False):
        self.actors[0].eval()
        with K.no_grad():
            mu = self.actors[0](state.to(self.device))
        self.actors[0].train()
        if exploration:
            mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()),
                          dtype=self.dtype,
                          device=self.device)
        mu = mu.clamp(int(self.action_space.low[0]),
                      int(self.action_space.high[0]))

        return mu

    def update_parameters(self,
                          batch,
                          normalizer=None,
                          use_object_Qfunc=False):

        V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device)

        s = K.cat([
            K.tensor(batch['o'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                  dim=-1)
        a = K.tensor(batch['u'], dtype=self.dtype, device=self.device)
        r = K.tensor(batch['r'], dtype=self.dtype,
                     device=self.device).unsqueeze(1)
        s_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)
        a_ = K.zeros_like(a)

        if normalizer is not None:
            s = normalizer.preprocess(s)
            s_ = normalizer.preprocess(s_)

        Q = self.critics[0](s, a)

        a_ = self.actors_target[0](s_)
        V = self.critics_target[0](s_, a_).detach()

        if use_object_Qfunc:
            r = self.get_obj_reward(s, s_)
            target_Q = (V * self.gamma) + r
        else:
            target_Q = (V * self.gamma) + r
            target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.)

        loss_critic = self.loss_func(Q, target_Q)

        self.critics_optim[0].zero_grad()
        loss_critic.backward()
        self.critics_optim[0].step()

        a = self.actors[0](s)

        loss_actor = -self.critics[0](s, a).mean()

        if self.regularization:
            loss_actor += (self.actors[0](s)**2).mean() * 1

        self.actors_optim[0].zero_grad()
        loss_actor.backward()
        self.actors_optim[0].step()

        return loss_critic.item(), loss_actor.item()

    def update_target(self):

        soft_update(self.actors_target[0], self.actors[0], self.tau)
        soft_update(self.critics_target[0], self.critics[0], self.tau)

    def estimate_obj_action(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))

        return action

    def get_obj_reward(self, state, next_state):
        with K.no_grad():
            action = self.backward(state.to(self.device),
                                   next_state.to(self.device))
            reward = self.object_Qfunc(state.to(self.device), action)

        return reward

    def update_backward(self, batch, normalizer=None):

        s = K.cat([
            K.tensor(batch['o'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                  dim=-1)
        a = K.tensor(batch['u'], dtype=self.dtype, device=self.device)
        s_ = K.cat([
            K.tensor(batch['o_2'], dtype=self.dtype, device=self.device),
            K.tensor(batch['g'], dtype=self.dtype, device=self.device)
        ],
                   dim=-1)

        if normalizer is not None:
            s = normalizer.preprocess(s)
            s_ = normalizer.preprocess(s_)

        a_pred = self.backward(s, s_)

        loss_backward = self.loss_func(a_pred, a)

        self.backward_optim.zero_grad()
        loss_backward.backward()
        self.backward_optim.step()

        return loss_backward.item()