Ejemplo n.º 1
0
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 masked_with_r=False,
                 n_objects=1,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.masked_with_r = masked_with_r
        self.n_objects = n_objects

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space,
                                        action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn
            self.backward_optim = optimizer(self.backward.parameters(),
                                            lr=critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc_target = copy.deepcopy(self.object_Qfunc)
            self.object_Qfunc_optim = optimizer(self.object_Qfunc.parameters(),
                                                lr=critic_lr)
            self.entities.append(self.object_Qfunc)
            self.entities.append(self.object_Qfunc_target)
            self.entities.append(self.object_Qfunc_optim)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy_target = copy.deepcopy(self.object_policy)
            self.object_policy_optim = optimizer(
                self.object_policy.parameters(), lr=actor_lr)
            self.entities.append(self.object_policy)
            self.entities.append(self.object_policy_target)
            self.entities.append(self.object_policy_optim)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        print('clipped between -1 and 0, and masked with abs(r), and + r')
Ejemplo n.º 2
0
    def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid,
                 discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, 
                 object_policy=None, dtype=K.float32, device="cuda"):

        super(MADDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy

        # model initialization
        self.entities = []
        
        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []
        
        self.actors.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device))
        self.actors_target.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device))
        self.actors_optim.append(optimizer(self.actors[0].parameters(), lr = actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim) 
        
        # critics   
        self.critics = []
        self.critics_target = []
        self.critics_optim = []
        
        if agent_id == 0:
            critic_action_space = action_space[2]
        else:
            critic_action_space = action_space[1]

        self.critics.append(Critic(observation_space, critic_action_space).to(device))
        self.critics_target.append(Critic(observation_space, critic_action_space).to(device))
        self.critics_optim.append(optimizer(self.critics[0].parameters(), lr = critic_lr))

        hard_update(self.critics_target[0], self.critics[0])
            
        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        if backward_dyn is None:
            self.backward = BackwardDyn(observation_space, action_space[1]).to(device)
            self.backward_optim = optimizer(self.backward.parameters(), lr = critic_lr)
            self.entities.append(self.backward)
            self.entities.append(self.backward_optim)
        else:
            self.backward = backward_dyn
            self.backward.eval()
            self.entities.append(self.backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.object_Qfunc.eval()
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.object_policy.eval()
            self.entities.append(self.object_policy)
Ejemplo n.º 3
0
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 agent_id=0,
                 object_Qfunc=None,
                 backward_dyn=None,
                 object_policy=None,
                 reward_fun=None,
                 n_objects=1,
                 clip_Q_neg=None,
                 dtype=K.float32,
                 device="cuda"):

        super(DDPG_BD, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_id = agent_id
        self.object_Qfunc = object_Qfunc
        self.object_policy = object_policy
        self.object_backward = backward_dyn
        self.n_objects = n_objects
        self.clip_Q_neg = clip_Q_neg if clip_Q_neg is not None else -1. / (
            1. - self.gamma)

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        self.actors.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_target.append(
            Actor(observation_space, action_space[agent_id], discrete,
                  out_func).to(device))
        self.actors_optim.append(
            optimizer(self.actors[0].parameters(), lr=actor_lr))

        hard_update(self.actors_target[0], self.actors[0])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        self.critics.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_target.append(
            Critic(observation_space, action_space[agent_id]).to(device))
        self.critics_optim.append(
            optimizer(self.critics[0].parameters(), lr=critic_lr))

        hard_update(self.critics_target[0], self.critics[0])

        for i_object in range(self.n_objects):
            self.critics.append(
                Critic(observation_space, action_space[agent_id]).to(device))
            self.critics_target.append(
                Critic(observation_space, action_space[agent_id]).to(device))
            self.critics_optim.append(
                optimizer(self.critics[i_object + 1].parameters(),
                          lr=critic_lr))

            hard_update(self.critics_target[i_object + 1],
                        self.critics[i_object + 1])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model for object actions
        if self.object_backward is not None:
            self.entities.append(self.object_backward)

        # Learnt Q function for object
        if self.object_Qfunc is not None:
            self.entities.append(self.object_Qfunc)

        # Learnt policy for object
        if self.object_policy is not None:
            self.entities.append(self.object_policy)

        if reward_fun is not None:
            self.get_obj_reward = reward_fun
        else:
            self.get_obj_reward = self.reward_fun

        # backward dynamics model for object actions
        self.backward = BackwardDyn(observation_space,
                                    action_space[0]).to(device)
        self.backward_optim = optimizer(self.backward.parameters(),
                                        lr=critic_lr)
        self.entities.append(self.backward)
        self.entities.append(self.backward_optim)

        self.backward_otw = BackwardDyn(observation_space,
                                        action_space[0]).to(device)
        self.backward_otw_optim = optimizer(self.backward_otw.parameters(),
                                            lr=critic_lr)
        self.entities.append(self.backward_otw)
        self.entities.append(self.backward_otw_optim)

        print('seperaate Qs')
Ejemplo n.º 4
0
    def __init__(self,
                 observation_space,
                 action_space,
                 optimizer,
                 Actor,
                 Critic,
                 loss_func,
                 gamma,
                 tau,
                 out_func=K.sigmoid,
                 discrete=True,
                 regularization=False,
                 normalized_rewards=False,
                 dtype=K.float32,
                 device="cuda"):

        super(MADDPG_RAE, self).__init__()

        optimizer, lr = optimizer
        actor_lr, critic_lr = lr

        self.loss_func = loss_func
        self.gamma = gamma
        self.tau = tau
        self.out_func = out_func
        self.discrete = discrete
        self.regularization = regularization
        self.normalized_rewards = normalized_rewards
        self.dtype = dtype
        self.device = device
        self.observation_space = observation_space
        self.action_space = action_space

        # model initialization
        self.entities = []

        # actors
        self.actors = []
        self.actors_target = []
        self.actors_optim = []

        for i in range(2):
            self.actors.append(
                Actor(observation_space, action_space[i], discrete,
                      out_func).to(device))
            self.actors_target.append(
                Actor(observation_space, action_space[i], discrete,
                      out_func).to(device))
            self.actors_optim.append(
                optimizer(self.actors[i].parameters(), lr=actor_lr))

        for i in range(2):
            hard_update(self.actors_target[i], self.actors[i])

        self.entities.extend(self.actors)
        self.entities.extend(self.actors_target)
        self.entities.extend(self.actors_optim)

        # critics
        self.critics = []
        self.critics_target = []
        self.critics_optim = []

        for i in range(2):
            self.critics.append(
                Critic(observation_space, action_space[2]).to(device))
            self.critics_target.append(
                Critic(observation_space, action_space[2]).to(device))
            self.critics_optim.append(
                optimizer(self.critics[i].parameters(), lr=critic_lr))

        for i in range(2):
            hard_update(self.critics_target[i], self.critics[i])

        self.entities.extend(self.critics)
        self.entities.extend(self.critics_target)
        self.entities.extend(self.critics_optim)

        # backward dynamics model
        self.backward = BackwardDyn(3, action_space[1]).to(device)
        self.backward_optim = optimizer(self.backward.parameters(),
                                        lr=critic_lr)

        self.entities.append(self.backward)
        self.entities.append(self.backward_optim)