Esempio n. 1
0
 def __init__(self, gate, critic, gate_lr, critic_lr, gamma, tau,
              memory_buffer, sample_size, forward_model, meta_critic):
     self._gate = gate
     self._ddpg = DDPG(gate, critic, gate_lr, critic_lr, gamma, tau,
                       memory_buffer, sample_size)
     self._forward_model = forward_model
     self._meta_critic = meta_critic
Esempio n. 2
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGBulletNetworkSURND(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = MetaCriticRNDMotivation(self.network.metacritic_model, config.motivation_lr, config.motivation_variant, config.motivation_eta, self.memory, config.motivation_batch_size,
                                               config.device)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
Esempio n. 3
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGAerisNetworkFIM(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = ForwardInverseModelMotivation(self.network.forward_model, config.forward_model_lr, self.network.inverse_model, config.forward_model_lr,
                                                     0.5, config.forward_model_eta,
                                                     config.forward_model_variant, self.memory, config.forward_model_batch_size, config.device)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
Esempio n. 4
0
class DDPGBulletAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGBulletNetwork(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        indices = self.memory.indices(self.config.batch_size)
        self.algorithm.train_sample(indices)
Esempio n. 5
0
class DDPGBulletRNDModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGBulletNetworkRND(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.motivation = RNDMotivation(self.network.rnd_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
Esempio n. 6
0
class DDPGAerisM2SModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGAerisNetworkFM(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.motivation = M2SMotivation(self.network, config.forward_model_lr, config.forward_model_eta, self.memory, config.forward_model_batch_size, config.steps * 1e6)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.forward_model_batch_size))
Esempio n. 7
0
class DDPGAerisM2ModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGAerisNetworkM2(state_dim, action_dim, config)
        self.memory = M2ReplayBuffer(config.memory_size)
        self.motivation = M2Motivation(self.network, config.forward_model_lr, config.gamma, config.tau, config.forward_model_eta, self.memory, config.forward_model_batch_size)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)

    def train(self, state0, action0, state1, im0, error0, weight, im1, error1, reward, mask):
        gate_state0 = self.compose_gate_state(im0, error0)
        gate_state1 = self.compose_gate_state(im1, error1)
        self.memory.add(state0, action0, state1, gate_state0, weight, gate_state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.forward_model_batch_size))

    def compose_gate_state(self, im, error):
        return torch.cat([im, error], dim=1)
Esempio n. 8
0
class DDPGBulletDOPModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGBulletNetworkDOP(state_dim, action_dim, config)
        self.memory = DOPReplayBuffer(config.memory_size)
        self.motivation = DOPMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)

    def get_action(self, state):
        action = self.network.action(state)
        noise, index = self.network.noise(state, action)
        return action.detach(), noise.detach(), index.detach()

    def train(self, state0, action0, noise0, index, state1, reward, mask):
        self.memory.add(state0, action0, noise0, index, state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
Esempio n. 9
0
    def __init__(self, network, lr, gamma, tau, eta, memory, sample_size):
        self.network = network
        self._optimizer = torch.optim.Adam(self.network.forward_model.parameters(), lr=lr)
        self.eta = eta
        self._memory = memory
        self._sample_size = sample_size
        self._weight = torch.tensor([[0.9, 0.1]], dtype=torch.float32)

        self.gate_algorithm = DDPG(self.network.gate, lr, lr, gamma, tau, memory, 32)
Esempio n. 10
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGBulletNetwork(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)
Esempio n. 11
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGAerisNetworkQRND(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = QRNDMotivation(self.network.qrnd_model, config.forward_model_lr, config.forward_model_eta, self.memory)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
Esempio n. 12
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGBulletNetworkDOP(state_dim, action_dim, config)
     self.memory = DOPReplayBuffer(config.memory_size)
     self.motivation = DOPMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)
Esempio n. 13
0
                                                int(time.time()))).make()
    else:
        environment = gym.make(args.env)
        is_env_pool = False

    tf.reset_default_graph()
    with tf.device(device):
        if args.eval:
            replay_memory = None
        else:
            replay_memory = Replay_Memory(memory_size=args.memory_size)
        if args.model == 'DDPG':
            agent = DDPG(environment,
                         args.hidden_dims,
                         replay_memory=replay_memory,
                         gamma=args.gamma,
                         actor_lr=args.actor_lr,
                         critic_lr=args.critic_lr,
                         tau=args.tau,
                         N=args.N)
        elif args.model == 'QRDDPG':
            agent = QRDDPG(environment,
                           args.hidden_dims,
                           replay_memory=replay_memory,
                           gamma=args.gamma,
                           actor_lr=args.actor_lr,
                           critic_lr=args.critic_lr,
                           tau=args.tau,
                           N=args.N,
                           kappa=args.kappa,
                           n_quantile=args.n_quantile)
        elif args.model == 'D3PG':
Esempio n. 14
0
import numpy as np
import time

MAX_EPISODES = 900
MAX_EP_STEPS = 200
ON_TRAIN = False

# set env
env = ArmEnv()
s_dim = env.state_dim
a_dim = env.action_dim
a_bound = env.action_bound

# set RL method
pd = PD()
rl = DDPG(a_dim, s_dim, a_bound)

steps = []


def runpd():
    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_r = 0.
        for j in range(MAX_EP_STEPS):

            a = pd.cal(s, np.array([0, 0, -4, 0, 0]))
            s, r, done, safe = env.step(a)
            if done or j == MAX_EP_STEPS - 1 or safe is False:
                print('Ep: %i | %s | %s | ep_r: %.1f | step: %i' %
                      (i, '---' if not done else 'done',
Esempio n. 15
0
class M3Motivation:
    def __init__(self, gate, critic, gate_lr, critic_lr, gamma, tau,
                 memory_buffer, sample_size, forward_model, meta_critic):
        self._gate = gate
        self._ddpg = DDPG(gate, critic, gate_lr, critic_lr, gamma, tau,
                          memory_buffer, sample_size)
        self._forward_model = forward_model
        self._meta_critic = meta_critic

    def train(self, state0, action0, state1, action1, reward, done):
        error_estimate0 = self._meta_critic.error(state0, action0)
        error_estimate1 = self._meta_critic.error(state1, action1)
        m3_state0 = torch.cat([state0, error_estimate0.expand_as(state0)])
        m3_state1 = torch.cat([state1, error_estimate1.expand_as(state1)])
        m3_action = self._gate(m3_state0).detach()

        self._meta_critic.train(state0, action0, state1)
        self._ddpg.train(m3_state0, m3_action, m3_state1, reward, done)

    def reward(self, state0, action, state1):
        error = self._forward_model.error(state0, action, state1)
        error_estimate = self._meta_critic.error(state0, action)
        m3_state0 = torch.cat(
            [state0, error_estimate.expand_as(state0)], dim=-1)
        gate = torch.softmax(self._gate(m3_state0).detach(), dim=-1)

        rewards = [
            self._curious_reward(error, error_estimate),
            self._familiar_reward(error, error_estimate),
            self._surprise_reward(error, error_estimate),
            self._predictive_penalty(error, error_estimate)
        ]
        rewards = torch.stack(rewards).squeeze(state0.ndim)
        if state0.ndim == 2:
            rewards = rewards.T
        reward = torch.sum(rewards * gate,
                           state0.ndim - 1).unsqueeze(state0.ndim - 1)

        return reward

    def get_forward_model(self):
        return self._forward_model

    def get_metacritic(self):
        return self._meta_critic

    @staticmethod
    def _curious_reward(error, error_estimate):
        return torch.tanh(error)

    @staticmethod
    def _familiar_reward(error, error_estimate):
        return torch.tanh(1 / error)

    @staticmethod
    def _surprise_reward(error, error_estimate):
        sigma = 1e-2
        mask = torch.gt(torch.abs(error - error_estimate),
                        torch.ones_like(error) * sigma).type(torch.float32)
        return torch.tanh(error / error_estimate + error_estimate / error -
                          2) * mask

    @staticmethod
    def _predictive_penalty(error, error_estimate):
        return torch.tanh(error_estimate - error)