def __init__(self, gate, critic, gate_lr, critic_lr, gamma, tau, memory_buffer, sample_size, forward_model, meta_critic): self._gate = gate self._ddpg = DDPG(gate, critic, gate_lr, critic_lr, gamma, tau, memory_buffer, sample_size) self._forward_model = forward_model self._meta_critic = meta_critic
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkSURND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = MetaCriticRNDMotivation(self.network.metacritic_model, config.motivation_lr, config.motivation_variant, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkFIM(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = ForwardInverseModelMotivation(self.network.forward_model, config.forward_model_lr, self.network.inverse_model, config.forward_model_lr, 0.5, config.forward_model_eta, config.forward_model_variant, self.memory, config.forward_model_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
class DDPGBulletAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetwork(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) indices = self.memory.indices(self.config.batch_size) self.algorithm.train_sample(indices)
class DDPGBulletRNDModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkRND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = RNDMotivation(self.network.rnd_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
class DDPGAerisM2SModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkFM(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = M2SMotivation(self.network, config.forward_model_lr, config.forward_model_eta, self.memory, config.forward_model_batch_size, config.steps * 1e6) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.forward_model_batch_size))
class DDPGAerisM2ModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkM2(state_dim, action_dim, config) self.memory = M2ReplayBuffer(config.memory_size) self.motivation = M2Motivation(self.network, config.forward_model_lr, config.gamma, config.tau, config.forward_model_eta, self.memory, config.forward_model_batch_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, im0, error0, weight, im1, error1, reward, mask): gate_state0 = self.compose_gate_state(im0, error0) gate_state1 = self.compose_gate_state(im1, error1) self.memory.add(state0, action0, state1, gate_state0, weight, gate_state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.forward_model_batch_size)) def compose_gate_state(self, im, error): return torch.cat([im, error], dim=1)
class DDPGBulletDOPModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkDOP(state_dim, action_dim, config) self.memory = DOPReplayBuffer(config.memory_size) self.motivation = DOPMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size) def get_action(self, state): action = self.network.action(state) noise, index = self.network.noise(state, action) return action.detach(), noise.detach(), index.detach() def train(self, state0, action0, noise0, index, state1, reward, mask): self.memory.add(state0, action0, noise0, index, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
def __init__(self, network, lr, gamma, tau, eta, memory, sample_size): self.network = network self._optimizer = torch.optim.Adam(self.network.forward_model.parameters(), lr=lr) self.eta = eta self._memory = memory self._sample_size = sample_size self._weight = torch.tensor([[0.9, 0.1]], dtype=torch.float32) self.gate_algorithm = DDPG(self.network.gate, lr, lr, gamma, tau, memory, 32)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetwork(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkQRND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = QRNDMotivation(self.network.qrnd_model, config.forward_model_lr, config.forward_model_eta, self.memory) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkDOP(state_dim, action_dim, config) self.memory = DOPReplayBuffer(config.memory_size) self.motivation = DOPMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)
int(time.time()))).make() else: environment = gym.make(args.env) is_env_pool = False tf.reset_default_graph() with tf.device(device): if args.eval: replay_memory = None else: replay_memory = Replay_Memory(memory_size=args.memory_size) if args.model == 'DDPG': agent = DDPG(environment, args.hidden_dims, replay_memory=replay_memory, gamma=args.gamma, actor_lr=args.actor_lr, critic_lr=args.critic_lr, tau=args.tau, N=args.N) elif args.model == 'QRDDPG': agent = QRDDPG(environment, args.hidden_dims, replay_memory=replay_memory, gamma=args.gamma, actor_lr=args.actor_lr, critic_lr=args.critic_lr, tau=args.tau, N=args.N, kappa=args.kappa, n_quantile=args.n_quantile) elif args.model == 'D3PG':
import numpy as np import time MAX_EPISODES = 900 MAX_EP_STEPS = 200 ON_TRAIN = False # set env env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set RL method pd = PD() rl = DDPG(a_dim, s_dim, a_bound) steps = [] def runpd(): for i in range(MAX_EPISODES): s = env.reset() ep_r = 0. for j in range(MAX_EP_STEPS): a = pd.cal(s, np.array([0, 0, -4, 0, 0])) s, r, done, safe = env.step(a) if done or j == MAX_EP_STEPS - 1 or safe is False: print('Ep: %i | %s | %s | ep_r: %.1f | step: %i' % (i, '---' if not done else 'done',
class M3Motivation: def __init__(self, gate, critic, gate_lr, critic_lr, gamma, tau, memory_buffer, sample_size, forward_model, meta_critic): self._gate = gate self._ddpg = DDPG(gate, critic, gate_lr, critic_lr, gamma, tau, memory_buffer, sample_size) self._forward_model = forward_model self._meta_critic = meta_critic def train(self, state0, action0, state1, action1, reward, done): error_estimate0 = self._meta_critic.error(state0, action0) error_estimate1 = self._meta_critic.error(state1, action1) m3_state0 = torch.cat([state0, error_estimate0.expand_as(state0)]) m3_state1 = torch.cat([state1, error_estimate1.expand_as(state1)]) m3_action = self._gate(m3_state0).detach() self._meta_critic.train(state0, action0, state1) self._ddpg.train(m3_state0, m3_action, m3_state1, reward, done) def reward(self, state0, action, state1): error = self._forward_model.error(state0, action, state1) error_estimate = self._meta_critic.error(state0, action) m3_state0 = torch.cat( [state0, error_estimate.expand_as(state0)], dim=-1) gate = torch.softmax(self._gate(m3_state0).detach(), dim=-1) rewards = [ self._curious_reward(error, error_estimate), self._familiar_reward(error, error_estimate), self._surprise_reward(error, error_estimate), self._predictive_penalty(error, error_estimate) ] rewards = torch.stack(rewards).squeeze(state0.ndim) if state0.ndim == 2: rewards = rewards.T reward = torch.sum(rewards * gate, state0.ndim - 1).unsqueeze(state0.ndim - 1) return reward def get_forward_model(self): return self._forward_model def get_metacritic(self): return self._meta_critic @staticmethod def _curious_reward(error, error_estimate): return torch.tanh(error) @staticmethod def _familiar_reward(error, error_estimate): return torch.tanh(1 / error) @staticmethod def _surprise_reward(error, error_estimate): sigma = 1e-2 mask = torch.gt(torch.abs(error - error_estimate), torch.ones_like(error) * sigma).type(torch.float32) return torch.tanh(error / error_estimate + error_estimate / error - 2) * mask @staticmethod def _predictive_penalty(error, error_estimate): return torch.tanh(error_estimate - error)