class DDPGBulletAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetwork(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) indices = self.memory.indices(self.config.batch_size) self.algorithm.train_sample(indices)
class DDPGBulletRNDModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkRND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = RNDMotivation(self.network.rnd_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
class DDPGAerisM2SModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkFM(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = M2SMotivation(self.network, config.forward_model_lr, config.forward_model_eta, self.memory, config.forward_model_batch_size, config.steps * 1e6) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.forward_model_batch_size))
class DDPGAerisM2ModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkM2(state_dim, action_dim, config) self.memory = M2ReplayBuffer(config.memory_size) self.motivation = M2Motivation(self.network, config.forward_model_lr, config.gamma, config.tau, config.forward_model_eta, self.memory, config.forward_model_batch_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, im0, error0, weight, im1, error1, reward, mask): gate_state0 = self.compose_gate_state(im0, error0) gate_state1 = self.compose_gate_state(im1, error1) self.memory.add(state0, action0, state1, gate_state0, weight, gate_state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.forward_model_batch_size)) def compose_gate_state(self, im, error): return torch.cat([im, error], dim=1)
class DDPGBulletDOPModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkDOP(state_dim, action_dim, config) self.memory = DOPReplayBuffer(config.memory_size) self.motivation = DOPMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size) def get_action(self, state): action = self.network.action(state) noise, index = self.network.noise(state, action) return action.detach(), noise.detach(), index.detach() def train(self, state0, action0, noise0, index, state1, reward, mask): self.memory.add(state0, action0, noise0, index, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.motivation_batch_size))