Beispiel #1
0
class DDPGBulletForwardModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGBulletNetworkFM(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.motivation = ForwardModelMotivation(self.network.forward_model, config.motivation_lr, config.motivation_eta, config.motivation_variant, self.memory, config.device)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
Beispiel #2
0
class PPOAtariForwardModelAgent(PPOAgent):
    def __init__(self, input_shape, action_dim, config, action_type):
        super().__init__(input_shape, action_dim, config)
        self.network = PPOAtariNetworkFM(input_shape,
                                         action_dim,
                                         config,
                                         head=action_type).to(config.device)
        self.motivation = ForwardModelMotivation(self.network.forward_model,
                                                 config.forward_model_lr,
                                                 config.forward_model_eta,
                                                 config.forward_model_variant,
                                                 self.memory, config.device)
        self.algorithm = self.init_algorithm(config, self.memory, action_type,
                                             self.motivation)

    def train(self, state0, value, action0, probs0, state1, reward, mask):
        self.memory.add(state0.cpu(), value.cpu(), action0.cpu(), probs0.cpu(),
                        state1.cpu(), reward.cpu(), mask.cpu())
        indices = self.memory.indices()
        self.algorithm.train(indices)
        self.motivation.train(indices)
        if indices is not None:
            self.memory.clear()