Ejemplo n.º 1
0
 def __init__(self,
              input_dims,
              num_actions,
              learning_rate=2e-4,
              discount_factor=0.99,
              eps=1.0,
              eps_decrement_factor=1e-5,
              eps_min=0.1,
              replay_memory_size=10000,
              mini_batch_size=32):
     self.input_dims = input_dims
     self.num_actions = num_actions
     self.discount_factor = discount_factor
     self.eps_min = eps_min
     self.eps = eps
     self.eps_decrement_factor = eps_decrement_factor
     self.mini_batch_size = mini_batch_size
     #self.Q = LinearDQN(learning_rate, num_actions, input_dims)
     self.online_network = DualDeepQCNN(input_dims,
                                        self.num_actions,
                                        name='OnlineNetwork')
     self.target_network = DualDeepQCNN(input_dims,
                                        self.num_actions,
                                        name='TargetNetwork')
     self.replay_memory_size = replay_memory_size
     self.memory_bank = AgentMemory(self.replay_memory_size)
Ejemplo n.º 2
0
 def setUp(self):
     self.agent = Mock(["send_chat"])
     self.memory = AgentMemory()
     self.dialogue_stack = DialogueStack(self.agent, self.memory)
     self.dialogue_stack.append(
         BotStackStatus(agent=self.agent,
                        memory=self.memory,
                        dialogue_stack=self.dialogue_stack))
Ejemplo n.º 3
0
class MethodsTests(unittest.TestCase):
    def setUp(self):
        self.memory = AgentMemory()

    def test_peek_empty(self):
        self.assertEqual(self.memory.task_stack_peek(), None)

    def test_add_mob(self):
        # add mob
        chicken = {v: k for k, v in MOBS_BY_ID.items()}["chicken"]
        mob_id, mob_type, pos = 42, chicken, Pos(3, 4, 5)
        self.memory.set_mob_position(Mob(mob_id, mob_type, pos))

        # get mob
        self.assertIsNotNone(self.memory.get_mob_by_eid(mob_id))

        # update mob
        pos = Pos(6, 7, 8)
        self.memory.set_mob_position(Mob(mob_id, mob_type, pos))

        # get mob
        mob_node = self.memory.get_mob_by_eid(mob_id)
        self.assertIsNotNone(mob_node)
        self.assertEqual(mob_node.pos, (6, 7, 8))

    def test_add_guardian_mob(self):
        guardian = {v: k for k, v in MOBS_BY_ID.items()}["guardian"]
        mob_id, mob_type, pos = 42, guardian, Pos(3, 4, 5)
        self.memory.set_mob_position(Mob(mob_id, mob_type, pos))
Ejemplo n.º 4
0
class Agent():
    '''Base class implementing functionality for different Deep Q Learning methods'''
    def __init__(self, env_name, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99,  eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32):
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        self.eps_min = eps_min
        self.eps = eps
        self.eps_decrement_factor = eps_decrement_factor
        self.mini_batch_size = mini_batch_size
        self.replay_memory_size = replay_memory_size
        self.memory_bank = AgentMemory(self.replay_memory_size)
        self.env_name = env_name

    def get_greedy_action(self, observation):
        raise NotImplementedError

    def save_models(self):
        raise NotImplementedError

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.online_network.save_checkpoint()
        self.target_network.save_checkpoint()

    def load_models(self):
        self.online_network.load_checkpoint()
        self.target_network.load_checkpoint()

    def store_memory(self, memory):
        self.memory_bank.remember(memory)

    def make_memory(self, state, action, reward, new_state, done):
        return np.array([state,
                         np.long(action),
                         float(reward),
                         new_state,
                         bool(done)])
    
    def get_random_action(self, observation):
        # randint return is inclusive of final value
        return random.randint(0, num_actions-1)

    def decrement_epsilon(self):
        new_eps = self.eps - self.eps_decrement_factor
        self.eps = new_eps if new_eps > self.eps_min else self.eps_min
    
    def sample_memory(self):
        self.memory_bank.recall_batch(
                self.mini_batch_size)

    def copy_online_nn_to_target_nn(self):
        self.target_network.load_state_dict(self.online_network.state_dict())
Ejemplo n.º 5
0
 def __init__(self, env_name, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99,  eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32):
     self.input_dims = input_dims
     self.num_actions = num_actions
     self.discount_factor = discount_factor
     self.eps_min = eps_min
     self.eps = eps
     self.eps_decrement_factor = eps_decrement_factor
     self.mini_batch_size = mini_batch_size
     self.replay_memory_size = replay_memory_size
     self.memory_bank = AgentMemory(self.replay_memory_size)
     self.env_name = env_name
Ejemplo n.º 6
0
    def __init__(self, env, brain_name, config, policy=None, critic=None):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate
        - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'update_every', int, update frequency, in number of steps
        - 'train_iterations', int, number of training passes over a data batch
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'std', float, standard deviation used for the Normal distribution of the policy
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        - 'policy', optional, used to pass a mock policy for testing purposes
        - 'critic', optional, used to pass a mock critic for testing purposes

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.policy = Policy(config, self.state_size,
                             self.action_size) if policy is None else policy
        self.trained_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic.eval()
        self.memory = AgentMemory(
            ((self.num_agents, self.state_size),
             (self.num_agents, self.action_size), (self.num_agents, ),
             (self.num_agents, self.state_size), (self.num_agents, ),
             (self.num_agents, )), int(self.max_memory))
        self.epsilon = config.epsilon_start
        self.scores = []
        self.surrogates = []

        self.optimizer = optim.Adam([{
            'params': self.policy.parameters()
        }, {
            'params': self.trained_critic.parameters()
        }],
                                    lr=config.learning_rate)
Ejemplo n.º 7
0
    def setUp(self):
        self.memory = AgentMemory(
            load_minecraft_specs=False)  # don't load specs, it's slow
        self.agent = FakeAgent(self.memory)
        self.dialogue_manager = TtadModelDialogueManager(
            self.agent, None, None, None, no_ground_truth_actions=True)

        # More helpful error message to encourage test writers to use self.set_looking_at()
        self.agent.get_player_line_of_sight = Mock(
            side_effect=NotImplementedError(
                "Cannot call into C++ function in this unit test. " +
                "Call self.set_looking_at() to set the return value"))

        # Add a speaker at position (5, 63, 5) looking in the +x direction
        self.memory.update(self.agent)
        self.speaker = list(self.memory.other_players.values())[0].name
Ejemplo n.º 8
0
class BotStackStatusTest(unittest.TestCase):
    def setUp(self):
        self.agent = Mock(["send_chat"])
        self.memory = AgentMemory()
        self.dialogue_stack = DialogueStack(self.agent, self.memory)
        self.dialogue_stack.append(
            BotStackStatus(agent=self.agent,
                           memory=self.memory,
                           dialogue_stack=self.dialogue_stack))

    def test_move(self):
        self.memory.task_stack_push(
            tasks.Move(self.agent, {"target": (42, 42, 42)}))
        self.memory.add_chat("test_agent", "test chat: where are you going?")
        self.dialogue_stack.step()
        self.agent.send_chat.assert_called()
Ejemplo n.º 9
0
    def __init__(self, env, brain_name, config):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'batch_size', int, size of the batches sampled to train the model on each update
        - 'update_every', int, update frequency, in number of steps
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.trained_policy = Policy(config, self.state_size, self.action_size)
        self.target_policy = Policy(config, self.state_size, self.action_size)
        self.trained_critic = Critic(config, self.state_size, self.action_size)
        self.target_critic = Critic(config, self.state_size, self.action_size)
        # those networks will never be trained
        self.target_policy.eval()
        self.target_critic.eval()
        self.memory = AgentMemory(((self.num_agents, self.state_size),
                                   (self.num_agents, self.action_size),
                                   (self.num_agents, self.state_size),
                                   (self.num_agents, ), (self.num_agents, )),
                                  int(self.max_memory))
        self.scores = []
        self.critic_losses = []
        self.surrogates = []

        self.critic_optimizer = optim.Adam(self.trained_critic.parameters(),
                                           lr=config.learning_rate)
        self.policy_optimizer = optim.Adam(self.trained_policy.parameters(),
                                           lr=config.learning_rate)
Ejemplo n.º 10
0
 def setUp(self):
     self.memory = AgentMemory()
Ejemplo n.º 11
0
class BaseCraftassistTestCase(unittest.TestCase):
    def setUp(self):
        self.memory = AgentMemory(
            load_minecraft_specs=False)  # don't load specs, it's slow
        self.agent = FakeAgent(self.memory)
        self.dialogue_manager = TtadModelDialogueManager(
            self.agent, None, None, None, no_ground_truth_actions=True)

        # More helpful error message to encourage test writers to use self.set_looking_at()
        self.agent.get_player_line_of_sight = Mock(
            side_effect=NotImplementedError(
                "Cannot call into C++ function in this unit test. " +
                "Call self.set_looking_at() to set the return value"))

        # Add a speaker at position (5, 63, 5) looking in the +x direction
        self.memory.update(self.agent)
        self.speaker = list(self.memory.other_players.values())[0].name

    def handle_action_dict(self,
                           d,
                           answer: str = None,
                           stop_on_chat=False,
                           max_steps=10000) -> Dict[XYZ, IDM]:
        """Handle an action dict and call self.flush()

        If "answer" is specified and a question is asked by the agent, respond
        with this string.

        If "stop_on_chat" is specified, stop iterating if the agent says anything
        """
        self.add_incoming_chat("TEST {}".format(d))
        obj = self.dialogue_manager.handle_action_dict(self.speaker, d)
        if obj is not None:
            self.dialogue_manager.dialogue_stack.append(obj)
        changes = self.flush(max_steps, stop_on_chat=stop_on_chat)
        if len(self.dialogue_manager.dialogue_stack
               ) != 0 and answer is not None:
            self.add_incoming_chat(answer)
            changes.update(self.flush(max_steps, stop_on_chat=stop_on_chat))
        return changes

    def flush(self, max_steps=10000, stop_on_chat=False) -> Dict[XYZ, IDM]:
        """Update memory and step the dialogue and task stacks until they are empty

        If "stop_on_chat" is specified, stop iterating if the agent says anything

        Return the set of blocks that were changed.
        """
        if stop_on_chat:
            self.agent.clear_outgoing_chats()

        world_before = self.agent._world.copy()

        for _ in range(max_steps):
            if (len(self.dialogue_manager.dialogue_stack) == 0
                    and not self.memory.task_stack_peek()):
                break
            self.memory.update(self.agent)
            self.dialogue_manager.dialogue_stack.step()
            self.agent.task_step()
            if (isinstance(self.dialogue_manager.dialogue_stack.peek(),
                           AwaitResponse) and
                    not self.dialogue_manager.dialogue_stack.peek().finished
                ) or (stop_on_chat and self.agent.get_last_outgoing_chat()):
                break
        self.memory.update(self.agent)

        # get changes
        world_after = self.agent._world.copy()
        changes = dict(set(world_after.items()) - set(world_before.items()))
        changes.update({
            k: (0, 0)
            for k in set(world_before.keys()) - set(world_after.keys())
        })
        return changes

    def set_looking_at(self, xyz: XYZ):
        """Set the return value for C++ call to get_player_line_of_sight"""
        self.agent.get_player_line_of_sight = Mock(return_value=Pos(*xyz))

    def set_blocks(self, xyzbms: List[Block], origin: XYZ = (0, 0, 0)):
        """Change the state of the world, block by block"""
        for xyz, idm in xyzbms:
            abs_xyz = tuple(np.array(xyz) + origin)
            self.memory.on_block_changed(abs_xyz, idm)
            self.agent._world[abs_xyz] = idm

    def add_object(self, xyzbms: List[Block],
                   origin: XYZ = (0, 0, 0)) -> ObjectNode:
        """Add an object to memory as if it was placed block by block

        Args:
        - xyzbms: a list of relative (xyz, idm)
        - origin: (x, y, z) of the corner

        Returns an ObjectNode
        """
        self.set_blocks(xyzbms, origin)
        abs_xyz = tuple(np.array(xyzbms[0][0]) + origin)
        memid = self.memory.get_block_object_ids_by_xyz(abs_xyz)[0]
        return self.memory.get_object_by_id(memid)

    def get_blocks(self, xyzs: Sequence[XYZ]) -> Dict[XYZ, IDM]:
        """Return the ground truth block state"""
        d = {}
        for (x, y, z) in xyzs:
            B = self.agent.get_blocks(x, x, y, y, z, z)
            d[(x, y, z)] = tuple(B[0, 0, 0, :])
        return d

    def add_incoming_chat(self, chat: str):
        """Add a chat to memory as if it was just spoken by SPEAKER"""
        self.memory.add_chat(
            self.memory.get_player_by_name(self.speaker).memid, chat)

    def assert_schematics_equal(self, a, b):
        """Check equality between two list[(xyz, idm)] schematics

        N.B. this compares the shapes and idms, but ignores absolute position offsets.
        """
        a, _ = to_relative_pos(a)
        b, _ = to_relative_pos(b)
        self.assertEqual(set(a), set(b))

    def last_outgoing_chat(self) -> str:
        return self.agent.get_last_outgoing_chat()

    def get_speaker_pos(self) -> XYZ:
        return tuple(
            pos_to_np(self.memory.get_player_struct_by_name(self.speaker).pos))
Ejemplo n.º 12
0
class Agent():
    def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99,  eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32):
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        self.eps_min = eps_min
        self.eps = eps
        self.eps_decrement_factor = eps_decrement_factor
        self.mini_batch_size = mini_batch_size
        #self.Q = LinearDQN(learning_rate, num_actions, input_dims)
        self.online_network = DeepQCNN(
            input_dims, self.num_actions, name='OnlineNetwork')
        self.target_network = DeepQCNN(
            input_dims, self.num_actions, name='TargetNetwork')
        self.replay_memory_size = replay_memory_size
        self.memory_bank = AgentMemory(self.replay_memory_size)

    def decrement_epsilon(self):
        new_eps = self.eps - self.eps_decrement_factor
        self.eps = new_eps if new_eps > self.eps_min else self.eps_min

    def store_memory(self, memory):
        self.memory_bank.remember(memory)

    def make_memory(self, state, action, reward, new_state, done):
        return np.array([state,
                         np.long(action),
                         float(reward),
                         new_state,
                         bool(done)])

    def get_greedy_action(self, observation):
        # convert obs to tensor, pass to device, forward pass, argmax
        obs_t = T.tensor(observation).to(
            self.online_network.device, dtype=T.float)
        action = self.target_network.forward(obs_t)

        return action.argmax().item()

    def get_random_action(self, observation):
        # randint return is inclusive of final value
        return random.randint(0, num_actions-1)

    def train_online_network(self):
        pass

    def save_models(self):
        self.online_network.save_checkpoint()
        self.target_network.save_checkpoint()

    def load_models(self):
        self.online_network.load_checkpoint()
        self.target_network.load_checkpoint()

        #replay_memory_training_data = self.memory_bank.recall_batch(mini_batch_size)
        # need is an array of arrays outer array (batchsize, 2), inner array(training data, targets)
        # self.online_network.fit()

    def update_target_network(self):
        pass

    def copy_online_nn_to_target_nn(self):
        self.target_network.load_state_dict(self.online_network.state_dict())
Ejemplo n.º 13
0
    def setUp(self):
        self.memory = AgentMemory(load_minecraft_specs=False)  # don't load specs, it's slow
        self.agent = FakeAgent(self.memory)
        self.dialogue_manager = TtadModelDialogueManager(
            self.agent, None, None, None, None, None, no_ground_truth_actions=True
        )

        # More helpful error message to encourage test writers to use self.set_looking_at()
        self.agent.get_player_line_of_sight = Mock(
            side_effect=NotImplementedError(
                "Cannot call into C++ function in this unit test. "
                + "Call self.set_looking_at() to set the return value"
            )
        )

        # Add a speaker at position (5, 63, 5) looking in the +x direction
        self.memory.update(self.agent)
        self.speaker = list(self.memory.other_players.values())[0].name

        # Combinable actions to be used in test cases
        self.possible_actions = {
            "destroy_speaker_look": {
                "action_type": "DESTROY",
                "reference_object": {"location": {"location_type": "SPEAKER_LOOK"}},
            },
            "copy_speaker_look_to_agent_pos": {
                "action_type": "BUILD",
                "reference_object": {"location": {"location_type": "SPEAKER_LOOK"}},
                "location": {"location_type": "AGENT_POS"},
            },
            "build_small_sphere": {
                "action_type": "BUILD",
                "schematic": {"has_name": "sphere", "has_size": "small"},
            },
            "build_1x1x1_cube": {
                "action_type": "BUILD",
                "schematic": {"has_name": "cube", "has_size": "1 x 1 x 1"},
            },
            "move_speaker_pos": {
                "action_type": "MOVE",
                "location": {"location_type": "SPEAKER_POS"},
            },
            "build_diamond": {"action_type": "BUILD", "schematic": {"has_name": "diamond"}},
            "build_gold_cube": {
                "action_type": "BUILD",
                "schematic": {"has_block_type": "gold", "has_name": "cube"},
            },
            "fill_all_holes_speaker_look": {
                "action_type": "FILL",
                "location": {"location_type": "SPEAKER_LOOK"},
                "repeat": {"repeat_key": "ALL"},
            },
            "go_to_tree": {
                "action_type": "MOVE",
                "location": {
                    "location_type": "REFERENCE_OBJECT",
                    "reference_object": {"has_name": "tree"},
                },
            },
            "build_square_height_1": {
                "action_type": "BUILD",
                "schematic": {"has_name": "square", "has_height": "1"},
            },
            "stop": {"action_type": "STOP"},
            "fill_speaker_look": {
                "action_type": "FILL",
                "location": {"location_type": "SPEAKER_LOOK"},
            },
            "fill_speaker_look_gold": {
                "action_type": "FILL",
                "has_block_type": "gold",
                "location": {"location_type": "SPEAKER_LOOK"},
            },
        }
Ejemplo n.º 14
0
class DDPGController:
    """
    Deep learning agent based on Deep Deterministic Policy Gradient described in https://arxiv.org/pdf/1509.02971.pdf
    
    """
    def __init__(self, env, brain_name, config):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'batch_size', int, size of the batches sampled to train the model on each update
        - 'update_every', int, update frequency, in number of steps
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.trained_policy = Policy(config, self.state_size, self.action_size)
        self.target_policy = Policy(config, self.state_size, self.action_size)
        self.trained_critic = Critic(config, self.state_size, self.action_size)
        self.target_critic = Critic(config, self.state_size, self.action_size)
        # those networks will never be trained
        self.target_policy.eval()
        self.target_critic.eval()
        self.memory = AgentMemory(((self.num_agents, self.state_size),
                                   (self.num_agents, self.action_size),
                                   (self.num_agents, self.state_size),
                                   (self.num_agents, ), (self.num_agents, )),
                                  int(self.max_memory))
        self.scores = []
        self.critic_losses = []
        self.surrogates = []

        self.critic_optimizer = optim.Adam(self.trained_critic.parameters(),
                                           lr=config.learning_rate)
        self.policy_optimizer = optim.Adam(self.trained_policy.parameters(),
                                           lr=config.learning_rate)

    def solve(self):
        """
        Main method to launch the environment loop

        """
        step = 1

        for i_episode in range(1, self.num_episodes + 1):
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            rewards = []
            surrogates = []
            critic_losses = []
            while True:
                action = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = env_info.rewards
                done = env_info.local_done
                self.memory.add((state, action, next_state, reward, done))
                state = next_state
                rewards.append(reward)
                if self.memory.size >= self.batch_size and not step % self.update_every:
                    surrogate_buffer, critic_loss = self.train()
                    surrogates.append(surrogate_buffer)
                    critic_losses.append(critic_loss)
                step += 1
                if np.any(done):
                    break

            self.scores.append(np.mean(np.sum(rewards, axis=0)))
            self.surrogates.append(np.mean(surrogates))
            self.critic_losses.append(np.mean(critic_losses))

            self.print_status(i_episode)

        return self.scores, self.surrogates, self.critic_losses

    def act(self, states):
        """
        Based on states, returns the on-policy actions
        
        Parameter
        ---------
        states - float array shape=(num_agents, state_size)
        
        Return
        ---------
        Float array shape=(num_agents, action_size), chosen action

        """
        states = torch.from_numpy(states).float().to(device)
        self.trained_policy.eval()
        with torch.no_grad():
            actions = self.trained_policy(states)
        # TODO: add exploration noise
        return actions.cpu().data.numpy()

    def train(self):
        """
        Training routine to update the policy and critic

        """
        states, actions, next_states, rewards, dones = self.memory.sample(
            self.batch_size)

        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        rewards = torch.from_numpy(rewards).float().to(device)
        dones = torch.from_numpy(dones).float().to(device)

        # critic update
        next_actions = self.target_policy(next_states)
        self.trained_critic.train()
        self.critic_optimizer.zero_grad()
        done_mask = 1 - dones
        target_states_values = rewards + self.gamma * \
            self.target_critic(next_states, next_actions) * done_mask
        predicted_states_values = self.trained_critic(states, actions)
        critic_loss = torch.mean(
            (target_states_values - predicted_states_values)**2)
        critic_loss.backward()
        self.critic_optimizer.step()

        # policy update
        self.trained_policy.train()
        self.policy_optimizer.zero_grad()
        action_values = self.trained_critic(states,
                                            self.trained_policy(states))
        surrogate = -torch.mean(action_values)
        surrogate.backward()
        self.policy_optimizer.step()

        self.target_network_update(self.trained_critic, self.target_critic)
        self.target_network_update(self.trained_policy, self.target_policy)

        return surrogate.cpu().data.numpy(), critic_loss.cpu().data.numpy()

    def target_network_update(self, trained_model, target_model):
        """
        Performs a soft update with rate tau from the trained_model to the target_model.

        """
        target_model_weights = target_model.get_weights()
        train_model_weights = trained_model.get_weights()
        new_weights = []
        for w1, w2 in zip(target_model_weights, train_model_weights):
            new_weights.append(w1 * (1 - self.tau) + w2 * self.tau)
        target_model.set_weights(new_weights)

    def print_status(self, i_episode):
        """
        Print the latest status of the agent

        Parameter
        ---------
        i_episode, int

        """
        print(
            "\rEpisode %d/%d | Average Score: %.2f | Surrogate: %.5f | Critic loss: %.5f  "
            % (i_episode, self.num_episodes, self.scores[-1],
               self.surrogates[-1], self.critic_losses[-1]),
            end="")
        sys.stdout.flush()
Ejemplo n.º 15
0
class Agent():
    def __init__(self,
                 input_dims,
                 num_actions,
                 learning_rate=2e-4,
                 discount_factor=0.99,
                 eps=1.0,
                 eps_decrement_factor=1e-5,
                 eps_min=0.1,
                 replay_memory_size=10000,
                 mini_batch_size=32):
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        self.eps_min = eps_min
        self.eps = eps
        self.eps_decrement_factor = eps_decrement_factor
        self.mini_batch_size = mini_batch_size
        #self.Q = LinearDQN(learning_rate, num_actions, input_dims)
        self.online_network = DualDeepQCNN(input_dims,
                                           self.num_actions,
                                           name='OnlineNetwork')
        self.target_network = DualDeepQCNN(input_dims,
                                           self.num_actions,
                                           name='TargetNetwork')
        self.replay_memory_size = replay_memory_size
        self.memory_bank = AgentMemory(self.replay_memory_size)

    def decrement_epsilon(self):
        new_eps = self.eps - self.eps_decrement_factor
        self.eps = new_eps if new_eps > self.eps_min else self.eps_min

    def store_memory(self, memory):
        self.memory_bank.remember(memory)

    def make_memory(self, state, action, reward, new_state, done):
        return np.array(
            [state,
             np.long(action),
             float(reward), new_state,
             bool(done)])

    def get_greedy_action(self, observation):
        # convert obs to tensor, pass to device, forward pass, argmax
        obs_t = T.tensor(observation).to(self.online_network.device,
                                         dtype=T.float)

        #current value of state, and subtracting average value of best action do not matter as it only results in scaling of the actions without any change in ordering.
        action_v, action_a = self.target_network.forward(obs_t)
        return action_a.argmax().item()

    def get_random_action(self, observation):
        # randint return is inclusive of final value
        return random.randint(0, num_actions - 1)

    def train_online_network(self):
        pass

    def save_models(self):
        self.online_network.save_checkpoint()
        self.target_network.save_checkpoint()

    def load_models(self):
        self.online_network.load_checkpoint()
        self.target_network.load_checkpoint()

    def update_target_network(self):
        pass

    def copy_online_nn_to_target_nn(self):
        self.target_network.load_state_dict(self.online_network.state_dict())
Ejemplo n.º 16
0
class PPOController:
    """
    Deep learning agent based on Proximal Policy Optimization, based on https://arxiv.org/pdf/1506.02438.pdf

    """
    def __init__(self, env, brain_name, config, policy=None, critic=None):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate
        - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'update_every', int, update frequency, in number of steps
        - 'train_iterations', int, number of training passes over a data batch
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'std', float, standard deviation used for the Normal distribution of the policy
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        - 'policy', optional, used to pass a mock policy for testing purposes
        - 'critic', optional, used to pass a mock critic for testing purposes

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.policy = Policy(config, self.state_size,
                             self.action_size) if policy is None else policy
        self.trained_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic.eval()
        self.memory = AgentMemory(
            ((self.num_agents, self.state_size),
             (self.num_agents, self.action_size), (self.num_agents, ),
             (self.num_agents, self.state_size), (self.num_agents, ),
             (self.num_agents, )), int(self.max_memory))
        self.epsilon = config.epsilon_start
        self.scores = []
        self.surrogates = []

        self.optimizer = optim.Adam([{
            'params': self.policy.parameters()
        }, {
            'params': self.trained_critic.parameters()
        }],
                                    lr=config.learning_rate)

    def solve(self):
        """
        Main method to launch the environment loop

        """
        step = 1

        for i_episode in range(1, self.num_episodes + 1):
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            rewards = []
            surrogates = []
            while True:
                action, log_probability = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = env_info.rewards
                done = env_info.local_done
                self.memory.add(
                    (state, action, log_probability, next_state, reward, done))
                state = next_state
                rewards.append(reward)
                if not step % self.update_every:
                    surrogate_buffer = self.train_loop()
                    surrogates.append(surrogate_buffer)
                step += 1
                if np.any(done):
                    break

            self.scores.append(np.mean(np.sum(rewards, axis=0)))
            self.surrogates.append(np.mean(surrogates))

            self.epsilon *= self.epsilon_decay
            self.print_status(i_episode)

        return self.scores, self.surrogates

    def act(self, states):
        """
        Based on states, returns the on-policy actions
        
        Parameter
        ---------
        states - float array shape=(num_agents, state_size)
        
        Return
        ---------
        Float array shape=(num_agents, action_size), chosen action

        """
        states = torch.from_numpy(states).float().to(device)
        self.policy.eval()
        actions, log_probabilities = self.policy.next_actions(states)
        return actions.cpu().data.numpy(), log_probabilities.cpu().data.numpy()

    def train_loop(self):
        """
        Training routine to update the policy and critic

        """
        surrogate_buffer = []
        states, actions, old_log_probabilities, next_states, rewards, dones = self.memory.get_latest(
            self.update_every)

        future_rewards = self.compute_discounted_future_rewards(rewards)

        old_log_probabilities = torch.from_numpy(
            old_log_probabilities).float().to(device)
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        future_rewards = torch.from_numpy(future_rewards).float().to(device)
        dones = torch.from_numpy(dones).bool().to(device)
        self.policy.train()
        self.trained_critic.train()
        for _ in range(self.train_iterations):
            surrogate = self.compute_surrogate(old_log_probabilities, states,
                                               actions, next_states,
                                               future_rewards, dones)
            surrogate_buffer.append(surrogate.cpu().data.numpy())
            self.optimizer.zero_grad()
            surrogate.backward()
            self.optimizer.step()
            self.target_network_update()
        return surrogate_buffer

    def compute_surrogate(self, old_log_probabilities, states, actions,
                          next_states, future_rewards, dones):
        """
        Compute the surrogate, i.e. the function optimized at training time

        Parameters
        ----------
        - old_log_probabilities, float Tensor shape=(batch_size, num_agents), original probabilities for the performed action
        - states, float Tensor shape=(batch_size, num_agents, state_size)
        - actions, float Tensor shape=(batch_size, num_agents, action_size)
        - next_states, float Tensor shape=(batch_size, num_agents, state_size)
        - future_rewards, float Tensor shape=(batch_size, num_agents), discounted sum of future rewards over the length of the trajectory
        - dones, float Tensor shape=(batch_size, num_agents)

        Return
        ---------
        Surrogate, float Tensor

        """
        new_log_probabilities, entropy = self.policy.get_log_probabilities_and_entropy(
            states, actions)
        ratio = torch.exp(new_log_probabilities - old_log_probabilities)

        with torch.no_grad():
            states_values = self.target_critic(states)
            next_states_values = self.target_critic(next_states[-1, :])
        if torch.any(dones):
            final_states_values = 0
        else:
            final_states_values = next_states_values.expand(
                states_values.shape)

        future_rewards = self.normalize(future_rewards)

        discount = self.gamma**torch.arange(len(states_values),
                                            0,
                                            -1,
                                            dtype=torch.float).unsqueeze(1)
        target_states_values = future_rewards + final_states_values * discount
        advantages = target_states_values - states_values

        clip = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon)
        clipped_surrogate = torch.min(ratio * advantages, clip * advantages)

        return -1 * torch.mean(
            clipped_surrogate) + 0.5 * self.trained_critic.mse(
                states_values, target_states_values) - 0.01 * entropy.mean()

    def normalize(self, a):
        """
        Normalize a torch Tensor

        Parameters
        ----------
        - a, float Tensor to normalize

        """
        mean = torch.mean(a, -1)
        std = torch.std(a, -1)
        b = a
        mask = std != 0
        b[mask] = (a[mask] - mean[mask].unsqueeze(1)) / std[mask].unsqueeze(1)
        # if the deviation is null set the normalized reward to 0
        mask = std == 0
        b[mask] = 0
        return b

    def compute_discounted_future_rewards(self, rewards):
        """
        Compute the discounted sum of future reward over the trajectory

        Parameters
        ----------
        - rewards, float array shape=(batch_size, num_agents)

        Return
        ----------
        Discounted future rewards, float array shape=(batch_size, num_agents)

        """
        # This is complex so giving an example with gamma = 0.5 and
        # rewards = [[1, 0],
        #            [1, 1]]
        main_dim = len(rewards)
        # discounts = [1, 0.5]
        discounts = (self.gamma**np.arange(main_dim))
        # discounts = [[1, 0.5],
        #              [1, 0.5]]
        discounts = np.tile(discounts, main_dim).reshape(main_dim, main_dim)
        # indexes = [[0, 1],
        #            [1, 2]]
        indexes = np.tile(np.arange(main_dim), main_dim).reshape(
            main_dim, main_dim) + np.arange(main_dim)[:, np.newaxis]
        # indexes = [[0, 1],
        #            [1, 0]]
        indexes = np.mod(indexes, main_dim)
        # discounts = [[1, 0.5],
        #              [0, 1]]
        discounts = np.triu(discounts[range(main_dim), indexes])
        # rewards = [[1.5, 0.5],
        #              [1, 1]]
        return np.dot(discounts, rewards)

    def target_network_update(self):
        """
        Performs a soft update with rate tau from the trained_model to the target_model.

        """
        target_model_weights = self.target_critic.get_weights()
        train_model_weights = self.trained_critic.get_weights()
        new_weights = []
        for w1, w2 in zip(target_model_weights, train_model_weights):
            new_weights.append(w1 * (1 - self.tau) + w2 * self.tau)
        self.target_critic.set_weights(new_weights)

    def print_status(self, i_episode):
        """
        Print the latest status of the agent

        Parameter
        ---------
        i_episode, int

        """
        print(
            "\rEpisode %d/%d | Average Score: %.2f | Model surrogate: %.5f   "
            % (i_episode, self.num_episodes, self.scores[-1],
               self.surrogates[-1]),
            end="")
        sys.stdout.flush()