Esempio n. 1
0
    def _swap_snapshots(self) -> None:
        """
        Swaps the appropriate weight to the policy and pushes it to respective policy queues
        """

        for team_id in self._team_to_name_to_policy_queue:
            if team_id == self._learning_team:
                continue
            elif np.random.uniform() < (1 -
                                        self.play_against_latest_model_ratio):
                x = np.random.randint(len(self.policy_snapshots))
                snapshot = self.policy_snapshots[x]
            else:
                snapshot = self.current_policy_snapshot
                x = "current"

            self.current_opponent = -1 if x == "current" else x
            name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
            for brain_name in self._team_to_name_to_policy_queue[team_id]:
                behavior_id = create_name_behavior_id(brain_name, team_id)
                policy = self.get_policy(behavior_id)
                policy.load_weights(snapshot[brain_name])
                name_to_policy_queue[brain_name].put(policy)
                logger.debug(
                    "Step {}: Swapping snapshot {} to id {} with team {} learning"
                    .format(self.ghost_step, x, behavior_id,
                            self._learning_team))
Esempio n. 2
0
    def advance(self) -> None:
        """
        Steps the trainer, passing trajectories to wrapped trainer and calling trainer advance
        """
        for trajectory_queue in self.trajectory_queues:
            parsed_behavior_id = self._name_to_parsed_behavior_id[
                trajectory_queue.behavior_id]
            if parsed_behavior_id.team_id == self._learning_team:
                # With a future multiagent trainer, this will be indexed by 'role'
                internal_trajectory_queue = self._internal_trajectory_queues[
                    parsed_behavior_id.brain_name]
                try:
                    # We grab at most the maximum length of the queue.
                    # This ensures that even if the queue is being filled faster than it is
                    # being emptied, the trajectories in the queue are on-policy.
                    for _ in range(trajectory_queue.maxlen):
                        t = trajectory_queue.get_nowait()
                        # adds to wrapped trainers queue
                        internal_trajectory_queue.put(t)
                        self._process_trajectory(t)
                except AgentManagerQueue.Empty:
                    pass
            else:
                # Dump trajectories from non-learning policy
                try:
                    for _ in range(trajectory_queue.maxlen):
                        t = trajectory_queue.get_nowait()
                        # count ghost steps
                        self.ghost_step += len(t.steps)
                except AgentManagerQueue.Empty:
                    pass

        self.next_summary_step = self.trainer.next_summary_step
        self.trainer.advance()
        if self.get_step - self.last_team_change > self.steps_to_train_team:
            self.controller.change_training_team(self.get_step)
            self.last_team_change = self.get_step

        next_learning_team = self.controller.get_learning_team

        # CASE 1: Current learning team is managed by this GhostTrainer.
        # If the learning team changes, the following loop over queues will push the
        # new policy into the policy queue for the new learning agent if
        # that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot.
        # CASE 2: Current learning team is managed by a different GhostTrainer.
        # If the learning team changes to a team managed by this GhostTrainer, this loop
        # will push the current_snapshot into the correct queue.  Otherwise,
        # it will continue skipping and swap_snapshot will continue to handle
        # pushing fixed snapshots
        # Case 3: No team change. The if statement just continues to push the policy
        # into the correct queue (or not if not learning team).
        for brain_name in self._internal_policy_queues:
            internal_policy_queue = self._internal_policy_queues[brain_name]
            try:
                policy = cast(TFPolicy, internal_policy_queue.get_nowait())
                self.current_policy_snapshot[brain_name] = policy.get_weights()
            except AgentManagerQueue.Empty:
                pass
            if next_learning_team in self._team_to_name_to_policy_queue:
                name_to_policy_queue = self._team_to_name_to_policy_queue[
                    next_learning_team]
                if brain_name in name_to_policy_queue:
                    behavior_id = create_name_behavior_id(
                        brain_name, next_learning_team)
                    policy = self.get_policy(behavior_id)
                    policy.load_weights(
                        self.current_policy_snapshot[brain_name])
                    name_to_policy_queue[brain_name].put(policy)

        # Note save and swap should be on different step counters.
        # We don't want to save unless the policy is learning.
        if self.get_step - self.last_save > self.steps_between_save:
            self._save_snapshot()
            self.last_save = self.get_step

        if (self._learning_team != next_learning_team
                or self.ghost_step - self.last_swap > self.steps_between_swap):
            self._learning_team = next_learning_team
            self._swap_snapshots()
            self.last_swap = self.ghost_step