Esempio n. 1
0
    def warmup(
        self,
        brain_set: BrainSet,
        n_episodes: int,
        max_t: int,
        step_agents_fn: Callable = default_step_agents_fn,
        preprocess_brain_actions_for_env_fn:
        Callable = default_preprocess_brain_actions_for_env_fn,
        end_episode_criteria=np.all,
    ) -> None:
        """
        Act randomly in the environment, storing experience tuples in trajectory/memory buffers.
        Used to initialize memory objects such as prioritized experience replay
        :param brain_set: The agent brains to undergo training
        :param n_episodes: The number of episodes to train over
        :param step_agents_fn: Function used to update the agents with a new experience sampled from the environment
        :param preprocess_brain_actions_for_env_fn: Function used to preprocess actions from the agents before
         passing to the environment
        :param max_t: The maximum number of time steps allowed in each episode
        :param end_episode_criteria: Function acting on a list of booleans
            (identifying whether that agent's episode has terminated) to determine whether the episode is finished
        :return: None
        """
        print("Performing warmup with {} episodes and max_t={}".format(
            n_episodes, max_t))
        for brain in brain_set.brains():
            for agent in brain.agents:
                agent.set_mode('train')
                agent.set_warmup(True)

        t1 = time.time()
        for i_episode in range(1, n_episodes + 1):
            self.reset_env(train_mode=True)
            brain_states = self.get_next_states(brain_set)
            for t in range(max_t):
                next_brain_environment = self.step(
                    brain_set=brain_set,
                    brain_states=brain_states,
                    random_actions=True,
                    preprocess_brain_actions_for_env_fn=
                    preprocess_brain_actions_for_env_fn)
                step_agents_fn(brain_set, next_brain_environment, t)
                brain_states = {
                    brain_name:
                    next_brain_environment[brain_name]['next_states']
                    for brain_name in brain_states
                }

                all_dones = []
                for brain_name in brain_set.names():
                    all_dones.extend(
                        next_brain_environment[brain_name]['dones'])

                if end_episode_criteria(all_dones):
                    break

                print('\rEpisode {}\tTimestep: {:.2f}'.format(i_episode, t),
                      end="")
        print("Finished warmup in {}s".format(round(time.time() - t1)))
Esempio n. 2
0
def get_solution_brain_set():
    memory = PrioritizedMemory(
        capacity=REPLAY_BUFFER_SIZE,
        state_shape=(1, STATE_SIZE),
        # Anneal alpha linearly
        alpha_scheduler=ParameterScheduler(
            initial=0.6,
            lambda_fn=lambda i: 0.6 - 0.6 * i / NUM_EPISODES,
            final=0.),
        # Anneal beta linearly
        beta_scheduler=ParameterScheduler(
            initial=0.4,
            final=1,
            lambda_fn=lambda i: 0.4 + 0.6 * i / NUM_EPISODES
        ),  # Anneal beta linearly
        seed=SEED,
        continuous_actions=True,
        min_priority=MIN_PRIORITY)

    reacher_brain = Brain(
        brain_name=BRAIN_NAME,
        action_size=ACTION_SIZE,
        state_shape=STATE_SIZE,
        observation_type='vector',
        agents=[get_agent(memory)],
    )

    brain_set = BrainSet(brains=[reacher_brain])
    return brain_set
Esempio n. 3
0
    def step(
        self,
        brain_set: BrainSet,
        brain_states: Dict[str, np.ndarray],
        random_actions: bool = False,
        preprocess_brain_actions_for_env_fn:
        Callable = default_preprocess_brain_actions_for_env_fn
    ) -> Dict[str, dict]:
        """ Step the simulation, getting the next environment frame
        :param brain_set: The agent brains
        :param brain_states: Mapping from brain_name to a numpy ndarray of states
        :param random_actions: Whether to obtain random or learned actions
        :param preprocess_brain_actions_for_env_fn: Function for preprocessing brain actions prior to
            passing to the environment
        :return: Mapping from brain_name to the the next environment frame, which includes:
            - states
            - actions
            - next_states
            - rewards
            - dones
        """
        if random_actions:
            brain_actions: Dict[
                str, List[Action]] = brain_set.get_random_actions(brain_states)
        else:
            brain_actions: Dict[str, List[Action]] = brain_set.get_actions(
                brain_states)

        actions: Dict[str, np.ndarray] = preprocess_brain_actions_for_env_fn(
            deepcopy(brain_actions))

        self.env_info = self.env.step(actions)

        next_brain_states = self.get_next_states(brain_set)

        output = {}
        for brain_name in brain_set.names():
            output[brain_name] = {
                'states': brain_states[brain_name],
                'actions': brain_actions[brain_name],
                'next_states': next_brain_states[brain_name],
                'rewards': self.env_info[brain_name].rewards,
                'dones': self.env_info[brain_name].local_done
            }
        return output
Esempio n. 4
0
def get_solution_brain_set():
    params = deepcopy(default_cfg)
    update_params = {
        "MLP_FEATURES_HIDDEN": (512, ),
        "OUTPUT_FC_HIDDEN_SIZES": (128, ),
        "NUM_STACKED_FRAMES": 1,
        "MLP_FEATURES_DROPOUT": None,
        "OUTPUT_HIDDEN_DROPOUT": None,
        "DUELING": True,
    }

    params.update(update_params)

    policy = get_policy(ACTION_SIZE, params)

    featurizer = MLP(tuple([VECTOR_STATE_SHAPE[1]] +
                           list(params['MLP_FEATURES_HIDDEN'])),
                     dropout=params['MLP_FEATURES_DROPOUT'],
                     activation_function=nn.ReLU(),
                     output_function=nn.ReLU(),
                     seed=SEED)

    model = DQN(
        VECTOR_STATE_SHAPE,
        ACTION_SIZE,
        featurizer,
        params['MLP_FEATURES_HIDDEN'][-1],
        seed=SEED,
        grayscale=params["GRAYSCALE"],
        num_stacked_frames=params["NUM_STACKED_FRAMES"],
        output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"],
        OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"],
        dueling_output=params["DUELING"],
        noisy_output=params['NOISY'],
        categorical_output=params['CATEGORICAL'],
    )

    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR'])
    memory = get_memory(VECTOR_STATE_SHAPE, params)
    solution_agent = get_agent(VECTOR_STATE_SHAPE, ACTION_SIZE, model, policy,
                               memory, optimizer, params)

    banana_brain_ = Brain(
        brain_name=BRAIN_NAME,
        action_size=ACTION_SIZE,
        state_shape=VECTOR_STATE_SHAPE,
        observation_type='vector',
        agents=[solution_agent],
    )

    brain_set_ = BrainSet(brains=[banana_brain_])

    return brain_set_, params
Esempio n. 5
0
def get_solution_brain_set():
    agent = PPOAgent(
        state_size=STATE_SIZE,
        action_size=ACTION_SIZE,
        seed=SEED,
        actor_critic_factory=lambda: PPO_Actor_Critic(
            actor_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, ACTION_SIZE),
                            seed=SEED,
                            output_function=torch.nn.Tanh(),
                            with_batchnorm=BATCHNORM,
                            output_layer_initialization_fn=lambda l:
                            init_layer_within_range(l),
                            hidden_layer_initialization_fn=lambda l:
                            init_layer_inverse_root_fan_in(l),
                            activation_function=torch.nn.LeakyReLU(True),
                            dropout=DROPOUT),
            critic_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, 1),
                             seed=SEED,
                             output_function=torch.nn.Tanh(),
                             with_batchnorm=BATCHNORM,
                             output_layer_initialization_fn=lambda l:
                             init_layer_within_range(l),
                             hidden_layer_initialization_fn=lambda l:
                             init_layer_inverse_root_fan_in(l),
                             activation_function=torch.nn.LeakyReLU(True),
                             dropout=DROPOUT),
            action_size=ACTION_SIZE,
            continuous_actions=True,
        ),
        optimizer_factory=lambda params: torch.optim.Adam(
            params, lr=LR, weight_decay=WEIGHT_DECAY, eps=EPSILON),
        batch_size=BATCH_SIZE,
    )

    crawler_brain = Brain(
        brain_name=BRAIN_NAME,
        action_size=ACTION_SIZE,
        state_shape=STATE_SIZE,
        observation_type='vector',
        agents=[agent],
    )
    brain_set = BrainSet(brains=[crawler_brain])
    return brain_set
Esempio n. 6
0
    def evaluate(
            self,
            brain_set: BrainSet,
            n_episodes: int = 5,
            max_t: int = 1000,
            brain_reward_accumulation_fn: Callable = lambda rewards: np.array(
                rewards),
            episode_reward_accumulation_fn: Callable = lambda
        brain_episode_scores: float(
            np.mean([
                np.mean(brain_episode_scores[brain_name])
                for brain_name in brain_episode_scores
            ])),
            end_of_episode_score_display_fn: Callable = lambda i_episode,
        episode_aggregated_score, training_scores:
        '\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(
            i_episode, episode_aggregated_score,
            training_scores.get_mean_sliding_scores()),
            sliding_window_size: int = 100,
            end_episode_criteria: Callable = np.all) -> Tuple[BrainSet, float]:
        """
        Evaluate the agent in the environment
        :param brain_set: The agent brains to undergo training
        :param n_episodes: The number of episodes to train over
        :param max_t: The maximum number of time steps allowed in each episode
        :param brain_reward_accumulation_fn:Function used to accumulate rewards for each brain
        :param episode_reward_accumulation_fn: Function used to aggregate rewards across brains
        :param end_of_episode_score_display_fn: Function used to print out end-of-episode scalar score
        :param sliding_window_size: Size of the sliding window to average episode scores over
        :param end_episode_criteria: Function acting on a list of booleans
            (identifying whether that agent's episode has terminated) to determine whether the episode is finished
        :return: Tuple of  (brain_set, average_score)
        """
        for brain in brain_set.brains():
            for agent in brain.agents:
                agent.set_mode('eval')
                agent.set_warmup(False)

        self.evaluation_scores = Scores(window_size=sliding_window_size)

        for i_episode in range(1, n_episodes + 1):
            self.reset_env(train_mode=False)
            brain_states = self.get_next_states(brain_set)

            brain_episode_scores = {
                brain_name: None
                for brain_name, brain in brain_set
            }

            for t in range(max_t):
                next_brain_environment = self.step(brain_set=brain_set,
                                                   brain_states=brain_states)

                brain_states = {
                    brain_name:
                    next_brain_environment[brain_name]['next_states']
                    for brain_name in brain_states
                }
                for brain_name in brain_episode_scores:
                    scores = brain_reward_accumulation_fn(
                        next_brain_environment[brain_name]['rewards'])
                    if brain_episode_scores[brain_name] is None:
                        brain_episode_scores[brain_name] = scores
                    else:
                        brain_episode_scores[brain_name] += scores

                all_dones = []
                for brain_name in brain_set.names():
                    all_dones.extend(
                        next_brain_environment[brain_name]['dones'])

                if end_episode_criteria(all_dones):
                    break

            episode_aggregated_score = episode_reward_accumulation_fn(
                brain_episode_scores)
            self.evaluation_scores.add(episode_aggregated_score)
            print(end_of_episode_score_display_fn(i_episode,
                                                  episode_aggregated_score,
                                                  self.evaluation_scores),
                  end='\n')
        average_score = self.evaluation_scores.get_mean_sliding_scores()
        return brain_set, average_score
Esempio n. 7
0
    def train(
        self,
        brain_set: BrainSet,
        solved_score: Optional[float] = None,
        n_episodes=2000,
        max_t=1000,
        sliding_window_size: int = 100,
        step_agents_fn: Callable = default_step_agents_fn,
        step_episode_agents_fn: Callable = default_step_episode_agents_fn,
        brain_reward_accumulation_fn: Callable = lambda rewards: np.array(
            rewards),
        episode_reward_accumulation_fn: Callable = lambda brain_episode_scores:
        float(
            np.mean([
                np.mean(brain_episode_scores[brain_name])
                for brain_name in brain_episode_scores
            ])),
        preprocess_brain_actions_for_env_fn:
        Callable = default_preprocess_brain_actions_for_env_fn,
        end_episode_criteria: Callable = np.all,
        end_of_episode_score_display_fn: Callable = lambda i_episode,
        episode_aggregated_score, training_scores:
        '\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(
            i_episode, episode_aggregated_score,
            training_scores.get_mean_sliding_scores()),
        aggregate_end_of_episode_score_fn: Callable = lambda training_scores:
        training_scores.get_mean_sliding_scores()
    ) -> Tuple[BrainSet, Scores, int, float]:
        """
        Train a set of agents (brain-set) in an environment
        :param brain_set: The agent brains to undergo training
        :param solved_score: The score (averaged over sliding_window_size episodes) required to consider the task solved
        :param n_episodes: The number of episodes to train over
        :param max_t: The maximum number of time steps allowed in each episode
        :param sliding_window_size: Size of the sliding window to average episode scores over
        :param step_agents_fn: Function used to update the agents with a new experience sampled from the environment
        :param step_episode_agents_fn: Function used to step the agents at the end of each episode
        :param preprocess_brain_actions_for_env_fn: Function used to preprocess actions from the agents before
         passing to the environment
        :param brain_reward_accumulation_fn:Function used to accumulate rewards for each brain
        :param episode_reward_accumulation_fn: Function used to aggregate rewards across brains
        :param end_of_episode_score_display_fn: Function used to print out end-of-episode scalar score
        :param end_episode_criteria: Function acting on a list of booleans
            (identifying whether that agent's episode has terminated) to determine whether the episode is finished
        :param aggregate_end_of_episode_score_fn: Function used to aggregate the end-of-episode score function.
            Defaults to averaging over the past sliding_window_size episode scores
        :return: Tuple of  (brain_set, Scores, i_episode, average_score)
            brain_set (BrainSet): The trained BrainSet
            Scores (Scores): Scores object containing all historic and sliding-window scores
            i_episode (int): The number of episodes required to solve the task
            average_score (float): The final averaged score
        """

        for brain in brain_set.brains():
            for agent in brain.agents:
                agent.set_mode('train')
                agent.set_warmup(False)

        self.training_scores = Scores(window_size=sliding_window_size)

        t_start = time.time()
        for i_episode in range(1, n_episodes + 1):
            self.reset_env(train_mode=True)
            brain_states = self.get_next_states(brain_set)

            brain_episode_scores = OrderedDict([
                (brain_name, None) for brain_name, brain in brain_set
            ])

            for t in range(max_t):
                next_brain_environment = self.step(
                    brain_set=brain_set,
                    brain_states=brain_states,
                    preprocess_brain_actions_for_env_fn=
                    preprocess_brain_actions_for_env_fn)
                step_agents_fn(brain_set, next_brain_environment, t)

                brain_states = {
                    brain_name:
                    next_brain_environment[brain_name]['next_states']
                    for brain_name in brain_states
                }

                for brain_name in brain_episode_scores:
                    # Brain rewards are a scalar for each agent,
                    # of form next_brain_environment[brain_name]['rewards']=[0.0, 0.0]
                    brain_rewards = brain_reward_accumulation_fn(
                        next_brain_environment[brain_name]['rewards'])
                    if brain_episode_scores[brain_name] is None:
                        brain_episode_scores[brain_name] = brain_rewards
                    else:
                        brain_episode_scores[brain_name] += brain_rewards

                all_dones = []
                for brain_name in brain_set.names():
                    all_dones.extend(
                        next_brain_environment[brain_name]['dones'])

                if end_episode_criteria(all_dones):
                    break

            # Step episode for agents
            step_episode_agents_fn(brain_set, i_episode)

            # Brain episode scores are of form: {'<brain_name>', <output_of_brain_reward_accumulation_fn>]}
            episode_aggregated_score = episode_reward_accumulation_fn(
                brain_episode_scores)
            self.training_scores.add(episode_aggregated_score)

            if i_episode % 100 == 0:
                end = '\n'
            else:
                end = ""

            print(end_of_episode_score_display_fn(i_episode,
                                                  episode_aggregated_score,
                                                  self.training_scores),
                  end=end)
            if solved_score and aggregate_end_of_episode_score_fn(
                    self.training_scores) >= solved_score:
                print("\nTotal Training time = {:.1f} min".format(
                    (time.time() - t_start) / 60))
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode,
                            self.training_scores.get_mean_sliding_scores()))
                break
        training_time = round(time.time() - t_start)

        return brain_set, self.training_scores, i_episode, training_time
Esempio n. 8
0
        brain_name=GOALIE_BRAIN_NAME,
        action_size=GOALIE_ACTION_SIZE,
        state_shape=GOALIE_STATE_SIZE,
        observation_type='vector',
        agents=goalie_agents,
    )

    striker_brain = Brain(
        brain_name=STRIKER_BRAIN_NAME,
        action_size=STRIKER_ACTION_SIZE,
        state_shape=STRIKER_STATE_SIZE,
        observation_type='vector',
        agents=striker_agents,
    )

    brain_set = BrainSet(brains=[goalie_brain, striker_brain])

    for brain_name, brain in brain_set:
        for agent_num, agent in enumerate(brain.agents):
            agent_id = "{}_{}".format(brain_name, agent_num)
            if brain_name == 'GoalieBrain':
                action_size = GOALIE_ACTION_SIZE
                action_range = GOALIE_ACTION_DISCRETE_RANGE
            elif brain_name == 'StrikerBrain':
                action_size = STRIKER_ACTION_SIZE
                action_range = STRIKER_ACTION_DISCRETE_RANGE
            else:
                raise ValueError('f**k')

            agent.policy = IndependentMADDPGPolicy(
                brain_set=brain_set,
Esempio n. 9
0
def banana_tuning(update_params: dict):
    params = deepcopy(default_cfg)
    params.update(update_params)
    try:
        params['OUTPUT_FC_HIDDEN_SIZES'] = ast.literal_eval(
            params['OUTPUT_FC_HIDDEN_SIZES'])
        params['SUPPORT_RANGE'] = ast.literal_eval(params['SUPPORT_RANGE'])
        params['MLP_FEATURES_HIDDEN'] = ast.literal_eval(
            params['MLP_FEATURES_HIDDEN'])

        policy = get_policy(ACTION_SIZE, params)

        featurizer = MLP(tuple([VECTOR_STATE_SHAPE[1]] +
                               list(params['MLP_FEATURES_HIDDEN'])),
                         dropout=params['MLP_FEATURES_DROPOUT'],
                         activation_function=nn.ReLU(True),
                         output_function=nn.ReLU(True),
                         seed=SEED)

        model = DQN(
            VECTOR_STATE_SHAPE,
            ACTION_SIZE,
            featurizer,
            params['MLP_FEATURES_HIDDEN'][-1],
            seed=SEED,
            grayscale=params["GRAYSCALE"],
            num_stacked_frames=params["NUM_STACKED_FRAMES"],
            output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"],
            OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"],
            dueling_output=params["DUELING"],
            noisy_output=params['NOISY'],
            categorical_output=params['CATEGORICAL'],
        )

        print(model)
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=params['INITIAL_LR'])

        memory = get_memory(VECTOR_STATE_SHAPE, params)

        agent = get_agent(VECTOR_STATE_SHAPE, ACTION_SIZE, model, policy,
                          memory, optimizer, params)

        banana_brain = Brain(
            brain_name=BRAIN_NAME,
            action_size=ACTION_SIZE,
            state_shape=VECTOR_STATE_SHAPE,
            observation_type='vector',
            agents=[agent],
        )

        brain_set = BrainSet(brains=[banana_brain])

        # Run performance evaluation
        performance, info = simulator.get_agent_performance(
            brain_set=brain_set,
            n_train_episodes=params["N_EPISODES"],
            n_eval_episodes=params["N_EVAL_EPISODES"],
            max_t=params["MAX_T"],
        )
        info['input_params'] = params

        write_tuning_data(info, performance)

        global TRIAL_COUNTER
        TRIAL_COUNTER += 1

        print("Performance is : {}".format(performance))
        return performance
    except Exception as e:
        print(e)
        return 0
Esempio n. 10
0
def visual_banana_tuning(update_params: dict):
    params = deepcopy(default_cfg)
    params.update(update_params)
    try:
        params['SUPPORT_RANGE'] = ast.literal_eval(params['SUPPORT_RANGE'])
        params['OUTPUT_FC_HIDDEN_SIZES'] = ast.literal_eval(params['OUTPUT_FC_HIDDEN_SIZES'])
        params['FILTERS'] = ast.literal_eval(params['FILTERS'])
        params['KERNEL_SIZES'] = [ast.literal_eval(i) for i in ast.literal_eval(params["KERNEL_SIZES"])]
        params['STRIDE_SIZES'] = [ast.literal_eval(i) for i in ast.literal_eval(params["STRIDE_SIZES"])]

        policy = get_policy(ACTION_SIZE, params)
        print(params)
        featurizer = CNN(
            image_shape=IMAGE_SHAPE,
            num_stacked_frames=params["NUM_STACKED_FRAMES"],
            grayscale=params["GRAYSCALE"],
            filters=params["FILTERS"],
            kernel_sizes=params["KERNEL_SIZES"],
            stride_sizes=params["STRIDE_SIZES"],
        )

        model = VisualDQN(
            VISUAL_STATE_SHAPE,
            ACTION_SIZE,
            featurizer,
            featurizer.output_size,
            seed=SEED,
            grayscale=params["GRAYSCALE"],
            num_stacked_frames=params["NUM_STACKED_FRAMES"],
            output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"],
            OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"],
            dueling_output=params["DUELING"],
            noisy_output=params['NOISY'],
            categorical_output=params['CATEGORICAL'],
        )

        print(model)
        optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR'])

        memory = get_memory(VISUAL_STATE_SHAPE, params)

        agent = get_agent(VISUAL_STATE_SHAPE, ACTION_SIZE, model, policy, memory, optimizer, params)

        # Run performance evaluation
        banana_brain = Brain(
            brain_name=BRAIN_NAME,
            action_size=ACTION_SIZE,
            state_shape=VISUAL_STATE_SHAPE,
            observation_type='visual',
            agents=[agent],
            preprocess_state_fn=get_preprocess_state_fn(params)
        )

        brain_set = BrainSet(brains=[banana_brain])

        performance, info = simulator.get_agent_performance(
            brain_set=brain_set,
            n_train_episodes=params["N_EPISODES"],
            n_eval_episodes=params["N_EVAL_EPISODES"],
            max_t=params["MAX_T"],
        )
        info['input_params'] = params

        global TRIAL_COUNTER
        TRIAL_COUNTER += 1

        write_tuning_data(info, performance)

        print(f"Performance is : {performance}")
        return performance
    except Exception as e:
        # Failures can occur do to invalid CNN sizes
        print("FAILURE IN HYPERPARAMETER TUNING::: {}, {}".format(e, sys.exc_info()))
        return 0
Esempio n. 11
0
def get_solution_brain_set():
    tennis_agents = []
    for i in range(2):
        key = "TennisBrain_{}".format(i)
        agent = MAPPOAgent(
            agent_id=key,
            state_size=STATE_SIZE,
            action_size=ACTION_SIZE,
            map_agent_to_state_slice={
                "TennisBrain_0": lambda t: t[:, 0:24],
                "TennisBrain_1": lambda t: t[:, 24:48]
            },
            map_agent_to_action_slice={
                "TennisBrain_0": lambda t: t[:, 0:2],
                "TennisBrain_1": lambda t: t[:, 2:4]
            },
            actor_critic_factory=lambda: MAPPO_Actor_Critic(
                actor_model=MLP(
                    layer_sizes=(STATE_SIZE, 256, 128, ACTION_SIZE),
                    seed=SEED,
                    # output_function=BoundVectorNorm(),
                    output_function=torch.nn.Tanh(),
                    with_batchnorm=BATCHNORM,
                    activation_function=torch.nn.ReLU(True),
                    hidden_layer_initialization_fn=
                    init_layer_inverse_root_fan_in,
                    output_layer_initialization_fn=get_init_layer_within_rage(
                        limit_range=(-3e-4, 3e-4)),
                    dropout=DROPOUT),
                critic_model=MACritic(
                    state_featurizer=MLP(
                        layer_sizes=(STATE_SIZE * 2 + ACTION_SIZE, 256),
                        with_batchnorm=BATCHNORM,
                        dropout=DROPOUT,
                        seed=SEED,
                        output_function=torch.nn.ReLU(),
                    ),
                    output_module=MLP(
                        layer_sizes=(256 + ACTION_SIZE, 128, 1),
                        with_batchnorm=BATCHNORM,
                        dropout=DROPOUT,
                        seed=SEED,
                        output_layer_initialization_fn=
                        get_init_layer_within_rage(limit_range=(-3e-4, 3e-4)),
                        activation_function=torch.nn.ReLU(True),
                    ),
                ),
                action_size=ACTION_SIZE,
                continuous_actions=True,
            ),
            optimizer_factory=lambda params: torch.optim.AdamW(
                params, lr=LR, weight_decay=WEIGHT_DECAY, eps=EPSILON),
            continuous_action_range_clip=(-1, 1),
            batch_size=256,
            min_batches_for_training=16,
            num_learning_updates=10,
            beta_scheduler=ParameterScheduler(initial=0.01,
                                              lambda_fn=lambda i: 0.01,
                                              final=0.01),
            std_scale_scheduler=ParameterScheduler(
                initial=0.8, lambda_fn=lambda i: 0.8 * 0.999**i, final=0.2),
            seed=SEED)
        tennis_agents.append(agent)

    tennis_brain = Brain(
        brain_name="TennisBrain",
        action_size=ACTION_SIZE,
        state_shape=STATE_SIZE,
        observation_type='vector',
        agents=tennis_agents,
    )

    brain_set = BrainSet(brains=[tennis_brain])
    return brain_set
Esempio n. 12
0
def multi_agent_step_episode_agents_fn(brain_set: BrainSet, episode):
    for brain_name in brain_set.names():
        for _, agent in enumerate(brain_set[brain_name].agents):
            agent.step_episode(episode)
Esempio n. 13
0
def get_solution_brain_set():
    # Define the solution hyper parameters
    params = deepcopy(default_cfg)

    update_params = {
        "INITIAL_LR": 5e-4,
        "NUM_STACKED_FRAMES": 4,
        "OUTPUT_HIDDEN_DROPOUT": 0.1,
        "DUELING": True,
        "NOISY": True,
        "BATCH_SIZE": 64,
        "N_FILTERS": (64, 128, 128),
        "EPS_DECAY_FACTOR": 0.995,
        "KERNEL_SIZES": [(1, 8, 8), (1, 4, 4), (4, 3, 3)],
        "STRIDE_SIZES": [(1, 4, 4), (1, 2, 2), (1, 3, 3)],
        "OUTPUT_FC_HIDDEN_SIZES": (1024, ),
        "WARMUP_STEPS": 10000,
    }

    params.update(update_params)

    print("Params are: {}".format(json.dumps(params, indent=2)))

    policy = get_policy(ACTION_SIZE, params)

    featurizer = CNN(
        image_shape=VISUAL_STATE_SHAPE[1:],
        num_stacked_frames=params["NUM_STACKED_FRAMES"],
        grayscale=params["GRAYSCALE"],
        nfilters=params["N_FILTERS"],
        kernel_sizes=params["KERNEL_SIZES"],
        stride_sizes=params["STRIDE_SIZES"],
    )

    model = VisualDQN(
        VISUAL_STATE_SHAPE,
        ACTION_SIZE,
        featurizer,
        featurizer.output_size,
        seed=SEED,
        grayscale=params["GRAYSCALE"],
        num_stacked_frames=params["NUM_STACKED_FRAMES"],
        output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"],
        OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"],
        dueling_output=params["DUELING"],
        noisy_output=params['NOISY'],
        categorical_output=params['CATEGORICAL'],
    )

    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR'])

    memory = get_memory(VISUAL_STATE_SHAPE, params)

    solution_agent = get_agent(VISUAL_STATE_SHAPE, ACTION_SIZE, model, policy,
                               memory, optimizer, params)

    banana_brain_ = Brain(brain_name=BRAIN_NAME,
                          action_size=ACTION_SIZE,
                          state_shape=VISUAL_STATE_SHAPE,
                          observation_type='visual',
                          agents=[solution_agent],
                          preprocess_state_fn=get_preprocess_state_fn(params))

    brain_set_ = BrainSet(brains=[banana_brain_])
    return brain_set_, params
Esempio n. 14
0
def get_solution_brain_set():
    tennis_agents = []

    state_featurizer = MLP(
        layer_sizes=(STATE_SIZE * 2 + ACTION_SIZE, 400),
        with_batchnorm=BATCHNORM,
        activation_function=torch.nn.ReLU(True),
    )
    output_module = MLP(
        layer_sizes=(400 + ACTION_SIZE, 300, 1),
        with_batchnorm=BATCHNORM,
        activation_function=torch.nn.ReLU(True),
        output_layer_initialization_fn=get_init_layer_within_rage(
            limit_range=(-3e-4, 3e-4)))

    memory_factory = lambda: PrioritizedMemory(
        capacity=BUFFER_SIZE,
        state_shape=(1, STATE_SIZE),
        alpha_scheduler=ParameterScheduler(initial=0.6,
                                           lambda_fn=lambda i: 0.6 - 0.6 * i /
                                           NUM_EPISODES,
                                           final=0.),
        beta_scheduler=
        ParameterScheduler(initial=0.4,
                           final=1,
                           lambda_fn=lambda i: 0.4 + 0.6 * i / NUM_EPISODES
                           ),  # Anneal beta linearly
        seed=SEED,
        continuous_actions=True,
        min_priority=1e-4)

    if MATD3:
        critic_factory = lambda: MATD3Critic(
            critic_model_factory=lambda: MACritic(
                state_featurizer=state_featurizer,
                output_module=output_module,
                seed=SEED,
            ),
            seed=SEED)
    else:
        critic_factory = lambda: MACritic(
            state_featurizer=state_featurizer,
            output_module=output_module,
        )

    for i in range(2):
        key = "TennisBrain_{}".format(i)
        tennis_agent = MADDPGAgent(
            key,
            None,
            STATE_SIZE,
            ACTION_SIZE,
            critic_factory=critic_factory,
            actor_factory=lambda: MLP(
                layer_sizes=(STATE_SIZE, 400, 300, ACTION_SIZE),
                with_batchnorm=BATCHNORM,
                dropout=DROPOUT,
                output_function=BoundVectorNorm(),
                output_layer_initialization_fn=init_layer_within_range,
                hidden_layer_initialization_fn=init_layer_inverse_root_fan_in,
                seed=SEED),
            critic_optimizer_factory=lambda parameters: optim.Adam(
                parameters, lr=CRITIC_LR, weight_decay=1.e-5),
            actor_optimizer_factory=lambda parameters: optim.Adam(parameters,
                                                                  lr=ACTOR_LR),
            memory_factory=memory_factory,
            seed=0,
            batch_size=BATCH_SIZE,
            homogeneous_agents=False,
        )

        tennis_agents.append(tennis_agent)

    tennis_brain = Brain(
        brain_name=BRAIN_NAME,
        action_size=ACTION_SIZE,
        state_shape=STATE_SIZE,
        observation_type='vector',
        agents=tennis_agents,
    )

    brain_set = BrainSet(brains=[tennis_brain])

    # Update the policy with the independent MADDPG policy
    # This is done so that each agent will receive the other agents'
    # states/actions during training to guide actor learning.
    for i, agent in enumerate(tennis_agents):
        agent_id = "TennisBrain_{}".format(i)
        agent.policy = IndependentMADDPGPolicy(
            brain_set=brain_set,
            agent_id=agent_id,
            action_dim=ACTION_SIZE,
            epsilon_scheduler=ParameterScheduler(initial=1,
                                                 lambda_fn=lambda i: 0.99**i,
                                                 final=0.01),
            random_brain_action_factory=lambda: RandomBrainAction(
                ACTION_SIZE,
                1,
                continuous_actions=True,
                continuous_action_range=(-1, 1),
            ),
            map_agent_to_state_slice={
                "TennisBrain_0": lambda t: t[:, 0:24],
                "TennisBrain_1": lambda t: t[:, 24:48]
            },
            map_agent_to_action_slice={
                "TennisBrain_0": lambda t: t[:, 0:2],
                "TennisBrain_1": lambda t: t[:, 2:4]
            },
            matd3=MATD3,
            gaussian_noise_factory=lambda: GaussianNoise(),
            continuous_actions=True,
            continuous_actions_clip_range=(-1, 1))

    return brain_set
Esempio n. 15
0
def get_solution_brain_set():
    params = {
        'striker_actor_layer_size': (STRIKER_STATE_SIZE, 256, 256, len(range(*STRIKER_ACTION_DISCRETE_RANGE))),
        'goalie_actor_layer_size': (GOALIE_STATE_SIZE, 256, 256, len(range(*GOALIE_ACTION_DISCRETE_RANGE))),
        'striker_critic_state_featurizer_layer_size': (336*4 + 3, 256),
        'striker_critic_output_layer_size': (256 + 1, 256, 1),
        'goalie_critic_state_featurizer_layer_size': (336 * 4 + 3, 256),
        'goalie_critic_output_layer_size': (256 + 1, 256, 1),
        'batchnorm': True,
        'actor_dropout': 0.1,
        'critic_dropout': 0.2,
        'lr': 5e-3,
        'weight_decay': 1e-4,
        'eps': 1e-6,
        'num_ppo_epochs': 4,
        'minimum_training_batches': 32,
        'batch_size': 1024
    }

    goalie_agents = []
    for agent_num in range(NUM_GOALIE_AGENTS):
        key = 'GoalieBrain_{}'.format(agent_num)
        if agent_num == 1:
            goalie_agent = DummyMADDPGAgent(
                GOALIE_STATE_SIZE,
                len(range(*GOALIE_ACTION_DISCRETE_RANGE)),
                seed=SEED,
                map_agent_to_state_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:336],
                    "GoalieBrain_1": lambda t: t[:, 336:672],
                    "StrikerBrain_0": lambda t: t[:, 672:1008],
                    "StrikerBrain_1": lambda t: t[:, 1008:]
                },
                map_agent_to_action_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:1],
                    "GoalieBrain_1": lambda t: t[:, 1:2],
                    "StrikerBrain_0": lambda t: t[:, 2:3],
                    "StrikerBrain_1": lambda t: t[:, 3:4]
                },
            )
        else:
            goalie_agent = MAPPOAgent(
                agent_id=key,
                state_size=GOALIE_STATE_SIZE,
                action_size=len(range(*GOALIE_ACTION_DISCRETE_RANGE)),
                seed=SEED,
                map_agent_to_state_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:336],
                    "GoalieBrain_1": lambda t: t[:, 336:672],
                    "StrikerBrain_0": lambda t: t[:, 672:1008],
                    "StrikerBrain_1": lambda t: t[:, 1008:]
                },
                map_agent_to_action_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:1],
                    "GoalieBrain_1": lambda t: t[:, 1:2],
                    "StrikerBrain_0": lambda t: t[:, 2:3],
                    "StrikerBrain_1": lambda t: t[:, 3:4]
                },
                actor_critic_factory=lambda: MAPPO_Actor_Critic(
                    actor_model=MLP(
                        layer_sizes=params['goalie_actor_layer_size'],
                        seed=SEED,
                        output_function=torch.nn.Softmax(),
                        with_batchnorm=params['batchnorm'],
                        activation_function=torch.nn.LeakyReLU(True),
                        dropout=params['actor_dropout']
                    ),
                    critic_model=MACritic(
                        state_featurizer=MLP(
                            layer_sizes=params['goalie_critic_state_featurizer_layer_size'],
                            with_batchnorm=params['batchnorm'],
                            dropout=params['critic_dropout'],
                            seed=SEED
                        ),
                        output_module=MLP(
                            layer_sizes=params['goalie_critic_output_layer_size'],
                            with_batchnorm=params['batchnorm'],
                            dropout=params['critic_dropout'],
                            seed=SEED,
                        ),
                    ),
                    action_size=GOALIE_ACTION_SIZE,
                    continuous_actions=False,
                    seed=SEED
                ),
                min_batches_for_training=params['minimum_training_batches'],
                num_learning_updates=params['num_ppo_epochs'],
                optimizer_factory=lambda model_params: torch.optim.AdamW(
                    model_params, lr=params['lr'], weight_decay=params['weight_decay'], eps=params['eps']
                ),
                continuous_actions=False,
                batch_size=params['batch_size'],
                beta_scheduler=ParameterScheduler(initial=0.01, lambda_fn=lambda i: 0.01, final=0.01),
                std_scale_scheduler=ParameterScheduler(initial=0.8,
                                                       lambda_fn=lambda i: 0.8 * 0.999 ** i,
                                                       final=0.2),
            )
            print("Goalie is: {}".format(goalie_agent.online_actor_critic))
        goalie_agents.append(goalie_agent)

    striker_agents = []
    for agent_num in range(NUM_STRIKER_AGENTS):
        key = 'StrikerBrain_{}'.format(agent_num)
        if agent_num == 1:
            striker_agent = DummyMADDPGAgent(
                STRIKER_STATE_SIZE,
                len(range(*STRIKER_ACTION_DISCRETE_RANGE)),
                SEED,
                map_agent_to_state_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:336],
                    "GoalieBrain_1": lambda t: t[:, 336:672],
                    "StrikerBrain_0": lambda t: t[:, 672:1008],
                    "StrikerBrain_1": lambda t: t[:, 1008:]
                },
                map_agent_to_action_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:1],
                    "GoalieBrain_1": lambda t: t[:, 1:2],
                    "StrikerBrain_0": lambda t: t[:, 2:3],
                    "StrikerBrain_1": lambda t: t[:, 3:4]
                },
            )
        else:
            striker_agent = MAPPOAgent(
                agent_id=key,
                state_size=STRIKER_STATE_SIZE,
                action_size=len(range(*STRIKER_ACTION_DISCRETE_RANGE)),
                seed=SEED,
                map_agent_to_state_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:336],
                    "GoalieBrain_1": lambda t: t[:, 336:672],
                    "StrikerBrain_0": lambda t: t[:, 672:1008],
                    "StrikerBrain_1": lambda t: t[:, 1008:]
                },
                map_agent_to_action_slice={
                    "GoalieBrain_0": lambda t: t[:, 0:1],
                    "GoalieBrain_1": lambda t: t[:, 1:2],
                    "StrikerBrain_0": lambda t: t[:, 2:3],
                    "StrikerBrain_1": lambda t: t[:, 3:4]
                },
                actor_critic_factory=lambda: MAPPO_Actor_Critic(
                    actor_model=MLP(
                        layer_sizes=params['striker_actor_layer_size'],
                        seed=SEED,
                        output_function=torch.nn.Softmax(),
                        with_batchnorm=params['batchnorm'],
                        activation_function=torch.nn.LeakyReLU(True),
                        dropout=params['actor_dropout']
                    ),
                    critic_model=MACritic(
                        state_featurizer=MLP(
                            layer_sizes=params['striker_critic_state_featurizer_layer_size'],
                            with_batchnorm=params['batchnorm'],
                            dropout=params['critic_dropout'],
                            seed=SEED,
                        ),
                        output_module=MLP(
                            layer_sizes=params['striker_critic_output_layer_size'],
                            with_batchnorm=params['batchnorm'],
                            dropout=params['critic_dropout'],
                            seed=SEED,
                        ),
                    ),
                    action_size=STRIKER_ACTION_SIZE,
                    continuous_actions=False,
                    seed=SEED
                ),
                optimizer_factory=lambda model_params: torch.optim.AdamW(
                    model_params, lr=params['lr'], weight_decay=params['weight_decay'], eps=params['eps']
                ),
                min_batches_for_training=params['minimum_training_batches'],
                num_learning_updates=params['num_ppo_epochs'],
                continuous_actions=False,
                batch_size=params['batch_size'],
                beta_scheduler=ParameterScheduler(initial=0.01, lambda_fn=lambda i: 0.01, final=0.01),
                std_scale_scheduler=ParameterScheduler(initial=0.8,
                                                       lambda_fn=lambda i: 0.8 * 0.999 ** i,
                                                       final=0.2),
            )
            print("Striker is: {}".format(striker_agent.online_actor_critic))
        striker_agents.append(striker_agent)

    goalie_brain = Brain(
        brain_name=GOALIE_BRAIN_NAME,
        action_size=GOALIE_ACTION_SIZE,
        state_shape=GOALIE_STATE_SIZE,
        observation_type='vector',
        agents=goalie_agents,
    )

    striker_brain = Brain(
        brain_name=STRIKER_BRAIN_NAME,
        action_size=STRIKER_ACTION_SIZE,
        state_shape=STRIKER_STATE_SIZE,
        observation_type='vector',
        agents=striker_agents,
    )

    brain_set = BrainSet(brains=[goalie_brain, striker_brain])
    return brain_set