def warmup( self, brain_set: BrainSet, n_episodes: int, max_t: int, step_agents_fn: Callable = default_step_agents_fn, preprocess_brain_actions_for_env_fn: Callable = default_preprocess_brain_actions_for_env_fn, end_episode_criteria=np.all, ) -> None: """ Act randomly in the environment, storing experience tuples in trajectory/memory buffers. Used to initialize memory objects such as prioritized experience replay :param brain_set: The agent brains to undergo training :param n_episodes: The number of episodes to train over :param step_agents_fn: Function used to update the agents with a new experience sampled from the environment :param preprocess_brain_actions_for_env_fn: Function used to preprocess actions from the agents before passing to the environment :param max_t: The maximum number of time steps allowed in each episode :param end_episode_criteria: Function acting on a list of booleans (identifying whether that agent's episode has terminated) to determine whether the episode is finished :return: None """ print("Performing warmup with {} episodes and max_t={}".format( n_episodes, max_t)) for brain in brain_set.brains(): for agent in brain.agents: agent.set_mode('train') agent.set_warmup(True) t1 = time.time() for i_episode in range(1, n_episodes + 1): self.reset_env(train_mode=True) brain_states = self.get_next_states(brain_set) for t in range(max_t): next_brain_environment = self.step( brain_set=brain_set, brain_states=brain_states, random_actions=True, preprocess_brain_actions_for_env_fn= preprocess_brain_actions_for_env_fn) step_agents_fn(brain_set, next_brain_environment, t) brain_states = { brain_name: next_brain_environment[brain_name]['next_states'] for brain_name in brain_states } all_dones = [] for brain_name in brain_set.names(): all_dones.extend( next_brain_environment[brain_name]['dones']) if end_episode_criteria(all_dones): break print('\rEpisode {}\tTimestep: {:.2f}'.format(i_episode, t), end="") print("Finished warmup in {}s".format(round(time.time() - t1)))
def get_solution_brain_set(): memory = PrioritizedMemory( capacity=REPLAY_BUFFER_SIZE, state_shape=(1, STATE_SIZE), # Anneal alpha linearly alpha_scheduler=ParameterScheduler( initial=0.6, lambda_fn=lambda i: 0.6 - 0.6 * i / NUM_EPISODES, final=0.), # Anneal beta linearly beta_scheduler=ParameterScheduler( initial=0.4, final=1, lambda_fn=lambda i: 0.4 + 0.6 * i / NUM_EPISODES ), # Anneal beta linearly seed=SEED, continuous_actions=True, min_priority=MIN_PRIORITY) reacher_brain = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=STATE_SIZE, observation_type='vector', agents=[get_agent(memory)], ) brain_set = BrainSet(brains=[reacher_brain]) return brain_set
def step( self, brain_set: BrainSet, brain_states: Dict[str, np.ndarray], random_actions: bool = False, preprocess_brain_actions_for_env_fn: Callable = default_preprocess_brain_actions_for_env_fn ) -> Dict[str, dict]: """ Step the simulation, getting the next environment frame :param brain_set: The agent brains :param brain_states: Mapping from brain_name to a numpy ndarray of states :param random_actions: Whether to obtain random or learned actions :param preprocess_brain_actions_for_env_fn: Function for preprocessing brain actions prior to passing to the environment :return: Mapping from brain_name to the the next environment frame, which includes: - states - actions - next_states - rewards - dones """ if random_actions: brain_actions: Dict[ str, List[Action]] = brain_set.get_random_actions(brain_states) else: brain_actions: Dict[str, List[Action]] = brain_set.get_actions( brain_states) actions: Dict[str, np.ndarray] = preprocess_brain_actions_for_env_fn( deepcopy(brain_actions)) self.env_info = self.env.step(actions) next_brain_states = self.get_next_states(brain_set) output = {} for brain_name in brain_set.names(): output[brain_name] = { 'states': brain_states[brain_name], 'actions': brain_actions[brain_name], 'next_states': next_brain_states[brain_name], 'rewards': self.env_info[brain_name].rewards, 'dones': self.env_info[brain_name].local_done } return output
def get_solution_brain_set(): params = deepcopy(default_cfg) update_params = { "MLP_FEATURES_HIDDEN": (512, ), "OUTPUT_FC_HIDDEN_SIZES": (128, ), "NUM_STACKED_FRAMES": 1, "MLP_FEATURES_DROPOUT": None, "OUTPUT_HIDDEN_DROPOUT": None, "DUELING": True, } params.update(update_params) policy = get_policy(ACTION_SIZE, params) featurizer = MLP(tuple([VECTOR_STATE_SHAPE[1]] + list(params['MLP_FEATURES_HIDDEN'])), dropout=params['MLP_FEATURES_DROPOUT'], activation_function=nn.ReLU(), output_function=nn.ReLU(), seed=SEED) model = DQN( VECTOR_STATE_SHAPE, ACTION_SIZE, featurizer, params['MLP_FEATURES_HIDDEN'][-1], seed=SEED, grayscale=params["GRAYSCALE"], num_stacked_frames=params["NUM_STACKED_FRAMES"], output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"], OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"], dueling_output=params["DUELING"], noisy_output=params['NOISY'], categorical_output=params['CATEGORICAL'], ) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR']) memory = get_memory(VECTOR_STATE_SHAPE, params) solution_agent = get_agent(VECTOR_STATE_SHAPE, ACTION_SIZE, model, policy, memory, optimizer, params) banana_brain_ = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=VECTOR_STATE_SHAPE, observation_type='vector', agents=[solution_agent], ) brain_set_ = BrainSet(brains=[banana_brain_]) return brain_set_, params
def get_solution_brain_set(): agent = PPOAgent( state_size=STATE_SIZE, action_size=ACTION_SIZE, seed=SEED, actor_critic_factory=lambda: PPO_Actor_Critic( actor_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, ACTION_SIZE), seed=SEED, output_function=torch.nn.Tanh(), with_batchnorm=BATCHNORM, output_layer_initialization_fn=lambda l: init_layer_within_range(l), hidden_layer_initialization_fn=lambda l: init_layer_inverse_root_fan_in(l), activation_function=torch.nn.LeakyReLU(True), dropout=DROPOUT), critic_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, 1), seed=SEED, output_function=torch.nn.Tanh(), with_batchnorm=BATCHNORM, output_layer_initialization_fn=lambda l: init_layer_within_range(l), hidden_layer_initialization_fn=lambda l: init_layer_inverse_root_fan_in(l), activation_function=torch.nn.LeakyReLU(True), dropout=DROPOUT), action_size=ACTION_SIZE, continuous_actions=True, ), optimizer_factory=lambda params: torch.optim.Adam( params, lr=LR, weight_decay=WEIGHT_DECAY, eps=EPSILON), batch_size=BATCH_SIZE, ) crawler_brain = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=STATE_SIZE, observation_type='vector', agents=[agent], ) brain_set = BrainSet(brains=[crawler_brain]) return brain_set
def evaluate( self, brain_set: BrainSet, n_episodes: int = 5, max_t: int = 1000, brain_reward_accumulation_fn: Callable = lambda rewards: np.array( rewards), episode_reward_accumulation_fn: Callable = lambda brain_episode_scores: float( np.mean([ np.mean(brain_episode_scores[brain_name]) for brain_name in brain_episode_scores ])), end_of_episode_score_display_fn: Callable = lambda i_episode, episode_aggregated_score, training_scores: '\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format( i_episode, episode_aggregated_score, training_scores.get_mean_sliding_scores()), sliding_window_size: int = 100, end_episode_criteria: Callable = np.all) -> Tuple[BrainSet, float]: """ Evaluate the agent in the environment :param brain_set: The agent brains to undergo training :param n_episodes: The number of episodes to train over :param max_t: The maximum number of time steps allowed in each episode :param brain_reward_accumulation_fn:Function used to accumulate rewards for each brain :param episode_reward_accumulation_fn: Function used to aggregate rewards across brains :param end_of_episode_score_display_fn: Function used to print out end-of-episode scalar score :param sliding_window_size: Size of the sliding window to average episode scores over :param end_episode_criteria: Function acting on a list of booleans (identifying whether that agent's episode has terminated) to determine whether the episode is finished :return: Tuple of (brain_set, average_score) """ for brain in brain_set.brains(): for agent in brain.agents: agent.set_mode('eval') agent.set_warmup(False) self.evaluation_scores = Scores(window_size=sliding_window_size) for i_episode in range(1, n_episodes + 1): self.reset_env(train_mode=False) brain_states = self.get_next_states(brain_set) brain_episode_scores = { brain_name: None for brain_name, brain in brain_set } for t in range(max_t): next_brain_environment = self.step(brain_set=brain_set, brain_states=brain_states) brain_states = { brain_name: next_brain_environment[brain_name]['next_states'] for brain_name in brain_states } for brain_name in brain_episode_scores: scores = brain_reward_accumulation_fn( next_brain_environment[brain_name]['rewards']) if brain_episode_scores[brain_name] is None: brain_episode_scores[brain_name] = scores else: brain_episode_scores[brain_name] += scores all_dones = [] for brain_name in brain_set.names(): all_dones.extend( next_brain_environment[brain_name]['dones']) if end_episode_criteria(all_dones): break episode_aggregated_score = episode_reward_accumulation_fn( brain_episode_scores) self.evaluation_scores.add(episode_aggregated_score) print(end_of_episode_score_display_fn(i_episode, episode_aggregated_score, self.evaluation_scores), end='\n') average_score = self.evaluation_scores.get_mean_sliding_scores() return brain_set, average_score
def train( self, brain_set: BrainSet, solved_score: Optional[float] = None, n_episodes=2000, max_t=1000, sliding_window_size: int = 100, step_agents_fn: Callable = default_step_agents_fn, step_episode_agents_fn: Callable = default_step_episode_agents_fn, brain_reward_accumulation_fn: Callable = lambda rewards: np.array( rewards), episode_reward_accumulation_fn: Callable = lambda brain_episode_scores: float( np.mean([ np.mean(brain_episode_scores[brain_name]) for brain_name in brain_episode_scores ])), preprocess_brain_actions_for_env_fn: Callable = default_preprocess_brain_actions_for_env_fn, end_episode_criteria: Callable = np.all, end_of_episode_score_display_fn: Callable = lambda i_episode, episode_aggregated_score, training_scores: '\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format( i_episode, episode_aggregated_score, training_scores.get_mean_sliding_scores()), aggregate_end_of_episode_score_fn: Callable = lambda training_scores: training_scores.get_mean_sliding_scores() ) -> Tuple[BrainSet, Scores, int, float]: """ Train a set of agents (brain-set) in an environment :param brain_set: The agent brains to undergo training :param solved_score: The score (averaged over sliding_window_size episodes) required to consider the task solved :param n_episodes: The number of episodes to train over :param max_t: The maximum number of time steps allowed in each episode :param sliding_window_size: Size of the sliding window to average episode scores over :param step_agents_fn: Function used to update the agents with a new experience sampled from the environment :param step_episode_agents_fn: Function used to step the agents at the end of each episode :param preprocess_brain_actions_for_env_fn: Function used to preprocess actions from the agents before passing to the environment :param brain_reward_accumulation_fn:Function used to accumulate rewards for each brain :param episode_reward_accumulation_fn: Function used to aggregate rewards across brains :param end_of_episode_score_display_fn: Function used to print out end-of-episode scalar score :param end_episode_criteria: Function acting on a list of booleans (identifying whether that agent's episode has terminated) to determine whether the episode is finished :param aggregate_end_of_episode_score_fn: Function used to aggregate the end-of-episode score function. Defaults to averaging over the past sliding_window_size episode scores :return: Tuple of (brain_set, Scores, i_episode, average_score) brain_set (BrainSet): The trained BrainSet Scores (Scores): Scores object containing all historic and sliding-window scores i_episode (int): The number of episodes required to solve the task average_score (float): The final averaged score """ for brain in brain_set.brains(): for agent in brain.agents: agent.set_mode('train') agent.set_warmup(False) self.training_scores = Scores(window_size=sliding_window_size) t_start = time.time() for i_episode in range(1, n_episodes + 1): self.reset_env(train_mode=True) brain_states = self.get_next_states(brain_set) brain_episode_scores = OrderedDict([ (brain_name, None) for brain_name, brain in brain_set ]) for t in range(max_t): next_brain_environment = self.step( brain_set=brain_set, brain_states=brain_states, preprocess_brain_actions_for_env_fn= preprocess_brain_actions_for_env_fn) step_agents_fn(brain_set, next_brain_environment, t) brain_states = { brain_name: next_brain_environment[brain_name]['next_states'] for brain_name in brain_states } for brain_name in brain_episode_scores: # Brain rewards are a scalar for each agent, # of form next_brain_environment[brain_name]['rewards']=[0.0, 0.0] brain_rewards = brain_reward_accumulation_fn( next_brain_environment[brain_name]['rewards']) if brain_episode_scores[brain_name] is None: brain_episode_scores[brain_name] = brain_rewards else: brain_episode_scores[brain_name] += brain_rewards all_dones = [] for brain_name in brain_set.names(): all_dones.extend( next_brain_environment[brain_name]['dones']) if end_episode_criteria(all_dones): break # Step episode for agents step_episode_agents_fn(brain_set, i_episode) # Brain episode scores are of form: {'<brain_name>', <output_of_brain_reward_accumulation_fn>]} episode_aggregated_score = episode_reward_accumulation_fn( brain_episode_scores) self.training_scores.add(episode_aggregated_score) if i_episode % 100 == 0: end = '\n' else: end = "" print(end_of_episode_score_display_fn(i_episode, episode_aggregated_score, self.training_scores), end=end) if solved_score and aggregate_end_of_episode_score_fn( self.training_scores) >= solved_score: print("\nTotal Training time = {:.1f} min".format( (time.time() - t_start) / 60)) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, self.training_scores.get_mean_sliding_scores())) break training_time = round(time.time() - t_start) return brain_set, self.training_scores, i_episode, training_time
brain_name=GOALIE_BRAIN_NAME, action_size=GOALIE_ACTION_SIZE, state_shape=GOALIE_STATE_SIZE, observation_type='vector', agents=goalie_agents, ) striker_brain = Brain( brain_name=STRIKER_BRAIN_NAME, action_size=STRIKER_ACTION_SIZE, state_shape=STRIKER_STATE_SIZE, observation_type='vector', agents=striker_agents, ) brain_set = BrainSet(brains=[goalie_brain, striker_brain]) for brain_name, brain in brain_set: for agent_num, agent in enumerate(brain.agents): agent_id = "{}_{}".format(brain_name, agent_num) if brain_name == 'GoalieBrain': action_size = GOALIE_ACTION_SIZE action_range = GOALIE_ACTION_DISCRETE_RANGE elif brain_name == 'StrikerBrain': action_size = STRIKER_ACTION_SIZE action_range = STRIKER_ACTION_DISCRETE_RANGE else: raise ValueError('f**k') agent.policy = IndependentMADDPGPolicy( brain_set=brain_set,
def banana_tuning(update_params: dict): params = deepcopy(default_cfg) params.update(update_params) try: params['OUTPUT_FC_HIDDEN_SIZES'] = ast.literal_eval( params['OUTPUT_FC_HIDDEN_SIZES']) params['SUPPORT_RANGE'] = ast.literal_eval(params['SUPPORT_RANGE']) params['MLP_FEATURES_HIDDEN'] = ast.literal_eval( params['MLP_FEATURES_HIDDEN']) policy = get_policy(ACTION_SIZE, params) featurizer = MLP(tuple([VECTOR_STATE_SHAPE[1]] + list(params['MLP_FEATURES_HIDDEN'])), dropout=params['MLP_FEATURES_DROPOUT'], activation_function=nn.ReLU(True), output_function=nn.ReLU(True), seed=SEED) model = DQN( VECTOR_STATE_SHAPE, ACTION_SIZE, featurizer, params['MLP_FEATURES_HIDDEN'][-1], seed=SEED, grayscale=params["GRAYSCALE"], num_stacked_frames=params["NUM_STACKED_FRAMES"], output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"], OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"], dueling_output=params["DUELING"], noisy_output=params['NOISY'], categorical_output=params['CATEGORICAL'], ) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR']) memory = get_memory(VECTOR_STATE_SHAPE, params) agent = get_agent(VECTOR_STATE_SHAPE, ACTION_SIZE, model, policy, memory, optimizer, params) banana_brain = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=VECTOR_STATE_SHAPE, observation_type='vector', agents=[agent], ) brain_set = BrainSet(brains=[banana_brain]) # Run performance evaluation performance, info = simulator.get_agent_performance( brain_set=brain_set, n_train_episodes=params["N_EPISODES"], n_eval_episodes=params["N_EVAL_EPISODES"], max_t=params["MAX_T"], ) info['input_params'] = params write_tuning_data(info, performance) global TRIAL_COUNTER TRIAL_COUNTER += 1 print("Performance is : {}".format(performance)) return performance except Exception as e: print(e) return 0
def visual_banana_tuning(update_params: dict): params = deepcopy(default_cfg) params.update(update_params) try: params['SUPPORT_RANGE'] = ast.literal_eval(params['SUPPORT_RANGE']) params['OUTPUT_FC_HIDDEN_SIZES'] = ast.literal_eval(params['OUTPUT_FC_HIDDEN_SIZES']) params['FILTERS'] = ast.literal_eval(params['FILTERS']) params['KERNEL_SIZES'] = [ast.literal_eval(i) for i in ast.literal_eval(params["KERNEL_SIZES"])] params['STRIDE_SIZES'] = [ast.literal_eval(i) for i in ast.literal_eval(params["STRIDE_SIZES"])] policy = get_policy(ACTION_SIZE, params) print(params) featurizer = CNN( image_shape=IMAGE_SHAPE, num_stacked_frames=params["NUM_STACKED_FRAMES"], grayscale=params["GRAYSCALE"], filters=params["FILTERS"], kernel_sizes=params["KERNEL_SIZES"], stride_sizes=params["STRIDE_SIZES"], ) model = VisualDQN( VISUAL_STATE_SHAPE, ACTION_SIZE, featurizer, featurizer.output_size, seed=SEED, grayscale=params["GRAYSCALE"], num_stacked_frames=params["NUM_STACKED_FRAMES"], output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"], OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"], dueling_output=params["DUELING"], noisy_output=params['NOISY'], categorical_output=params['CATEGORICAL'], ) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR']) memory = get_memory(VISUAL_STATE_SHAPE, params) agent = get_agent(VISUAL_STATE_SHAPE, ACTION_SIZE, model, policy, memory, optimizer, params) # Run performance evaluation banana_brain = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=VISUAL_STATE_SHAPE, observation_type='visual', agents=[agent], preprocess_state_fn=get_preprocess_state_fn(params) ) brain_set = BrainSet(brains=[banana_brain]) performance, info = simulator.get_agent_performance( brain_set=brain_set, n_train_episodes=params["N_EPISODES"], n_eval_episodes=params["N_EVAL_EPISODES"], max_t=params["MAX_T"], ) info['input_params'] = params global TRIAL_COUNTER TRIAL_COUNTER += 1 write_tuning_data(info, performance) print(f"Performance is : {performance}") return performance except Exception as e: # Failures can occur do to invalid CNN sizes print("FAILURE IN HYPERPARAMETER TUNING::: {}, {}".format(e, sys.exc_info())) return 0
def get_solution_brain_set(): tennis_agents = [] for i in range(2): key = "TennisBrain_{}".format(i) agent = MAPPOAgent( agent_id=key, state_size=STATE_SIZE, action_size=ACTION_SIZE, map_agent_to_state_slice={ "TennisBrain_0": lambda t: t[:, 0:24], "TennisBrain_1": lambda t: t[:, 24:48] }, map_agent_to_action_slice={ "TennisBrain_0": lambda t: t[:, 0:2], "TennisBrain_1": lambda t: t[:, 2:4] }, actor_critic_factory=lambda: MAPPO_Actor_Critic( actor_model=MLP( layer_sizes=(STATE_SIZE, 256, 128, ACTION_SIZE), seed=SEED, # output_function=BoundVectorNorm(), output_function=torch.nn.Tanh(), with_batchnorm=BATCHNORM, activation_function=torch.nn.ReLU(True), hidden_layer_initialization_fn= init_layer_inverse_root_fan_in, output_layer_initialization_fn=get_init_layer_within_rage( limit_range=(-3e-4, 3e-4)), dropout=DROPOUT), critic_model=MACritic( state_featurizer=MLP( layer_sizes=(STATE_SIZE * 2 + ACTION_SIZE, 256), with_batchnorm=BATCHNORM, dropout=DROPOUT, seed=SEED, output_function=torch.nn.ReLU(), ), output_module=MLP( layer_sizes=(256 + ACTION_SIZE, 128, 1), with_batchnorm=BATCHNORM, dropout=DROPOUT, seed=SEED, output_layer_initialization_fn= get_init_layer_within_rage(limit_range=(-3e-4, 3e-4)), activation_function=torch.nn.ReLU(True), ), ), action_size=ACTION_SIZE, continuous_actions=True, ), optimizer_factory=lambda params: torch.optim.AdamW( params, lr=LR, weight_decay=WEIGHT_DECAY, eps=EPSILON), continuous_action_range_clip=(-1, 1), batch_size=256, min_batches_for_training=16, num_learning_updates=10, beta_scheduler=ParameterScheduler(initial=0.01, lambda_fn=lambda i: 0.01, final=0.01), std_scale_scheduler=ParameterScheduler( initial=0.8, lambda_fn=lambda i: 0.8 * 0.999**i, final=0.2), seed=SEED) tennis_agents.append(agent) tennis_brain = Brain( brain_name="TennisBrain", action_size=ACTION_SIZE, state_shape=STATE_SIZE, observation_type='vector', agents=tennis_agents, ) brain_set = BrainSet(brains=[tennis_brain]) return brain_set
def multi_agent_step_episode_agents_fn(brain_set: BrainSet, episode): for brain_name in brain_set.names(): for _, agent in enumerate(brain_set[brain_name].agents): agent.step_episode(episode)
def get_solution_brain_set(): # Define the solution hyper parameters params = deepcopy(default_cfg) update_params = { "INITIAL_LR": 5e-4, "NUM_STACKED_FRAMES": 4, "OUTPUT_HIDDEN_DROPOUT": 0.1, "DUELING": True, "NOISY": True, "BATCH_SIZE": 64, "N_FILTERS": (64, 128, 128), "EPS_DECAY_FACTOR": 0.995, "KERNEL_SIZES": [(1, 8, 8), (1, 4, 4), (4, 3, 3)], "STRIDE_SIZES": [(1, 4, 4), (1, 2, 2), (1, 3, 3)], "OUTPUT_FC_HIDDEN_SIZES": (1024, ), "WARMUP_STEPS": 10000, } params.update(update_params) print("Params are: {}".format(json.dumps(params, indent=2))) policy = get_policy(ACTION_SIZE, params) featurizer = CNN( image_shape=VISUAL_STATE_SHAPE[1:], num_stacked_frames=params["NUM_STACKED_FRAMES"], grayscale=params["GRAYSCALE"], nfilters=params["N_FILTERS"], kernel_sizes=params["KERNEL_SIZES"], stride_sizes=params["STRIDE_SIZES"], ) model = VisualDQN( VISUAL_STATE_SHAPE, ACTION_SIZE, featurizer, featurizer.output_size, seed=SEED, grayscale=params["GRAYSCALE"], num_stacked_frames=params["NUM_STACKED_FRAMES"], output_hidden_layer_size=params["OUTPUT_FC_HIDDEN_SIZES"], OUTPUT_HIDDEN_DROPOUT=params["OUTPUT_HIDDEN_DROPOUT"], dueling_output=params["DUELING"], noisy_output=params['NOISY'], categorical_output=params['CATEGORICAL'], ) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=params['INITIAL_LR']) memory = get_memory(VISUAL_STATE_SHAPE, params) solution_agent = get_agent(VISUAL_STATE_SHAPE, ACTION_SIZE, model, policy, memory, optimizer, params) banana_brain_ = Brain(brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=VISUAL_STATE_SHAPE, observation_type='visual', agents=[solution_agent], preprocess_state_fn=get_preprocess_state_fn(params)) brain_set_ = BrainSet(brains=[banana_brain_]) return brain_set_, params
def get_solution_brain_set(): tennis_agents = [] state_featurizer = MLP( layer_sizes=(STATE_SIZE * 2 + ACTION_SIZE, 400), with_batchnorm=BATCHNORM, activation_function=torch.nn.ReLU(True), ) output_module = MLP( layer_sizes=(400 + ACTION_SIZE, 300, 1), with_batchnorm=BATCHNORM, activation_function=torch.nn.ReLU(True), output_layer_initialization_fn=get_init_layer_within_rage( limit_range=(-3e-4, 3e-4))) memory_factory = lambda: PrioritizedMemory( capacity=BUFFER_SIZE, state_shape=(1, STATE_SIZE), alpha_scheduler=ParameterScheduler(initial=0.6, lambda_fn=lambda i: 0.6 - 0.6 * i / NUM_EPISODES, final=0.), beta_scheduler= ParameterScheduler(initial=0.4, final=1, lambda_fn=lambda i: 0.4 + 0.6 * i / NUM_EPISODES ), # Anneal beta linearly seed=SEED, continuous_actions=True, min_priority=1e-4) if MATD3: critic_factory = lambda: MATD3Critic( critic_model_factory=lambda: MACritic( state_featurizer=state_featurizer, output_module=output_module, seed=SEED, ), seed=SEED) else: critic_factory = lambda: MACritic( state_featurizer=state_featurizer, output_module=output_module, ) for i in range(2): key = "TennisBrain_{}".format(i) tennis_agent = MADDPGAgent( key, None, STATE_SIZE, ACTION_SIZE, critic_factory=critic_factory, actor_factory=lambda: MLP( layer_sizes=(STATE_SIZE, 400, 300, ACTION_SIZE), with_batchnorm=BATCHNORM, dropout=DROPOUT, output_function=BoundVectorNorm(), output_layer_initialization_fn=init_layer_within_range, hidden_layer_initialization_fn=init_layer_inverse_root_fan_in, seed=SEED), critic_optimizer_factory=lambda parameters: optim.Adam( parameters, lr=CRITIC_LR, weight_decay=1.e-5), actor_optimizer_factory=lambda parameters: optim.Adam(parameters, lr=ACTOR_LR), memory_factory=memory_factory, seed=0, batch_size=BATCH_SIZE, homogeneous_agents=False, ) tennis_agents.append(tennis_agent) tennis_brain = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=STATE_SIZE, observation_type='vector', agents=tennis_agents, ) brain_set = BrainSet(brains=[tennis_brain]) # Update the policy with the independent MADDPG policy # This is done so that each agent will receive the other agents' # states/actions during training to guide actor learning. for i, agent in enumerate(tennis_agents): agent_id = "TennisBrain_{}".format(i) agent.policy = IndependentMADDPGPolicy( brain_set=brain_set, agent_id=agent_id, action_dim=ACTION_SIZE, epsilon_scheduler=ParameterScheduler(initial=1, lambda_fn=lambda i: 0.99**i, final=0.01), random_brain_action_factory=lambda: RandomBrainAction( ACTION_SIZE, 1, continuous_actions=True, continuous_action_range=(-1, 1), ), map_agent_to_state_slice={ "TennisBrain_0": lambda t: t[:, 0:24], "TennisBrain_1": lambda t: t[:, 24:48] }, map_agent_to_action_slice={ "TennisBrain_0": lambda t: t[:, 0:2], "TennisBrain_1": lambda t: t[:, 2:4] }, matd3=MATD3, gaussian_noise_factory=lambda: GaussianNoise(), continuous_actions=True, continuous_actions_clip_range=(-1, 1)) return brain_set
def get_solution_brain_set(): params = { 'striker_actor_layer_size': (STRIKER_STATE_SIZE, 256, 256, len(range(*STRIKER_ACTION_DISCRETE_RANGE))), 'goalie_actor_layer_size': (GOALIE_STATE_SIZE, 256, 256, len(range(*GOALIE_ACTION_DISCRETE_RANGE))), 'striker_critic_state_featurizer_layer_size': (336*4 + 3, 256), 'striker_critic_output_layer_size': (256 + 1, 256, 1), 'goalie_critic_state_featurizer_layer_size': (336 * 4 + 3, 256), 'goalie_critic_output_layer_size': (256 + 1, 256, 1), 'batchnorm': True, 'actor_dropout': 0.1, 'critic_dropout': 0.2, 'lr': 5e-3, 'weight_decay': 1e-4, 'eps': 1e-6, 'num_ppo_epochs': 4, 'minimum_training_batches': 32, 'batch_size': 1024 } goalie_agents = [] for agent_num in range(NUM_GOALIE_AGENTS): key = 'GoalieBrain_{}'.format(agent_num) if agent_num == 1: goalie_agent = DummyMADDPGAgent( GOALIE_STATE_SIZE, len(range(*GOALIE_ACTION_DISCRETE_RANGE)), seed=SEED, map_agent_to_state_slice={ "GoalieBrain_0": lambda t: t[:, 0:336], "GoalieBrain_1": lambda t: t[:, 336:672], "StrikerBrain_0": lambda t: t[:, 672:1008], "StrikerBrain_1": lambda t: t[:, 1008:] }, map_agent_to_action_slice={ "GoalieBrain_0": lambda t: t[:, 0:1], "GoalieBrain_1": lambda t: t[:, 1:2], "StrikerBrain_0": lambda t: t[:, 2:3], "StrikerBrain_1": lambda t: t[:, 3:4] }, ) else: goalie_agent = MAPPOAgent( agent_id=key, state_size=GOALIE_STATE_SIZE, action_size=len(range(*GOALIE_ACTION_DISCRETE_RANGE)), seed=SEED, map_agent_to_state_slice={ "GoalieBrain_0": lambda t: t[:, 0:336], "GoalieBrain_1": lambda t: t[:, 336:672], "StrikerBrain_0": lambda t: t[:, 672:1008], "StrikerBrain_1": lambda t: t[:, 1008:] }, map_agent_to_action_slice={ "GoalieBrain_0": lambda t: t[:, 0:1], "GoalieBrain_1": lambda t: t[:, 1:2], "StrikerBrain_0": lambda t: t[:, 2:3], "StrikerBrain_1": lambda t: t[:, 3:4] }, actor_critic_factory=lambda: MAPPO_Actor_Critic( actor_model=MLP( layer_sizes=params['goalie_actor_layer_size'], seed=SEED, output_function=torch.nn.Softmax(), with_batchnorm=params['batchnorm'], activation_function=torch.nn.LeakyReLU(True), dropout=params['actor_dropout'] ), critic_model=MACritic( state_featurizer=MLP( layer_sizes=params['goalie_critic_state_featurizer_layer_size'], with_batchnorm=params['batchnorm'], dropout=params['critic_dropout'], seed=SEED ), output_module=MLP( layer_sizes=params['goalie_critic_output_layer_size'], with_batchnorm=params['batchnorm'], dropout=params['critic_dropout'], seed=SEED, ), ), action_size=GOALIE_ACTION_SIZE, continuous_actions=False, seed=SEED ), min_batches_for_training=params['minimum_training_batches'], num_learning_updates=params['num_ppo_epochs'], optimizer_factory=lambda model_params: torch.optim.AdamW( model_params, lr=params['lr'], weight_decay=params['weight_decay'], eps=params['eps'] ), continuous_actions=False, batch_size=params['batch_size'], beta_scheduler=ParameterScheduler(initial=0.01, lambda_fn=lambda i: 0.01, final=0.01), std_scale_scheduler=ParameterScheduler(initial=0.8, lambda_fn=lambda i: 0.8 * 0.999 ** i, final=0.2), ) print("Goalie is: {}".format(goalie_agent.online_actor_critic)) goalie_agents.append(goalie_agent) striker_agents = [] for agent_num in range(NUM_STRIKER_AGENTS): key = 'StrikerBrain_{}'.format(agent_num) if agent_num == 1: striker_agent = DummyMADDPGAgent( STRIKER_STATE_SIZE, len(range(*STRIKER_ACTION_DISCRETE_RANGE)), SEED, map_agent_to_state_slice={ "GoalieBrain_0": lambda t: t[:, 0:336], "GoalieBrain_1": lambda t: t[:, 336:672], "StrikerBrain_0": lambda t: t[:, 672:1008], "StrikerBrain_1": lambda t: t[:, 1008:] }, map_agent_to_action_slice={ "GoalieBrain_0": lambda t: t[:, 0:1], "GoalieBrain_1": lambda t: t[:, 1:2], "StrikerBrain_0": lambda t: t[:, 2:3], "StrikerBrain_1": lambda t: t[:, 3:4] }, ) else: striker_agent = MAPPOAgent( agent_id=key, state_size=STRIKER_STATE_SIZE, action_size=len(range(*STRIKER_ACTION_DISCRETE_RANGE)), seed=SEED, map_agent_to_state_slice={ "GoalieBrain_0": lambda t: t[:, 0:336], "GoalieBrain_1": lambda t: t[:, 336:672], "StrikerBrain_0": lambda t: t[:, 672:1008], "StrikerBrain_1": lambda t: t[:, 1008:] }, map_agent_to_action_slice={ "GoalieBrain_0": lambda t: t[:, 0:1], "GoalieBrain_1": lambda t: t[:, 1:2], "StrikerBrain_0": lambda t: t[:, 2:3], "StrikerBrain_1": lambda t: t[:, 3:4] }, actor_critic_factory=lambda: MAPPO_Actor_Critic( actor_model=MLP( layer_sizes=params['striker_actor_layer_size'], seed=SEED, output_function=torch.nn.Softmax(), with_batchnorm=params['batchnorm'], activation_function=torch.nn.LeakyReLU(True), dropout=params['actor_dropout'] ), critic_model=MACritic( state_featurizer=MLP( layer_sizes=params['striker_critic_state_featurizer_layer_size'], with_batchnorm=params['batchnorm'], dropout=params['critic_dropout'], seed=SEED, ), output_module=MLP( layer_sizes=params['striker_critic_output_layer_size'], with_batchnorm=params['batchnorm'], dropout=params['critic_dropout'], seed=SEED, ), ), action_size=STRIKER_ACTION_SIZE, continuous_actions=False, seed=SEED ), optimizer_factory=lambda model_params: torch.optim.AdamW( model_params, lr=params['lr'], weight_decay=params['weight_decay'], eps=params['eps'] ), min_batches_for_training=params['minimum_training_batches'], num_learning_updates=params['num_ppo_epochs'], continuous_actions=False, batch_size=params['batch_size'], beta_scheduler=ParameterScheduler(initial=0.01, lambda_fn=lambda i: 0.01, final=0.01), std_scale_scheduler=ParameterScheduler(initial=0.8, lambda_fn=lambda i: 0.8 * 0.999 ** i, final=0.2), ) print("Striker is: {}".format(striker_agent.online_actor_critic)) striker_agents.append(striker_agent) goalie_brain = Brain( brain_name=GOALIE_BRAIN_NAME, action_size=GOALIE_ACTION_SIZE, state_shape=GOALIE_STATE_SIZE, observation_type='vector', agents=goalie_agents, ) striker_brain = Brain( brain_name=STRIKER_BRAIN_NAME, action_size=STRIKER_ACTION_SIZE, state_shape=STRIKER_STATE_SIZE, observation_type='vector', agents=striker_agents, ) brain_set = BrainSet(brains=[goalie_brain, striker_brain]) return brain_set