def test_identity_discrete(agent_number, prioritized_replay): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param model_name: (str) Name of the RL model """ env = IdentityEnv(5, agent_number) lr = 0.01 if not prioritized_replay else 0.02 agents = [ MAD3PGAgent(env.observation_space, env.action_space, idx, batch_size=32, buff_size=10000, lr=lr, num_layer=2, num_units=32, gamma=0.9, tau=0.01, prioritized_replay=prioritized_replay, max_step=50000, _run=FakeRun(), min_val=-25, max_val=0) for idx in range(agent_number) ] ag_env_comb = AgentEnvCombination(agents, env) target_reward = -1.0 * agent_number episode_rewards = ag_env_comb.train(15000, 10, target_reward) # 10000, 100) assert np.mean(episode_rewards[-10:]) > target_reward
def get_agents(_run, env, num_adversaries, good_policy, adv_policy, lr, batch_size, buff_size, num_units, num_layers, gamma, tau, priori_replay, alpha, num_episodes, max_episode_len, beta, policy_update_rate, critic_action_noise_stddev, entropy_coeff, num_atoms, min_val, max_val) -> List[AbstractAgent]: """ This function generates the agents for the environment. The parameters are meant to be filled by sacred, and are therefore documented in the configuration function train_config. :returns List[AbstractAgent] returns a list of instantiated agents """ agents = [] for agent_idx in range(num_adversaries): if adv_policy == 'maddpg': agent = MADDPGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, _run=_run) elif adv_policy == 'matd3': agent = MATD3Agent( env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, policy_update_freq=policy_update_rate, target_policy_smoothing_eps=critic_action_noise_stddev, _run=_run) elif adv_policy == 'mad3pg': agent = MAD3PGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, num_atoms=num_atoms, min_val=min_val, max_val=max_val, _run=_run) elif good_policy == 'masac': agent = MASACAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, entropy_coeff=entropy_coeff, policy_update_freq=policy_update_rate, _run=_run) else: raise RuntimeError('Invalid Class') agents.append(agent) for agent_idx in range(num_adversaries, num_adversaries + env.n_good_agents - 1): if good_policy == 'maddpg': agent = MADDPGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, _run=_run) elif good_policy == 'matd3': agent = MATD3Agent( env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, policy_update_freq=policy_update_rate, target_policy_smoothing_eps=critic_action_noise_stddev, _run=_run) elif adv_policy == 'mad3pg': agent = MAD3PGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, num_atoms=num_atoms, min_val=min_val, max_val=max_val, _run=_run) elif good_policy == 'masac': agent = MASACAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, entropy_coeff=entropy_coeff, policy_update_freq=policy_update_rate, _run=_run) else: raise RuntimeError('Invalid Class') agents.append(agent) for agent_idx in range(num_adversaries + env.n_good_agents, env.n): if good_policy == 'maddpg': agent = MADDPGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, _run=_run) elif good_policy == 'matd3': agent = MATD3Agent( env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, policy_update_freq=policy_update_rate, target_policy_smoothing_eps=critic_action_noise_stddev, _run=_run) elif adv_policy == 'mad3pg': agent = MAD3PGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, num_atoms=num_atoms, min_val=min_val, max_val=max_val, _run=_run) elif good_policy == 'masac': agent = MASACAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, entropy_coeff=entropy_coeff, policy_update_freq=policy_update_rate, _run=_run) else: raise RuntimeError('Invalid Class') agents.append(agent) print('Using good policy {} and adv policy {}'.format( good_policy, adv_policy)) return agents
def create_agent(alg_name): if alg_name == 'maddpg': ret_agent = MADDPGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, _run=_run) elif alg_name == 'matd3': ret_agent = MATD3Agent( env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, policy_update_freq=policy_update_rate, target_policy_smoothing_eps=critic_action_noise_stddev, _run=_run) elif alg_name == 'mad3pg': ret_agent = MAD3PGAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, num_atoms=num_atoms, min_val=min_val, max_val=max_val, _run=_run) elif alg_name == 'masac': ret_agent = MASACAgent(env.observation_space, env.action_space, agent_idx, batch_size, buff_size, lr, num_layers, num_units, gamma, tau, priori_replay, alpha=alpha, max_step=num_episodes * max_episode_len, initial_beta=beta, entropy_coeff=entropy_coeff, policy_update_freq=policy_update_rate, _run=_run) else: raise RuntimeError('Invalid Class') return ret_agent
def test_save_load(): """ Tests saving and loading for two agents. """ fp = '/tmp/unittestmaddpg' env = IdentityEnv(5, 2) agents = [ MAD3PGAgent(env.observation_space, env.action_space, idx, batch_size=32, buff_size=10000, lr=0.01, num_layer=2, num_units=32, gamma=0.9, tau=0.01, prioritized_replay=True, max_step=5000) for idx in range(2) ] for idx, agent in enumerate(agents): agent.save(fp + str(idx)) load_agents = [ MAD3PGAgent(env.observation_space, env.action_space, idx, batch_size=32, buff_size=10000, lr=0.01, num_layer=2, num_units=32, gamma=0.9, tau=0.01, prioritized_replay=True, max_step=5000) for idx in range(2) ] for idx, agent in enumerate(load_agents): agent.load(fp + str(idx)) def check_for_equal_arrays(list_1, list_2): for el1, el2 in zip(list_1, list_2): if not (el1 == el2).all(): return False return True assert check_for_equal_arrays(load_agents[0].critic.model.get_weights(), agents[0].critic.model.get_weights()) assert check_for_equal_arrays(load_agents[1].critic.model.get_weights(), agents[1].critic.model.get_weights()) assert check_for_equal_arrays( load_agents[0].critic_target.model.get_weights(), agents[0].critic_target.model.get_weights()) assert check_for_equal_arrays( load_agents[1].critic_target.model.get_weights(), agents[1].critic_target.model.get_weights()) assert check_for_equal_arrays(load_agents[0].policy.model.get_weights(), agents[0].policy.model.get_weights()) assert check_for_equal_arrays(load_agents[1].policy.model.get_weights(), agents[1].policy.model.get_weights()) assert check_for_equal_arrays( load_agents[0].policy_target.model.get_weights(), agents[0].policy_target.model.get_weights()) assert check_for_equal_arrays( load_agents[1].policy_target.model.get_weights(), agents[1].policy_target.model.get_weights())
def create_agent(alg_name, index: int, env: MultiAgentEnv, exp): conf = exp.config if alg_name == 'maddpg': ret_agent = MADDPGAgent(env.observation_space, env.action_space, index, conf.batch_size, conf.buff_size, conf.lr, conf.num_layers, conf.num_units, conf.gamma, conf.tau, conf.priori_replay, alpha=conf.alpha, max_step=conf.num_episodes * conf.max_episode_len, initial_beta=conf.beta, _run=exp) elif alg_name == 'matd3': ret_agent = MATD3Agent( env.observation_space, env.action_space, index, conf.batch_size, conf.buff_size, conf.lr, conf.num_layers, conf.num_units, conf.gamma, conf.tau, conf.priori_replay, alpha=conf.alpha, max_step=conf.num_episodes * conf.max_episode_len, initial_beta=conf.beta, policy_update_freq=conf.policy_update_rate, target_policy_smoothing_eps=conf.critic_action_noise_stddev, _run=exp) elif alg_name == 'mad3pg': ret_agent = MAD3PGAgent(env.observation_space, env.action_space, index, conf.batch_size, conf.buff_size, conf.lr, conf.num_layers, conf.num_units, conf.gamma, conf.tau, conf.priori_replay, alpha=conf.alpha, max_step=conf.num_episodes * conf.max_episode_len, initial_beta=conf.beta, num_atoms=conf.num_atoms, min_val=conf.min_val, max_val=conf.max_val, _run=exp) elif alg_name == 'masac': ret_agent = MASACAgent(env.observation_space, env.action_space, index, conf.batch_size, conf.buff_size, conf.lr, conf.num_layers, conf.num_units, conf.gamma, conf.tau, conf.priori_replay, alpha=conf.alpha, max_step=conf.num_episodes * conf.max_episode_len, initial_beta=conf.beta, entropy_coeff=conf.entropy_coeff, policy_update_freq=conf.policy_update_rate, _run=exp) else: raise RuntimeError(f'Invalid Class - {alg_name} is unknown') return ret_agent