def make_env(scenario_name, benchmark=False): ''' Creates a MultiAgentEnv object as env. This can be used similar to a gym environment by calling env.reset() and env.step(). Use env.render() to view the environment on the screen. Input: scenario_name : name of the scenario from ./scenarios/ to be Returns (without the .py extension) benchmark : whether you want to produce benchmarking data (usually only done during evaluation) Some useful env properties (see environment.py): .observation_space : Returns the observation space for each agent .action_space : Returns the action space for each agent .n : Returns the number of Agents ''' from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment world.dim_c = 0 if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env.discrete_action_space = False env.discrete_action_input = False scenario.reset_world(world) return env,scenario,world
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--no_render", action='store_true') config = parser.parse_args() seed = np.random.randint(1e9) np.random.seed(seed) scenario = Scenario() # Create world world = scenario.make_world(obs_range=1.0) # Create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env.discrete_action_space = False #env.render() # Create policies policies = [RandomPolicy(env) for i in range(env.n)] obs_n = env.reset() it = 0 while True: # Get each agent's action act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) print(f'Agent {i}: \nobs:{obs_n[i]}\naction:{act_n[i]}') # Environment step
def run(cnt): # load scenario from script scenario_name = 'simple_spread' scenario = scenarios.load(scenario_name + ".py").Scenario() # change to local observation scenario.observation = observation # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) print('observation shape: ', env.observation_space) print('action shape: ', env.action_space) env.discrete_action_input = True env.discrete_action_space = False actor = ActorNetwork(input_dim=10, out_dim=5) critic = CriticNetwork(input_dim=10 + 5, out_dim=1) memory = MemoryBuffer(size=1000000) agent = Trainer(actor, critic, memory) # def run(): episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve terminal_reward = [] # history = [] # history_rewards = [] # episode_rewards = [] # sum of rewards for all agents episode_loss = [] obs = env.reset() episode_step = 0 train_step = 0 nb_episode = 0 verbose_step = False verbose_episode = True t_start = time.time() print('Starting iterations...') while True: # get action obs = agent.process_obs(obs) actions = agent.get_exploration_action(obs) actions = agent.process_action(actions) # environment step new_obs, rewards, done, info = env.step(actions) rewards = agent.process_reward(rewards) rewards = rewards.mean() episode_step += 1 done = all(done) terminal = (episode_step >= arglist.max_episode_len) terminal = agent.process_done(done or terminal) # collect experience # obs, actions, rewards, new_obs, done actions = agent.to_onehot(actions) agent.memory.add(obs, actions, rewards, agent.process_obs(new_obs), terminal) obs = new_obs # episode_rewards.append(rewards) rewards = rewards.item() for i, rew in enumerate([rewards] * env.n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # for displaying learned policies if arglist.display: if terminal: time.sleep(0.1) env.render() # continue if terminal: obs = env.reset() episode_step = 0 nb_episode += 1 episode_rewards.append(0) terminal_reward.append(np.mean(rewards)) # increment global step counter train_step += 1 # update all trainers, if not in display or benchmark mode loss = [np.nan, np.nan] if (train_step > arglist.warmup_steps) and (train_step % 100 == 0): loss = agent.optimize() loss = [loss[0].data.item(), loss[1].data.item()] episode_loss.append(loss) if verbose_step: if loss == [np.nan, np.nan]: loss = ['--', '--'] print('step: {}, actor_loss: {}, critic_loss: {}'.format( train_step, loss[0], loss[1])) elif verbose_episode: if terminal and (len(episode_rewards) % arglist.save_rate == 0): print( "steps: {}, episodes: {}, mean episode reward: {}, reward: {}, time: {}" .format( train_step, len(episode_rewards), round(np.mean(episode_rewards[-arglist.save_rate:]), 3), round(np.mean(terminal_reward), 3), round(time.time() - t_start, 3))) terminal_reward = [] t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if nb_episode > arglist.num_episodes: np.save('experiments/iter_{}_episode_rewards.npy'.format(cnt), episode_rewards) # rew_file_name = 'experiments/' + arglist.exp_name + '{}_rewards.pkl'.format(cnt) # with open(rew_file_name, 'wb') as fp: # pickle.dump(final_ep_rewards, fp) # agrew_file_name = 'experiments/' + arglist.exp_name + '{}_agrewards.pkl'.format(cnt) # with open(agrew_file_name, 'wb') as fp: # pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def run(cnt): # load scenario from script scenario_name = 'simple_spread' scenario = scenarios.load(scenario_name + ".py").Scenario() # change to local observation scenario.observation = observation # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) print('observation shape: ', env.observation_space) print('action shape: ', env.action_space) env.discrete_action_input = True env.discrete_action_space = False actor = ActorNetwork(nb_agents=env.n, input_dim=10, out_dim=5) critic = CriticNetwork(nb_agents=env.n, input_dim=10 + 5, out_dim=1) memory = EpisodicMemory(limit=1000000) agent = Trainer(actor, critic, memory) # initialize history episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve terminal_reward = [] episode_loss = [] obs = env.reset() episode_step = 0 train_step = 0 nb_episode = 0 verbose_step = False verbose_episode = True t_start = time.time() log = open('results/train_log.txt', 'w') log.write('train start... \n') log.close() print('Starting iterations...') while True: # get action obs = agent.process_obs(obs) actions = agent.get_exploration_action(obs) actions = agent.process_action(actions) # environment step new_obs, rewards, done, info = env.step(actions) rewards = agent.process_reward(rewards) rewards = rewards.mean() episode_step += 1 done = all(done) terminal = (episode_step >= arglist.max_episode_len) terminal = agent.process_done(done or terminal) # collect experience # obs, actions, rewards, done actions = agent.to_onehot(actions) agent.memory.append(obs, actions, rewards, terminal, training=True) # next observation obs = deepcopy(new_obs) # episode_rewards.append(rewards) rewards = rewards.item() for i, rew in enumerate([rewards] * env.n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # for displaying learned policies if arglist.display: if terminal: time.sleep(0.1) env.render() # continue # for save & print history terminal_verbose = terminal if terminal: terminal_reward.append(np.mean(rewards)) # save terminal state # process observation obs = agent.process_obs(obs) # get action & process action actions = agent.get_exploration_action(obs) actions = agent.process_action(actions) actions = agent.to_onehot(actions) # process rewards rewards = agent.process_reward(0.) rewards = rewards.mean().item() # process terminal terminal = agent.process_done(False) agent.memory.append(obs, actions, rewards, terminal, training=True) # reset environment obs = env.reset() episode_step = 0 nb_episode += 1 episode_rewards.append(0) # initialize hidden/cell states agent.actor.hState = None # increment global step counter train_step += 1 # update all trainers, if not in display or benchmark mode loss = [np.nan, np.nan] if (train_step > arglist.warmup_steps) and (train_step % 600 == 0): # store hidden/cell state hState = agent.actor.hState # reset hidden/cell state agent.actor.hState = None # optimize actor-critic loss = agent.optimize() # recover hidden/cell state agent.actor.hState = hState loss = np.array([x.data.item() for x in loss]) episode_loss.append(loss) if verbose_step: if loss == [np.nan, np.nan]: loss = ['--', '--'] print('step: {}, actor_loss: {}, critic_loss: {}'.format(train_step, loss[0], loss[1])) elif verbose_episode: if terminal_verbose and (len(episode_rewards) % arglist.save_rate == 0): monitor_loss = np.mean(np.array(episode_loss)[-1000:], axis=0) msg1 = "steps: {}, episodes: {}, mean episode reward: {}, reward: {}, time: {}".format( train_step, len(episode_rewards), round(np.mean(episode_rewards[-arglist.save_rate:]), 3), round(np.mean(terminal_reward), 3), round(time.time() - t_start, 3)) msg2 = "TD error: {}, c_model: {}, actorQ: {}, a_model: {}".format( round(monitor_loss[2], 3), round(monitor_loss[3], 3), round(monitor_loss[4], 3), round(monitor_loss[5], 3)) msg = msg1 + ', ' + msg2 print(msg) # save log log = open('results/train_log.txt', 'a') log.write(msg + '\n') log.close() terminal_reward = [] t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if nb_episode > arglist.num_episodes: np.save('results/iter_{}_episode_rewards.npy'.format(cnt), episode_rewards) # rew_file_name = 'experiments/' + arglist.exp_name + '{}_rewards.pkl'.format(cnt) # with open(rew_file_name, 'wb') as fp: # pickle.dump(final_ep_rewards, fp) # agrew_file_name = 'experiments/' + arglist.exp_name + '{}_agrewards.pkl'.format(cnt) # with open(agrew_file_name, 'wb') as fp: # pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break