Esempio n. 1
0
def make_env(scenario_name, benchmark=False):
    '''
    Creates a MultiAgentEnv object as env. This can be used similar to a gym
    environment by calling env.reset() and env.step().
    Use env.render() to view the environment on the screen.

    Input:
        scenario_name   :   name of the scenario from ./scenarios/ to be Returns
                            (without the .py extension)
        benchmark       :   whether you want to produce benchmarking data
                            (usually only done during evaluation)

    Some useful env properties (see environment.py):
        .observation_space  :   Returns the observation space for each agent
        .action_space       :   Returns the action space for each agent
        .n                  :   Returns the number of Agents
    '''
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    world.dim_c = 0
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    env.discrete_action_space = False
    env.discrete_action_input = False
    scenario.reset_world(world)
    return env,scenario,world
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--no_render", action='store_true')
    config = parser.parse_args()

    seed = np.random.randint(1e9)
    np.random.seed(seed)

    scenario = Scenario()
    # Create world
    world = scenario.make_world(obs_range=1.0)

    # Create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)
    env.discrete_action_space = False
    #env.render()

    # Create policies
    policies = [RandomPolicy(env) for i in range(env.n)]

    obs_n = env.reset()
    it = 0
    while True:
        # Get each agent's action
        act_n = []
        for i, policy in enumerate(policies):
            act_n.append(policy.action(obs_n[i]))
            print(f'Agent {i}: \nobs:{obs_n[i]}\naction:{act_n[i]}')

        # Environment step
Esempio n. 3
0
def run(cnt):
    # load scenario from script
    scenario_name = 'simple_spread'
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # change to local observation
    scenario.observation = observation

    # create world
    world = scenario.make_world()

    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)
    print('observation shape: ', env.observation_space)
    print('action shape: ', env.action_space)
    env.discrete_action_input = True
    env.discrete_action_space = False

    actor = ActorNetwork(input_dim=10, out_dim=5)
    critic = CriticNetwork(input_dim=10 + 5, out_dim=1)
    memory = MemoryBuffer(size=1000000)
    agent = Trainer(actor, critic, memory)

    # def run():
    episode_rewards = [0.0]  # sum of rewards for all agents
    agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_ag_rewards = []  # agent rewards for training curve
    terminal_reward = []

    # history = []
    # history_rewards = []
    # episode_rewards = []  # sum of rewards for all agents
    episode_loss = []
    obs = env.reset()
    episode_step = 0
    train_step = 0
    nb_episode = 0

    verbose_step = False
    verbose_episode = True
    t_start = time.time()

    print('Starting iterations...')
    while True:
        # get action
        obs = agent.process_obs(obs)
        actions = agent.get_exploration_action(obs)
        actions = agent.process_action(actions)

        # environment step
        new_obs, rewards, done, info = env.step(actions)
        rewards = agent.process_reward(rewards)
        rewards = rewards.mean()
        episode_step += 1
        done = all(done)
        terminal = (episode_step >= arglist.max_episode_len)
        terminal = agent.process_done(done or terminal)
        # collect experience
        # obs, actions, rewards, new_obs, done
        actions = agent.to_onehot(actions)
        agent.memory.add(obs, actions, rewards, agent.process_obs(new_obs),
                         terminal)
        obs = new_obs
        # episode_rewards.append(rewards)
        rewards = rewards.item()
        for i, rew in enumerate([rewards] * env.n):
            episode_rewards[-1] += rew
            agent_rewards[i][-1] += rew

        # for displaying learned policies
        if arglist.display:
            if terminal:
                time.sleep(0.1)
                env.render()
            # continue

        if terminal:
            obs = env.reset()
            episode_step = 0
            nb_episode += 1
            episode_rewards.append(0)
            terminal_reward.append(np.mean(rewards))

        # increment global step counter
        train_step += 1

        # update all trainers, if not in display or benchmark mode
        loss = [np.nan, np.nan]
        if (train_step > arglist.warmup_steps) and (train_step % 100 == 0):
            loss = agent.optimize()
            loss = [loss[0].data.item(), loss[1].data.item()]

        episode_loss.append(loss)

        if verbose_step:
            if loss == [np.nan, np.nan]:
                loss = ['--', '--']
            print('step: {}, actor_loss: {}, critic_loss: {}'.format(
                train_step, loss[0], loss[1]))

        elif verbose_episode:
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                print(
                    "steps: {}, episodes: {}, mean episode reward: {}, reward: {}, time: {}"
                    .format(
                        train_step, len(episode_rewards),
                        round(np.mean(episode_rewards[-arglist.save_rate:]),
                              3), round(np.mean(terminal_reward), 3),
                        round(time.time() - t_start, 3)))
                terminal_reward = []
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

        # saves final episode reward for plotting training curve later
        if nb_episode > arglist.num_episodes:
            np.save('experiments/iter_{}_episode_rewards.npy'.format(cnt),
                    episode_rewards)

            # rew_file_name = 'experiments/' + arglist.exp_name + '{}_rewards.pkl'.format(cnt)
            # with open(rew_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_rewards, fp)
            # agrew_file_name = 'experiments/' + arglist.exp_name + '{}_agrewards.pkl'.format(cnt)
            # with open(agrew_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_ag_rewards, fp)
            print('...Finished total of {} episodes.'.format(
                len(episode_rewards)))

            break
def run(cnt):
    # load scenario from script
    scenario_name = 'simple_spread'
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # change to local observation
    scenario.observation = observation

    # create world
    world = scenario.make_world()

    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    print('observation shape: ', env.observation_space)
    print('action shape: ', env.action_space)
    env.discrete_action_input = True
    env.discrete_action_space = False

    actor = ActorNetwork(nb_agents=env.n, input_dim=10, out_dim=5)
    critic = CriticNetwork(nb_agents=env.n, input_dim=10 + 5, out_dim=1)
    memory = EpisodicMemory(limit=1000000)
    agent = Trainer(actor, critic, memory)

    # initialize history
    episode_rewards = [0.0]  # sum of rewards for all agents
    agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_ag_rewards = []  # agent rewards for training curve
    terminal_reward = []
    episode_loss = []
    obs = env.reset()
    episode_step = 0
    train_step = 0
    nb_episode = 0

    verbose_step = False
    verbose_episode = True
    t_start = time.time()

    log = open('results/train_log.txt', 'w')
    log.write('train start... \n')
    log.close()

    print('Starting iterations...')
    while True:
        # get action
        obs = agent.process_obs(obs)
        actions = agent.get_exploration_action(obs)
        actions = agent.process_action(actions)

        # environment step
        new_obs, rewards, done, info = env.step(actions)
        rewards = agent.process_reward(rewards)
        rewards = rewards.mean()
        episode_step += 1
        done = all(done)
        terminal = (episode_step >= arglist.max_episode_len)
        terminal = agent.process_done(done or terminal)
        # collect experience
        # obs, actions, rewards, done
        actions = agent.to_onehot(actions)
        agent.memory.append(obs, actions, rewards, terminal, training=True)

        # next observation
        obs = deepcopy(new_obs)

        # episode_rewards.append(rewards)
        rewards = rewards.item()
        for i, rew in enumerate([rewards] * env.n):
            episode_rewards[-1] += rew
            agent_rewards[i][-1] += rew

        # for displaying learned policies
        if arglist.display:
            if terminal:
                time.sleep(0.1)
                env.render()
            # continue

        # for save & print history
        terminal_verbose = terminal
        if terminal:
            terminal_reward.append(np.mean(rewards))

            # save terminal state
            # process observation
            obs = agent.process_obs(obs)
            # get action & process action
            actions = agent.get_exploration_action(obs)
            actions = agent.process_action(actions)
            actions = agent.to_onehot(actions)
            # process rewards
            rewards = agent.process_reward(0.)
            rewards = rewards.mean().item()
            # process terminal
            terminal = agent.process_done(False)
            agent.memory.append(obs, actions, rewards, terminal, training=True)

            # reset environment
            obs = env.reset()
            episode_step = 0
            nb_episode += 1
            episode_rewards.append(0)

            # initialize hidden/cell states
            agent.actor.hState = None

        # increment global step counter
        train_step += 1

        # update all trainers, if not in display or benchmark mode
        loss = [np.nan, np.nan]
        if (train_step > arglist.warmup_steps) and (train_step % 600 == 0):
            # store hidden/cell state
            hState = agent.actor.hState
            # reset hidden/cell state
            agent.actor.hState = None
            # optimize actor-critic
            loss = agent.optimize()
            # recover hidden/cell state
            agent.actor.hState = hState
            loss = np.array([x.data.item() for x in loss])
            episode_loss.append(loss)

        if verbose_step:
            if loss == [np.nan, np.nan]:
                loss = ['--', '--']
            print('step: {}, actor_loss: {}, critic_loss: {}'.format(train_step, loss[0], loss[1]))

        elif verbose_episode:
            if terminal_verbose and (len(episode_rewards) % arglist.save_rate == 0):
                monitor_loss = np.mean(np.array(episode_loss)[-1000:], axis=0)

                msg1 = "steps: {}, episodes: {}, mean episode reward: {}, reward: {}, time: {}".format(
                    train_step, len(episode_rewards), round(np.mean(episode_rewards[-arglist.save_rate:]), 3),
                    round(np.mean(terminal_reward), 3), round(time.time() - t_start, 3))

                msg2 = "TD error: {}, c_model: {}, actorQ: {}, a_model: {}".format(
                    round(monitor_loss[2], 3),
                    round(monitor_loss[3], 3),
                    round(monitor_loss[4], 3),
                    round(monitor_loss[5], 3))
                msg = msg1 + ', ' + msg2
                print(msg)

                # save log
                log = open('results/train_log.txt', 'a')
                log.write(msg + '\n')
                log.close()

                terminal_reward = []
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

        # saves final episode reward for plotting training curve later
        if nb_episode > arglist.num_episodes:
            np.save('results/iter_{}_episode_rewards.npy'.format(cnt), episode_rewards)
            # rew_file_name = 'experiments/' + arglist.exp_name + '{}_rewards.pkl'.format(cnt)
            # with open(rew_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_rewards, fp)
            # agrew_file_name = 'experiments/' + arglist.exp_name + '{}_agrewards.pkl'.format(cnt)
            # with open(agrew_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_ag_rewards, fp)
            print('...Finished total of {} episodes.'.format(len(episode_rewards)))
            break