Beispiel #1
0
def run_agents(n_episodes=5):
    env = UnityEnvironment(file_name="envs/Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    state_size = env_info.vector_observations.shape[1]
    action_size = brain.vector_action_space_size
    num_agents = env_info.vector_observations.shape[0]
    maddpg = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents)

    for i, agent in enumerate(maddpg.agents):
        agent.actor_local.load_state_dict(torch.load(f'models/checkpoint_actor_local_{i}.pth'))
        agent.critic_local.load_state_dict(torch.load(f'models/checkpoint_critic_local_{i}.pth'))

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        while True:
            actions = maddpg.act(states, add_noise=True)
            env_info = env.step(actions)[brain_name]
            states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards
            if any(dones):
                break
        print(f"Epsiode {i_episode}. Rewards of two agents: {scores}")
def test(env, model_file='best.pt', num_ep=100):

    rewards_total = []
    dict_list = torch.load(model_file)

    maddpg = MADDPG()
    maddpg.maddpg_agent[0].actor.load_state_dict(torch.load('actor0.pt'))
    maddpg.maddpg_agent[1].actor.load_state_dict(torch.load('actor1.pt'))
    maddpg.maddpg_agent[0].critic.load_state_dict(torch.load('critic1.pt'))
    maddpg.maddpg_agent[1].critic.load_state_dict(torch.load('critic1.pt'))

    for i in range(1, num_ep + 1):  # play game for 100 episodes
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(2)  # initialize the score (for each agent)
        while True:
            actions = maddpg.act(states)  # select actions
            env_info = env.step(actions)[
                brain_name]  # send all actions to tne environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += rewards  # update the score (for each agent)
            states = next_states  # roll over states to next time step
            if np.any(dones):  # exit loop if episode finished
                break
        rewards_total.append(np.max(scores))
        print('Scores from episode {}: {}'.format(i, scores))
    print('Average Score over {} episodes: {}'.format(num_ep,
                                                      np.mean(rewards_total)))
Beispiel #3
0
def main(args):
    set_seed(args.seed)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # initialize environment
    n_players = 3
    env = football_env.create_environment(
        env_name="academy_3_vs_1_with_keeper",
        representation="simple115",
        number_of_left_players_agent_controls=n_players,
        stacked=False,
        logdir="/tmp/football",
        write_goal_dumps=False,
        write_full_episode_dumps=False,
        render=False)

    # state and action space
    state_space_size = env.observation_space.shape[
        1]  # we are using simple115 representation
    action_space_size = env.action_space.nvec.tolist()[0]  # 三个 players 动作空间相同
    # state[98:100] 表示控制的三个球员

    # model
    print("loading models")
    actors = [
        Actor(state_space_size=state_space_size,
              action_space_size=action_space_size) for _ in range(n_players)
    ]
    critics = [
        Critic(state_space_size=state_space_size,
               action_space_size=action_space_size,
               n_players=n_players) for _ in range(n_players)
    ]
    old_actors = [
        Actor(state_space_size=state_space_size,
              action_space_size=action_space_size) for _ in range(n_players)
    ]
    old_critics = [
        Critic(state_space_size=state_space_size,
               action_space_size=action_space_size,
               n_players=n_players) for _ in range(n_players)
    ]
    for old_actor, actor in zip(old_actors, actors):
        old_actor.load_state_dict(actor.state_dict())
    for old_critic, critic in zip(old_critics, critics):
        old_critic.load_state_dict(critic.state_dict())

    # maddpg
    maddpg = MADDPG(env=env,
                    action_list=list(range(action_space_size)),
                    actors=actors,
                    critics=critics,
                    old_actors=old_actors,
                    old_critics=old_critics,
                    args=args,
                    device=device)
    print("learn")
    maddpg.learn()
Beispiel #4
0
def play():
    env = UnityEnvironment(file_name='./Tennis.app')

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    # create agent
    maddpg_agent = MADDPG(state_size=state_size,
                          action_size=action_size,
                          seed=0)

    # load weights
    for i, agent in enumerate(maddpg_agent.maddpg_agent):
        agent.policy_local.load_state_dict(
            torch.load('models/checkpoint_actor_{}.pth'.format(i)))

    # reverse weights so agent 1 is on the left instead
    # for i, agent in enumerate(reversed(maddpg_agent.maddpg_agent)):
    #     agent.policy_local.load_state_dict(torch.load('models/checkpoint_actor_{}.pth'.format(i)))

    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    while True:
        actions = maddpg_agent.act(
            states, add_noise=False)  # select an action (for each agent)
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment
        next_states = env_info.vector_observations  # get next state (for each agent)
        rewards = env_info.rewards  # get reward (for each agent)
        dones = env_info.local_done  # see if episode finished
        scores += rewards  # update the score (for each agent)
        states = next_states  # roll over states to next time step
        if np.any(dones):  # exit loop if episode finished
            break

    print('Agent 0 score this episode: {}'.format(scores[0]))
    print('Agent 0 score this episode: {}'.format(scores[1]))

    env.close()
Beispiel #5
0
def train_agents(n_episodes=10000, t_max=1000):
    env = UnityEnvironment(file_name="envs/Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    seeding(seed=42)
    state_size = env_info.vector_observations.shape[1]
    action_size = brain.vector_action_space_size
    num_agents = env_info.vector_observations.shape[0]
    maddpg = MADDPG(state_size=state_size,
                    action_size=action_size,
                    num_agents=num_agents)

    scores_deque = deque(maxlen=100)
    scores_list = []
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        for _ in range(t_max):
            actions = maddpg.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards
            maddpg.step(states, actions, rewards, next_states, dones)
            states = next_states
            if np.any(dones):
                break

        scores_deque.append(np.max(scores))
        scores_list.append(np.max(scores))

        print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque)}',
              end="")
        if i_episode % PRINT_EVERY == 0:
            print(
                f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}'
            )

        if np.mean(scores_deque) >= 2.0 and len(scores_deque) >= 100:
            for i, agent in enumerate(maddpg.agents):
                torch.save(agent.actor_local.state_dict(),
                           f'models/checkpoint_actor_local_{i}.pth')
                torch.save(agent.critic_local.state_dict(),
                           f'models/checkpoint_critic_local_{i}.pth')
            print(
                f'\nSaved Model: Episode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}'
            )
            break

    return scores_list
Beispiel #6
0
def test_ddpg(env, episodes=10):
    # reset
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=False)[brain_name]

    # action and state size
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('State size:', state_size)
    print('Action size: ', action_size)

    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # load MADDPG agent
    maddpg = MADDPG(state_size, action_size, random_seed=0)
    for agent in maddpg.ddpg_agents:
        agent.actor_local.load_state_dict(
            torch.load('actor_agent_' + str(agent.id) + '.pth'))
        agent.critic_local.load_state_dict(
            torch.load('critic_agent_' + str(agent.id) + '.pth'))

    scores = []
    for n in range(episodes):
        # prepare for training in the current epoc
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = 0

        dones = [False] * num_agents
        states = env_info.vector_observations
        score = 0
        while not np.any(dones):
            actions = maddpg.act(states, add_noise=False)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            states = next_states

            score += np.max(rewards)

        scores.append(score)

    print('Average score over {} episodes: {:.4f}'.format(
        episodes, np.mean(scores)))
    return scores
def maddpg(n_episodes = 5000):


	#PARAMETERS:

	noise = 2
	batch_size = 256
	update_every = 1
	agent = MADDPG(discount_factor = 0.99, tau = 0.02, batch_size = batch_size)
	buff = ReplayBuffer(10000) 

	for i_episode in range(n_episodes):                                         
	    env_info = env.reset(train_mode = True)[brain_name]       
	    state = env_info.vector_observations 
	    state =  torch.from_numpy(np.array(state)).float().unsqueeze(0)
	    score = np.zeros(num_agents) 
	    t = 0

	    while True:
	        actions = agent.act(state, noise)
	        noise *= 0.9999
	        actions_array = torch.stack(actions).detach().numpy()
	        env_info = env.step(actions_array)[brain_name]  
	        next_state = env_info.vector_observations         
	        next_state = torch.from_numpy(np.array(next_state)).float().unsqueeze(0)
	        reward = np.array(env_info.rewards).reshape(1, -1)
	        dones = np.array(env_info.local_done).reshape(1, -1)   
	        actions_array = actions_array.reshape(1, -1)               
	        buff.push((state, actions_array, reward, next_state, dones))

	        if len(buff) > batch_size and t % update_every == 0: 
		        for i in range(2):
		            samples = buff.sample(batch_size)
		            agent.update(samples, i, noise)
	        	agent.update_targets() 	
	     
	        t += 1						
	        score += reward[0]		                         
	        state = next_state                             
	        if np.any(dones):                                
	            break

	    scores_window.append(np.max(score))
	    scores.append(np.max(score))
	    print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_window)), end = "")

	    if i_episode % 100:
	    	for i in range(2):
	    		torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/checkpoint_actor{}.pth'.format(i))
	    		torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/checkpoint_critic{}.pth'.format(i))

	    if np.mean(scores_window) >= 0.5:
	    	print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode - 100, np.mean(scores_window)))
	    	for i in range(2):
	    		torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/actor{}_finished.pth'.format(i))
	    		torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/critic{}_finished.pth'.format(i))
	    	break
Beispiel #8
0
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

import torch
import pickle

from maddpg import MADDPG

from collections import deque
import matplotlib.pyplot as plt
import time, os

maddpg = MADDPG(24, 2, 2, 1976)
scores_max_hist = []
scores_mean_hist = []

logger = logging.getLogger(__name__)

f_handle = logging.FileHandler("Log_File.txt")
f_format = logging.Formatter('%(levelname)s: %(asctime)s %(message)s')
f_handle.setFormatter(f_format)
f_handle.setLevel(logging.INFO)

logger.addHandler(f_handle)


def maddpg_train(n_episodes=2500):
Beispiel #9
0
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])

    full_action_size = num_agents * action_size
    full_state_size = num_agents * state_size

    maddpg = MADDPG(num_agents, state_size, action_size, buffer_size=0)
    maddpg.load(agent_id=1)

    for i_episode in range(10):
        env_info = env.reset(train_mode=False)[brain_name]

        i_step = 0
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        while True:
            actions = maddpg.act(states)

            env_info = env.step(actions)[brain_name]  # send all actions to tne environment
            rewards = env_info.rewards  # get reward (for each agent)
            next_states = env_info.vector_observations  # get next state (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += rewards  # update the score (for each agent)
Beispiel #10
0
from unityagents import UnityEnvironment

from maddpg import MADDPG
from ddpg import ReplayBuffer
import numpy as np
import torch
import matplotlib.pyplot as plt
from collections import deque

env = UnityEnvironment(file_name="Tennis.app")

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

agent = MADDPG(discount_factor=0.99, tau=0.02, batch_size=256)
agent.maddpg_agent[0].actor.load_state_dict(
    torch.load('bin/actor0_finished.pth',
               map_location=lambda storage, loc: storage))
agent.maddpg_agent[1].actor.load_state_dict(
    torch.load('bin/actor1_finished.pth',
               map_location=lambda storage, loc: storage))

env_info = env.reset(train_mode=False)[brain_name]
state = env_info.vector_observations
state = torch.from_numpy(np.array(state)).float().unsqueeze(0)
score = np.zeros(2)

while True:
    actions = agent.act(state, 0)
    actions_array = torch.stack(actions).detach().numpy()
    env_info = env.step(actions_array)[brain_name]
Beispiel #11
0
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
import torch
import numpy as np

from agent import DDPGAgent
from maddpg import MADDPG
from utils import MultiAgentReplayBuffer


def make_env(scenario_name, benchmark=False):
    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation)
    return env


env = make_env(scenario_name="simple_spread")

ma_controller = MADDPG(env, 1000000)
ma_controller.run(500, 300, 32)
Beispiel #12
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 5000
    # what is this ?
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    # this may be a list of all environments
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    # this creates a list of models, each element in the list refers to an agent in the simulation
    # [agent_one_ddpg, agent_two_ddpg, ...]
    # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    # notice we jump forward by number of parallel environments
    for episode in range(0, number_of_episodes, parallel_envs):
        timer.update(episode)

        # i believe there are as many as number of agents times parallel env reward
        reward_this_episode = np.zeros((parallel_envs, 3))
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # all_observation = array(number of environments 4, 2 elements)
        # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14
        # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements
        all_obs = env.reset()
        # obs : is a list that has 1 element per environment. each element contains a list of 3 array.
        # each array is the state of one agent in that environment.
        # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment
        # each element contains an array of 14 values which is the global state of that environment
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):
            # we finish the episode before sampling the buffer for trainint
            # t jumps forward in a multiple of environment
            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            # the transpose_to_tensor(obs) changes the data to each agent point of view
            # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
            # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
            # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
            # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
            # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and
            # to generate an action from each agent actor.
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction
            # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct
            # actions_array is a tensor of shape (3 agent, 4 env, 2 action)
            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # the shape of actions_for_env is (4 env, 3 agent, 2 action)
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            # obs is the observation state space of all the three agents in the 4 parallel env.
            # for the Physical Dception environment with three agents it is of dimension 4x3x14.
            # obs_full is world state irrespective of the agents and its dimension is 4x14.
            # To gain more understanding, please see the code in the multiagent folder.
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which
                # reward and actions belong to which agent
                # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done
                # each element of sample, say samples[0] is a list of 3 elements, one for each agent
                # each agent element contains their corresponding value, for example in case of obs it would be a
                # vector with 14 values
                # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Beispiel #13
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes + parallel_envs, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Beispiel #14
0
totalTime = 0

vis = visdom.Visdom(port=8097)
win = None
param = None

np.random.seed(1234)
th.manual_seed(1234)

n_episode = 20000
max_steps = 1200
episode_before_train = 100
obs = env.reset()
n_states = len(obs[0])

maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity, episode_before_train)

FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor

for i_episode in range(n_episode):
    startTime = datetime.datetime.now()
    obs = env.reset()
    obs = np.stack(obs)
    if isinstance(obs, np.ndarray):
       obs = th.from_numpy(obs).float()
    #obs = np.asarray(obs)
    reward_record = []
    adversaries_reward_record = []
    agent_reward_record = []

    # for i in range(len(obs)):
def main():

    seeding()

    number_of_episodes = 20000
    episode_length = 1000
    batchsize = 256
    save_interval = 1000
    rewards_deque = deque(maxlen=100)
    rewards_all = []
    noise = 1.0
    noise_reduction = 1.0

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)
    """ Info about the UnityEnvironment
    brain_name: 'TennisBrain'
    brain: ['brain_name', 'camera_resolutions',
           'num_stacked_vector_observations', 'number_visual_observations',
           'vector_action_descriptions', 'vector_action_space_size',
           'vector_action_space_type', 'vector_observation_space_size',
           'vector_observation_space_type']]
    """

    env = UnityEnvironment(file_name="Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    buffer = ReplayBuffer(int(1e5))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)

    # ------------------------------ training ------------------------------ #
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    for episode in range(1, number_of_episodes + 1):

        timer.update(episode)
        rewards_this_episode = np.zeros((2, ))
        """ Info about the UnityEnvironment
        env_info: ['agents', 'local_done', 'max_reached', 'memories',
                  'previous_text_actions', 'previous_vector_actions', 'rewards',
                  'text_observations', 'vector_observations', 'visual_observations']
        actions: List(num_agents=2, action_size=2)
        states: List((24,), (24,))
        rewards: List(2,)
        dones: List(2,)
        """
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        for episode_t in range(episode_length):
            # reset the OUNoise for each agent.
            for i in range(2):
                maddpg.maddpg_agent[i].noise.reset()

            actions = maddpg.act(states, noise=noise)
            env_info = env.step(actions)[brain_name]
            noise *= noise_reduction

            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # add data to buffer
            transition = (states, actions, rewards, next_states, dones)
            buffer.push(transition)

            rewards_this_episode += rewards

            states = next_states

            if any(dones):
                break

        # update the local and target network
        if len(buffer) > batchsize:
            # update the local network
            for _ in range(5):
                for a_i in range(2):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i, logger)
            # soft update the target network
            maddpg.update_targets()

        rewards_all.append(rewards_this_episode)
        rewards_deque.append(np.max(rewards_this_episode))
        average_score = np.mean(rewards_deque)

        # --------------------- Logging for TensorBoard --------------------- #
        logger.add_scalars('rewards', {
            'agent0': rewards_this_episode[0],
            'agent1': rewards_this_episode[1]
        }, episode)
        logger.add_scalars('global', {
            'score': np.max(rewards_this_episode),
            'average_score': average_score
        }, episode)
        # -------------------------- Save the model -------------------------- #
        save_dict_list = []

        if episode % save_interval == 0 or average_score >= 0.5:
            for i in range(2):
                save_dict = \
                    {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                     'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                     'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                     'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            if average_score >= 3.0:
                print('\nEnvironment solved in {} episodes!'.format(episode -
                                                                    100))
                print('\nAverage Score: {:.2f}'.format(average_score))
                break

    env.close()
    logger.close()
    timer.finish()
Beispiel #16
0
def main():
    env_info = env.reset(train_mode=False)[brain_name]
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    seeding()
    # number of parallel agents
    #parallel_envs = num_agents
    # number of training episodes.
    # change this to higher number to experiment. say 30000.

    number_of_episodes = 10000
    update_actor_after = 100
    update_actor_every = 2
    episode_length = 100
    batchsize = 100
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    LR_ACTOR = 1e-5
    LR_CRITIC = 3e-3

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.999999

    # how many episodes before update
    episode_per_update = 1
    no_of_updates_perTime = 1

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    #torch.set_num_threads(parallel_envs)
    #env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(10 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC)
    #logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    #agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes):

        timer.update(episode)

        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        reward_this_episode = np.zeros((1, num_agents))

        #all_obs = env.reset() #
        obs = states
        obs_full = np.concatenate((states[0], states[1]))

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < 1
                     or episode == number_of_episodes - 1)
        tmax = 0

        #resetting noise
        for i in range(num_agents):
            maddpg.maddpg_agent[i].noise.reset()

        for episode_t in range(episode_length):

            t += 1

            update_act = True if (episode > update_actor_after or episode %
                                  update_actor_every == 0) else False
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensorAsitis(obs),
                                 noise=noise,
                                 batch=False)
            noise *= noise_reduction

            actions_array = torch.stack(actions).cpu().detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            env_info = env.step(actions_for_env)[brain_name]

            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += env_info.rewards

            rewards_for_env = np.hstack(rewards)

            obs = states
            obs_full = np.concatenate((states[0], states[1]))
            next_obs = next_states
            next_obs_full = np.concatenate((next_states[0], next_states[1]))
            # add data to buffer
            transition = (np.array([obs]), np.array([obs_full]),
                          np.array([actions_for_env]),
                          np.array([rewards_for_env]), np.array([next_obs]),
                          np.array([next_obs_full]),
                          np.array([dones], dtype='float'))
            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # update once after every episode_per_update
            if len(buffer) > batchsize and episode % episode_per_update == 0:
                for _ in range(no_of_updates_perTime):
                    for a_i in range(num_agents):
                        samples = buffer.sample(batchsize)
                        #updating the weights of the n/w
                        maddpg.update(samples, a_i, update_actor=update_act)
                    maddpg.update_targets(
                    )  #soft update the target network towards the actual networks

            if np.any(dones):
                # if the episode is done the loop is break to the next episode
                break

        for i in range(num_agents):
            agent0_reward.append(reward_this_episode[0, 0])
            agent1_reward.append(reward_this_episode[0, 1])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)]
            agent0_reward = []
            agent1_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)
                print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(num_agents):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #frames, duration=.04)
    timer.finish()
Beispiel #17
0
print('The state for the first agent looks like:', states[0])

# config settings
config = Config()
config.update_every = 1
config.batch_size = 512
config.buffer_size = int(1e6)
config.discount = 0.99
config.tau = 0.2
config.seed = 2
config.lr_actor = 1e-4
config.lr_critic = 1e-4
config.action_size = action_size
config.state_size = state_size
config.num_agents = num_agents
ma = MADDPG(config)


def train(n_episode=30000):
    """
    Function to train the agent
    """
    scores = []
    scores_window = deque(maxlen=100)
    for i_episode in range(n_episode):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        ma.reset()
        score = np.zeros(num_agents)
        while True:
            actions = ma.act(states)
Beispiel #18
0
def main(arglist):
    ACTORS = 1
    env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode)
    if arglist.eval:
        current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        writer = SummaryWriter(log_dir='./logs/' + current_time + '-' +
                               arglist.scenario)
    maddpg_wrapper = MADDPG(ACTORS)

    maddpg_wrapper.create_agents(env, arglist)

    j = 0
    for episode in range(arglist.max_episode):
        obs = env.reset()
        terminal = False
        maddpg_wrapper.reset()
        total_reward = [0 for i in maddpg_wrapper.workers]
        step = 0

        while not terminal and step < 25:
            if not arglist.eval:
                env.render(0)
                time.sleep(0.03)

            actions = maddpg_wrapper.take_actions(obs)
            obs2, reward, done = env.step(actions)

            for actor in range(ACTORS):
                for i, rew in enumerate(reward[actor]):
                    total_reward[i] += rew

            j += ACTORS
            #terminal = all(done)
            if arglist.eval:
                maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2,
                                      done)

            obs = obs2
            step += 1

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            maddpg_wrapper.save(episode)

        if arglist.eval:
            for worker, ep_ave_max in zip(maddpg_wrapper.workers,
                                          maddpg_wrapper.ep_ave_max_q_value):
                print(worker.pos, ' => average_max_q: ',
                      ep_ave_max / float(step), ' Reward: ',
                      total_reward[worker.pos], ' Episode: ', episode)
                writer.add_scalar(
                    str(worker.pos) + '/Average_max_q',
                    ep_ave_max / float(step), episode)
                writer.add_scalar(
                    str(worker.pos) + '/Reward Agent',
                    total_reward[worker.pos], episode)

    env.close()
Beispiel #19
0
# ##############################################################################
#                                                  ENVIRONMENT
# ##############################################################################
env = UnityEnvironment(file_name=ENV_FILE, seed=SEED)

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# ##############################################################################
#                                                  AGENT
# ##############################################################################
# INITIALIZE AGENT, AND LOAD WEIGHTS FROM BEST SNAPSHOT
maddpg = MADDPG(
    actor_layer_sizes=ACTOR_LAYER_SIZES,
    critic_layer_sizes=CRITIC_LAYER_SIZES,
    clamp_actions=CLAMP_ACTIONS,
    logger=None,
)
maddpg.load_model(os.path.join(snapshots_dir, "best_model.snapshot"))

# ##############################################################################
#                                                 INTERACT WITH ENVIRONMENT
# ##############################################################################
for episode_i in range(1, N_EPISODES + 1):
    print("{dec}\nEpisode {i}\n{dec}\n".format(dec="=" * 60, i=episode_i))
    # INITIALIZE FOR NEW EPISODE
    rewards_this_episode = np.zeros((N_AGENTS, ))
    env_info = env.reset(train_mode=False)[brain_name]
    states = process_agent_states(env_info.vector_observations)
    global_state = process_gobal_state(env_info.vector_observations)
Beispiel #20
0
def update():
    if ALGORITHM == 'maddpg':
        ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model',
                      RETRAIN)
    elif ALGORITHM == 'ddpg':
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    else:
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    t1 = time.time()
    rewards1 = 0
    rewards2 = 0
    var = VAR
    collision = 0
    avgreward1 = []
    avgreward2 = []
    collision_percentage = []
    for i in range(MAX_EPISODES):
        s1, s2 = avs.reset()
        ep_reward1 = 0
        ep_reward2 = 0
        if i % 100000 == 0 and i > IMITATION_EPISODE:
            plot(avgreward1, avgreward2, collision_percentage, i)
        for j in range(MAX_EP_STEPS):
            if RENDER:
                avs.render()

            # Add exploration noise
            if i < IMITATION_EPISODE or i % 4 == 0:
                a1 = imitation(avs.agent1, avs.agent2, avs.target1)
                a2 = imitation(avs.agent2, avs.agent1, avs.target2)
            else:
                # add randomness to action selection for exploration
                a1 = ddpg.choose_action(s1)
                a1 = [
                    np.clip(np.random.normal(a1[0], var), -1, 1),
                    np.clip(np.random.normal(a1[1], var), -1, 1)
                ]
                a2 = ddpg.choose_action(s2)
                a2 = [
                    np.clip(np.random.normal(a2[0], var), -1, 1),
                    np.clip(np.random.normal(a2[1], var), -1, 1)
                ]
                # a2 = imitation(avs.agent2, avs.agent1, avs.target2)

            if DEBUG:
                time.sleep(0.1)
            s_1, r1, s_2, r2, done, info = avs.step(a1, a2)
            if ALGORITHM == 'ddpg':
                ddpg.store_transition(s1, a1, r1, s_1)
                ddpg.store_transition(s2, a2, r2, s_2)
            else:
                ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2)
                ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1)

            s1 = s_1
            s2 = s_2
            ep_reward1 += r1
            ep_reward2 += r2

            if j == MAX_EP_STEPS - 1 or done:
                print("pt:", ddpg.pointer)
                print('Episode:', i,
                      'Step:', j, ' Reward: %i' % int(ep_reward1),
                      int(ep_reward2), 'Explore: %.2f' % var)

                if i >= IMITATION_EPISODE:
                    rewards1 += ep_reward1
                    rewards2 += ep_reward2
                    if r1 < -100:
                        collision += 1
                    if (i + 1) % 100 == 0:
                        avgreward1.append(rewards1 / 100)
                        avgreward2.append(rewards2 / 100)
                        collision_percentage.append(collision)
                        rewards1 = 0
                        rewards2 = 0
                        collision = 0
                break
        if ddpg.pointer > MEMORY_CAPACITY:
            ddpg.learn()
            ddpg.learn()
            if var > MIN_VAR and i > IMITATION_EPISODE:
                var *= DECAY  # decay the action randomness
        if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE:
            ddpg.save(i)
    print('Running time: ', time.time() - t1)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

from maddpg import MADDPG
from collections import deque
import torch
agent = MADDPG(24, 2, 0)

env_info = env.reset(train_mode=True)[brain_name]
env_info.vector_observations.shape


def maddpg(max_episodes=2000, print_every=10):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, max_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        score = 0
        while True:
            actions = agent.act(states)
Beispiel #22
0
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])

    full_action_size = num_agents * action_size
    full_state_size = num_agents * state_size

    writer = SummaryWriter(log_dir="logs/train", flush_secs=30)
    maddpg = MADDPG(num_agents,
                    state_size,
                    action_size,
                    buffer_size=BUFFER_SIZE,
                    writer=writer)

    scores_list, avg_scores_list = maddpg_training(maddpg, writer)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores_list) + 1), scores_list)
    plt.plot(np.arange(1, len(avg_scores_list) + 1), avg_scores_list)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
    plt.savefig("Scores.png")

    writer.close()
Beispiel #23
0
            print(
                'Episode {}\tAverage Score: {:.3f} MaxReward: {:.3f} Buffer : {}/{} Noise: {:.3f} Timestep: {}.'
                .format(episode, avg_score, max_reward, len(agent.memory),
                        BUFFER_SIZE, agent.epsilon, agent.timestep_counter))

        if avg_score >= GOAL:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, avg_score))
            agent.checkpoint()
            break

    return global_scores, averaged_scores


# Init the Tennis environment and get agents, state and action info
env, brain_name, n_agents, state_size, action_size = init_environment(
    UNITY_EXE_PATH)
agent = MADDPG(state_size=state_size,
               action_size=action_size,
               n_agents=n_agents,
               random_seed=89)
# Train the agent and get the results
scores, averages = train()

# Plot Statistics (Global scores and averaged scores)
plt.subplot(2, 1, 2)
plt.plot(np.arange(1, len(scores) + 1), averages)
plt.ylabel('Tennis Environment Average Score')
plt.xlabel('Episode #')
plt.show()
Beispiel #24
0
print('\nExample state for a single agent:\n', states[0])

agent_state_size = process_agent_states(states).shape[1]
global_state_size = process_gobal_state(states).shape[0]

# ##############################################################################
#                                                  AGENT
# ##############################################################################
# Create Multi agent Actor-Critic Model
maddpg = MADDPG(
    actor_layer_sizes=ACTOR_LAYER_SIZES,
    critic_layer_sizes=CRITIC_LAYER_SIZES,
    discount_factor=DISCOUNT_FACTOR,
    tau=TAU,
    lr_actor=LR_ACTOR,
    lr_critic=LR_CRITIC,
    gradient_clipping=GRADIENT_CLIPPING,
    clamp_actions=CLAMP_ACTIONS,
    logger=logger,
    log_losses=True,
    log_layers=False,
    log_weights=False,
)

# ##############################################################################
#                                                  TRAIN
# ##############################################################################
buffer = ReplayBuffer(int(BUFFER_SIZE), seed=SEED)

n_episodes = 10000
best_rolling_mean_score = -np.inf
hard_noise_reigime = True
Beispiel #25
0
if __name__ == "__main__":

    # Configuration
    n_episodes = 1
    checkpoint = "./checkpoints/checkpoint{}.pth"

    # Unitiy environment
    env = UnityEnvironment("./Tennis_Linux/Tennis.x86_64")

    # Agent
    agent = TennisMultiAgent(state_size=24, action_size=2, n_agents=2)
    agent.load(checkpoint)

    # DDPG
    maddpg = MADDPG(env=env, agent=agent)
    scores = maddpg.test(n_episodes=n_episodes)

    # Close the environment
    env.close()

    if n_episodes > 1:
        # Show results
        print(scores)
        print("Average score of {} episodes: {:.2f}".format(
            n_episodes, np.mean(scores)))

        # Plot scores
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.plot(np.linspace(1, n_episodes + 1, n_episodes), scores)
        ax.set_xlabel("Episodes")
Beispiel #26
0
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

agents = MADDPG(state_size=state_size,
                action_size=action_size,
                num_agents=num_agents,
                random_seed=2)
agents.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agents.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))


def play(n_episodes=5):
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        agents.reset()
        scores = np.zeros(num_agents)
        while True:
            actions = agents.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
Beispiel #27
0
max_steps = 100
# before training, we will store the experience of all agents' state information for the next training process.
episode_before_train = 100
obs = env.reset()
n_states = len(obs[0])
initial_train = True
test_or_train = True

#vis = visdom.Visdom(port=8097)
win = None
param = None

np.random.seed(1234)
th.manual_seed(1234)
# the initial of the original maddpg, it is the basic part of our architecture.
maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity,
                episode_before_train, initial_train, test_or_train)

FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor

for i in range(maddpg.n_agents):
    maddpg.critics[i] = th.load('new/model_initial/critic[' + str(i) +
                                '].pkl_episode' + str(3000))
    maddpg.actors[i] = th.load('new/model_initial/actors[' + str(i) +
                               '].pkl_episode' + str(3000))

for i_episode in range(n_episode):
    startTime = datetime.datetime.now()
    obs = env.reset()
    obs = np.stack(obs)
    if isinstance(obs, np.ndarray):
        obs = th.from_numpy(obs).float()
Beispiel #28
0
def main():
seeding()
# number of parallel agents
parallel_envs = 4
# number of training episodes.
# change this to higher number to experiment. say 30000.
number_of_episodes = 1000
episode_length = 80
batchsize = 1000
# how many episodes to save policy and gif
save_interval = 1000
t = 0


# amplitude of OU noise
# this slowly decreases to 0
noise = 2
noise_reduction = 0.9999

# how many episodes before update
episode_per_update = 2 * parallel_envs

log_path = os.getcwd() + '/log'
model_dir = os.getcwd() + '/model_dir'

os.makedirs(model_dir, exist_ok=True)

torch.set_num_threads(parallel_envs)
env = envs.make_parallel_env(parallel_envs)

# keep 5000 episodes worth of replay
buffer = ReplayBuffer(int(5000 * episode_length))

# initialize policy and critic
maddpg = MADDPG()
logger = SummaryWriter(log_dir = log_path)
agent0_reward = []
agent1_reward = []
agent2_reward = []

# training loop
# show progressbar
import progressbar as pb
widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ',
		  pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ]

timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

# use keep_awake to keep workspace from disconnecting
# for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
for episode in range(0, number_of_episodes, parallel_envs):

	timer.update(episode)

	reward_this_episode = np.zeros((parallel_envs, 3))
	all_obs = env.reset()
	obs, obs_full = transpose_list(all_obs)

	# for calculating rewards for this particular episode - addition of all time steps

	# save info or not
	save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs)
	frames = []
	tmax = 0

	if save_info:
		frames.append(env.render('rgb_array'))
Beispiel #29
0
    'N_EPS_MIN': .01,                  # Normal noise min decay value

    'OU_THETA': 1e-2,                  # OU noise theta parameter
    'OU_SIGMA': 1e-2,                  # OU noise sigma parameters

    'SEED': 42,                       # Random seed
    'DEVICE': torch.device("cuda" if torch.cuda.is_available() else "cpu"),

    # Training Hyperparameters
    'N_EPISODES': 2000,
    'MAX_T': 2000,
    'SUCCESS_SCORE': .5,
    'PRINT_EVERY': 100,

    # Save on W&B
    'WANDB': True,
}

if PARAMETERS['WANDB']:
    # Save on wandb
    wandb.init(project="maddpg", config=PARAMETERS)

# Agent
agent = MADDPG(PARAMETERS)

# Training job
scores = train_MADDPG(env, agent, n_episodes=PARAMETERS['N_EPISODES'],
                      max_t=PARAMETERS['MAX_T'], success_score=PARAMETERS['SUCCESS_SCORE'],
                      print_every=PARAMETERS['PRINT_EVERY'], brain_name=brain_name,
                      use_wandb=PARAMETERS['WANDB'])
Beispiel #30
0
def main():
    seeding()
    # number of parallel agents
    number_of_agents = 2
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 3000
    batchsize = 128
    
    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1
    noise_reduction = 0.9999

    tau = 1e-3   # soft update factor
    gamma = 0.99 # reward discount factor

    print_every = 100
    # how many episodes before update
    episode_per_update = 2

    #model_dir= os.getcwd()+"/model_dir"
    #os.makedirs(model_dir, exist_ok=True)

    result_dir= os.getcwd()+"/result_dir"
    os.makedirs(result_dir, exist_ok=True)

    # do we need to set multi-thread for this env?
    torch.set_num_threads(number_of_agents*2)

    env = TennisEnv()
    
    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(1e5))
    
    num_agents, num_states, num_actions = env.get_shapes()

    # initialize policy and critic
    maddpg = MADDPG(num_agents, num_states, num_actions, discount_factor=gamma, tau=tau)

    # training loop
    scores_window = deque(maxlen=100)
    ep_scores = []


    agent0_reward = []
    agent1_reward = []

    for episode in range(0, number_of_episodes):
        reward_this_episode = np.zeros((1, number_of_agents))
        states, states_full, env_info = env.reset()

        for agent in maddpg.maddpg_agent:
            agent.noise.reset()

        while True:
            actions = maddpg.act(torch.tensor(states, dtype=torch.float), noise=noise)

            noise *= noise_reduction
            actions_for_env = torch.stack(actions).detach().numpy()

            # step forward one frame
            next_states, next_states_full, rewards, dones, info = env.step(actions_for_env)

            # add data to buffer
            buffer.push(states, states_full, actions_for_env, rewards, next_states, next_states_full, dones)

            reward_this_episode += rewards

            states = np.copy(next_states)
            states_full = np.copy(next_states_full)

            # update once after every episode_per_update
            if len(buffer) > batchsize:
                for a_i in range(number_of_agents):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i)

            if np.any(dones):
                break

        agent0_reward.append(reward_this_episode[0, 0])
        agent1_reward.append(reward_this_episode[0, 1])
        
        avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1])

        scores_window.append(avg_rewards)
        cur_score = np.mean(scores_window)
        ep_scores.append(cur_score)
        
        save_dict_list =[]
     
        if episode % print_every == 0.0 or avg_rewards > 2.5:
            print('\rEpisode: {}, Average score: {:.5f}, noise: {:.5f}'.format(episode, cur_score, noise))    
            
            
            if avg_rewards > 2.5:
                for i in range(number_of_agents):
                    save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                                 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                                 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                                 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                    save_dict_list.append(save_dict)

                    torch.save(save_dict_list, 
                               os.path.join(model_dir, 'episode-{}-{}.pt'.format(episode, cur_score)))
                print('model saved')
            break
    env.close()

    #print('main-ep_scores: ', ep_scores)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(ep_scores)+1), ep_scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    fig.savefig(result_dir + '/score_plot.png')