Esempio n. 1
0
def ddpg(agent_instance, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores_deque.append(0)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    n_episodes = 0
    start_time = time.time()                                    # start time for printing
    history = []

    agent_obj = [ddpg_agent.Agent(**agent_instance) for _ in range(num_agents)]  # generate list of agents for each agent instance according to number of agents

    while np.mean(scores_deque) < 0.8:
        n_episodes += 1
        env_info = env.reset(train_mode=True)[brain_name]     # reset the environment
        states = env_info.vector_observations                  # get the current
        # state (for each agent)
        scores = np.zeros(num_agents)                          # initialize the score (for each agent)
        for agent in agent_obj:
            agent.reset()

        learn_count = 0

        while True:
            learn_count += 1
            actions = np.array([agent_obj[i].act(states[i], add_noise=True) for i in range(num_agents)])                        # select an action (for each agent)

            #env_info = env.step(actions)[brain_name]           # send all actions to tne environment

            env_info = env.step(np.reshape(np.concatenate(([actions[i] for i in range(num_agents)]), axis = 0), (1, action_tensor_size)))[brain_name]           # send all actions to the environment

            next_states = env_info.vector_observations        # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            for i in range(num_agents):
                agent_obj[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i], learn_count, update_count)

            states = next_states                               # roll over states to next time step

            scores += rewards                         # update the score (for each agent)

            if np.any(dones):                                  # exit loop if episode finished
                break

        scores_deque.append(np.max(scores))

        history.append(np.mean(scores))

        delta_time = str(timedelta(seconds=time.time() - start_time))  # elapsed time
        count = 0
        for agent in agent_obj:
            count += 1
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_%s.pth' % count)
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_%s.pth' % count)

        if n_episodes % (print_every) == 0:
            print("\rEpisode: {}\tHighest Score {: .2f}\tAverage score for last {} episodes was {: .2f}\tTime: {:.9}".format(n_episodes, np.max(scores_deque), print_every, np.mean(scores_deque), delta_time))

    print("\n\rLast Episode: {}\tHighest Score {: .2f}\tAverage score for last {} episodes was {: .2f}\tTime: {:.9}".format(n_episodes, np.max(scores_deque), print_every, np.mean(scores_deque), delta_time))
    return history
Esempio n. 2
0
def ddpg(agent_instance, print_every=100):

    agent_obj = [
        ddpg_agent.Agent(**agent_instance) for _ in range(num_agents)
    ]  # generate list of agents for each agent instance according to number of agents
    count = 0

    for agent in agent_obj:
        count += 1
        agent_obj[count - 1].actor_local.load_state_dict(
            torch.load('checkpoint_actor_%s.pth' % count))
        agent_obj[count - 1].critic_local.load_state_dict(
            torch.load('checkpoint_critic_%s.pth' % count))

    while True:
        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current
        # state (for each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        for agent in agent_obj:
            agent.reset()

        learn_count = 0

        while True:
            learn_count += 1
            actions = np.array([
                agent_obj[i].act(states[i], add_noise=False)
                for i in range(num_agents)
            ])  # select an action (for each agent)

            #env_info = env.step(actions)[brain_name]           # send all actions to tne environment

            env_info = env.step(
                np.reshape(np.concatenate(
                    (actions[0], actions[1]),
                    axis=0), (1, action_tensor_size)))[
                        brain_name]  # send all actions to the environment

            next_states = env_info.vector_observations  # get next state (for each agent)
            dones = env_info.local_done  # see if episode finished

            states = next_states  # roll over states to next time step

            if np.any(dones):  # exit loop if episode finished
                break
Esempio n. 3
0
def ddpg_runner(args):

    env = args['environment']
    brain_name = args['brain_name']
    scores = []
    agent = ddpg_agent.Agent(args['agent_args'])
    achievement = args['achievement']
    achievement_length = args['achievement_length']
    scores_deque = deque(maxlen=achievement_length)

    for i_episode in range(1, args['episodes'] + 1):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        score = 0
        while True:
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done  # see if episode finished
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += np.mean(rewards)
            if np.any(dones):
                break

        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_deque)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')

        if np.mean(scores_deque) > achievement:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            return scores

    return scores
Esempio n. 4
0
def ddpg(args):
    scores_deque = deque(maxlen=args['maxlen'])
    env = args['environment']
    brain_name = args['brain_name']
    scores = []
    agent = ddpg_agent.Agent(args['agent_args'])

    for i_episode in range(1, args['episodes'] + 1):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        score = 0
        while True:
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            learning_rewards = [0] * len(
                rewards)  # get reward (for each agent)
            for i in range(len(rewards)):
                learning_rewards[
                    i] = -0.0001 if rewards[i] == 0 else rewards[i]
            dones = env_info.local_done  # see if episode finished
            agent.step(states, actions, learning_rewards, next_states, dones)
            states = next_states
            score += np.mean(rewards)
            if np.any(dones):
                break

        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_deque)))
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if np.mean(scores_deque) > 30:
            return scores

    return scores
Esempio n. 5
0
def main(args):
    env = Tennis(args.env_path)

    config = {
        'state_size': env.state_size,
        'action_size': env.action_size,
        'reward_accum_steps': 1000,
        'random_seed': 1,
        'gamma': 0.99,
        'update_cycle': 400,
        'update_times': 10,
        'buffer_size': int(1e6),
        'batch_size': 1024,
        'warm_start_size': 1024,
        'n_episode': 1000000,
        'max_t': 1000,
        'window_size': 100,
        'ckpt_prefix': 'checkpoint',
        'reset_cycle': 20000,
    }

    agents = [ddpg_agent.Agent(**config) for _ in range(env.num_agents)]

    if args.train:
        scores = maddpg(agents, env, **config)
        plot(scores, args.png_path)

    if args.show:
        for agent_i, agent in enumerate(agents):
            agent.actor_local.load_state_dict(
                torch.load('checkpoint_actor_{}.pth'.format(agent_i),
                           lambda a, b: a))
            agent.critic_local.load_state_dict(
                torch.load('checkpoint_critic_{}.pth'.format(agent_i),
                           lambda a, b: a))
        show(agents, env)
Esempio n. 6
0
    return agent


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--max_minutes', type=int, default=60)
    parser.add_argument('--max_episodes', type=int, default=2000)
    parser.add_argument('--name', type=str, default='player')
    parser.add_argument('--memory_size', type=int, default=int(1e5))
    parser.add_argument('--warm_up', type=int, default=int(1e4))
    parser.add_argument('--batch_size', type=int, default=1024)
    parser.add_argument('--discount', type=float, default=0.995)
    parser.add_argument('--tau', type=float, default=1e-3)
    parser.add_argument('--gradient_clip', type=float, default=1)
    parser.add_argument('--random_process', type=str, default='gaussian')
    parser.add_argument('--random_theta', type=float, default=0.1)
    parser.add_argument('--random_std', type=float, default=1)
    parser.add_argument('--random_std_decay', type=float, default=0.999)
    parser.add_argument('--update_every', type=int, default=5)
    parser.add_argument('--update_epochs', type=int, default=5)
    parser.add_argument('--h1_size', type=int, default=256)
    parser.add_argument('--h2_size', type=int, default=256)
    parser.add_argument('--actor_lr', type=float, default=1e-3)
    parser.add_argument('--critic_lr', type=float, default=1e-4)
    args = parser.parse_args()
    params = args.__dict__

    print(f'Train agent with params: {params}')
    agent = train(ddpg_agent.Agent(gym_env, state_size, action_size, params), args.max_minutes, args.max_episodes)
    agent.save()
        if np.mean(scores_deque) >= checkpoint_score:
            checkpt = "Episode" + str(i_episode)
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_deque)))
            agent.checkpoint(checkpt)
            break

    return scores


# In[49]:

agent = ddpg_agent.Agent(state_size=state_size,
                         action_size=action_size,
                         random_seed=0,
                         num_envs=num_envs,
                         checkpt_folder="MultiEnvCheckPt")

# In[50]:

rr_scores = train(env=env, agent=agent)  # Multiple parallel Env

# In[51]:

plot_scores(rr_scores)  # random replay scores

# When finished, you can close the environment.

# In[6]:
Esempio n. 8
0
from collections import deque
import matplotlib.pyplot as plt
#%matplotlib inline
import importlib
from ddpg_agent import Agent
import ddpg_agent
import model
av_reward = deque(maxlen=100)

importlib.reload(ddpg_agent)
#importlib.reload(model)

agent_list = []
#import ipdb; ipdb.set_trace()
agent_list.append(
    ddpg_agent.Agent(state_size=24, action_size=2, random_seed=100))
for a in range(1):
    agent = ddpg_agent.Agent(state_size=24, action_size=2, random_seed=a)
    #agent.memory = agent_list[0].memory
    agent_list.append(agent)
num_episodes = 10000
max_t = 10000
training_reward_list = []
best_score = 0
for episode in range(num_episodes):
    print(episode)
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    time_step = 0
    for agent in agent_list:
Esempio n. 9
0
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# size of each action
action_size = brain.vector_action_space_size

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]

agent = ddpg_agent.Agent(num_agents=num_agents,
                         state_size=state_size,
                         action_size=action_size,
                         random_seed=31337)


def tick_simulation(actions):
    env_info = env.step(actions)[brain_name]
    return env_info.vector_observations, env_info.rewards, env_info.local_done


def plot_scores(scores, mean_scores):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores) + 1), scores)
    plt.plot(np.arange(1, len(mean_scores) + 1), mean_scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
Esempio n. 10
0
config.replay_fn = lambda: Replay(
    config.action_size, buffer_size=int(1e6), batch_size=128)
config.noise_fn = lambda: OUNoise(
    config.action_size, mu=0., theta=0.15, sigma=0.1, seed=config.seed)

config.discount = 0.99
config.target_mix = 3e-3

config.max_episodes = 3000
config.max_steps = int(1e6)
config.goal_score = 1

config.CHECKPOINT_FOLDER = "MultiAgentCheckPt"

maddpg_agent = ddpg_agent.Agent(config=config)

# In[21]:

ddpg_scores, ddpg_avg_scores = train(env=env,
                                     agent=maddpg_agent,
                                     config=config)  # Multiple parallel Env

# In[26]:

plot_scores(ddpg_scores, ddpg_avg_scores)  # random replay scores

# When finished, you can close the environment.

# In[6]:
Esempio n. 11
0
def ddpg(agent_instance, print_every=1000):
    scores_deque0 = deque(maxlen=print_every)
    scores_deque0.append(0)
    scores_deque1 = deque(maxlen=print_every)
    scores_deque1.append(0)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    n_episodes = 0
    start_time = time.time()                                    # start time for printing
    history0 = []
    history1 = []


    agent_obj = [ddpg_agent.Agent(**agent_instance) for _ in range(num_agents)]  # generate list of agents for each agent instance according to number of agents
    while np.mean(scores_deque0) < 0.8:
        n_episodes += 1
        env_info = env.reset(train_mode=True)[brain_name]     # reset the environment
        states = np.reshape(env_info.vector_observations                  # get the current
        # state (for each agent)
        scores = np.zeros(num_agents)                          # initialize the score (for each agent)
        for agent in agent_obj:
            agent.reset()

        learn_count = 0

        while True:
            learn_count += 1
            actions = np.array([agent_obj[i].act(states[i], i) for i in range(num_agents)])                        # select an action (for each agent)
            env_info = env.step(np.reshape(np.concatenate((actions[0], actions[1]), axis = 0), (1, 4)))[brain_name]           # send all actions to the environment

            #env_info = env.step(actions)[brain_name]           # send all actions to the environment
            next_states = env_info.vector_observations        # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished

            #(agent_obj[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i], learn_count, update_count, i) for i in range(num_agents))
            (agent_obj[i].step(states, actions[i], rewards[i], next_states, dones[i], learn_count, update_count, i) for i in range(num_agents))

            states = next_states                               # roll over states to next time step

            scores += rewards                         # update the score (for each agent)

            #print("\rEpisode: {}\tAvg. Agent Score: {:.2f}\t{} Episode Rolling Average Score: {:.2f}\tStep: {}".format(n_episodes, np.mean(scores), print_every, np.mean(scores_deque), learn_count), end =" ")

            if np.any(dones):                                  # exit loop if episode finished
                break
        print('\rEpisode {}\tActor 1s score was {: .2f}\t Actor 2s score was {: .2f}'.format(n_episodes, scores[0], scores[1]), end =" ")
        scores_deque0.append(np.max(scores))
        scores_deque1.append(scores[1])


        history0.append(scores[0])
        history1.append(scores[1])

        delta_time = str(timedelta(seconds=time.time() - start_time))   #  elapsed time

        (torch.save(agent[i].actor_local.state_dict(), 'checkpoint_actor%s.pth' % i) for i in range(num_agents))
        (torch.save(agent[i].critic_local.state_dict(), 'checkpoint_critic%s.pth' % i ) for i in range(num_agents))

        if n_episodes % (print_every) == 0:
            print("\n\rEpisode: {}\tActor 1s avg score was {: .2f}\t Actor 2s avg score was {: .2f}\t A1 Max {}\tTime: {:.9}".format(n_episodes, np.mean(scores_deque0), np.mean(scores_deque1), np.max(scores_deque0), delta_time))

    print("\n\rEpisode: {}\tActor 1s avg score was {: .2f}\t Actor 2s avg score was {: .2f}\t A1 Max {}\tTime: {:.9}".format(n_episodes, np.mean(scores_deque0), np.mean(scores_deque1), np.max(scores_deque0), delta_time))
    return history0, history1

agent_instance ={"state_size": state_size, "action_size": action_size, "random_seed": random_seed, "clip_constant": clip_constant}

results0, results1 = ddpg(agent_instance)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(results0)+1), results0)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
Esempio n. 12
0
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=False)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]

# create agent from checkpoint
agent = ddpga.Agent(state_size, action_size, random_seed=15)
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

rounds = 5
scores_all = []

for r in range(rounds):
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations

    scores = np.zeros(num_agents)

    while True:
        actions = agent.act(
            states, add_noise=False)  # select an action (for each agent)
        if np.mean(scores_deque) >= breakpoint_score:
            fname += str(i_episode)
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_deque)))
            torch.save(agent.qnetwork_local.state_dict(),
                       fname + 'checkpoint.pth')
            break

    return scores


#%%

agent = ddpg_agent.Agent(state_size=state_size,
                         action_size=action_size,
                         random_seed=0)

#%%
rr_scores = train(env=env, agent=agent)  # random replay training

#%%
import matplotlib.pyplot as plt


def plot_scores(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
Esempio n. 14
0
                                     batch_size=args.batch_size,
                                     seed=args.seed)
    else:
        memory = DeterministicReplayBuffer(action_size=action_size,
                                           state_size=state_size,
                                           buffer_size=args.buffer_size)

    # agent
    if args.algorithm == "ddpg":
        agent = ddpg.Agent(state_size=state_size,
                           action_size=action_size,
                           seed=args.seed,
                           batch_size=args.batch_size,
                           memory=memory,
                           lr_actor=args.lr_actor,
                           lr_critic=args.lr_critic,
                           clip_critic=args.clip_critic,
                           gamma=args.gamma,
                           tau=args.tau,
                           weight_decay=args.weight_decay,
                           update_network_steps=args.update_network_steps,
                           sgd_epoch=args.sgd_epoch,
                           checkpoint_prefix=args.checkpoint_prefix)
    else:
        agent = ppo.Agent(state_size=state_size,
                          action_size=action_size,
                          seed=args.seed,
                          batch_size=args.batch_size,
                          memory=memory,
                          lr_actor=args.lr_actor,
                          gamma=args.gamma,
                          eps=args.eps,
Esempio n. 15
0
import matplotlib.pyplot as plt
#%matplotlib inline
import importlib
from ddpg_agent import Agent
import ddpg_agent
import model

av_reward = deque(maxlen=100)

importlib.reload(ddpg_agent)
#importlib.reload(model)

agent_list = []
#import ipdb; ipdb.set_trace()
agent_list.append(
    ddpg_agent.Agent(state_size=33, action_size=action_size, random_seed=100))
for a in range(19):
    agent = ddpg_agent.Agent(state_size=33,
                             action_size=action_size,
                             random_seed=a)
    agent.memory = agent_list[0].memory
    agent_list.append(agent)
num_episodes = 150
max_t = 10000
training_reward_list = []
best_score = 0
for episode in range(num_episodes):
    print(episode)
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
Esempio n. 16
0
            break

    return scores


import sys
Reacher_path = sys.argv[1]
env_20 = Reacher(Reacher_path)

import ddpg_agent
reward_accum_steps = 20
agent_20 = ddpg_agent.Agent(state_size=33,
                            action_size=4,
                            random_seed=1,
                            gamma=0.99,
                            update_cycle=reward_accum_steps * 20,
                            update_times=reward_accum_steps * 20 // 40,
                            buffer_size=int(1e6),
                            batch_size=1024,
                            warm_start_size=1024)

scores = ddpg(agent_20,
              env_20,
              1000,
              is_20=True,
              ckpt_prefix='checkpoint_20',
              reward_accum_steps=reward_accum_steps)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores) + 1), scores)