Esempio n. 1
0
    def __init__(self, state_size, action_size, seed):
        super(MADDPG, self).__init__()

        self.maddpg_agents = [
            DDPGAgent(state_size, action_size, 1 * seed),
            DDPGAgent(state_size, action_size, 2 * seed)
        ]

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
Esempio n. 2
0
    def __init__(self, num_agents=2, random_seed=1):  #np.random.randint(1000)
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed),
            DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed)
        ]

        self.num_agents = num_agents

        # Replay memory
        action_size = 2
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Esempio n. 3
0
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 discount_factor=0.95,
                 tau=0.02,
                 device=device):
        super(MADDPG, self).__init__()

        # store configuration parameters
        self.device = device
        self.discount_factor = discount_factor
        self.tau = tau
        self.num_agents = num_agents

        # create maddgp agent
        self.maddpg_agent = [
            DDPGAgent(num_agents,
                      local_obs_dim,
                      local_action_size,
                      global_obs_dim,
                      global_action_size,
                      device=self.device) for _ in range(num_agents)
        ]

        # iteration counter
        self.iter = 0
Esempio n. 4
0
    def __init__(self,
                 numAgents,
                 state_size,
                 action_size,
                 random_seed,
                 batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE,
                 use_batch_norm=USE_BATCH_NORM):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        self.agents = [
            DDPGAgent(i, numAgents, state_size, action_size, random_seed,
                      use_batch_norm) for i in range(numAgents)
        ]
        self.numAgents = numAgents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        self.batch_size = batch_size
Esempio n. 5
0
    def __init__(self,
                 state_size,
                 action_size,
                 numAgent,
                 random_seed,
                 epsilon=1,
                 epsilonDecay=0.995,
                 minEpsilon=0.00):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = epsilon
        self.epsilonDecay = epsilonDecay
        self.minEpsilon = minEpsilon
        self.numAgent = numAgent
        self.agents = [
            DDPGAgent(state_size, action_size, random_seed)
            for i in range(numAgent)
        ]
        self.sharedMemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                         random_seed)
Esempio n. 6
0
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 hidden_layers,
                 seed,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weight_decay=WEIGHT_DECAY,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE):
        """Initialize MADDPG agent."""
        super(MADDPG, self).__init__()

        self.seed = random.seed(seed)

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size

        self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \
                                 tau, lr_actor, lr_critic, weight_decay, seed) \
                                     for _ in range(num_agents)]

        self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)
Esempio n. 7
0
    def __init__(self, state_size, action_size, n_agents, random_seed=1):
        self.actor_local = Actor(state_size, action_size, random_seed)
        self.actor_target = Actor(state_size, action_size, random_seed)

        self.ddpg_agents = [
            DDPGAgent(state_size, action_size, self.actor_local,
                      self.actor_target, random_seed) for _ in range(n_agents)
        ]
Esempio n. 8
0
    def __init__(self, config):
        self.config = config

        if config.shared_replay_buffer:
            self.memory = config.memory()
            self.config.memory = self.memory

        self.ddpg_agents = [
            DDPGAgent(self.config) for _ in range(config.num_agents)
        ]

        self.t_step = 0
Esempio n. 9
0
    def __init__(self, state_size, action_size, num, seed):
        # shared memory for all agents
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   device)
        # define each agent
        self.ddpg_agents = [
            DDPGAgent(state_size, action_size, seed, self.memory, device)
            for _ in range(num)
        ]

        self.t_step = 0
        self.num_agents = num
def test(args):
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # dim of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # dim of the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent_1 = DDPGAgent(state_size, action_size)
    #agent_2 = DDPGAgent(state_size, action_size)
    agent_2 = TD3Agent(state_size, action_size)

    agent_1_path = '../results/td3_opponent/00_best_td3_model.checkpoint'
    agent_2_path = '../results/ddgp_solo/01_best_model.checkpoint'

    agent = MAAC(state_size, action_size, agent_2, agent_1, False, False)
    agent.load(agent_1_path, 0)
    agent.load(agent_2_path, 1)

    test_scores = []
    for i_episode in tqdm(range(1, 1 + args.test_n_run)):
        # initialize the scores
        scores = np.zeros(num_agents)
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current states
        dones = [False] * num_agents
        while not np.any(dones):
            actions = agent.act(states)  # select actions
            # send the actions to the environment
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get the next states
            rewards = env_info.rewards  # get the rewards
            dones = env_info.local_done  # see if episode has finished
            scores += rewards  # update the scores
            # roll over the states to next time step
            states = next_states

        test_scores.append(np.max(scores))

    avg_score = sum(test_scores) / len(test_scores)
    print("Test Score: {}".format(avg_score))

    return avg_score
Esempio n. 11
0
def train_ddpg_agent_job(config):
    if config.render_game:
        env = UnityEnvironment(
            file_name="./resources/Tennis_Linux/Tennis.x86_64")
    else:
        env = UnityEnvironment(
            file_name="./resources/Tennis_Linux_NoVis/Tennis.x86_64")

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    print(states)
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:\n', states[0])

    # Train the agent
    agent = DDPGAgent(state_size, action_size, config.random_seed,
                      config.buffer_size, config.batch_size, config.gamma,
                      config.tau, config.lr_actor, config.lr_critic,
                      config.weight_decay, config.sigma, config.actor_nn_size,
                      config.critic_nn_size, config.batch_norm,
                      config.clip_grad_norm)

    scores, avg_scores, std, save_path = train(agent, env, config.n_episodes,
                                               config.score_window_size,
                                               config.print_every,
                                               config.max_score,
                                               config.damp_exploration_noise)

    config.dump(save_path + 'config.yml')

    env.close()

    return scores, avg_scores, std
Esempio n. 12
0
    def __init__(self, n_agents, state_size, action_size, seed=0):
        super(MADDPGAgent, self).__init__()

        self.n_agents = n_agents
        self.maddpg_agent = [
            DDPGAgent(state_size, action_size, n_agents, seed + i)
            for i in range(n_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer((n_agents, action_size), BUFFER_SIZE,
                                   BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Noise parameters with exponential decay
        self.noise = INITIAL_NOISE
        self.noise_decay = NOISE_DECAY
Esempio n. 13
0
def run():
    env = gym.make('Pendulum-v0')
    seed = 30
    env.seed(seed)

    agent = DDPGAgent(seed=seed,
                      n_state=env.observation_space.shape[0],
                      n_action=env.action_space.shape[0])
    ''' 
    agent = Agent(state_size=env.observation_space.shape[0], 
                  action_size=env.action_space.shape[0], random_seed=seed)
    '''

    episodes_n = 2000
    steps_max = 300
    scores = []
    print_every = 100

    scores_deque = deque(maxlen=print_every)

    for i_episode in range(1, episodes_n):
        state = env.reset()
        agent.reset()
        score = 0
        done_step = 0
        for step in range(steps_max):
            action = agent.act(state)
            state_next, reward, done, meta = env.step(action)
            agent.step(state, action, reward, state_next, done)
            state = state_next
            score += reward
            done_step += 1
            if done:
                break
        scores.append(score)
        scores_deque.append(score)

        print_line(i_episode, scores_deque, end="")
        if i_episode % print_every == 0:
            print_line(i_episode, scores_deque, end="\n")

    return scores
Esempio n. 14
0
    def __init__(self, num_agents, state_size, action_size):
        """Initialize a MADDPGAgent wrapper.
        Params
        ======
            num_agents (int): the number of agents in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        
        self.agents = [DDPGAgent(state_size, action_size, i+1) for i in range(num_agents)]

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        
        # Will help to decide when to update the model weights
        self.t_step = 0
        
        # Directory where to save the model
        self.model_dir = os.getcwd() + "/saved_models"
        os.makedirs(self.model_dir, exist_ok=True)
    def __init__(self, num_agents, state_size, action_size, random_seed=None):
        super(MultiAgentDDPG, self).__init__()
        """Initialize an Agent object.
        
        Params
        ======
            num_agents (int): number of agents 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        if random_seed == None:
            random_seed = random.randint(1, 1000)
        self.seed = random.seed(random_seed)

        self.maddpg_agent = [DDPGAgent(state_size=self.state_size, action_size=self.action_size, random_seed=self.seed)\
                                for i in range(self.num_agents)]

        self.iter = 0
Esempio n. 16
0
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
state = env_info.vector_observations
state_size = state.shape[1]

# create agent
agent1 = DDPGAgent(nS=state_size,
                   nA=action_size,
                   lr_actor=0.0005,
                   lr_critic=0.0005,
                   gamma=0.99,
                   batch_size=60,
                   tau=0.001,
                   memory_length=int(1e6),
                   no_op=int(1e3),
                   net_update_rate=1,
                   std_initial=0.15,
                   std_final=0.025,
                   std_decay_frames=200000)

# create agent
agent2 = DDPGAgent(nS=state_size,
                   nA=action_size,
                   lr_actor=0.0005,
                   lr_critic=0.0005,
                   gamma=0.99,
                   batch_size=60,
                   tau=0.001,
Esempio n. 17
0
def run(env,
        device,
        episodes,
        experiment_name,
        update_rate,
        action_size,
        state_size,
        brain_name,
        epsilon_start=1.0,
        epsilon_min=0.05,
        epsilon_decay=0.995,
        max_score=30.,
        num_agents=1):

    epsilon = epsilon_start

    agent = DDPGAgent(state_space=state_size,
                      action_space=action_size,
                      buffer_size=int(1e5),
                      batch_size=512,
                      learning_rate_actor=0.001,
                      learning_rate_critic=0.001,
                      update_rate=update_rate,
                      gamma=0.995,
                      tau=0.001,
                      device=device,
                      seed=5,
                      num_agents=num_agents)
    score_window = deque(maxlen=100)
    all_scores = []
    tb_writer = SummaryWriter('{}/{}'.format('logs', experiment_name))

    for episode in range(episodes):
        agent.reset()
        scores = np.zeros(num_agents)
        dones = np.zeros((num_agents), dtype=bool)
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        while not np.any(dones):
            actions = agent.act(states, epsilon)
            actions = np.clip(actions, -1, 1)  # all actions between -1 and 1
            env_info = env.step(actions)[
                brain_name]  # send all actions to tne environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += env_info.rewards  # update the score (for each agent)

            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done)

            states = next_states

        episode_score = np.mean(scores)
        score_window.append(episode_score)
        all_scores.append(episode_score)

        #decay after each episode
        if episode % 10 == 0:
            epsilon = max(epsilon_min, epsilon * epsilon_decay)

        print('\rEpisode: {}\tAverage Score: {}'.format(
            episode, np.mean(score_window)),
              end="")
        if episode % 100 == 0:
            tb_writer.add_scalar('Episode_Accum_score', np.mean(score_window),
                                 episode)

            print('\rEpisode: {}\tAverage Score: {}'.format(
                episode, np.mean(score_window)))
        if np.mean(score_window) >= max_score:
            torch.save(agent.actor_local_network.state_dict(),
                       'actor_checkpoint_{}.pth'.format(experiment_name))
            torch.save(agent.critic_local_network.state_dict(),
                       'critic_checkpoint_{}.pth'.format(experiment_name))
            break
Esempio n. 18
0
            env = UnityEnvironment(
                file_name="./resources/Tennis_Linux/Tennis.x86_64")
            brain_name = env.brain_names[0]
            brain = env.brains[brain_name]

            env_info = env.reset(train_mode=False)[brain_name]
            num_agents = len(env_info.agents)
            action_size = brain.vector_action_space_size

            states = env_info.vector_observations
            state_size = states.shape[1]

            agent = DDPGAgent(state_size, action_size, conf.random_seed,
                              conf.buffer_size, conf.batch_size, conf.gamma,
                              conf.tau, conf.lr_actor, conf.lr_critic,
                              conf.weight_decay, conf.sigma,
                              conf.actor_nn_size, conf.critic_nn_size,
                              conf.batch_norm, conf.clip_grad_norm)

            agent.load_weights(
                actor_weights_file=checkpoint_path + "checkpoint_actor.pth",
                critic_weights_file=checkpoint_path + "checkpoint_critic.pth")

            demo(agent, env, num_episodes=100)

        finally:
            env.close()

    else:
        raise (ValueError, f"Unknown command {command}")
Esempio n. 19
0
# State space information
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

print('========================================')
# ### Training the DDPG agent to solve the environment ### #
# In the following the training takes place.
# When training the environment, setting `train_mode=True` accelerates the
# simulation environment so that training can be done much quicker.

# Make a new agent from the DDPGAgent class in 'ddpg_agent.py'
agent = DDPGAgent(state_size=state_size,
                  action_size=action_size,
                  num_agents=num_agents,
                  random_seed=1)  # , seed=random.randint(1,100000000)

# Print information about the network architecture
print('====================')
print('Actor-Critic network architecture')
print(agent.actor_local)
print(agent.critic_local)
print('====================')


# Set up a function that takes in information for training the agent for n_episodes.
def trainDDPG(n_episodes=1000, print_every=10):
    '''Deep Deterministic Policy Gradient (DDPG) agent that can be trained for a set amount of episodes.

    Keyword Arguments:
Esempio n. 20
0
 def __init__(self, random_seed, num_agents, state_size, action_size):
     self.agents = [DDPGAgent(state_size,action_size,random_seed) for x in range(num_agents)]
Esempio n. 21
0
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])

    for config in generate_grid_config():
        agent = DDPGAgent(state_size,
                          action_size,
                          ddpg_config=config,
                          num_agents=num_agents)
        agent.ddpg_actor_local.load_state_dict(
            torch.load('ddpg_weights/actor_{}.pth'.format(config)))
        agent.ddpg_critic_local.load_state_dict(
            torch.load('ddpg_weights/critic_{}.pth'.format(config)))
        evaluate_current_weights(env, agent, brain_name, num_agents, 5)
        # torch.save(agent.actor_critic.state_dict(), 'weights/{}.pth'.format(agent.ddpg_config))
        # agent.actor_critic.load_state_dict(torch.load('weights/{}.pth'.format(agent.ddpg_config)))
        # test_agent(env, agent, brain_name, num_agents)
Esempio n. 22
0
MAX_ITERATIONS = 200


def save_rewards(rewards):
    with open("reward_data.pickle", "wb") as handle:
        pickle.dump(rewards, handle)


# with tf.Session() as sess:
env = gym.make(ENV_NAME)
# print(env.observation_space.shape)
# print(env.action_space.shape)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

agent = DDPGAgent(env)

rewards = []

for episode in range(MAX_EPISODES):
    state = env.reset()
    total_reward = 0
    for itr in range(MAX_ITERATIONS):
        action = agent.getNoisyAction(state)
        # print("##### ", action, "####")
        state_, reward, done, _ = env.step(action[0])
        agent.observe(state, action, reward, state_, done)
        state = state_
        total_reward += reward
        if done:
            break
Esempio n. 23
0
import gym
from puckworld_env import PuckWorldEnv
from ddpg_agent import DDPGAgent
from utils import learning_curve
import numpy as np

env = PuckWorldEnv()
agent =DDPGAgent(env)
data = agent.learning(max_episode_num=200, display=True)
learning_curve(data, 2, 1, #title="DDPGAgent performance on PuckWorld with continuous action space",
               x_name="episodes", y_name="rewards of episode")
Esempio n. 24
0
def train_agent(episodes=100, model='DDPG', print_every=10):

    if model.lower() == 'd4pg':
        agent = D4PGAgent()
        print('Use D4PG agent......\n')
    else:
        agent = DDPGAgent()
        print('Use default DDPG agent......\n')

    print('Batch size: ', BATCH_SIZE)
    print('Actor learning rate: ', LR_ACTOR)
    print('Critic learning rate: ', LR_CRITIC)
    print('\n')

    env = EnvWrapper(file_name='Reacher_Windows_x86_64\Reacher.exe',
                     train_mode=True)

    scores = []
    scores_window = deque(maxlen=100)

    for ep in range(1, episodes + 1):
        agent.reset()
        agent.states = env.reset()

        for s in range(agent.max_steps):
            agent.actions = agent.act(add_noise=True)
            agent.rewards, agent.next_states, agent.dones = env.step(
                agent.actions)
            agent.step()
            agent.states = agent.next_states

        scores.append(agent.scores.mean())
        scores_window.append(agent.scores.mean())

        if ep % print_every == 0:
            print('Episode %d, avg score: %.2f' % (ep, agent.scores.mean()))

        if np.mean(scores_window) >= 30:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(ep - 100, np.mean(scores_window)))
            torch.save(agent.actor.state_dict(),
                       'checkpoints/reacher_%s_actor_checkpoint.pth' % model)
            torch.save(agent.critic.state_dict(),
                       'checkpoints/reacher_%s_critic_checkpoint.pth' % model)

    env.close()

    return scores, agent
Esempio n. 25
0
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

# Set device between cuda:0 and cpu
torch_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device =', torch_device)

memory = ReplayBuffer(action_size, memory_params['buffer_size'],
                      memory_params['batch_size'], memory_params['seed'], torch_device)

ddpg_agents = [DDPGAgent(state_size, action_size, memory, torch_device, params)
               for _ in range(num_agents)]

ddpg_scores = train(300, 5000, ddpg_agents, ["model_ddpg_actor.pth", "model_ddpg_critic.pth"],
                    benchmark_score, rolling_n_episodes)

plot_scores(ddpg_scores, benchmark_score, rolling_n_episodes)

# Test
ddpg_agents = [DDPGAgent(state_size, action_size, memory, torch_device, params)
               for _ in range(num_agents)]

for agent in ddpg_agents:
    agent.load_weights(["model_ddpg_actor.pth", "model_ddpg_critic.pth"])

test(ddpg_agents)
Esempio n. 26
0
# Set the minimum score the agent has to reach in order to solve this task
threshold = 0.5

max_episodes = 5000
max_t = 100000
threshold = 2.0
conseq_episodes = 5
print_every = 1

mode = False

if mode == True:
    #train(args)
    #exit()

    agent_1 = DDPGAgent(state_size, action_size)
    #agent_2 = DDPGAgent(state_size, action_size)
    agent_2 = TD3Agent(state_size, action_size)

    agent_1_path = 'results/ddgp_solo/00_best_model.checkpoint'
    #agent_2_path = 'results/temp/new_ddpg_model.checkpoint'
    agent_2_path = 'results/temp/new_td3_model.checkpoint'

    agent = MAAC(state_size, action_size, agent_1, agent_2, False, True)
    agent.load(agent_1_path, 0)


    df = train_CollabAndCompete(env, brain_name, max_episodes, max_t, threshold, \
                                    conseq_episodes, print_every, agent, agent_1_path, agent_2_path)

    plot_minmax(df)
Esempio n. 27
0
def main():
    test_mode = len(sys.argv) >= 2 and sys.argv[1] == "test"

    env = get_swimmer6_env()

    max_action_val = env.action_space.high[0]
    min_action_val = env.action_space.low[0]

    agent = DDPGAgent(env.state_space_dim,
                      env.action_space_dim,
                      min_action_val,
                      max_action_val,
                      hidden_layer_size=512,
                      path_to_load=MODEL_PATH)

    episode_rewards = []
    episodes_count = 1000000

    with open("avgs.log", "w") as avgs_file:
        for episode_index in range(episodes_count):
            try:
                episode_reward = 0
                state = env.reset()
                current_reward = None
                while True:
                    if test_mode:
                        action = agent.get_best_action(state)
                    else:
                        action = agent.get_action(state)

                    next_state, reward, done, _ = env.step(action)
                    if current_reward is None:
                        current_reward = reward
                        continue

                    reward_diff = reward - current_reward
                    current_reward = reward

                    if not test_mode:
                        agent.remember_step(
                            (state, action, next_state, reward_diff))
                        agent.learn()
                        agent.update_targets()

                    episode_reward += reward_diff

                    if done:
                        break
                    state = next_state
                episode_rewards.append(episode_reward)
                avg = np.mean(episode_rewards[-100:])
                print(
                    f"Episode #{episode_index}, reward: {episode_reward}, avg: {avg}"
                )
                avgs_file.write(str(episode_reward) + "\n")
                avgs_file.flush()

                if episode_index % 10 == 0 and not test_mode:
                    agent.save(MODEL_PATH)
            except KeyboardInterrupt:
                if not test_mode:
                    agent.save(MODEL_PATH)
                cmd = input("Input: ")
                if cmd == "1":
                    view_render(env, agent)
            except Exception as e:
                print(e)
Esempio n. 28
0
    n_hid1 = 400
    n_hid2 = 300
    lr_alpha = 1e-4
    lr_beta = 1e-3
    gamma = 0.99
    tau = 0.99

    fname = 'lunarlandercontinuous_ngames' + str(n_games) + '_memsize' + str(mem_size) + '_batchsize' + str(batch_size) + '_nhid1' + str(n_hid1)\
        + '_nhid2' + str(n_hid2) + '_lralpha' + str(lr_alpha) + '_lrbeta' + str(lr_beta) + '_gamma' + str(gamma) +\
                '_tau' + str(tau)

    figure_file = 'plots/' + fname + '.png'
    checkpoint_file = 'models/' + fname

    agent = DDPGAgent(load_checkpoint, n_states, n_actions, checkpoint_file,
                      mem_size, batch_size, n_hid1, n_hid2, lr_alpha, lr_beta,
                      gamma, tau)

    env = gym.make('LunarLanderContinuous-v2')
    if not load_checkpoint:
        scores = []
        n_to_consider = 100  # number of previous score to consider in the avg
        best_score = env.reward_range[0]
        for i in range(n_games):
            done = False
            score = 0
            agent.noise.reset()
            obs = env.reset()
            while not done:
                action = agent.choose_action(obs)
                obs_, reward, done, info = env.step(action)
Esempio n. 29
0
def main(path=''):
    """ show the environment controlled by the 20 smart agents
    Args:
       param1: (string) pathname for saved network weights

    """
    env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=False)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    config = Config()
    config.n_agents = num_agents
    config.gamma = 0.99
    config.state_dim = states.shape[1]
    config.action_dim = brain.vector_action_space_size
    config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    config.seed = 42
    config.leak = 0.001
    config.tau = 1e-3
    config.hdl1 = 256
    config.hdl2 = 128
    config.hdl3 = 128
    config.lr_actor = 0.001
    config.lr_critic = 0.001
    config.batch_size = 1024
    config.weight_decay = 0.99
    config.memory_capacity = int(1e6)
    agent = DDPGAgent(config)

    agent.actor_local.load_state_dict(torch.load(path + 'checkpoint_actor.pth'))
    agent.critic_local.load_state_dict(torch.load(path + 'checkpoint_critic.pth'))
    for _ in range(3):
        episode_reward = []
        scores = np.zeros(num_agents)
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        agent.reset_noise()
        total_steps = 0
        while True:
            total_steps += 1
            actions = agent.act(states, 0, False)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            reward = env_info.rewards
            done = np.array(env_info.local_done)
            episode_reward.append(np.mean(reward))
            scores += reward
            states = next_states
            if np.any(done):
                print("total steps", total_steps)
                print(sum(episode_reward))
                print('average: ', np.mean(scores))
                print('min: ', np.min(np.array(episode_reward)))
                print('max: ', np.max(np.array(episode_reward)))
                break
Esempio n. 30
0
def main():
    # Instanciate specified environment.
    env = fe.FlockingEnv(size1, dynamic="first")

    # Get environment specs
    num_states = (size1 + 1) * dim * 2
    num_actions = size1 * dim

    # Print specs
    print("Number of states: %d" % num_states)
    print("Number of actions: %d" % num_actions)
    print("-----------------------------------------")

    # Instanciate reinforcement learning agent which contains Actor/Critic DNN.
    #agents =[]
    #for i in range(0,size):
    agent = DDPGAgent(ob_shape=num_states, ac_shape=dim)
    #    agents.append(agent)
    # Exploration noise generator which uses Ornstein-Uhlenbeck process.
    noise = OUNoise(1)

    for i in range(episodes_num):
        print("--------Episode %d--------" % i)
        reward_per_episode = 0
        observation = env.reset_mul()
        #observation = env.reset_full()

        for j in range(steps_limit):
            if is_movie_on: env.render()

            # Select action off-policy
            state = observation
            action = np.zeros((size1, dim), dtype=np.float32)

            # get individual ob states here
            for k in range(0, size1):
                ac = agent.feed_forward_actor(
                    np.reshape(state[k], [1, num_states]))
                # print(noise.generate())
                if i % 2 == 0:
                    action[k][0] = ac[0][0] + noise.generate()
                    action[k][1] = ac[0][1] + noise.generate()
                else:
                    action[k][0] = ac[0][0] + noise.generate()
                    action[k][1] = ac[0][1] + noise.generate()
            '''
            action = agent.feed_forward_actor(np.reshape(state, [1, num_states]))
            action = np.reshape(action, [size1,dim])
            for k in range(0, size1):
                if i % 2 == 0:
                    action[k][0] += noise.generate()
                    action[k][1] += noise.generate()
            '''
            # Throw action to environment
            observation, reward, done, info = env.step_mul(action)
            #observation, reward, done, info = env.step_full(action)

            for k in range(0, size1):
                agent.add_experience(np.reshape(state[k],
                                                [num_states]), action[k],
                                     np.reshape(observation[k], [num_states]),
                                     reward[k], done)
            #action=np.reshape(action,[num_actions])
            #agent.add_experience(np.reshape(state, [num_states]), action,
            #                                np.reshape(observation, [ num_states]), reward, done)

            # Train actor/critic network
            if len(agent.replay_buffer) > MINI_BATCH_SIZE: agent.train()

            reward_per_episode = reward.sum()

            if j % 100 == 0:
                print(j, "step finished. reward=", reward_per_episode, "info=",
                      info)
                # print("action=",action,"observation=",observation)
            if (done or j == steps_limit - 1):
                print("Steps count: %d" % j)
                print("Total reward: %d" % reward_per_episode)

                env.render()
                #noise.reset()

                with open("reward_log.csv", "a") as f:
                    f.write("%d,%f\n" % (i, reward_per_episode))

                break