Esempio n. 1
0
def train_agent(episodes=100, model='DDPG', print_every=10):

    if model.lower() == 'd4pg':
        agent = D4PGAgent()
        print('Use D4PG agent......\n')
    else:
        agent = DDPGAgent()
        print('Use default DDPG agent......\n')

    print('Batch size: ', BATCH_SIZE)
    print('Actor learning rate: ', LR_ACTOR)
    print('Critic learning rate: ', LR_CRITIC)
    print('\n')

    env = EnvWrapper(file_name='Reacher_Windows_x86_64\Reacher.exe',
                     train_mode=True)

    scores = []
    scores_window = deque(maxlen=100)

    for ep in range(1, episodes + 1):
        agent.reset()
        agent.states = env.reset()

        for s in range(agent.max_steps):
            agent.actions = agent.act(add_noise=True)
            agent.rewards, agent.next_states, agent.dones = env.step(
                agent.actions)
            agent.step()
            agent.states = agent.next_states

        scores.append(agent.scores.mean())
        scores_window.append(agent.scores.mean())

        if ep % print_every == 0:
            print('Episode %d, avg score: %.2f' % (ep, agent.scores.mean()))

        if np.mean(scores_window) >= 30:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(ep - 100, np.mean(scores_window)))
            torch.save(agent.actor.state_dict(),
                       'checkpoints/reacher_%s_actor_checkpoint.pth' % model)
            torch.save(agent.critic.state_dict(),
                       'checkpoints/reacher_%s_critic_checkpoint.pth' % model)

    env.close()

    return scores, agent
Esempio n. 2
0
def run():
    env = gym.make('Pendulum-v0')
    seed = 30
    env.seed(seed)

    agent = DDPGAgent(seed=seed,
                      n_state=env.observation_space.shape[0],
                      n_action=env.action_space.shape[0])
    ''' 
    agent = Agent(state_size=env.observation_space.shape[0], 
                  action_size=env.action_space.shape[0], random_seed=seed)
    '''

    episodes_n = 2000
    steps_max = 300
    scores = []
    print_every = 100

    scores_deque = deque(maxlen=print_every)

    for i_episode in range(1, episodes_n):
        state = env.reset()
        agent.reset()
        score = 0
        done_step = 0
        for step in range(steps_max):
            action = agent.act(state)
            state_next, reward, done, meta = env.step(action)
            agent.step(state, action, reward, state_next, done)
            state = state_next
            score += reward
            done_step += 1
            if done:
                break
        scores.append(score)
        scores_deque.append(score)

        print_line(i_episode, scores_deque, end="")
        if i_episode % print_every == 0:
            print_line(i_episode, scores_deque, end="\n")

    return scores
Esempio n. 3
0
def main(path=''):
    """ show the environment controlled by the 20 smart agents
    Args:
       param1: (string) pathname for saved network weights

    """
    env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=False)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    config = Config()
    config.n_agents = num_agents
    config.gamma = 0.99
    config.state_dim = states.shape[1]
    config.action_dim = brain.vector_action_space_size
    config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    config.seed = 42
    config.leak = 0.001
    config.tau = 1e-3
    config.hdl1 = 256
    config.hdl2 = 128
    config.hdl3 = 128
    config.lr_actor = 0.001
    config.lr_critic = 0.001
    config.batch_size = 1024
    config.weight_decay = 0.99
    config.memory_capacity = int(1e6)
    agent = DDPGAgent(config)

    agent.actor_local.load_state_dict(torch.load(path + 'checkpoint_actor.pth'))
    agent.critic_local.load_state_dict(torch.load(path + 'checkpoint_critic.pth'))
    for _ in range(3):
        episode_reward = []
        scores = np.zeros(num_agents)
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        agent.reset_noise()
        total_steps = 0
        while True:
            total_steps += 1
            actions = agent.act(states, 0, False)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            reward = env_info.rewards
            done = np.array(env_info.local_done)
            episode_reward.append(np.mean(reward))
            scores += reward
            states = next_states
            if np.any(done):
                print("total steps", total_steps)
                print(sum(episode_reward))
                print('average: ', np.mean(scores))
                print('min: ', np.min(np.array(episode_reward)))
                print('max: ', np.max(np.array(episode_reward)))
                break
Esempio n. 4
0
def run(env,
        device,
        episodes,
        experiment_name,
        update_rate,
        action_size,
        state_size,
        brain_name,
        epsilon_start=1.0,
        epsilon_min=0.05,
        epsilon_decay=0.995,
        max_score=30.,
        num_agents=1):

    epsilon = epsilon_start

    agent = DDPGAgent(state_space=state_size,
                      action_space=action_size,
                      buffer_size=int(1e5),
                      batch_size=512,
                      learning_rate_actor=0.001,
                      learning_rate_critic=0.001,
                      update_rate=update_rate,
                      gamma=0.995,
                      tau=0.001,
                      device=device,
                      seed=5,
                      num_agents=num_agents)
    score_window = deque(maxlen=100)
    all_scores = []
    tb_writer = SummaryWriter('{}/{}'.format('logs', experiment_name))

    for episode in range(episodes):
        agent.reset()
        scores = np.zeros(num_agents)
        dones = np.zeros((num_agents), dtype=bool)
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        while not np.any(dones):
            actions = agent.act(states, epsilon)
            actions = np.clip(actions, -1, 1)  # all actions between -1 and 1
            env_info = env.step(actions)[
                brain_name]  # send all actions to tne environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += env_info.rewards  # update the score (for each agent)

            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done)

            states = next_states

        episode_score = np.mean(scores)
        score_window.append(episode_score)
        all_scores.append(episode_score)

        #decay after each episode
        if episode % 10 == 0:
            epsilon = max(epsilon_min, epsilon * epsilon_decay)

        print('\rEpisode: {}\tAverage Score: {}'.format(
            episode, np.mean(score_window)),
              end="")
        if episode % 100 == 0:
            tb_writer.add_scalar('Episode_Accum_score', np.mean(score_window),
                                 episode)

            print('\rEpisode: {}\tAverage Score: {}'.format(
                episode, np.mean(score_window)))
        if np.mean(score_window) >= max_score:
            torch.save(agent.actor_local_network.state_dict(),
                       'actor_checkpoint_{}.pth'.format(experiment_name))
            torch.save(agent.critic_local_network.state_dict(),
                       'critic_checkpoint_{}.pth'.format(experiment_name))
            break
Esempio n. 5
0
def main(arg):
    """

     Args:
         param1: (args)
    """
    env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=True, seed=arg.seed)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
 
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    states = env_info.vector_observations
    
    print('Size of each action:', brain.vector_action_space_size)
    print(states.shape[1])
    
    epsilon = arg.epsilon
    epsilon_min = arg.epsilon_min
    epsilon_decay = arg.epsilon_decay
    config = Config()
    config.state_dim = states.shape[1]
    config.action_dim = brain.vector_action_space_size
    config.n_agents = num_agents
    set_config(config, arg)
    t_0 = time.time()
    n_episodes = arg.n_episodes
    train_every = arg.train_every
    scores_window = deque(maxlen=100)  # last 100 scores
    agent = DDPGAgent(config)
    scores = []
    print("Start training")
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.reset_noise()
        episode_reward = np.zeros(num_agents)
        for t in range(arg.t_max):
            actions = agent.act(states, epsilon)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            episode_reward += np.array(env_info.rewards)       # update the score (for each agent)
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                agent.memory.add(state, action, reward, next_state, done)
            states = next_states
            if t % train_every == 0:
                for _ in range(arg.repeat_learning):
                    agent.learn()
            
            if np.any(dones):
                break
        epsilon = epsilon * epsilon_decay   
        epsilon = max(epsilon_min, epsilon)
        scores_window.append(np.mean(episode_reward))
        scores.append(np.mean(episode_reward))
        duration = time.time() - t_0
        sec = duration % 60
        minutes = duration // 60
        print('\rEpisode {}\t Average Score all: {:.2f} , Score: {:.2f} Time: min {:.2f} sec: {}'.format(i_episode, np.mean(scores_window), np.mean(episode_reward), minutes, sec))
        if np.mean(scores_window) >= 30:
            print("Enviroment solved save smart agent")
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break
    return scores