Ejemplo n.º 1
0
def testAgent():
    print("Testing the Agent")
    agent = DDPGAgent(state_size=state_size,
                      action_size=action_size,
                      n_agents=n_agents,
                      seed=48,
                      train=False)
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state
    score = np.zeros(n_agents)  # initialize the score
    while True:
        actions = agent.act(states)  # select an action
        env_info = env.step(actions)[
            brain_name]  # send the action to the environment
        next_states = env_info.vector_observations  # get the next state
        rewards = env_info.rewards  # get the reward
        dones = env_info.local_done  # see if episode has finished
        score += np.array(rewards)  # update the score
        states = next_states  # roll over the state to next time step
        if np.any(dones):  # exit loop if episode finished
            break
    print("Score: {}".format(np.mean(score)))
    return score
Ejemplo n.º 2
0
                      mem_size=50000,
                      n_actions=1,
                      batch_size=64)

    train_score_history = []
    avg_train_score_history = []
    test_score_history = []
    avg_test_score_history = []

    for i in range(5000):
        obs = env.reset()
        done = False
        train_score = 0

        while not done:
            act = agent.act(obs)
            new_state, reward, done, _ = env.step(act)
            agent.record(obs, act, reward, new_state, done)
            agent.learn()
            train_score += reward
            obs = new_state

        train_score_history.append(train_score)
        avg_train_score_history.append(np.mean(train_score_history[-100:]))
        print('episode %s score %d last 100 games avg reward %.2f' %
              (i, train_score, float(avg_train_score_history[-1])))

        # testing
        if i % 10 == 0:
            test_sore_list = []
            for j in range(3):
Ejemplo n.º 3
0
def main_single_agent():
    env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64",
                           worker_id=1,
                           seed=1)
    env_date = str(datetime.datetime.now())
    file_path = os.path.join('data_single', env_date)

    os.makedirs(file_path, exist_ok=True)
    save_config(file_path)

    brain_name = env.brain_names[0]

    buffer = ReplayBuffer(Config.buffer_size)
    agent = DDPGAgent(in_actor=48,
                      hidden_in_actor=Config.actor_hidden[0],
                      hidden_out_actor=Config.actor_hidden[1],
                      out_actor=2,
                      in_critic=50,
                      hidden_in_critic=Config.critic_hidden[0],
                      hidden_out_critic=Config.critic_hidden[1],
                      lr_actor=Config.actor_lr,
                      lr_critic=Config.critic_lr,
                      noise_dist=Config.noise_distribution,
                      checkpoint_path=Config.checkpoint_path)

    agent_reward, all_rewards_mean = [], []
    batchsize = Config.batchsize
    max_reward = Config.max_reward
    # amplitude of OU noise
    # this slowly decreases to 0
    noise = Config.noise_beginning

    logger = logging.getLogger('Tennis MADDPG')
    all_rewards = []
    for episode in range(Config.n_episodes):
        reward_this_episode = 0
        env_info = env.reset(train_mode=True)[brain_name]
        states = torch.from_numpy(np.concatenate(env_info.vector_observations)
                                  )  # get the current state (for each agent)
        scores = np.zeros(2)  # initialize the score (for each agent)
        n_of_steps = 0
        noise = max(
            Config.min_noise,
            Config.noise_beginning *
            (1 - (Config.n_episodes - episode) / Config.n_episodes))
        while True:
            n_of_steps += 1

            states_tensor = torch.tensor(states).float()
            actions = agent.act(states_tensor, noise=noise)
            actions_array = actions.detach().numpy()
            actions_for_env = np.clip(actions_array, -1,
                                      1)  # all actions between -1 and 1

            env_info = env.step(np.array([
                actions_for_env, actions_for_env
            ]))[brain_name]  # send all actions to tne environment

            states_next = torch.from_numpy(
                np.concatenate(env_info.vector_observations))

            # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward
            reward = np.sum(np.array(env_info.rewards))
            reward_this_episode += reward
            if Config.replay_buffer_raward_min and reward_this_episode >= Config.replay_buffer_raward_min:
                buffer_data = (states, torch.from_numpy(actions_for_env),
                               reward, states_next, env_info.local_done[0])
                buffer.push(buffer_data)

            if not Config.replay_buffer_raward_min:
                buffer_data = (states, torch.from_numpy(actions_for_env),
                               reward, states_next, env_info.local_done[0])

                buffer.push(buffer_data)

            dones = env_info.local_done  # see if episode finished
            scores += np.sum(
                env_info.rewards)  # update the score (for each agent)
            states = states_next  # roll over states to next time step
            if np.any(dones):  # exit loop if episode finished
                break

        all_rewards.append(reward_this_episode)
        all_rewards_mean.append(np.mean(all_rewards[-100:]))
        if len(buffer) > Config.warmup:
            agent.update(buffer,
                         batchsize=batchsize,
                         tau=Config.tau,
                         discount=Config.discount_factor)
            if episode % Config.update_episode_n == 0:
                agent.update_targets(tau=Config.tau)

        if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1:
            logger.info(
                f'Episode {episode}:  Average reward over 100 episodes is {all_rewards_mean[-1]}'
            )
            if all_rewards_mean and all_rewards_mean[-1] > max_reward:
                logger.info('Found best model. Saving model into file: ...')

                save_dict_list = []
                save_dict = {
                    'actor_params': agent.actor.state_dict(),
                    'actor_optim_params': agent.actor_optimizer.state_dict(),
                    'critic_params': agent.critic.state_dict(),
                    'critic_optim_params': agent.critic_optimizer.state_dict()
                }

                save_dict_list.append(save_dict)
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(file_path, 'episode-{}.pt'.format(episode)))
                max_reward = all_rewards_mean[-1]
            plt.plot(all_rewards_mean)
            plt.xlabel('N of episodes')
            plt.ylabel('Reward')
            plt.title(
                'Final rewards of single agent for tennis collaboration task')
            plt.savefig(os.path.join(file_path, 'result_plot.png'))

    save_dict = {
        'actor_params': agent.actor.state_dict(),
        'actor_target_params': agent.target_actor.save_dict(),
        'actor_optim_params': agent.actor_optimizer.state_dict(),
        'critic_params': agent.critic.state_dict(),
        'critic_target_params': agent.target_critic.state_dict(),
        'critic_optim_params': agent.critic_optimizer.state_dict()
    }

    torch.save(save_dict,
               os.path.join(file_path, 'episode-{}.pt'.format(episode)))
Ejemplo n.º 4
0
                      in_critic=50,
                      hidden_in_critic=Config.critic_hidden[0],
                      hidden_out_critic=Config.critic_hidden[1],
                      lr_actor=Config.actor_lr,
                      lr_critic=Config.critic_lr,
                      noise_dist=Config.noise_distribution,
                      checkpoint_path=Config.checkpoint_path)

    for episode in range(args.n_episodes):
        env_info = env.reset(train_mode=False)[brain_name]
        states = torch.from_numpy(np.concatenate(env_info.vector_observations)
                                  )  # get the current state (for each agent)
        scores = np.zeros(2)  # initialize the score (for each agent)
        while True:
            states_tensor = torch.tensor(states).float()
            actions = agent.act(states_tensor, noise=0)
            actions_array = actions.detach().numpy()
            actions_for_env = np.clip(actions_array, -1,
                                      1)  # all actions between -1 and 1

            env_info = env.step(np.array([
                actions_for_env, actions_for_env
            ]))[brain_name]  # send all actions to tne environment

            states_next = torch.from_numpy(
                np.concatenate(env_info.vector_observations))

            # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward
            reward = np.sum(np.array(env_info.rewards))
            dones = env_info.local_done  # see if episode finished
            scores += np.sum(
Ejemplo n.º 5
0
def main(args):
    env = gym.make(args.env)
    outdir = '/tmp/ddpg'
    env = wrappers.Monitor(env, outdir, force=True)

    assert (env.action_space.high == -env.action_space.low
            ).all(), 'action_space bound should be symmetric'
    assert (env.action_space.high == env.action_space.high[0]
            ).all(), 'all action dims should have the same bound'

    agent = DDPGAgent(env.observation_space.shape[0],
                      env.action_space.shape[0],
                      float(env.action_space.high[0]))
    optimizer = DDPGOptimizer(agent, args.capacity, args.batch_size,
                              args.gamma, args.tau, args.init_lr,
                              args.weight_decay, args.crayon_vis)

    for episode in range(args.num_episode):
        agent.ou_noise.reset()
        state = env.reset().astype(NUMPY_PRECISION)

        running_loss = 0.
        training_total_reward = 0.
        for step in count():
            action = agent.noisy_act(state)

            next_state, reward, done, _ = env.step(action)
            # env.render()

            state, action, reward, next_state = map(
                lambda x: NUMPY_PRECISION(x),
                (state, action, reward, next_state))

            if done:
                next_state = None
            optimizer.memory.push_back(SARS(state, action, reward, next_state))

            if optimizer.memory.trainable:
                loss = optimizer.step()
                running_loss += loss

            state = next_state
            training_total_reward += reward

            if done:
                if args.crayon_vis:
                    optimizer.stats.add_scalar_value('average loss',
                                                     running_loss / step)
                    optimizer.stats.add_scalar_value('step', step)
                    optimizer.stats.add_scalar_value('training total reward',
                                                     training_total_reward)
                break

        if episode % 100 == 99:
            total_reward = 0.

            for eval in range(args.num_test):
                # agent.ou_noise.reset()
                state = env.reset().astype(NUMPY_PRECISION)

                for step in count():
                    # action = agent.noisy_act(state)
                    action = agent.act(state)
                    # print(action)

                    next_state, reward, done, _ = env.step(action)
                    env.render()

                    state, action, reward, next_state = map(
                        lambda x: NUMPY_PRECISION(x),
                        (state, action, reward, next_state))

                    state = next_state
                    total_reward += reward

                    if done:
                        break
            print('[Eval] {}th episode, total reward: {}, average reward: {}'.
                  format(episode, total_reward, total_reward / args.num_test))

    env.close()