예제 #1
0
def main_single_agent():
    env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64",
                           worker_id=1,
                           seed=1)
    env_date = str(datetime.datetime.now())
    file_path = os.path.join('data_single', env_date)

    os.makedirs(file_path, exist_ok=True)
    save_config(file_path)

    brain_name = env.brain_names[0]

    buffer = ReplayBuffer(Config.buffer_size)
    agent = DDPGAgent(in_actor=48,
                      hidden_in_actor=Config.actor_hidden[0],
                      hidden_out_actor=Config.actor_hidden[1],
                      out_actor=2,
                      in_critic=50,
                      hidden_in_critic=Config.critic_hidden[0],
                      hidden_out_critic=Config.critic_hidden[1],
                      lr_actor=Config.actor_lr,
                      lr_critic=Config.critic_lr,
                      noise_dist=Config.noise_distribution,
                      checkpoint_path=Config.checkpoint_path)

    agent_reward, all_rewards_mean = [], []
    batchsize = Config.batchsize
    max_reward = Config.max_reward
    # amplitude of OU noise
    # this slowly decreases to 0
    noise = Config.noise_beginning

    logger = logging.getLogger('Tennis MADDPG')
    all_rewards = []
    for episode in range(Config.n_episodes):
        reward_this_episode = 0
        env_info = env.reset(train_mode=True)[brain_name]
        states = torch.from_numpy(np.concatenate(env_info.vector_observations)
                                  )  # get the current state (for each agent)
        scores = np.zeros(2)  # initialize the score (for each agent)
        n_of_steps = 0
        noise = max(
            Config.min_noise,
            Config.noise_beginning *
            (1 - (Config.n_episodes - episode) / Config.n_episodes))
        while True:
            n_of_steps += 1

            states_tensor = torch.tensor(states).float()
            actions = agent.act(states_tensor, noise=noise)
            actions_array = actions.detach().numpy()
            actions_for_env = np.clip(actions_array, -1,
                                      1)  # all actions between -1 and 1

            env_info = env.step(np.array([
                actions_for_env, actions_for_env
            ]))[brain_name]  # send all actions to tne environment

            states_next = torch.from_numpy(
                np.concatenate(env_info.vector_observations))

            # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward
            reward = np.sum(np.array(env_info.rewards))
            reward_this_episode += reward
            if Config.replay_buffer_raward_min and reward_this_episode >= Config.replay_buffer_raward_min:
                buffer_data = (states, torch.from_numpy(actions_for_env),
                               reward, states_next, env_info.local_done[0])
                buffer.push(buffer_data)

            if not Config.replay_buffer_raward_min:
                buffer_data = (states, torch.from_numpy(actions_for_env),
                               reward, states_next, env_info.local_done[0])

                buffer.push(buffer_data)

            dones = env_info.local_done  # see if episode finished
            scores += np.sum(
                env_info.rewards)  # update the score (for each agent)
            states = states_next  # roll over states to next time step
            if np.any(dones):  # exit loop if episode finished
                break

        all_rewards.append(reward_this_episode)
        all_rewards_mean.append(np.mean(all_rewards[-100:]))
        if len(buffer) > Config.warmup:
            agent.update(buffer,
                         batchsize=batchsize,
                         tau=Config.tau,
                         discount=Config.discount_factor)
            if episode % Config.update_episode_n == 0:
                agent.update_targets(tau=Config.tau)

        if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1:
            logger.info(
                f'Episode {episode}:  Average reward over 100 episodes is {all_rewards_mean[-1]}'
            )
            if all_rewards_mean and all_rewards_mean[-1] > max_reward:
                logger.info('Found best model. Saving model into file: ...')

                save_dict_list = []
                save_dict = {
                    'actor_params': agent.actor.state_dict(),
                    'actor_optim_params': agent.actor_optimizer.state_dict(),
                    'critic_params': agent.critic.state_dict(),
                    'critic_optim_params': agent.critic_optimizer.state_dict()
                }

                save_dict_list.append(save_dict)
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(file_path, 'episode-{}.pt'.format(episode)))
                max_reward = all_rewards_mean[-1]
            plt.plot(all_rewards_mean)
            plt.xlabel('N of episodes')
            plt.ylabel('Reward')
            plt.title(
                'Final rewards of single agent for tennis collaboration task')
            plt.savefig(os.path.join(file_path, 'result_plot.png'))

    save_dict = {
        'actor_params': agent.actor.state_dict(),
        'actor_target_params': agent.target_actor.save_dict(),
        'actor_optim_params': agent.actor_optimizer.state_dict(),
        'critic_params': agent.critic.state_dict(),
        'critic_target_params': agent.target_critic.state_dict(),
        'critic_optim_params': agent.critic_optimizer.state_dict()
    }

    torch.save(save_dict,
               os.path.join(file_path, 'episode-{}.pt'.format(episode)))
예제 #2
0
def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 60000
    episode_length = 35
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0
    scenario_name = "simple_spread_ivan"

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 0.5  # was 2, try 0.5, 0.2
    noise_reduction = 0.9999  # 0.999
    #### DECAY
    initial_noise = 0.1
    decay = 0.01

    # how many episodes before update
    # episode_per_update = UPDATE_EVERY * parallel_envs
    common_folder = time.strftime("/%m%d%y_%H%M%S")
    log_path = os.getcwd() + common_folder + "/log"
    model_dir = os.getcwd() + common_folder + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # initialize environment
    # torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK)
    # env = envs.make_env("simple_spread_ivan")

    # initialize replay buffer
    buffer = ReplayBuffer(int(BUFFER_SIZE))

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    logger = SummaryWriter(log_dir=log_path)

    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])
    # agent0_reward = []
    # agent1_reward = []
    # agent2_reward = []

    agent_info = [[[]]]  # placeholder for benchmarking info

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        '\repisode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    print('Starting iterations...')
    for episode in range(0, number_of_episodes, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, num_agents))

        all_obs = env.reset()  #

        # flip the first two indices
        # ADD FOR WITHOUT PARALLEL ENV
        # all_obs = np.expand_dims(all_obs, axis=0)

        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        # if save_info:
        # frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            # get actions
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)

            noise = max(initial_noise * decay**(episode_t / 20000), 0.001)
            # noise = max(noise*noise_reduction, 0.001)

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # environment step
            # step forward one frame
            # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            # ADD FOR WITHOUT PARALLEL ENV
            # next_obs, rewards, dones, info = env.step(actions_for_env)
            next_obs, rewards, dones, info = env.step(actions_for_env)

            # rewards_sum += np.mean(rewards)

            # collect experience
            transition = (obs, actions_for_env, rewards, next_obs, dones)
            buffer.push(transition)

            reward_this_episode += rewards

            # obs, obs_full = next_obs, next_obs_full
            obs = next_obs

            # increment global step counter
            t += parallel_envs

            # save gif frame
            if save_info:
                # frames.append(env.render('rgb_array'))
                tmax += 1

            # for benchmarking learned policies
            if BENCHMARK:
                for i, inf in enumerate(info):
                    agent_info[-1][i].append(inf['n'])

        # update once after every episode_per_update
        # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs:
        if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs:
            for _ in range(UPDATE_TIMES):
                for a_i in range(num_agents):
                    samples = buffer.sample(BATCH_SIZE)
                    maddpg.update(samples, a_i, logger)
                maddpg.update_targets(
                )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            for n in range(num_agents):
                agents_reward[n].append(reward_this_episode[i, n])
            # agent0_reward.append(reward_this_episode[i,0])
            # agent1_reward.append(reward_this_episode[i,1])
            # agent2_reward.append(reward_this_episode[i,2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)]
            avg_rewards = []
            for n in range(num_agents):
                avg_rewards.append(np.mean(agents_reward[n]))
                # agent0_reward = []
            # agent1_reward = []
            # agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            print('agent_info benchmark=', agent_info)
            for i in range(5):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #                 frames, duration=.04)

    env.close()
    logger.close()
    timer.finish()
예제 #3
0
        ep_loss = []
        ep_error = []
        # Initialize the environment and state
        state = torch.tensor([env.reset()], device=device).float()
        done = False
        score = 0
        for t in count():
            # Select and perform an action
            action = select_action(state)
            next_state, reward, done, _ = env.step(action.item())
            score += reward

            next_state = torch.tensor([next_state], device=device).float()
            reward = torch.tensor([reward], device=device).float()
            # Store the transition in memory
            buffer.push(state, action, next_state, reward, not done)

            # Update state
            state = next_state

            # Perform one optimization step (on the policy network)
            loss, Q_estimation_error = train_model()

            # save results
            ep_loss.append(loss)
            ep_error.append(Q_estimation_error)

            # soft target update
            if params.target_update == 'soft':
                # print('in soft')
                # 0' ← τθ + (1 − τ )θ'
예제 #4
0
        maddpg.reset_ounoise()

        # GET ACTIONS TO TAK ADN INTERACT WITH THE ENVIRONMENT
        actions = maddpg.act(tensorfy(states), noise=noise, stacked=True)
        env_info = env.step(actions)[brain_name]

        # EXTRACT AND PROCESS THE RETRUNED VALUES FROM ENVIRONMENT
        next_states = process_agent_states(env_info.vector_observations)
        next_global_state = process_gobal_state(env_info.vector_observations)
        rewards = env_info.rewards
        dones = env_info.local_done

        # ADD EXPERIENCE TO THE BUFFER
        experience = (states, global_state, actions, rewards, next_states,
                      next_global_state, dones)
        buffer.push(experience)

        # UPDATE REWARDS
        rewards_this_episode += rewards

        # PREPARE FOR NEXT TIMESTEP
        states = next_states
        global_state = next_global_state
        noise = noise if hard_noise_reigime else noise * NOISE_DECAY

        # END EPISODE IF ANY AGENT IS DONE
        if any(dones):
            break

    if episode_i > HARD_NOISE_STEPS:
        hard_noise_reigime = False
예제 #5
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes + parallel_envs, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
예제 #6
0
def main():
    env_info = env.reset(train_mode=False)[brain_name]
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    seeding()
    # number of parallel agents
    #parallel_envs = num_agents
    # number of training episodes.
    # change this to higher number to experiment. say 30000.

    number_of_episodes = 10000
    update_actor_after = 100
    update_actor_every = 2
    episode_length = 100
    batchsize = 100
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    LR_ACTOR = 1e-5
    LR_CRITIC = 3e-3

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.999999

    # how many episodes before update
    episode_per_update = 1
    no_of_updates_perTime = 1

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    #torch.set_num_threads(parallel_envs)
    #env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(10 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC)
    #logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    #agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes):

        timer.update(episode)

        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        reward_this_episode = np.zeros((1, num_agents))

        #all_obs = env.reset() #
        obs = states
        obs_full = np.concatenate((states[0], states[1]))

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < 1
                     or episode == number_of_episodes - 1)
        tmax = 0

        #resetting noise
        for i in range(num_agents):
            maddpg.maddpg_agent[i].noise.reset()

        for episode_t in range(episode_length):

            t += 1

            update_act = True if (episode > update_actor_after or episode %
                                  update_actor_every == 0) else False
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensorAsitis(obs),
                                 noise=noise,
                                 batch=False)
            noise *= noise_reduction

            actions_array = torch.stack(actions).cpu().detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            env_info = env.step(actions_for_env)[brain_name]

            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            scores += env_info.rewards

            rewards_for_env = np.hstack(rewards)

            obs = states
            obs_full = np.concatenate((states[0], states[1]))
            next_obs = next_states
            next_obs_full = np.concatenate((next_states[0], next_states[1]))
            # add data to buffer
            transition = (np.array([obs]), np.array([obs_full]),
                          np.array([actions_for_env]),
                          np.array([rewards_for_env]), np.array([next_obs]),
                          np.array([next_obs_full]),
                          np.array([dones], dtype='float'))
            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # update once after every episode_per_update
            if len(buffer) > batchsize and episode % episode_per_update == 0:
                for _ in range(no_of_updates_perTime):
                    for a_i in range(num_agents):
                        samples = buffer.sample(batchsize)
                        #updating the weights of the n/w
                        maddpg.update(samples, a_i, update_actor=update_act)
                    maddpg.update_targets(
                    )  #soft update the target network towards the actual networks

            if np.any(dones):
                # if the episode is done the loop is break to the next episode
                break

        for i in range(num_agents):
            agent0_reward.append(reward_this_episode[0, 0])
            agent1_reward.append(reward_this_episode[0, 1])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)]
            agent0_reward = []
            agent1_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)
                print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(num_agents):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #frames, duration=.04)
    timer.finish()
class DQNAgent:
    """
    DQN Agent, valid for discrete actioin space
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #loss_fn = nn.MSELoss()
    loss_fn = nn.SmoothL1Loss()
    iter = 0

    def __init__(self,
                 net,
                 o_dim,
                 a_dim,
                 lr=1e-3,
                 batch_size=16,
                 algorithm="ddqn",
                 gamma=0.99,
                 tau=1e-3,
                 buffer_size=int(1e6)):
        """
        o_dim: observation space dim (or # of channels)
        a_dim: action space dimension
        """
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size

        if algorithm.lower() in ("dqn"):
            self.algorithm = "dqn"
        elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"):
            self.algorithm = "ddqn"
        else:
            raise TypeError("cannot recognize algorithm")

        self.buffer = ReplayBuffer(buffer_size, batch_size)

        self.online_net = net(o_dim, a_dim).to(self.device)
        self.target_net = net(o_dim, a_dim).to(self.device)

        self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr)

    def get_action(self, state, eps=0.):
        """ Epsilon-greedy action selection """

        if random.random() > eps:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(
                self.device)

            self.online_net.eval()
            with torch.no_grad():
                action = self.online_net(state_tensor).argmax(1).item()
            self.online_net.train()

            return action
        else:
            return random.choice(np.arange(self.a_dim))

    def update(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        states = torch.FloatTensor(states).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)

        actions = torch.LongTensor(actions).view(-1, 1).to(self.device)
        rewards = torch.FloatTensor(rewards).view(-1, 1).to(self.device)
        dones = torch.FloatTensor(dones).view(-1, 1).to(self.device)

        if self.algorithm == "ddqn":
            max_actions = self.online_net(next_states).max(1)[1].view(-1, 1)
            Q_next = self.target_net(next_states).gather(1, max_actions)

        elif self.algorithm == "dqn":
            Q_next = self.target_net(next_states).max(1)[0].view(-1, 1)
        else:
            raise TypeError("cannot recognize algorithm")

        Q_targets = rewards + self.gamma * Q_next * (1. - dones)
        Q_expected = self.online_net(states).gather(1, actions)

        loss = self.loss_fn(Q_expected, Q_targets.detach())

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), 10.)
        self.optimizer.step()

    def step(self, state, action, reward, next_state, done):
        self.buffer.push(state, action, reward, next_state, done)
        if len(self.buffer) > self.batch_size:
            experiences = self.buffer.sample()
            self.update(experiences)
            soft_update(self.target_net, self.online_net, self.tau)
            self.iter += 1
예제 #8
0
def main():
    seeding()
    parallel_envs = 4
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    save_interval = 1000
    t = 0

    # amplitude of OU noise, which slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    """
    `env` controls three agents, two blue, one red.
    env.observation_space: [Box(14,), Box(14,), Box(14,)]
    env.action_sapce: [Box(2,), Box(2,), Box(2,)]
    Box(14,) can be broken down into 2+3*2+3*2=14
    (2) location coordinates of the target landmark
    (3*2) the three agents' positions w.r.t. the target landmark
    (3*2) the three agents' velocities w.r.t. the target landmark
    """
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        # Consult `env_wrapper.py` line 19.
        all_obs = env.reset()
        """
        `all_abs` is a list of size `parallel_envs`,
        each item in the list is another list of size two,
        first is env.observation_space: [Box(14,), Box(14,), Box(14,)],
        second is [Box(14,)], which is added to faciliate training
        https://goo.gl/Xtr6sF
        `obs` and `obs_full` are both lists of size `parallel_envs`,
        `obs` has the default observation space [Box(14,), Box(14,), Box(14,)]
        `obs_full` has the compounded observation space [Box(14,)]
        """
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for one episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of steps
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            # `actions_array` has shape (3, parallel_envs, 2)
            actions_array = torch.stack(actions).detach().numpy()
            # `actions_for_env` has shape (parallel_envs, 3, 2), because
            # input to `step` requires the first index to be `parallel_envs`
            actions_for_env = np.rollaxis(actions_array, axis=1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = \
                env.step(actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update the target network `parallel_envs`=4 times
        # after every `episode_per_update`=2*4
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            # update the local network for all agents, `a_i` refers to agent no.
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # soft update the target network towards the actual networks
            maddpg.update_targets()

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # Saves the model.
        save_dict_list = []
        if save_info:
            for i in range(3):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # Save gif files.
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
예제 #9
0
class MADDPGAgent:

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    iter = 0

    def __init__(self,
                 num_agents,
                 x_dim,
                 o_dim,
                 a_dim,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 batch_size=16,
                 gamma=0.99,
                 tau=0.001,
                 buffer_size=int(1e5),
                 seed=1234):

        self.num_agents = num_agents
        self.x_dim = x_dim
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.seed = seed

        self.buffer = ReplayBuffer(buffer_size, batch_size, seed)
        self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \
                       for id in range(num_agents)]

    def get_actions(self, obs_full, eps=0.):
        """get actions from all agents in the MADDPG object"""
        actions = []
        for id, agent in enumerate(self.agents):
            actions.extend(agent.get_action2(obs_full[id, :], eps))
        return actions

    def update(self, experiences):

        obs_full, actions, rewards, next_obs_full, dones = experiences

        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        x = torch.FloatTensor(obs_full).to(self.device)
        a = torch.FloatTensor(actions).to(self.device)
        next_x = torch.FloatTensor(next_obs_full).to(self.device)

        with torch.no_grad():
            next_a = [
                agent.target_actor(next_x[:, agent.id, :])
                for agent in self.agents
            ]
        next_a = torch.cat(next_a, dim=1)

        for agent in self.agents:
            r = rewards[:, agent.id].view(-1, 1)
            d = dones[:, agent.id].view(-1, 1)

            pred_a = [ self.agents[i].actor(x[:, i, :]) if i == agent.id \
                       else self.agents[i].actor(x[:, i, :]).detach()
                       for i in range(self.num_agents) ]
            pred_a = torch.cat(pred_a, dim=1)

            agent.update(next_x, next_a, r, d, x, a, pred_a)

    def update_targets(self):
        """soft update targets"""
        for agent in self.agents:
            soft_update(agent.target_actor, agent.actor, self.tau)
            soft_update(agent.target_critic, agent.critic, self.tau)

    def step(self, state, action, reward, next_state, done):
        self.buffer.push(state, action, reward, next_state, done)
        if (len(self.buffer) > self.batch_size):
            experiences = self.buffer.sample()
            self.update(experiences)
            self.update_targets()
            self.iter += 1

    def reset(self):
        for agent in self.agents:
            agent.noise.reset()
예제 #10
0
파일: agent.py 프로젝트: mfsuve/TORCS-RL
class SAC_Agent:
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')

    def train(self):
        remove_log_file()
        clear_action_logs()
        eps_n = 0
        rewards = []
        test_rewards = []
        best_reward = -np.inf
        info = None
        for eps_n in range(1, self.args.max_eps + 1):  # Train loop
            self.set_mode('train')
            relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0
            state = self.env.reset(relaunch=relaunch,
                                   render=False,
                                   sampletrack=False)
            eps_r = 0
            sigma = (self.args.start_sigma - self.args.end_sigma) * (max(
                0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma
            randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma,
                                                     self.action_size)

            for step in range(self.args.max_eps_time):  # Episode
                action = self.policy_net.get_train_action(state, randomprocess)
                next_state, reward, done, info = self.env.step(action)

                self.buffer.push(state, action, reward, next_state, done)

                state = next_state
                eps_r += reward

                if len(self.buffer) > self.args.batch_size:
                    self.update()

                if done:
                    break

            rewards.append(eps_r)

            test_reward = self.test(eps_n)
            test_rewards.append(test_reward)

            if test_reward > best_reward:
                best_reward = test_reward
                self.save_checkpoint(eps_n, best_reward)

            info_str = ', '.join(
                [key for key in info.keys() if key != 'place'])
            info_str += f", {info['place']}. place"
            log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}'
                )

            if eps_n % self.args.plot_per == 0:
                self.plot(rewards, test_rewards, eps_n)

    def update(self):
        state, action, reward, next_state, done = self.buffer.sample(
            self.args.batch_size)

        state = FloatTensor(state).to(self.args.device)
        next_state = FloatTensor(next_state).to(self.args.device)
        action = FloatTensor(action).to(self.args.device)
        reward = FloatTensor(reward).unsqueeze(1).to(self.args.device)
        done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device)

        predicted_q_value1 = self.soft_q_net1(state, action)
        predicted_q_value2 = self.soft_q_net2(state, action)
        predicted_value = self.value_net(state)
        new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate(
            state)

        # Training Q function
        target_value = self.target_value_net(next_state)
        target_q_value = reward + (1 - done) * self.args.gamma * target_value
        q_value_loss1 = self.soft_q_loss1(predicted_q_value1,
                                          target_q_value.detach())
        q_value_loss2 = self.soft_q_loss2(predicted_q_value2,
                                          target_q_value.detach())

        self.soft_q_opt1.zero_grad()
        q_value_loss1.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net1.parameters())
        self.soft_q_opt1.step()
        self.soft_q_opt2.zero_grad()
        q_value_loss2.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net2.parameters())
        self.soft_q_opt2.step()

        # Training Value function
        predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),
                                          self.soft_q_net2(state, new_action))
        target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum(
        )
        value_loss = self.value_criterion(predicted_value,
                                          target_value_func.detach())

        self.value_opt.zero_grad()
        value_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.value_net.parameters())
        self.value_opt.step()

        # Training Policy function
        policy_loss = (log_prob - predicted_new_q_value).mean()

        self.policy_opt.zero_grad()
        policy_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.policy_net.parameters())
        self.policy_opt.step()

        # Updating target value network
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data *
                                    (1.0 - self.args.soft_tau) +
                                    param.data * self.args.soft_tau)

    def test(self, eps_n):
        self.set_mode('eval')
        rewards = []
        for step in range(self.args.test_rate):
            render = (eps_n % 30 == 0) and (step == 0)
            relaunch = render or ((eps_n % 30 == 0) and (step == 1))
            state = self.env.reset(relaunch=relaunch,
                                   render=render,
                                   sampletrack=False)
            running_reward = 0
            for t in range(self.args.max_eps_time):
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                store(action, eps_n, reward, info, t == 0)
                running_reward += reward
                if done:
                    break
            rewards.append(running_reward)
        avg_reward = sum(rewards) / self.args.test_rate
        return avg_reward

    def plot(self, rewards, test_rewards, eps_n):
        torch.save({
            'train_rewards': rewards,
            'test_rewards': test_rewards
        }, f'{self.plot_folder}/{eps_n}.pth')
        figure = plt.figure()
        plt.plot(rewards, label='Train Rewards')
        plt.plot(test_rewards, label='Test Rewards')
        plt.xlabel('Episode')
        plt.legend()
        plt.savefig(f'{self.plot_folder}/{eps_n}.png')
        try:
            send_mail(f'Improved Torcs SAC | Episode {eps_n}',
                      f'{self.plot_folder}/{eps_n}.png')
            log('Mail has been sent.')
        except (KeyboardInterrupt, SystemExit):
            print('KeyboardInterrupt or SystemExit')
            raise
        except Exception as e:
            print('Mail Exception occured:', e)
            emsg = e.args[-1]
            emsg = emsg[:1].lower() + emsg[1:]
            log('Couldn\'t send mail because', emsg)

    def clip_grad(self, parameters):
        for param in parameters:
            param.grad.data.clamp_(-1, 1)

    def set_mode(self, mode):
        if mode == 'train':
            self.value_net.train()
            self.target_value_net.train()
            self.soft_q_net1.train()
            self.soft_q_net2.train()
            self.policy_net.train()
        elif mode == 'eval':
            self.value_net.eval()
            self.target_value_net.eval()
            self.soft_q_net1.eval()
            self.soft_q_net2.eval()
            self.policy_net.eval()
        else:
            raise ValueError('mode should be either train or eval')

    def save_checkpoint(self, eps_n, test_reward):
        self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2,
                       self.policy_net)
        self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth')
        log(f'Saved checkpoint at episode {eps_n}.')

    def load_checkpoint(self, load_from):
        state_dicts = torch.load(load_from)
        self.value_net.load_state_dict(state_dicts['best_value'])
        self.soft_q_net1.load_state_dict(state_dicts['best_q1'])
        self.soft_q_net2.load_state_dict(state_dicts['best_q2'])
        self.policy_net.load_state_dict(state_dicts['best_policy'])
        print(f'Loaded from {load_from}.')

    def race(self, sampletrack=True):
        with torch.no_grad():
            state = self.env.reset(relaunch=True,
                                   render=True,
                                   sampletrack=sampletrack)
            running_reward = 0
            done = False
            while not done:
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                running_reward += reward

            print('Reward:', running_reward)
class FQFAgent:

    def __init__(self, env_name,
                 num_quantiles=32, fqf_factor=0.000001*0.1, ent_coef=0.001,
                 state_embedding_dim=3136, quantile_embedding_dim=64,
                 gamma=0.99, n_frames=4, batch_size=32,
                 buffer_size=1000000,
                 update_period=8,
                 target_update_period=10000):

        self.env_name = env_name

        self.num_quantiles = num_quantiles

        self.state_embedding_dim = state_embedding_dim

        self.quantile_embedding_dim = quantile_embedding_dim

        self.k = 1.0

        self.ent_coef = ent_coef

        self.n_frames = n_frames

        self.action_space = gym.make(self.env_name).action_space.n

        self.fqf_network = FQFNetwork(
            action_space=self.action_space,
            num_quantiles=self.num_quantiles,
            state_embedding_dim=self.state_embedding_dim,
            quantile_embedding_dim=self.quantile_embedding_dim)

        self.target_fqf_network = FQFNetwork(
            action_space=self.action_space,
            num_quantiles=self.num_quantiles,
            state_embedding_dim=self.state_embedding_dim,
            quantile_embedding_dim=self.quantile_embedding_dim)

        self._define_network()

        self.optimizer = tf.keras.optimizers.Adam(
            lr=0.00015, epsilon=0.01/32)

        #: fpl; fraction proposal layer
        self.optimizer_fpl = tf.keras.optimizers.Adam(
            learning_rate=0.00005 * fqf_factor,
            epsilon=0.0003125)

        self.gamma = gamma

        self.replay_buffer = ReplayBuffer(max_len=buffer_size)

        self.batch_size = batch_size

        self.update_period = update_period

        self.target_update_period = target_update_period

        self.steps = 0

    def _define_network(self):
        """ initialize network weights
        """
        env = gym.make(self.env_name)
        frames = collections.deque(maxlen=4)
        frame = frame_preprocess(env.reset())
        for _ in range(self.n_frames):
            frames.append(frame)

        state = np.stack(frames, axis=2)[np.newaxis, ...]
        self.fqf_network(state)
        self.target_fqf_network(state)
        self.target_fqf_network.set_weights(self.fqf_network.get_weights())

    @property
    def epsilon(self):
        if self.steps <= 1000000:
            return max(0.99 * (1000000 - self.steps) / 1000000, 0.1)
        elif self.steps <= 2000000:
            return 0.05 + 0.05 * (2000000 - self.steps) / 2000000
        else:
            return 0.05

    def learn(self, n_episodes, logdir="log"):

        logdir = Path(__file__).parent / logdir
        if logdir.exists():
            shutil.rmtree(logdir)
        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        for episode in range(1, n_episodes+1):

            env = gym.make(self.env_name)

            frames = collections.deque(maxlen=4)
            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)

            episode_rewards = 0
            episode_steps = 0
            done = False
            lives = 5
            while not done:
                self.steps += 1
                episode_steps += 1
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.fqf_network.sample_action(state, epsilon=self.epsilon)
                next_frame, reward, done, info = env.step(action)
                episode_rewards += reward
                frames.append(frame_preprocess(next_frame))
                next_state = np.stack(frames, axis=2)[np.newaxis, ...]

                if done:
                    exp = Experience(state, action, reward, next_state, done)
                    self.replay_buffer.push(exp)
                    break
                else:
                    if info["ale.lives"] != lives:
                        #: life loss as episode ends
                        lives = info["ale.lives"]
                        exp = Experience(state, action, reward, next_state, True)
                    else:
                        exp = Experience(state, action, reward, next_state, done)

                    self.replay_buffer.push(exp)

                if (len(self.replay_buffer) > 50000) and (self.steps % self.update_period == 0):

                    loss, loss_fp, entropy = self.update_network()

                    with self.summary_writer.as_default():
                        tf.summary.scalar("loss", loss, step=self.steps)
                        tf.summary.scalar("loss_fp", loss_fp, step=self.steps)
                        tf.summary.scalar("entropy", entropy, step=self.steps)
                        tf.summary.scalar("epsilon", self.epsilon, step=self.steps)
                        tf.summary.scalar("buffer_size", len(self.replay_buffer), step=self.steps)
                        tf.summary.scalar("train_score", episode_rewards, step=self.steps)
                        tf.summary.scalar("train_steps", episode_steps, step=self.steps)

                #: Target update
                if self.steps % self.target_update_period == 0:
                    self.target_fqf_network.set_weights(
                        self.fqf_network.get_weights())

            print(f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}")

            if episode % 20 == 0:
                test_scores, test_steps = self.test_play(n_testplay=1)
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score", test_scores[0], step=self.steps)
                    tf.summary.scalar("test_step", test_steps[0], step=self.steps)

            if episode % 500 == 0:
                self.fqf_network.save_weights("checkpoints/fqfnet")
                print("Model Saved")

    def update_network(self):

        (states, actions, rewards,
         next_states, dones) = self.replay_buffer.get_minibatch(self.batch_size)

        rewards = rewards.reshape((self.batch_size, 1, 1))
        dones = dones.reshape((self.batch_size, 1, 1))

        with tf.GradientTape() as tape:
            #: Compute F(τ^)
            state_embedded = self.fqf_network.state_embedding_layer(states)

            taus, taus_hat, taus_hat_probs = self.fqf_network.propose_fractions(state_embedded)
            taus_hat, taus_hat_probs = tf.stop_gradient(taus_hat), tf.stop_gradient(taus_hat_probs)

            quantiles = self.fqf_network.quantile_function(
                state_embedded, taus_hat)
            actions_onehot = tf.one_hot(
                actions.flatten().astype(np.int32), self.action_space)
            actions_mask = tf.expand_dims(actions_onehot, axis=2)
            quantiles = tf.reduce_sum(
                quantiles * actions_mask, axis=1, keepdims=True)

            #: Compute target F(τ^), use same taus proposed by online network
            next_actions, target_quantiles = self.target_fqf_network.greedy_action_on_given_taus(
                next_states, taus_hat, taus_hat_probs)

            next_actions_onehot = tf.one_hot(next_actions.numpy().flatten(), self.action_space)
            next_actions_mask = tf.expand_dims(next_actions_onehot, axis=2)
            target_quantiles = tf.reduce_sum(
                target_quantiles * next_actions_mask, axis=1, keepdims=True)

            #: TF(τ^)
            target_quantiles = rewards + self.gamma * (1-dones) * target_quantiles
            target_quantiles = tf.stop_gradient(target_quantiles)

            #: Compute Quantile regression loss
            target_quantiles = tf.repeat(
                target_quantiles, self.num_quantiles, axis=1)
            quantiles = tf.repeat(
                tf.transpose(quantiles, [0, 2, 1]), self.num_quantiles, axis=2)

            #: huberloss
            bellman_errors = target_quantiles - quantiles
            is_smaller_than_k = tf.abs(bellman_errors) < self.k
            squared_loss = 0.5 * tf.square(bellman_errors)
            linear_loss = self.k * (tf.abs(bellman_errors) - 0.5 * self.k)

            huberloss = tf.where(is_smaller_than_k, squared_loss, linear_loss)

            #: quantile loss
            indicator = tf.stop_gradient(tf.where(bellman_errors < 0, 1., 0.))
            _taus_hat = tf.repeat(
                tf.expand_dims(taus_hat, axis=2), self.num_quantiles, axis=2)

            quantile_factors = tf.abs(_taus_hat - indicator)
            quantile_huberloss = quantile_factors * huberloss

            loss = tf.reduce_mean(quantile_huberloss, axis=2),
            loss = tf.reduce_sum(loss, axis=1)
            loss = tf.reduce_mean(loss)

        state_embedding_vars = self.fqf_network.state_embedding_layer.trainable_variables
        quantile_function_vars = self.fqf_network.quantile_function.trainable_variables

        variables = state_embedding_vars + quantile_function_vars
        grads = tape.gradient(loss, variables)

        with tf.GradientTape() as tape2:
            taus_all = self.fqf_network.fraction_proposal_layer(state_embedded)
            taus = taus_all[:, 1:-1]

            quantiles = self.fqf_network.quantile_function(
                state_embedded, taus)
            taus_hat = (taus_all[:, 1:] + taus_all[:, :-1]) / 2.
            quantiles_hat = self.fqf_network.quantile_function(
                state_embedded, taus_hat)

            dw_dtau = 2 * quantiles - quantiles_hat[:, :, 1:] - quantiles_hat[:, :, :-1]
            dw_dtau = tf.reduce_sum(dw_dtau * actions_mask, axis=1)

            entropy = tf.reduce_sum(-1 * taus_hat * tf.math.log(taus_hat), axis=1)

            loss_fp = tf.reduce_mean(tf.square(dw_dtau), axis=1)
            loss_fp += -1 * self.ent_coef * entropy
            loss_fp = tf.reduce_mean(loss_fp)

        fp_variables = self.fqf_network.fraction_proposal_layer.trainable_variables
        grads_fp = tape2.gradient(loss_fp, fp_variables)

        self.optimizer.apply_gradients(zip(grads, variables))
        self.optimizer_fpl.apply_gradients(zip(grads_fp, fp_variables))

        return loss, loss_fp, tf.reduce_mean(entropy)

    def test_play(self, n_testplay=1, monitor_dir=None,
                  checkpoint_path=None):

        if checkpoint_path:
            env = gym.make(self.env_name)
            frames = collections.deque(maxlen=4)
            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)
            state = np.stack(frames, axis=2)[np.newaxis, ...]
            self.fqf_network(state)
            self.fqf_network.load_weights(checkpoint_path)

        if monitor_dir:
            monitor_dir = Path(monitor_dir)
            if monitor_dir.exists():
                shutil.rmtree(monitor_dir)
            monitor_dir.mkdir()
            env = gym.wrappers.Monitor(
                gym.make(self.env_name), monitor_dir, force=True,
                video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_name)

        scores = []
        steps = []
        for _ in range(n_testplay):

            frames = collections.deque(maxlen=4)
            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)

            done = False
            episode_steps = 0
            episode_rewards = 0

            while not done:
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.fqf_network.sample_action(state, epsilon=0.01)
                next_frame, reward, done, _ = env.step(action)
                frames.append(frame_preprocess(next_frame))

                episode_rewards += reward
                episode_steps += 1
                if episode_steps > 500 and episode_rewards < 3:
                    #: ゲーム開始(action: 0)しないまま停滞するケースへの対処
                    break

            scores.append(episode_rewards)
            steps.append(episode_steps)

        return scores, steps
예제 #12
0
def main():
    seeding()
    # number of training episodes.
    number_of_episodes = 5000
    episode_length = 1000
    batchsize = 2000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe')
    env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe',
                           no_graphics=True)

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]

    num_agents = len(env_info.agents)

    replay_episodes = 1000

    buffer = ReplayBuffer(int(replay_episodes * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    # logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []

    # training loop
    scores_deque = deque(maxlen=100)
    scores = []

    for episode in range(0, number_of_episodes):

        reward_this_episode = np.zeros(num_agents)
        env_info = env.reset(True)[brain_name]
        state = env_info.vector_observations

        obs = [[state[0], state[1]]]
        obs_full = np.concatenate((state[0], state[1]))

        #for calculating rewards for this particular episode - addition of all time steps

        frames = []
        tmax = 0

        for episode_t in range(episode_length):

            t += 1

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # actions_for_env = np.rollaxis(actions_array,1)
            actions_for_env = np.clip(actions_array.flatten(), -1, 1)

            # print(actions_for_env)

            # step forward one frame
            # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            env_info = env.step(actions_for_env)[brain_name]
            next_state = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            next_obs = [[next_state[0], next_state[1]]]
            next_obs_full = np.concatenate((next_state[0], next_state[1]))

            # print(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones

            # add data to buffer
            transition = ([obs], [obs_full], [actions_for_env], [rewards],
                          [next_obs], [next_obs_full], [dones])

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            if any(dones):
                break

        # update once after every episode_per_update
        if len(buffer) > batchsize and episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        avg_rewards = np.mean(reward_this_episode, axis=0)
        episode_reward = np.max(avg_rewards)
        scores_deque.append(episode_reward)
        scores.append(episode_reward)

        print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'.
              format(episode, np.mean(scores_deque), episode_reward),
              end="")

        if (episode > 0
                and episode % 100 == 0) or episode == number_of_episodes - 1:
            print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'.
                  format(episode, np.mean(scores_deque), episode_reward))

        if np.mean(scores_deque) >= 0.5:
            print('\nSuccess!')
            break

    #saving model
    save_dict_list = []
    for i in range(num_agents):

        save_dict = {
            'actor_params':
            maddpg.maddpg_agent[i].actor.state_dict(),
            'actor_optim_params':
            maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
            'critic_params':
            maddpg.maddpg_agent[i].critic.state_dict(),
            'critic_optim_params':
            maddpg.maddpg_agent[i].critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)

        torch.save(save_dict_list,
                   os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

    env.close()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.plot(np.arange(1, len(scores) + 1), scores)
    plt.savefig('tennis_score_history.png')

    return scores
예제 #13
0
def main():
    seeding()
    # number of parallel agents
    number_of_agents = 2
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 5000
    max_t = 1000
    batchsize = 128

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1
    noise_reduction = 0.9999

    tau = 1e-3  # soft update factor
    gamma = 0.99  # reward discount factor

    # how many episodes before update
    episode_per_update = 2

    model_dir = os.getcwd() + "/model_dir"
    os.makedirs(model_dir, exist_ok=True)

    # do we need to set multi-thread for this env?
    torch.set_num_threads(number_of_agents * 2)

    env = TennisEnv()

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(1e5))

    # initialize policy and critic
    maddpg = MADDPG(discount_factor=gamma, tau=tau)

    # training loop
    scores_window = deque(maxlen=100)
    ep_scores = []

    # when to save: use a dictionary to track if a model at a given score (key/10) has been saved.
    save_on_scores = {
        5: False,
        6: False,
        9: False,
        10: False,
        11: False,
        12: False,
        13: False,
        14: False,
        15: False,
        16: False,
        17: False,
        18: False,
        19: False,
        20: False
    }

    agent0_reward = []
    agent1_reward = []

    for episode in range(0, number_of_episodes):
        reward_this_episode = np.zeros((1, number_of_agents))
        obs, obs_full, env_info = env.reset()

        for agent in maddpg.maddpg_agent:
            agent.noise.reset()

        for episode_t in range(max_t):
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            #print('Obs:', obs)
            actions = maddpg.act(torch.tensor(obs, dtype=torch.float),
                                 noise=noise)
            #print(actions)

            #if noise>0.01:
            noise *= noise_reduction
            actions_for_env = torch.stack(actions).detach().numpy()

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            buffer.push(obs, obs_full, actions_for_env, rewards, next_obs,
                        next_obs_full, dones)

            reward_this_episode += rewards

            obs = np.copy(next_obs)
            obs_full = np.copy(next_obs_full)

            # update once after every episode_per_update
            if len(
                    buffer
            ) > batchsize and episode > 0 and episode % episode_per_update == 0:
                for a_i in range(number_of_agents):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i)

            if np.any(dones):
                break

        agent0_reward.append(reward_this_episode[0, 0])
        agent1_reward.append(reward_this_episode[0, 1])
        avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1])
        scores_window.append(avg_rewards)
        cur_score = np.mean(scores_window)
        ep_scores.append(cur_score)
        print(
            '\rEpisode:{}, Rwd:{:.3f} vs. {:.3f}, Average Score:{:.4f}, Noise:{:.4f}'
            .format(episode, reward_this_episode[0, 0],
                    reward_this_episode[0, 1], cur_score, noise))

        #saving model

        save_dict_list = []
        save_info = False
        score_code = int(cur_score * 10)
        if score_code in save_on_scores.keys():
            if not (save_on_scores[score_code]):
                save_on_scores[score_code] = True
                save_info = True

        if save_info:
            for i in range(number_of_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(
                        model_dir,
                        'episode-{}-{}.pt'.format(episode, score_code)))

            np.savez('scores-{}-{}.npz'.format(episode, score_code),
                     agent0_reward=np.array(agent0_reward),
                     agent1_reward=np.array(agent1_reward),
                     avg_max_scores=np.array(ep_scores))

    env.close()
예제 #14
0
class DDPGAgent:
    """
    DDPG Agent, valid for continuous actioin space
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #loss_fn = nn.MSELoss()
    loss_fn = nn.SmoothL1Loss()
    iter = 0

    def __init__(self, func1, func2, o_dim, a_dim, h_dim,
                 initialize_weights = False, lr_actor = 1e-3, lr_critic = 1e-3,
                 batch_size = 16, gamma = 0.99, tau = 0.001, buffer_size = int(1e5),
                 seed = 1234):

        """
        func1: actor model
        func2: critic model
        o_dim/c_dim: observation space dimension/ # of channels when image as input
        a_dim: action space dimension
        """

        self.o_dim = o_dim
        self.a_dim = a_dim
        self.h_dim = h_dim
        self.initialize_weights = initialize_weights

        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.seed = seed

        # Replay memory
        self.buffer = ReplayBuffer(buffer_size, batch_size, seed)

        # Actor Network (w/ Target Network)
        self.actor = func1(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device)
        self.target_actor = func1(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr = lr_actor)

        # Critic Network (w/ Target Network)
        self.critic = func2(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device)
        self.target_critic = func2(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = lr_critic)

        # Noise process
        self.noise = OUNoise(a_dim)

    def get_action1(self, state, eps = 0.):
        """
        action value ranges from -1 to 1
        --
        eps = 0. no exploration
            > 0. add exploration
        """
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state_tensor)[0].detach().cpu().numpy()
        self.actor.train()

        action += self.noise.sample() * eps

        return np.clip(action, -1, 1)

    def get_action2(self, state, eps = 0.):
        """
        slimevolly gym environment
        ---
        multibinary action space (although the action space is multi-binary, float vectors are accepted)
        forward = True if action[0]>0 else False
        backward = True if action[1]>0 elseTrue False
        jump = True if action[2]>0 else True False
        --
        eps = 0. no exploration
            > 0. add exploration
        """
        if random.random() > eps:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            self.actor.eval()
            with torch.no_grad():
                logits = self.actor(state_tensor).squeeze()
                action = torch.where(logits>0,torch.ones_like(logits),torch.zeros_like(logits))
            self.actor.train()
            return action.detach().cpu().numpy()

        else:
            action = [random.choice([0,1]) for _ in range(self.a_dim)]
            return np.asarray(action, dtype = np.float32)


    def update(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        states = torch.FloatTensor(states).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).view(-1, 1).to(self.device)
        dones = torch.FloatTensor(dones).view(-1, 1).to(self.device)
        self.iter += 1
        # ---------------------------- update critic ---------------------------- #
        next_actions = self.target_actor(next_states)
        Q_next = self.target_critic(next_states, next_actions)
        Q_targets = rewards + self.gamma * Q_next * (1. -dones)
        Q_expected = self.critic(states, actions)
        critic_loss = self.loss_fn(Q_expected, Q_targets.detach())
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        # ---------------------------- update actor ---------------------------- #
        pred_actions = self.actor(states)
        actor_loss = -self.critic(states, pred_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def update_targets(self):
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def step(self, state, action, reward, next_state, done):
        self.buffer.push(state, action, reward, next_state, done)
        if (len(self.buffer) > self.batch_size):
            experiences = self.buffer.sample()
            self.update(experiences)
            self.update_targets()
            self.iter += 1

    def reset(self):
        self.noise.reset()
예제 #15
0
def main():
    seeding()
    # number of parallel agents
    number_of_agents = 2
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 3000
    batchsize = 128
    
    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1
    noise_reduction = 0.9999

    tau = 1e-3   # soft update factor
    gamma = 0.99 # reward discount factor

    print_every = 100
    # how many episodes before update
    episode_per_update = 2

    #model_dir= os.getcwd()+"/model_dir"
    #os.makedirs(model_dir, exist_ok=True)

    result_dir= os.getcwd()+"/result_dir"
    os.makedirs(result_dir, exist_ok=True)

    # do we need to set multi-thread for this env?
    torch.set_num_threads(number_of_agents*2)

    env = TennisEnv()
    
    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(1e5))
    
    num_agents, num_states, num_actions = env.get_shapes()

    # initialize policy and critic
    maddpg = MADDPG(num_agents, num_states, num_actions, discount_factor=gamma, tau=tau)

    # training loop
    scores_window = deque(maxlen=100)
    ep_scores = []


    agent0_reward = []
    agent1_reward = []

    for episode in range(0, number_of_episodes):
        reward_this_episode = np.zeros((1, number_of_agents))
        states, states_full, env_info = env.reset()

        for agent in maddpg.maddpg_agent:
            agent.noise.reset()

        while True:
            actions = maddpg.act(torch.tensor(states, dtype=torch.float), noise=noise)

            noise *= noise_reduction
            actions_for_env = torch.stack(actions).detach().numpy()

            # step forward one frame
            next_states, next_states_full, rewards, dones, info = env.step(actions_for_env)

            # add data to buffer
            buffer.push(states, states_full, actions_for_env, rewards, next_states, next_states_full, dones)

            reward_this_episode += rewards

            states = np.copy(next_states)
            states_full = np.copy(next_states_full)

            # update once after every episode_per_update
            if len(buffer) > batchsize:
                for a_i in range(number_of_agents):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i)

            if np.any(dones):
                break

        agent0_reward.append(reward_this_episode[0, 0])
        agent1_reward.append(reward_this_episode[0, 1])
        
        avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1])

        scores_window.append(avg_rewards)
        cur_score = np.mean(scores_window)
        ep_scores.append(cur_score)
        
        save_dict_list =[]
     
        if episode % print_every == 0.0 or avg_rewards > 2.5:
            print('\rEpisode: {}, Average score: {:.5f}, noise: {:.5f}'.format(episode, cur_score, noise))    
            
            
            if avg_rewards > 2.5:
                for i in range(number_of_agents):
                    save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                                 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                                 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                                 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                    save_dict_list.append(save_dict)

                    torch.save(save_dict_list, 
                               os.path.join(model_dir, 'episode-{}-{}.pt'.format(episode, cur_score)))
                print('model saved')
            break
    env.close()

    #print('main-ep_scores: ', ep_scores)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(ep_scores)+1), ep_scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    fig.savefig(result_dir + '/score_plot.png')
예제 #16
0
def train():
    seeding()

    os.environ["CUDA_VISIBLE_DEVICES"] = "1"

    print("GPU available: {}".format(torch.cuda.is_available()))
    print("GPU tensor test: {}".format(torch.rand(3, 3).cuda()))

    env = UnityEnvironment(
        file_name=
        '/home/slavo/Dev/deep-rl-projects/ma_collab-compet/Tennis_Linux/Tennis.x86_64',
        no_graphics=True)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    agents = len(env_info.agents)

    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 30000
    episode_length = 500

    # how many steps before update
    steps_per_update = 100

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1
    noise_reduction = 0.9999

    torch.set_num_threads(4)

    buffer = ReplayBuffer(BUFFER_SIZE)

    # initialize policy and critic
    maddpg_agent = MADDPG(state_size, action_size, agents)

    scores = []
    scores_window = deque(maxlen=100)  # last 100 scores

    actor_losses = []
    critic_losses = []
    for i in range(len(env_info.agents)):
        actor_losses.append([])
        critic_losses.append([])

    for episode in range(0, number_of_episodes):

        episode_rewards = []

        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        state_full = np.concatenate(state)

        # for calculating rewards for this particular episode - addition of all time steps
        for episode_t in range(episode_length + 1):

            actions = maddpg_agent.act(transpose_to_tensor(list(state)),
                                       noise=noise)
            noise *= noise_reduction

            actions = torch.stack(actions).view(-1).detach().cpu().numpy()
            env_info = env.step(actions)[brain_name]

            state_next = env_info.vector_observations  # get the next state
            state_next_full = np.concatenate(state_next)
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done  # see if episode has finished

            # add experiences to buffer
            transition = (state, state_full, actions, rewards, state_next,
                          state_next_full, dones)
            buffer.push(transition)

            episode_rewards.append(rewards)
            state, state_full = state_next, state_next_full

            # update once after every steps_per_update
            if len(buffer) > BATCH_SIZE and (episode_t > 0) and (
                    episode_t % steps_per_update == 0):
                # print('maddpg update after {} steps'.format(episode_t))
                for agent_idx in range(len(env_info.agents)):
                    samples = buffer.sample(BATCH_SIZE)
                    al, cl = maddpg_agent.update(samples, agent_idx)
                    actor_losses[agent_idx].append(al)
                    critic_losses[agent_idx].append(cl)
                maddpg_agent.update_targets(
                )  # soft update the target network towards the actual networks

        # calculate agent episode rewards
        agent_episode_rewards = []
        for i in range(len(env_info.agents)):
            agent_episode_reward = 0
            for step in episode_rewards:
                agent_episode_reward += step[i]
            agent_episode_rewards.append(agent_episode_reward)

        scores.append(np.max(agent_episode_rewards))
        scores_window.append(np.max(agent_episode_rewards))

        if episode > 10 and episode % 10 == 0:
            print(
                '\rEpisode {}\tAgent Rewards [{:.4f}\t{:.4f}]\tMax Reward {:.4f}'
                .format(episode,
                        agent_episode_rewards[0], agent_episode_rewards[1],
                        np.max(agent_episode_rewards)))

            print(
                '\rEpisode {}\tAverage Actor 1 Loss {:.6f}\tAverage Critic 1 Loss {:.6f}'
                '\tAverage Actor 2 Loss {:.6f}\tAverage Critic 2 Loss {:.6f}'.
                format(episode, np.mean(actor_losses[0]),
                       np.mean(critic_losses[0]), np.mean(actor_losses[1]),
                       np.mean(critic_losses[1])))

            print('\rEpisode {}\tAverage Score: {:.4f}'.format(
                episode, np.mean(scores_window)))

            # reset losses
            actor_losses = []
            critic_losses = []
            for i in range(len(env_info.agents)):
                actor_losses.append([])
                critic_losses.append([])

        if episode > 100 and episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.4f}'.format(
                episode, np.mean(scores_window)))

        if episode > 100 and np.mean(scores_window) >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}'
                .format(episode - 100, np.mean(scores_window)))
            for i, save_agent in enumerate(maddpg_agent.agents):
                torch.save(save_agent.actor.state_dict(),
                           './checkpoints/checkpoint_actor_' + str(i) + '.pth')
                torch.save(
                    save_agent.critic.state_dict(),
                    './checkpoints/checkpoint_critic_' + str(i) + '.pth')
            break

    env.close()
    return scores
class DQNAgent:
    def __init__(self,
                 env_name="BreakoutDeterministic-v4",
                 gamma=0.99,
                 batch_size=32,
                 lr=0.00025,
                 update_period=4,
                 target_update_period=10000,
                 n_frames=4):

        self.env_name = env_name

        self.gamma = gamma

        self.batch_size = batch_size

        self.epsilon_scheduler = (
            lambda steps: max(1.0 - 0.9 * steps / 1000000, 0.1))

        self.update_period = update_period

        self.target_update_period = target_update_period

        env = gym.make(self.env_name)

        self.action_space = env.action_space.n

        self.qnet = QNetwork(self.action_space)

        self.target_qnet = QNetwork(self.action_space)

        self.optimizer = Adam(lr=lr, epsilon=0.01 / self.batch_size)

        self.n_frames = n_frames

        self.use_reward_clipping = True

        self.huber_loss = tf.keras.losses.Huber()

    def learn(self, n_episodes, buffer_size=1000000, logdir="log"):

        logdir = Path(__file__).parent / logdir
        if logdir.exists():
            shutil.rmtree(logdir)
        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        self.replay_buffer = ReplayBuffer(max_len=buffer_size)

        steps = 0
        for episode in range(1, n_episodes + 1):
            env = gym.make(self.env_name)

            frame = preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            episode_rewards = 0
            episode_steps = 0
            done = False
            lives = 5

            while not done:

                steps, episode_steps = steps + 1, episode_steps + 1

                epsilon = self.epsilon_scheduler(steps)

                state = np.stack(frames, axis=2)[np.newaxis, ...]

                action = self.qnet.sample_action(state, epsilon=epsilon)

                next_frame, reward, done, info = env.step(action)

                episode_rewards += reward

                frames.append(preprocess_frame(next_frame))

                next_state = np.stack(frames, axis=2)[np.newaxis, ...]

                if info["ale.lives"] != lives:
                    lives = info["ale.lives"]
                    transition = (state, action, reward, next_state, True)
                else:
                    transition = (state, action, reward, next_state, done)

                self.replay_buffer.push(transition)

                if len(self.replay_buffer) > 50000:
                    if steps % self.update_period == 0:
                        loss = self.update_network()
                        with self.summary_writer.as_default():
                            tf.summary.scalar("loss", loss, step=steps)
                            tf.summary.scalar("epsilon", epsilon, step=steps)
                            tf.summary.scalar("buffer_size",
                                              len(self.replay_buffer),
                                              step=steps)
                            tf.summary.scalar("train_score",
                                              episode_rewards,
                                              step=steps)
                            tf.summary.scalar("train_steps",
                                              episode_steps,
                                              step=steps)

                    if steps % self.target_update_period == 0:
                        self.target_qnet.set_weights(self.qnet.get_weights())

                if done:
                    break

            print(
                f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}"
            )
            if episode % 20 == 0:
                test_scores, test_steps = self.test_play(n_testplay=1)
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score", test_scores[0], step=steps)
                    tf.summary.scalar("test_step", test_steps[0], step=steps)

            if episode % 1000 == 0:
                self.qnet.save_weights("checkpoints/qnet")

    def update_network(self):

        #: ミニバッチの作成
        (states, actions, rewards, next_states,
         dones) = self.replay_buffer.get_minibatch(self.batch_size)

        if self.use_reward_clipping:
            rewards = np.clip(rewards, -1, 1)

        next_actions, next_qvalues = self.target_qnet.sample_actions(
            next_states)
        next_actions_onehot = tf.one_hot(next_actions, self.action_space)
        max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot,
                                         axis=1,
                                         keepdims=True)

        target_q = rewards + self.gamma * (1 - dones) * max_next_qvalues

        with tf.GradientTape() as tape:

            qvalues = self.qnet(states)
            actions_onehot = tf.one_hot(actions.flatten().astype(np.int32),
                                        self.action_space)
            q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)
            loss = self.huber_loss(target_q, q)

        grads = tape.gradient(loss, self.qnet.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.qnet.trainable_variables))

        return loss

    def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None):

        if checkpoint_path:
            env = gym.make(self.env_name)
            frame = preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            state = np.stack(frames, axis=2)[np.newaxis, ...]
            self.qnet(state)
            self.qnet.load_weights(checkpoint_path)

        if monitor_dir:
            monitor_dir = Path(monitor_dir)
            if monitor_dir.exists():
                shutil.rmtree(monitor_dir)
            monitor_dir.mkdir()
            env = gym.wrappers.Monitor(gym.make(self.env_name),
                                       monitor_dir,
                                       force=True,
                                       video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_name)

        scores = []
        steps = []
        for _ in range(n_testplay):

            frame = preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            done = False
            episode_steps = 0
            episode_rewards = 0

            while not done:
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.qnet.sample_action(state, epsilon=0.05)
                next_frame, reward, done, _ = env.step(action)
                frames.append(preprocess_frame(next_frame))

                episode_rewards += reward
                episode_steps += 1
                if episode_steps > 500 and episode_rewards < 3:
                    #: ゲーム開始(action: 0)しないまま停滞するケースへの対処
                    break

            scores.append(episode_rewards)
            steps.append(episode_steps)

        return scores, steps
예제 #18
0
def train(env,
          model_path='model_dir',
          number_of_episodes=50000,
          episode_length=500):

    noise = 1.0
    noise_reduction = 1.0
    batchsize = 256

    model_dir = os.getcwd() + "/" + model_path
    model_files = glob.glob(model_dir + "/*.pt")
    for file in model_files:
        os.remove(file)
    os.makedirs(model_dir, exist_ok=True)

    buffer = ReplayBuffer(int(1e5))
    rewards_deque = deque(maxlen=100)
    rewards_total = []

    # initialize policy and critic
    maddpg = MADDPG()

    for episode in range(1, number_of_episodes + 1):

        rewards_this_episode = np.asarray([0.0, 0.0])

        env_info = env.reset(train_mode=True)[brain_name]
        obs = env_info.vector_observations

        for episode_t in range(episode_length):

            actions = maddpg.act(obs, noise=noise)
            noise *= noise_reduction

            env_info = env.step(actions)[brain_name]

            next_obs = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # add data to buffer
            transition = (obs, actions, rewards, next_obs, dones)
            buffer.push(transition)

            rewards_this_episode += rewards

            obs = next_obs

            if any(dones):
                break

        # update once after every episode_per_update
        if len(buffer) > batchsize * 4:
            for _ in range(4):
                for a_i in range(num_agents):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        rewards_total.append(np.max(rewards_this_episode))
        rewards_deque.append(rewards_total[-1])
        average_score = np.mean(rewards_deque)

        print(episode, rewards_this_episode, rewards_total[-1], average_score)

        # saving model
        save_dict_list = []
        if episode % 1000 == 0:
            for i in range(2):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

                torch.save(maddpg.maddpg_agent[0].actor.state_dict(),
                           'actor0.pt')
                torch.save(maddpg.maddpg_agent[1].actor.state_dict(),
                           'actor1.pt')
                torch.save(maddpg.maddpg_agent[0].critic.state_dict(),
                           'critic0.pt')
                torch.save(maddpg.maddpg_agent[1].critic.state_dict(),
                           'critic1.pt')

    return rewards_total
예제 #19
0
def run(config):
    data_folder = Path(config.data_path)
    building_attributes = data_folder / 'building_attributes.json'
    solar_profile = data_folder / 'solar_generation_1kW.csv'
    building_state_actions = 'buildings_state_action_space.json'
    # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)]
    config.num_buildings = 6

    # customized log directory
    hidden = config.hidden_dim
    lr = config.lr
    tau = config.tau
    gamma = config.gamma
    batch_size = config.batch_size
    buffer_length = config.buffer_length
    to_print = lambda x: str(x)
    log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\
                "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/"

    logger = SummaryWriter(log_dir=log_path)
    # TODO fix here
    building_ids = ["Building_" + str(i)
                    for i in [1, 2, 5, 6, 7, 8]]  #[1,2,5,6,7,8]
    env = CityLearn(building_attributes,
                    solar_profile,
                    building_ids,
                    buildings_states_actions=building_state_actions,
                    cost_function=[
                        'ramping', '1-load_factor', 'peak_to_valley_ratio',
                        'peak_demand', 'net_electricity_consumption'
                    ])
    observations_spaces, actions_spaces = env.get_state_action_spaces()

    # Instantiating the control agent(s)
    if config.agent_alg == 'MADDPG':
        agents = MA_DDPG(observations_spaces,
                         actions_spaces,
                         hyper_params=vars(config))
    else:
        raise NotImplementedError

    k, c = 0, 0
    cost, cum_reward = {}, {}
    buffer = ReplayBuffer(max_steps=config.buffer_length,
                          num_agents=config.num_buildings,
                          obs_dims=[s.shape[0] for s in observations_spaces],
                          ac_dims=[a.shape[0] for a in actions_spaces])
    # TODO: store np or tensor in buffer?
    start = time.time()
    for e in range(config.n_episodes):
        cum_reward[e] = 0
        rewards = []
        state = env.reset()
        statecast = lambda x: [torch.FloatTensor(s) for s in x]
        done = False
        ss = 0
        while not done:
            if k % (40000 * 4) == 0:
                print('hour: ' + str(k) + ' of ' +
                      str(TIME_PERIOD * config.n_episodes))
            action = agents.select_action(statecast(state), explore=False)
            action = [a.detach().numpy() for a in action]
            # if batch norm:
            action = [np.squeeze(a, axis=0) for a in action]
            ss += 1
            #print("action is ", action)
            #print(action[0].shape)
            #raise NotImplementedError
            next_state, reward, done, _ = env.step(action)
            reward = reward_function(
                reward)  # See comments in reward_function.py
            #buffer_reward = [-r for r in reward]
            # agents.add_to_buffer()
            buffer.push(statecast(state), action, reward,
                        statecast(next_state), done)
            # if (len(buffer) >= config.batch_size and
            #         (e % config.steps_per_update) < config.n_rollout_threads):
            if len(buffer) >= config.batch_size:
                if USE_CUDA:
                    agents.to_train(device='gpu')
                else:
                    agents.to_train(device='cpu')
                for a_i in range(agents.n_buildings):
                    sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA)
                    agents.update(sample,
                                  a_i,
                                  logger=logger,
                                  global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag='net electric consumption',
                              scalar_value=env.net_electric_consumption[-1],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag='env cost total',
                              scalar_value=env.cost()['total'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="1 load factor",
                              scalar_value=env.cost()['1-load_factor'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="peak to valley ratio",
                              scalar_value=env.cost()['peak_to_valley_ratio'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(tag="peak demand",
                              scalar_value=env.cost()['peak_demand'],
                              global_step=e * TIME_PERIOD + ss)
            logger.add_scalar(
                tag="net energy consumption",
                scalar_value=env.cost()['net_electricity_consumption'],
                global_step=e * TIME_PERIOD + ss)
            net_energy_consumption_wo_storage = env.net_electric_consumption[
                -1] + env.electric_generation[
                    -1] - env.electric_consumption_cooling_storage[
                        -1] - env.electric_consumption_dhw_storage[-1]
            logger.add_scalar(tag="net energy consumption without storage",
                              scalar_value=net_energy_consumption_wo_storage,
                              global_step=e * TIME_PERIOD + ss)

            for id, r in enumerate(reward):
                logger.add_scalar(tag="agent {} reward ".format(id),
                                  scalar_value=r,
                                  global_step=e * TIME_PERIOD + ss)

            state = next_state
            cum_reward[e] += reward[0]
            k += 1
            cur_time = time.time()
            # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k))
        cost[e] = env.cost()
        if c % 1 == 0:
            print(cost[e])
        # add env total cost and reward logger
        logger.add_scalar(tag='env cost total final',
                          scalar_value=env.cost()['total'],
                          global_step=e)
        logger.add_scalar(tag="1 load factor final",
                          scalar_value=env.cost()['1-load_factor'],
                          global_step=e)
        logger.add_scalar(tag="peak to valley ratio final",
                          scalar_value=env.cost()['peak_to_valley_ratio'],
                          global_step=e)
        logger.add_scalar(tag="peak demand final",
                          scalar_value=env.cost()['peak_demand'],
                          global_step=e)
        logger.add_scalar(
            tag="net energy consumption final",
            scalar_value=env.cost()['net_electricity_consumption'],
            global_step=e)
        net_energy_consumption_wo_storage = env.net_electric_consumption[
            -1] + env.electric_generation[
                -1] - env.electric_consumption_cooling_storage[
                    -1] - env.electric_consumption_dhw_storage[-1]
        logger.add_scalar(tag="net energy consumption without storage",
                          scalar_value=net_energy_consumption_wo_storage,
                          global_step=e)
        c += 1
        rewards.append(reward)

    end = time.time()
    print((end - start) / 60.0)
    def maddpg(n_episodes=50000, max_t=1000, print_every=100, batchsize=128):
        seeding()
        buffer = ReplayBuffer(int(50000 * max_t))
        noise = 2
        noise_reduction = 0.9999
        scores_deque = deque(maxlen=print_every)
        scores = []
        for i_episode in range(1, n_episodes + 1):
            scores_agents = np.zeros(num_agents)
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations
            while True:
                # agent chooses actions
                states_converted_to_tensor = convert_to_tensor(states)
                actions = agent.act(states_converted_to_tensor, noise=noise)
                noise *= noise_reduction
                actions_array = torch.stack(actions).detach().numpy()

                # environment takes action and returns new states and rewards
                env_info = env.step(actions_array)[brain_name]
                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                # store in shared replay buffer
                experience = (states, actions_array, rewards, next_states,
                              dones)
                buffer.push(experience)

                # update agent with experience sample
                if len(buffer) > batchsize:
                    for a_i in range(2):
                        samples = buffer.sample(batchsize)
                        agent.update(samples, a_i)
                    agent.update_targets(
                    )  # soft update the target network towards the actual networks

                # update episode score with agent rewards
                scores_agents += rewards
                states = next_states
                if np.any(dones):
                    break
            scores_deque.append(np.max(scores_agents))
            scores.append(np.max(scores_agents))
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)),
                  end="")
            if i_episode % print_every == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_deque)))
            if np.mean(scores_deque) >= 0.5 and i_episode >= 100:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_deque)))
                for i, maddpg_agent in zip(range(num_agents),
                                           agent.maddpg_agent):
                    torch.save(maddpg_agent.actor.state_dict(),
                               'checkpoint_actor_{}.pth'.format(i))
                torch.save(agent.critic.state_dict(), 'checkpoint_critic.pth')
                break
        return scores
def main():

    seeding()

    number_of_episodes = 20000
    episode_length = 1000
    batchsize = 256
    save_interval = 1000
    rewards_deque = deque(maxlen=100)
    rewards_all = []
    noise = 1.0
    noise_reduction = 1.0

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)
    """ Info about the UnityEnvironment
    brain_name: 'TennisBrain'
    brain: ['brain_name', 'camera_resolutions',
           'num_stacked_vector_observations', 'number_visual_observations',
           'vector_action_descriptions', 'vector_action_space_size',
           'vector_action_space_type', 'vector_observation_space_size',
           'vector_observation_space_type']]
    """

    env = UnityEnvironment(file_name="Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    buffer = ReplayBuffer(int(1e5))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)

    # ------------------------------ training ------------------------------ #
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    for episode in range(1, number_of_episodes + 1):

        timer.update(episode)
        rewards_this_episode = np.zeros((2, ))
        """ Info about the UnityEnvironment
        env_info: ['agents', 'local_done', 'max_reached', 'memories',
                  'previous_text_actions', 'previous_vector_actions', 'rewards',
                  'text_observations', 'vector_observations', 'visual_observations']
        actions: List(num_agents=2, action_size=2)
        states: List((24,), (24,))
        rewards: List(2,)
        dones: List(2,)
        """
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        for episode_t in range(episode_length):
            # reset the OUNoise for each agent.
            for i in range(2):
                maddpg.maddpg_agent[i].noise.reset()

            actions = maddpg.act(states, noise=noise)
            env_info = env.step(actions)[brain_name]
            noise *= noise_reduction

            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # add data to buffer
            transition = (states, actions, rewards, next_states, dones)
            buffer.push(transition)

            rewards_this_episode += rewards

            states = next_states

            if any(dones):
                break

        # update the local and target network
        if len(buffer) > batchsize:
            # update the local network
            for _ in range(5):
                for a_i in range(2):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i, logger)
            # soft update the target network
            maddpg.update_targets()

        rewards_all.append(rewards_this_episode)
        rewards_deque.append(np.max(rewards_this_episode))
        average_score = np.mean(rewards_deque)

        # --------------------- Logging for TensorBoard --------------------- #
        logger.add_scalars('rewards', {
            'agent0': rewards_this_episode[0],
            'agent1': rewards_this_episode[1]
        }, episode)
        logger.add_scalars('global', {
            'score': np.max(rewards_this_episode),
            'average_score': average_score
        }, episode)
        # -------------------------- Save the model -------------------------- #
        save_dict_list = []

        if episode % save_interval == 0 or average_score >= 0.5:
            for i in range(2):
                save_dict = \
                    {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                     'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                     'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                     'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            if average_score >= 3.0:
                print('\nEnvironment solved in {} episodes!'.format(episode -
                                                                    100))
                print('\nAverage Score: {:.2f}'.format(average_score))
                break

    env.close()
    logger.close()
    timer.finish()
class SAC:

    MAX_EXPERIENCES = 100000

    MIN_EXPERIENCES = 512

    UPDATE_PERIOD = 4

    GAMMA = 0.99

    TAU = 0.005

    BATCH_SIZE = 256

    def __init__(self, env_id, action_space, action_bound):

        self.env_id = env_id

        self.action_space = action_space

        self.action_bound = action_bound

        self.env = gym.make(self.env_id)

        self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES)

        self.policy = GaussianPolicy(action_space=self.action_space,
                                     action_bound=self.action_bound)

        self.duqlqnet = DualQNetwork()

        self.target_dualqnet = DualQNetwork()

        self.log_alpha = tf.Variable(0.)  #: alpha=1

        self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4)

        self.target_entropy = -0.5 * self.action_space

        self.global_steps = 0

        self._initialize_weights()

    def _initialize_weights(self):
        """1度callすることでネットワークの重みを初期化
        """

        env = gym.make(self.env_id)

        dummy_state = env.reset()
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.action_space)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.policy(dummy_state)

        self.duqlqnet(dummy_state, dummy_action)
        self.target_dualqnet(dummy_state, dummy_action)
        self.target_dualqnet.set_weights(self.duqlqnet.get_weights())

    def play_episode(self):

        episode_reward = 0

        local_steps = 0

        done = False

        state = self.env.reset()

        while not done:

            action, _ = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]

            next_state, reward, done, _ = self.env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            self.replay_buffer.push(exp)

            state = next_state

            episode_reward += reward

            local_steps += 1

            self.global_steps += 1

            if (len(self.replay_buffer) >= self.MIN_EXPERIENCES
               and self.global_steps % self.UPDATE_PERIOD == 0):

                self.update_networks()

        return episode_reward, local_steps, tf.exp(self.log_alpha)

    def update_networks(self):

        (states, actions, rewards,
         next_states, dones) = self.replay_buffer.get_minibatch(self.BATCH_SIZE)

        alpha = tf.math.exp(self.log_alpha)

        #: Update Q-function
        next_actions, next_logprobs = self.policy.sample_action(next_states)

        target_q1, target_q2 = self.target_dualqnet(next_states, next_actions)

        target = rewards + (1 - dones) * self.GAMMA * (
            tf.minimum(target_q1, target_q2) + -1 * alpha * next_logprobs
            )

        with tf.GradientTape() as tape:
            q1, q2 = self.duqlqnet(states, actions)
            loss_1 = tf.reduce_mean(tf.square(target - q1))
            loss_2 = tf.reduce_mean(tf.square(target - q2))
            loss = 0.5 * loss_1 + 0.5 * loss_2

        variables = self.duqlqnet.trainable_variables
        grads = tape.gradient(loss, variables)
        self.duqlqnet.optimizer.apply_gradients(zip(grads, variables))

        #: Update policy
        with tf.GradientTape() as tape:
            selected_actions, logprobs = self.policy.sample_action(states)
            q1, q2 = self.duqlqnet(states, selected_actions)
            q_min = tf.minimum(q1, q2)
            loss = -1 * tf.reduce_mean(q_min + -1 * alpha * logprobs)

        variables = self.policy.trainable_variables
        grads = tape.gradient(loss, variables)
        self.policy.optimizer.apply_gradients(zip(grads, variables))

        #: Adjust alpha
        entropy_diff = -1 * logprobs - self.target_entropy
        with tf.GradientTape() as tape:
            tape.watch(self.log_alpha)
            selected_actions, logprobs = self.policy.sample_action(states)
            alpha_loss = tf.reduce_mean(tf.exp(self.log_alpha) * entropy_diff)

        grad = tape.gradient(alpha_loss, self.log_alpha)
        self.alpha_optimizer.apply_gradients([(grad, self.log_alpha)])

        #: Soft target update
        self.target_dualqnet.set_weights(
           (1 - self.TAU) * np.array(self.target_dualqnet.get_weights())
           + self.TAU * np.array(self.duqlqnet.get_weights())
           )

    def save_model(self):

        self.policy.save_weights("checkpoints/actor")

        self.duqlqnet.save_weights("checkpoints/critic")

    def load_model(self):

        self.policy.load_weights("checkpoints/actor")

        self.duqlqnet.load_weights("checkpoints/critic")

        self.target_dualqnet.load_weights("checkpoints/critic")

    def testplay(self, n=1, monitordir=None):

        if monitordir:
            env = wrappers.Monitor(gym.make(self.env_id),
                                   monitordir, force=True,
                                   video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_id)

        total_rewards = []

        for _ in range(n):

            state = env.reset()

            done = False

            total_reward = 0

            while not done:

                action, _ = self.policy.sample_action(np.atleast_2d(state))

                action = action.numpy()[0]

                next_state, reward, done, _ = env.step(action)

                total_reward += reward

                if done:
                    break
                else:
                    state = next_state

            total_rewards.append(total_reward)
            print()
            print(total_reward)
            print()

        return total_rewards
예제 #23
0
if args.train:
    # training loop
    for eps in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        
        for step in range(max_steps):
            if frame_idx > explore_steps:
                action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC, device=device)
            else:
                action = sac_trainer.policy_net.sample_action()
    
            next_state, reward, done, _ = env.step(action)

            replay_buffer.push(state, action, reward, next_state, done)
            
            state = next_state
            episode_reward += reward
            frame_idx += 1
            
            
            if len(replay_buffer) > batch_size:
                for i in range(update_itr):
                    _=sac_trainer.update(batch_size, device, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim)

            if done:
                break

        if eps % 20 == 0 and eps>0: # plot and model saving interval
            plot(rewards)
예제 #24
0
def main():
    env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64",
                           worker_id=1,
                           seed=1)
    env_date = str(datetime.datetime.now())
    file_path = os.path.join('data', env_date)

    os.makedirs(file_path, exist_ok=True)
    save_config(file_path)

    brain_name = env.brain_names[0]

    buffer = ReplayBuffer(Config.buffer_size)
    maddpg = MADDPGUnity(cfg=Config,
                         tau=Config.tau,
                         discount_factor=Config.discount_factor,
                         checkpoint_path=Config.checkpoint_path)

    agent1_reward, agent0_reward, all_rewards_mean = [], [], []
    batchsize = Config.batchsize
    max_reward = Config.max_reward
    # amplitude of OU noise
    # this slowly decreases to 0
    noise = Config.noise_beginning

    logger = logging.getLogger('Tennis MADDPG')
    all_rewards = []
    for episode in range(Config.n_episodes):
        reward_this_episode = np.zeros(2)
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(2)  # initialize the score (for each agent)
        n_of_steps = 0
        noise = max(
            Config.min_noise,
            Config.noise_beginning *
            (1 - (Config.n_episodes - episode) / Config.n_episodes))
        while True:
            n_of_steps += 1

            states_tensor = list(map(torch.tensor, states))
            states_tensor = [a.float() for a in states_tensor]
            actions = maddpg.act(states_tensor, noise=noise)
            actions_array = torch.stack(actions).detach().numpy()
            actions_for_env = np.rollaxis(actions_array, 1)
            actions_for_env = np.clip(actions_for_env, -1,
                                      1)  # all actions between -1 and 1

            env_info = env.step(actions_for_env)[
                brain_name]  # send all actions to tne environment

            states_next = env_info.vector_observations

            # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward
            reward_this_episode += np.array(env_info.rewards)
            if Config.replay_buffer_raward_min and max(
                    reward_this_episode) >= Config.replay_buffer_raward_min:
                buffer_data = (states, actions_for_env, env_info.rewards,
                               states_next, env_info.local_done)
                buffer.push(buffer_data)

            if not Config.replay_buffer_raward_min:
                buffer_data = (states, actions_for_env, env_info.rewards,
                               states_next, env_info.local_done)

                buffer.push(buffer_data)

            dones = env_info.local_done  # see if episode finished
            scores += env_info.rewards  # update the score (for each agent)
            states = states_next  # roll over states to next time step
            if np.any(dones):  # exit loop if episode finished
                break

        all_rewards.append(max(reward_this_episode[0], reward_this_episode[1]))
        all_rewards_mean.append(np.mean(all_rewards[-100:]))
        agent0_reward.append(reward_this_episode[0])
        agent1_reward.append(reward_this_episode[1])
        if len(buffer) > Config.warmup:
            for i in range(2):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, i, logger)
            if episode % Config.update_episode_n == 0:
                maddpg.update_targets(
                )  # soft update the target network towards the actual networks
            maddpg.iter += 1

        if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1:
            logger.info(
                f'Average 0 reward of agent0 is {np.mean(agent0_reward)}')
            logger.info(
                f'Average 1 reward of agent1 is {np.mean(agent1_reward)}')
            if all_rewards_mean and all_rewards_mean[-1] > max_reward:
                max_reward = max(np.mean(agent0_reward),
                                 np.mean(agent1_reward))
                logger.info('Found best model. Saving model into file: ...')

                save_dict_list = []
                for i in range(2):
                    save_dict = {
                        'actor_params':
                        maddpg.maddpg_agent[i].actor.state_dict(),
                        'actor_target_params':
                        maddpg.maddpg_agent[i].actor.state_dict(),
                        'actor_optim_params':
                        maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                        'critic_params':
                        maddpg.maddpg_agent[i].critic.state_dict(),
                        'critic_optim_params':
                        maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                    }

                    save_dict_list.append(save_dict)
                    save_dict_list.append(save_dict)

                    torch.save(
                        save_dict_list,
                        os.path.join(file_path,
                                     'episode-{}.pt'.format(episode)))
            agent0_reward = []
            agent1_reward = []
            plt.plot(all_rewards_mean)
            plt.savefig(os.path.join(file_path, 'result_plot.png'))
예제 #25
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 5000
    # what is this ?
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    # this may be a list of all environments
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    # this creates a list of models, each element in the list refers to an agent in the simulation
    # [agent_one_ddpg, agent_two_ddpg, ...]
    # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    # notice we jump forward by number of parallel environments
    for episode in range(0, number_of_episodes, parallel_envs):
        timer.update(episode)

        # i believe there are as many as number of agents times parallel env reward
        reward_this_episode = np.zeros((parallel_envs, 3))
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # all_observation = array(number of environments 4, 2 elements)
        # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14
        # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements
        all_obs = env.reset()
        # obs : is a list that has 1 element per environment. each element contains a list of 3 array.
        # each array is the state of one agent in that environment.
        # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment
        # each element contains an array of 14 values which is the global state of that environment
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):
            # we finish the episode before sampling the buffer for trainint
            # t jumps forward in a multiple of environment
            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            # the transpose_to_tensor(obs) changes the data to each agent point of view
            # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
            # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
            # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
            # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
            # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and
            # to generate an action from each agent actor.
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction
            # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct
            # actions_array is a tensor of shape (3 agent, 4 env, 2 action)
            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # the shape of actions_for_env is (4 env, 3 agent, 2 action)
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            # obs is the observation state space of all the three agents in the 4 parallel env.
            # for the Physical Dception environment with three agents it is of dimension 4x3x14.
            # obs_full is world state irrespective of the agents and its dimension is 4x14.
            # To gain more understanding, please see the code in the multiagent folder.
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which
                # reward and actions belong to which agent
                # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done
                # each element of sample, say samples[0] is a list of 3 elements, one for each agent
                # each agent element contains their corresponding value, for example in case of obs it would be a
                # vector with 14 values
                # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
class CategoricalDQNAgent:
    def __init__(self,
                 env_name="BreakoutDeterministic-v4",
                 n_atoms=51,
                 Vmin=-10,
                 Vmax=10,
                 gamma=0.98,
                 n_frames=4,
                 batch_size=32,
                 lr=0.00025,
                 init_epsilon=0.95,
                 update_period=8,
                 target_update_period=10000):

        self.env_name = env_name

        self.n_atoms = n_atoms

        self.Vmin, self.Vmax = Vmin, Vmax

        self.delta_z = (self.Vmax - self.Vmin) / (self.n_atoms - 1)

        self.Z = np.linspace(self.Vmin, self.Vmax, self.n_atoms)

        self.gamma = gamma

        self.n_frames = n_frames

        self.batch_size = batch_size

        self.init_epsilon = init_epsilon

        self.epsilon_scheduler = (
            lambda steps: max(0.98 * (500000 - steps) / 500000, 0.1)
            if steps < 500000 else max(
                0.05 + 0.05 * (1000000 - steps) / 500000, 0.05))

        self.update_period = update_period

        self.target_update_period = target_update_period

        env = gym.make(self.env_name)

        self.action_space = env.action_space.n

        self.qnet = CategoricalQNet(self.action_space, self.n_atoms, self.Z)

        self.target_qnet = CategoricalQNet(self.action_space, self.n_atoms,
                                           self.Z)

        self.optimizer = tf.keras.optimizers.Adam(lr=lr,
                                                  epsilon=0.01 / batch_size)

    def learn(self, n_episodes, buffer_size=800000, logdir="log"):

        logdir = Path(__file__).parent / logdir
        if logdir.exists():
            shutil.rmtree(logdir)
        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        self.replay_buffer = ReplayBuffer(max_len=buffer_size)

        steps = 0
        for episode in range(1, n_episodes + 1):
            env = gym.make(self.env_name)

            frames = collections.deque(maxlen=4)
            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)

            #: ネットワーク重みの初期化
            state = np.stack(frames, axis=2)[np.newaxis, ...]
            self.qnet(state)
            self.target_qnet(state)
            self.target_qnet.set_weights(self.qnet.get_weights())

            episode_rewards = 0
            episode_steps = 0

            done = False
            lives = 5
            while not done:

                steps += 1
                episode_steps += 1

                epsilon = self.epsilon_scheduler(steps)

                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.qnet.sample_action(state, epsilon=epsilon)
                next_frame, reward, done, info = env.step(action)
                episode_rewards += reward
                frames.append(frame_preprocess(next_frame))
                next_state = np.stack(frames, axis=2)[np.newaxis, ...]

                if done:
                    exp = Experience(state, action, reward, next_state, done)
                    self.replay_buffer.push(exp)
                    break
                else:
                    if info["ale.lives"] != lives:
                        lives = info["ale.lives"]
                        exp = Experience(state, action, reward, next_state,
                                         True)
                    else:
                        exp = Experience(state, action, reward, next_state,
                                         done)

                    self.replay_buffer.push(exp)

                if (len(self.replay_buffer) >
                        20000) and (steps % self.update_period == 0):
                    loss = self.update_network()

                    with self.summary_writer.as_default():
                        tf.summary.scalar("loss", loss, step=steps)
                        tf.summary.scalar("epsilon", epsilon, step=steps)
                        tf.summary.scalar("buffer_size",
                                          len(self.replay_buffer),
                                          step=steps)
                        tf.summary.scalar("train_score",
                                          episode_rewards,
                                          step=steps)
                        tf.summary.scalar("train_steps",
                                          episode_steps,
                                          step=steps)

                #: Hard target update
                if steps % self.target_update_period == 0:
                    self.target_qnet.set_weights(self.qnet.get_weights())

            print(
                f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}"
            )

            if episode % 20 == 0:
                test_scores, test_steps = self.test_play(n_testplay=1)
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score", test_scores[0], step=steps)
                    tf.summary.scalar("test_step", test_steps[0], step=steps)

            if episode % 1000 == 0:
                print("Model Saved")
                self.qnet.save_weights("checkpoints/qnet")

    def update_network(self):

        #: ミニバッチの作成
        (states, actions, rewards, next_states,
         dones) = self.replay_buffer.get_minibatch(self.batch_size)

        next_actions, next_probs = self.target_qnet.sample_actions(next_states)

        #: 選択されたactionの確率分布だけ抽出する
        onehot_mask = self.create_mask(next_actions)
        next_dists = tf.reduce_sum(next_probs * onehot_mask, axis=1).numpy()

        #: 分布版ベルマンオペレータの適用
        target_dists = self.shift_and_projection(rewards, dones, next_dists)

        onehot_mask = self.create_mask(actions)
        with tf.GradientTape() as tape:
            probs = self.qnet(states)

            dists = tf.reduce_sum(probs * onehot_mask, axis=1)
            #: クリップしないとlogとったときに勾配爆発することがある
            dists = tf.clip_by_value(dists, 1e-6, 1.0)

            loss = tf.reduce_sum(-1 * target_dists * tf.math.log(dists),
                                 axis=1,
                                 keepdims=True)
            loss = tf.reduce_mean(loss)

        grads = tape.gradient(loss, self.qnet.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.qnet.trainable_variables))

        return loss

    def shift_and_projection(self, rewards, dones, next_dists):

        target_dists = np.zeros((self.batch_size, self.n_atoms))

        for j in range(self.n_atoms):

            tZ_j = np.minimum(
                self.Vmax,
                np.maximum(self.Vmin, rewards + self.gamma * self.Z[j]))
            bj = (tZ_j - self.Vmin) / self.delta_z

            lower_bj = np.floor(bj).astype(np.int8)
            upper_bj = np.ceil(bj).astype(np.int8)

            eq_mask = lower_bj == upper_bj
            neq_mask = lower_bj != upper_bj

            lower_probs = 1 - (bj - lower_bj)
            upper_probs = 1 - (upper_bj - bj)

            next_dist = next_dists[:, [j]]
            indices = np.arange(self.batch_size).reshape(-1, 1)

            target_dists[indices[neq_mask],
                         lower_bj[neq_mask]] += (lower_probs *
                                                 next_dist)[neq_mask]
            target_dists[indices[neq_mask],
                         upper_bj[neq_mask]] += (upper_probs *
                                                 next_dist)[neq_mask]

            target_dists[indices[eq_mask],
                         lower_bj[eq_mask]] += (0.5 * next_dist)[eq_mask]
            target_dists[indices[eq_mask],
                         upper_bj[eq_mask]] += (0.5 * next_dist)[eq_mask]
        """ 2. doneへの対処
            doneのときは TZ(t) = R(t)
        """
        for batch_idx in range(self.batch_size):

            if not dones[batch_idx]:
                continue
            else:
                target_dists[batch_idx, :] = 0
                tZ = np.minimum(self.Vmax,
                                np.maximum(self.Vmin, rewards[batch_idx]))
                bj = (tZ - self.Vmin) / self.delta_z

                lower_bj = np.floor(bj).astype(np.int32)
                upper_bj = np.ceil(bj).astype(np.int32)

                if lower_bj == upper_bj:
                    target_dists[batch_idx, lower_bj] += 1.0
                else:
                    target_dists[batch_idx, lower_bj] += 1 - (bj - lower_bj)
                    target_dists[batch_idx, upper_bj] += 1 - (upper_bj - bj)

        return target_dists

    def create_mask(self, actions):

        mask = np.ones((self.batch_size, self.action_space, self.n_atoms))
        actions_onehot = tf.one_hot(tf.cast(actions, tf.int32),
                                    self.action_space,
                                    axis=1)

        for idx in range(self.batch_size):
            mask[idx, ...] = mask[idx, ...] * actions_onehot[idx, ...]

        return mask

    def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None):

        if checkpoint_path:
            env = gym.make(self.env_name)
            frames = collections.deque(maxlen=4)
            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)
            state = np.stack(frames, axis=2)[np.newaxis, ...]
            self.qnet(state)
            self.qnet.load_weights(checkpoint_path)

        if monitor_dir:
            monitor_dir = Path(monitor_dir)
            if monitor_dir.exists():
                shutil.rmtree(monitor_dir)
            monitor_dir.mkdir()
            env = gym.wrappers.Monitor(gym.make(self.env_name),
                                       monitor_dir,
                                       force=True,
                                       video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_name)

        scores = []
        steps = []
        for _ in range(n_testplay):

            frames = collections.deque(maxlen=4)

            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)

            done = False
            episode_steps = 0
            episode_rewards = 0

            while not done:
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.qnet.sample_action(state, epsilon=0.1)
                next_frame, reward, done, info = env.step(action)
                frames.append(frame_preprocess(next_frame))

                episode_rewards += reward
                episode_steps += 1
                if episode_steps > 500 and episode_rewards < 3:
                    #: ゲーム開始(action: 0)しないまま停滞するケースへの対処
                    break

            scores.append(episode_rewards)
            steps.append(episode_steps)

        return scores, steps