Exemple #1
0
    global_step.assign_add(1)
    print('Episode: ', episode, ' entropy: ', data[0] / float(episode_step),
          ' reward', data[1], ' global_step: ', max_step, ' episode_step: ',
          episode_step)
    with writer.as_default(), tf.contrib.summary.always_record_summaries():
        tf.contrib.summary.scalar("entropy", data[0] / float(episode_step))
        tf.contrib.summary.scalar("reward", data[1])
        tf.contrib.summary.scalar("episode_step", episode_step)


envs = env_wrapper.EnvWrapper(ENV_GAME_NAME,
                              ACTORS,
                              update_obs=_process_obs,
                              update_reward=_clip_reward,
                              end_episode=_end_episode)
rollouts = [rollout.Rollout() for _ in range(ACTORS)]

global ep_ave_max_q_value
ep_ave_max_q_value = 0
global total_reward
total_reward = 0


def create_tensorboard():
    global_step = tf.train.get_or_create_global_step()

    logdir = "./logs/"
    writer = tf.contrib.summary.create_file_writer(logdir)
    writer.set_as_default()
    return global_step, writer
    def generate_episode(self):
        """
        Generate and save a single rollout with the current policy

        Updates:
            self.rollouts

        Calls:
            _normalize
            _denormalize
            gym.environment.reset
            gym.environment.step
            gym.environment.render
            gym.environment.close

        :return: None
        """
        norm_prev_obs = self.env.reset()
        # Normalize the observations
        for i in range(0, np.shape(self.state_dimensions)[0]):
            norm_prev_obs[i] = self._normalize(norm_prev_obs[i],
                                               self.state_dimensions[i][0],
                                               self.state_dimensions[i][1])

        norm_states = ()
        norm_actions = ()
        norm_next_states = ()
        rewards = ()

        done = False
        steps = 0
        while (not done) & (steps < MAX_EPISODE_LENGTH):
            if self.actor is None:
                # If you haven't trained an actor explore with a set gaussian == N(0,1)
                action = self._denormalize(
                    np.clip(np.random.normal(0, 1), -1, 1),
                    self.action_dimension[0], self.action_dimension[1])
            else:
                action = self._denormalize(self.actor.act(norm_prev_obs),
                                           self.action_dimension[0],
                                           self.action_dimension[1])
            norm_obs, reward, done, info = self.env.step(np.array([action]))
            # print(done)
            # Normalize the observations
            for i in range(0, np.shape(self.state_dimensions)[0]):
                norm_obs[i] = self._normalize(norm_obs[i],
                                              self.state_dimensions[i][0],
                                              self.state_dimensions[i][1])
            norm_states += (norm_prev_obs, )
            norm_actions += (self._normalize(action, self.action_dimension[0],
                                             self.action_dimension[1]), )
            norm_next_states += (norm_obs, )
            rewards += (reward, )
            norm_prev_obs = norm_obs
            self.env.render()
            steps += 1
        self.env.close()

        number_of_samples = np.shape(norm_states)[0]
        # Reshape arrays to catch shape of (n,) and replace it with (n,1)
        rollout = rl.Rollout(
            np.array(np.reshape(norm_states, (number_of_samples, -1))),
            np.array(np.reshape(norm_actions, (number_of_samples, -1))),
            np.array(np.reshape(norm_next_states, (number_of_samples, -1))),
            np.array(np.reshape(rewards, (number_of_samples, -1))))
        self.rollouts += (rollout, )
Exemple #3
0
def main(arglist):
    envs = env_wrapper.EnvWrapper(arglist.scenario,
                                  arglist.actors,
                                  arglist.saved_episode,
                                  end_episode=_end_episode)
    rollouts = [roll.Rollout()
                for _ in range(arglist.actors)]  #envs.observation_shape[0]
    #actorCriticOld = agent.ActorCritic(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, envs.observation_shape, envs.continious, envs.upper_bound).to(device)
    actorCriticOld = agent.ActorCriticCNN(envs.action_shape,
                                          arglist.learning_rate,
                                          arglist.epsilon, arglist.final_step,
                                          [1, 41, 41], envs.continious,
                                          envs.upper_bound).to(device)
    try:
        actorCriticOld.load_model('./saved_actor/' + arglist.scenario + '_' +
                                  str(arglist.load_episode_saved))
        print('Success to load !')
    except Exception as e:
        print('FAILED TO LOAD.')

    if not arglist.eval:
        envs.set_wall_size(5)

    if not arglist.eval:
        envs.render()

    t = 0

    batch_obs = envs.reset()

    batch_stack = []

    for obs in batch_obs:
        stack = np.array(obs)  #np.concatenate([obs, obs], axis=1)
        batch_stack.append(stack)

    while t < arglist.final_step:
        if not arglist.eval:
            time.sleep(0.05)

        actions_t = []
        values_t = []
        dist_cat_t = []
        entropy_t = []

        for stack in batch_stack:
            state = torch.FloatTensor(stack).to(device)
            dist, value = actorCriticOld(state)
            action = dist.sample()[
                0]  #============================================>
            if envs.continious:
                action = action * envs.upper_bound
            entropy_t.append(dist.entropy().mean())
            actions_t.append(action.cpu().numpy())  #FIX
            dist_cat_t.append(dist.log_prob(action).cpu().detach().numpy()[0])
            values_t.append(
                value.cpu().detach().numpy()[0]
                [0])  #[0] FIX ============================================>

        obs2s_t, rewards_t, dones_t = envs.step(actions_t)

        for i in range(arglist.actors):
            data = envs.get_variables_at_index(i)
            if len(data) < 4:
                data = [0, 0, 0, 0]
            envs.add_variables_at_index(i, [
                entropy_t[i].cpu().detach().numpy() + data[0],
                rewards_t[i] + data[1], actorCriticOld.learning_rate_decay,
                actorCriticOld.clip_param
            ])

        if t > 0 and (t / arglist.actors
                      ) % arglist.time_horizon == 0 and arglist.eval:
            next_values = np.reshape(values_t, [-1])
            train(next_values, actorCriticOld, rollouts, arglist)

        if arglist.eval:
            if envs.can_saved:
                actorCriticOld.save_model('./saved_actor/' + arglist.scenario +
                                          '_' + str(envs.episode))

        if arglist.eval:
            for i, rollout in enumerate(rollouts):
                rollout.add(batch_stack[i][0, :], actions_t[i], rewards_t[i],
                            values_t[i], dist_cat_t[i], 1 - dones_t[i])

        t += arglist.actors

        for i, stack in enumerate(batch_stack):
            #stack = stack[:,1:,:,:]
            batch_stack[i] = np.array(
                obs2s_t[i])  #np.concatenate([stack, obs2s_t[i]], axis=1)

        if arglist.learning_rate_decay == 'linear' and arglist.eval:
            progress = t / arglist.final_step
            actorCriticOld.decay_clip_param(progress)
            actorCriticOld.decay_learning_rate(progress)
Exemple #4
0
def main(arglist):
    envs = env_wrapper.EnvWrapper(arglist.scenario, arglist.actors, update_obs=_process_obs, update_reward=_clip_reward, end_episode=_end_episode)
    rollouts = [roll.Rollout() for _ in range(arglist.actors)]
    actorCritic = ppo.ActorCritic(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, envs.observation_shape)
    actorCriticOld = ppo.ActorCritic(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, envs.observation_shape)

    print(envs.observation_shape)

    try:
        actorCriticOld.load('./saved_1100/actor.h5')
    except Exception as e:
        print('failed to load')

    t = 0
    batch_obs = envs.reset()
    update_episode = 0

    while True:
        if not TRAIN_MODE:
            envs.render(0)

        actions_t = []
        dists_t = []
        values_t = []
        dist_cat_t = []
        entropy_t = []

        for stack in batch_obs:
            dist, value = actorCriticOld.model.predict(stack)
            distCat = tf.distributions.Categorical(probs=tf.nn.softmax(dist))
            action = distCat.sample(1)[0]
            entropy_t.append(distCat.entropy())
            actions_t.append(action)
            dists_t.append(dist)
            dist_cat_t.append(distCat)
            values_t.append(value)

        obs2s_t, rewards_t, dones_t = envs.step(actions_t)

        for i in range(arglist.actors):
            data = envs.get_variables_at_index(i)
            if len(data) < 2:
                data = [0, 0]
            envs.add_variables_at_index(i, [np.mean(entropy_t[i]) + data[0], rewards_t[i] + data[1]])

        if t > 0 and (t / arglist.actors) % arglist.time_horizon == 0 and TRAIN_MODE:
            next_values = np.reshape(values_t, [-1])
            train(next_values, actorCritic, actorCriticOld, rollouts, arglist)

            #if update_episode % 50 == 0:
            actorCritic.save()
            #update_episode += 1

        if TRAIN_MODE:
            for i, rollout in enumerate(rollouts):
                log_prob = dist_cat_t[i].log_prob(actions_t[i])
                rollout.add(batch_obs[i][0,:], actions_t[i][0], rewards_t[i], values_t[i][0][0], log_prob[0], 1 - dones_t[i])

        t += arglist.actors

        for i, stack in enumerate(batch_obs):
            batch_obs[i] = obs2s_t[i]

        if arglist.learning_rate_decay == 'linear':
            actorCritic.decay_clip_param(t)
            actorCritic.decay_learning_rate(t)