def test_agent_random(self, T, normalization_factors=[], n=10):

        p = np.zeros(shape=(tuple(ant_utils.num_states)))
        p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))

        cumulative_states_visited_baseline = 0
        states_visited_baseline = []
        cumulative_states_visited_xy_baseline = 0
        states_visited_xy_baseline = []

        denom = 0

        for j in range(n):
            o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0
            o = get_state(self.test_env, o)
            while not (d or (ep_len == T)):
                a = self.test_env.action_space.sample()
                o, r, d, _ = self.test_env.step(a)
                o = get_state(self.test_env, o)
                r = self.reward(self.test_env, r, o)

                # if this is the first time you are seeing this state, increment.
                if p[tuple(
                        ant_utils.discretize_state(o, normalization_factors,
                                                   self.test_env))] == 0:
                    cumulative_states_visited_baseline += 1
                states_visited_baseline.append(
                    cumulative_states_visited_baseline)
                if p_xy[tuple(
                        ant_utils.discretize_state_2d(o, normalization_factors,
                                                      self.test_env))] == 0:
                    cumulative_states_visited_xy_baseline += 1
                states_visited_xy_baseline.append(
                    cumulative_states_visited_xy_baseline)

                p[tuple(
                    ant_utils.discretize_state(o, normalization_factors,
                                               self.test_env))] += 1
                p_xy[tuple(
                    ant_utils.discretize_state_2d(o, normalization_factors,
                                                  self.test_env))] += 1

                denom += 1
                ep_len += 1

                if d:  # CRITICAL: ignore done signal
                    d = False

        p /= float(denom)
        p_xy /= float(denom)

        return p, p_xy, states_visited_baseline, states_visited_xy_baseline
Example #2
0
def execute_policy_internal(env, T, policies, state, render):
    random_T = np.floor(random.random() * T)
    p = np.zeros(shape=(tuple(ant_utils.num_states)))
    random_initial_state = []

    for t in range(T):
        # Compute average probability over action space for state.
        probs = torch.tensor(np.zeros(shape=(1, ant_utils.action_dim))).float()
        var = torch.tensor(np.zeros(shape=(1, ant_utils.action_dim))).float()
        for policy in policies:
            prob, v = policy.get_probs_and_var(env.env.state_vector())
            probs += prob
            var += v
        probs /= len(policies)
        var /= len(policies)
        action = select_action(probs, var)

        state, reward, done, _ = env.step(action)
        p[tuple(ant_utils.discretize_state(state))] += 1
        if t == random_T:
            random_initial_state = env.env.state_vector()

        if render:
            env.render()
        if done:
            env.reset()

    p /= float(T)
    return p, random_initial_state
Example #3
0
def execute_one_rollout(policies,
                        weights,
                        env,
                        start_obs,
                        T,
                        data,
                        norm,
                        wrapped=False):
    obs = start_obs

    p, p_xy, cumulative_states_visited, states_visited, \
    cumulative_states_visited_xy, states_visited_xy, random_initial_state = data

    random_T = np.random.randint(0, T)

    for t in range(T):

        action = select_action(policies, weights, env, obs)

        # Count the cumulative number of new states visited as a function of t.
        obs, _, done, _ = env.step(action)
        obs = get_state(env, obs, wrapped)

        # if this is the first time you are seeing this state, increment.
        if p[tuple(ant_utils.discretize_state(obs, norm, env))] == 0:
            cumulative_states_visited += 1
        states_visited.append(cumulative_states_visited)
        if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] == 0:
            cumulative_states_visited_xy += 1
        states_visited_xy.append(cumulative_states_visited_xy)

        p[tuple(ant_utils.discretize_state(obs, norm, env))] += 1
        p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] += 1

        if t == random_T:
            random_initial_state = obs

        if done:  # CRITICAL: ignore done signal
            done = False
            if wrapped:
                obs = env.reset()
                obs = get_state(env, obs, wrapped)

    data = (p, p_xy, cumulative_states_visited, states_visited, \
    cumulative_states_visited_xy, states_visited_xy, random_initial_state)

    return data
    def reward(self, env, r, o):
        if len(self.reward_fn) == 0:
            return r

        # use self.normalization_factors to normalize the state.
        tup = tuple(
            ant_utils.discretize_state(o, self.normalization_factors, env))
        return self.reward_fn[tup]
    def test_agent(self,
                   T,
                   n=10,
                   initial_state=[],
                   normalization_factors=[],
                   store_log=True,
                   deterministic=True,
                   reset=False):

        denom = 0

        p = np.zeros(shape=(tuple(ant_utils.num_states)))
        p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))

        for j in range(n):
            o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0

            if len(initial_state) > 0:
                qpos = initial_state[:len(ant_utils.qpos)]
                qvel = initial_state[len(ant_utils.qpos):]
                self.test_env.env.set_state(qpos, qvel)
                o = self.test_env.env._get_obs()

            o = get_state(self.test_env, o)
            while not (d or (ep_len == T)):
                # Take deterministic actions at test time
                a = self.get_action(o, deterministic)
                o, r, d, _ = self.test_env.step(a)
                o = get_state(self.test_env, o)

                r = self.reward(self.test_env, r, o)
                ep_ret += r
                ep_len += 1
                denom += 1

                p[tuple(
                    ant_utils.discretize_state(o, normalization_factors,
                                               self.test_env))] += 1
                p_xy[tuple(
                    ant_utils.discretize_state_2d(o, normalization_factors,
                                                  self.test_env))] += 1

                if d and reset:
                    d = False

            if store_log:
                self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

        p /= float(denom)
        p_xy /= float(denom)
        return p, p_xy
Example #6
0
    def execute_internal(self, env, T, state, render):
        p = np.zeros(shape=(tuple(ant_utils.num_states)))
        print("Simulation starting at = " + str(state))
        state = self.get_obs()
        for t in range(T):
            action = self.select_action(state)
            _, reward, done, _ = self.env.step(action)
            state = self.get_obs()
            p[tuple(ant_utils.discretize_state(state))] += 1

            if render:
                env.render()
            if done:
                env.reset()
        env.close()
        return p
Example #7
0
    def learn_policy(self,
                     reward_fn,
                     initial_state=[],
                     episodes=1000,
                     train_steps=1000):

        if len(initial_state) == 0:
            # initial_state = self.init_state
            initial_state = self.env.reset()
            initial_state = initial_state[:29]
        print("init: " + str(initial_state))

        qpos = initial_state[:15]
        qvel = initial_state[15:]

        running_reward = 0
        running_loss = 0
        for i_episode in range(episodes):
            # if i_episode % 2 == 0:
            #     self.env.env.set_state(qpos, qvel)
            self.env.reset()
            state = self.get_obs()
            ep_reward = 0
            for t in range(train_steps):  # Don't infinite loop while learning
                action = self.select_action(state)
                _, _, done, _ = self.env.step(action)
                state = self.get_obs()
                reward = reward_fn[tuple(ant_utils.discretize_state(state))]
                ep_reward += reward
                self.rewards.append(reward)
                if done:
                    self.env.reset()

            running_reward = running_reward * 0.99 + ep_reward * 0.01
            if (i_episode == 0):
                running_reward = ep_reward

            loss = self.update_policy()
            running_loss = running_loss * 0.99 + loss * .01

            # Log to console.
            if i_episode % 10 == 0:
                print(
                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\tLoss: {:.2f}'
                    .format(i_episode, ep_reward, running_reward,
                            running_loss))
 def get_discrete_distribution(self):
     
     if self.p is not None:
         return self.p
     
     # normalize buffer experience
     if not self.normalized:
         self.normalize()
         
     p = np.zeros(shape=(tuple(ant_utils.num_states)))
     for obs in self.buffer:
         # discritize obs, add to distribution tabulation.
         p[tuple(ant_utils.discretize_state(obs))] += 1
     
     p /= len(self.buffer)
     self.p = p
         
     return p
Example #9
0
    def execute_random_internal(self, env, T, state, render):
        p = np.zeros(shape=(tuple(ant_utils.num_states)))
        for t in range(T):
            r = random.random()
            action = -1
            if (r < 1 / 3.):
                action = 0
            elif r < 2 / 3.:
                action = 1
            # action = self.env.action_space.sample() # continuous actions
            _, reward, done, _ = env.step([action])
            state = self.get_obs()
            p[tuple(ant_utils.discretize_state(state))] += 1

            if render:
                env.render()
            if done:
                env.reset()
        env.close()
        return p
Example #10
0
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''):

    reward_fn = np.zeros(shape=(tuple(ant_utils.num_states)))

    # set initial state to base state.
    seed = init_state(env)
    reward_fn[tuple(ant_utils.discretize_state(seed))] = 1
    print(seed)
    print(tuple(ant_utils.discretize_state(seed)))

    running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_ent = 0
    window_running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states)))
    window_running_avg_ent = 0

    running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_ent_baseline = 0
    window_running_avg_p_baseline = np.zeros(
        shape=(tuple(ant_utils.num_states)))
    window_running_avg_ent_baseline = 0

    baseline_entropies = []
    baseline_ps = []
    entropies = []
    ps = []

    average_entropies = []
    average_ps = []

    running_avg_entropies = []
    running_avg_ps = []

    running_avg_entropies_baseline = []
    running_avg_ps_baseline = []

    window_running_avg_ents = []
    window_running_avg_ps = []
    window_running_avg_ents_baseline = []
    window_running_avg_ps_baseline = []

    policies = []
    initial_state = []  #init_state(env)

    for i in range(epochs):

        # Learn policy that maximizes current reward function.
        policy = Policy(env, args.gamma, args.lr, ant_utils.obs_dim,
                        ant_utils.action_dim)
        policy.learn_policy(reward_fn, initial_state, args.episodes,
                            args.train_steps)
        policies.append(policy)

        # if args.save_models:
        #     policy.save(MODEL_DIR + 'model_' + str(i) + '.pt')

        # Get next distribution p by executing pi for T steps.
        # p_videos = 'cmp_videos/%sp_%d/'% (MODEL_DIR, i)
        initial_state = []
        p = policy.execute(T, initial_state, render=args.render)

        a = 10  # average over this many rounds
        baseline_videos = 'cmp_videos/%sbaseline_%d/' % (
            MODEL_DIR, i)  # note that MODEL_DIR has trailing slash
        entropy_videos = 'cmp_videos/%sentropy_%d/' % (MODEL_DIR, i)
        p_baseline = policy.execute_random(
            T, render=False, video_dir=baseline_videos)  # args.episodes?
        round_entropy_baseline = scipy.stats.entropy(p_baseline.flatten())
        for av in range(a - 1):
            next_p_baseline = policy.execute_random(T)
            p_baseline += next_p_baseline
            # print(scipy.stats.entropy(next_p_baseline.flatten()))
            round_entropy_baseline += scipy.stats.entropy(
                next_p_baseline.flatten())
        p_baseline /= float(a)
        round_entropy_baseline /= float(a)  # running average of the entropy

        # note: the entropy is p_baseline is not the same as the computed avg entropy
        # print("baseline compare:")
        # print(round_entropy_baseline) # running average
        # print(scipy.stats.entropy(p_baseline.flatten())) # entropy of final

        # reward_fn = grad_ent(p)

        round_entropy = scipy.stats.entropy(p.flatten())
        entropies.append(round_entropy)
        baseline_entropies.append(round_entropy_baseline)
        ps.append(p)
        baseline_ps.append(p_baseline)

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        average_p, round_avg_ent, initial_state = \
            execute_average_policy(env, policies, T, initial_state=initial_state, avg_runs=a, render=False, video_dir=entropy_videos)

        reward_fn = grad_ent(average_p)

        average_ps.append(average_p)
        average_entropies.append(round_avg_ent)

        # Update running average.
        window = 5
        if (i < window):  # add normally
            window_running_avg_ent = window_running_avg_ent * (
                i) / float(i + 1) + round_avg_ent / float(i + 1)
            window_running_avg_p = window_running_avg_ent * (
                i) / float(i + 1) + average_p / float(i + 1)
            window_running_avg_ent_baseline = window_running_avg_ent_baseline * (
                i) / float(i + 1) + round_entropy_baseline / float(i + 1)
            window_running_avg_p_baseline = window_running_avg_p_baseline * (
                i) / float(i + 1) + p_baseline / float(i + 1)

        else:
            window_running_avg_ent = window_running_avg_ent + round_avg_ent / float(
                window) - average_entropies[i - 5] / float(window)
            window_running_avg_p = window_running_avg_p + average_p / float(
                window) - average_ps[i - 5] / float(window)

            window_running_avg_ent_baseline = window_running_avg_ent_baseline + round_entropy_baseline / float(
                window) - baseline_entropies[i - 5] / float(window)
            window_running_avg_p_baseline = window_running_avg_p_baseline + p_baseline / float(
                window) - baseline_ps[i - 5] / float(window)

        running_avg_ent = running_avg_ent * (
            i) / float(i + 1) + round_avg_ent / float(i + 1)
        running_avg_p = running_avg_p * (
            i) / float(i + 1) + average_p / float(i + 1)
        running_avg_entropies.append(running_avg_ent)
        running_avg_ps.append(running_avg_p)

        # Update baseline running averages.
        running_avg_ent_baseline = running_avg_ent_baseline * (
            i) / float(i + 1) + round_entropy_baseline / float(i + 1)
        running_avg_p_baseline = running_avg_p_baseline * (
            i) / float(i + 1) + p_baseline / float(i + 1)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_ps_baseline.append(running_avg_p_baseline)

        window_running_avg_ents.append(window_running_avg_ent)
        window_running_avg_ps.append(window_running_avg_p)
        window_running_avg_ents_baseline.append(
            window_running_avg_ent_baseline)
        window_running_avg_ps_baseline.append(window_running_avg_p_baseline)

        # print("p=")
        # print(p)
        # print("..........")
        # print("round_entropy = %f" % (round_entropy))

        print("---------------------")

        # print("average_p =")
        # print(average_p)

        # print("..........")

        print("round_avg_ent[%d] = %f" % (i, round_avg_ent))
        print("running_avg_ent = %s" % running_avg_ent)
        print("window_running_avg_ent = %s" % window_running_avg_ent)

        print("..........")

        print("round_entropy_baseline[%d] = %f" % (i, round_entropy_baseline))
        print("running_avg_ent_baseline = %s" % running_avg_ent_baseline)
        print("window_running_avg_ent_baseline = %s" %
              window_running_avg_ent_baseline)

        print("----------------------")

        #plotting.heatmap(running_avg_p, average_p, i)

    # plotting.smear_lines(running_avg_ps, running_avg_ps_baseline)
    # plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline)
    # plotting.running_average_entropy_window(window_running_avg_ents, window_running_avg_ents_baseline, window)
    # plotting.difference_heatmap(running_avg_ps, running_avg_ps_baseline)

    # indexes = []
    # print('which indexes?')
    # for i in range(4):
    #     idx = input("index :")
    #     indexes.append(int(idx))
    # plotting.heatmap4(running_avg_ps, running_avg_ps_baseline, indexes)

    return policies
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''):

    direct = os.getcwd() + '/data/'
    experiment_directory = direct + args.exp_name
    print(experiment_directory)

    indexes = [1, 5, 10, 15]
    states_visited_indexes = [0, 5, 10, 15]

    states_visited_cumulative = []
    states_visited_cumulative_baseline = []

    running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent = 0
    running_avg_ent_xy = 0

    running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_baseline_xy = np.zeros(
        shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent_baseline = 0
    running_avg_ent_baseline_xy = 0

    pct_visited = []
    pct_visited_baseline = []
    pct_visited_xy = []
    pct_visited_xy_baseline = []

    running_avg_entropies = []
    running_avg_entropies_xy = []
    running_avg_ps_xy = []
    avg_ps_xy = []

    running_avg_entropies_baseline = []
    running_avg_entropies_baseline_xy = []
    running_avg_ps_baseline_xy = []
    avg_ps_baseline_xy = []

    policies = []
    initial_state = init_state(env)

    prebuf = ExperienceBuffer()
    env.reset()
    for t in range(10000):
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        prebuf.store(get_state(env, obs))
        if done:
            env.reset()
            done = False

    prebuf.normalize()
    normalization_factors = prebuf.normalization_factors
    utils.log_statement(normalization_factors)
    prebuf = None

    reward_fn = np.zeros(shape=(tuple(ant_utils.num_states)))

    for i in range(epochs):
        utils.log_statement("*** ------- EPOCH %d ------- ***" % i)

        # clear initial state if applicable.
        if not args.initial_state:
            initial_state = []
        else:
            utils.log_statement(initial_state)
            utils.log_statement(
                tuple(
                    ant_utils.discretize_state_2d(initial_state,
                                                  normalization_factors)))
            utils.log_statement(
                tuple(
                    ant_utils.discretize_state(initial_state,
                                               normalization_factors)))
        utils.log_statement("max reward: " + str(np.max(reward_fn)))

        logger_kwargs = setup_logger_kwargs("model" + str(i),
                                            data_dir=experiment_directory)

        # Learn policy that maximizes current reward function.
        print("Learning new oracle...")
        if args.seed != -1:
            seed = args.seed
        else:
            seed = random.randint(1, 100000)

        sac = AntSoftActorCritic(lambda: gym.make(args.env),
                                 reward_fn=reward_fn,
                                 xid=i + 1,
                                 seed=seed,
                                 gamma=args.gamma,
                                 ac_kwargs=dict(hidden_sizes=[args.hid] *
                                                args.l),
                                 logger_kwargs=logger_kwargs,
                                 normalization_factors=normalization_factors,
                                 learn_reduced=args.learn_reduced)
        # TODO: start learning from initial state to add gradient?
        # The first policy is random
        if i == 0:
            sac.soft_actor_critic(epochs=0)
        else:
            sac.soft_actor_critic(epochs=args.episodes,
                                  initial_state=initial_state,
                                  start_steps=args.start_steps)
        policies.append(sac)

        print("Learning autoencoding....")
        autoencoder = learn_encoding(env, policies, i)

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        print("Executing mixed policy...")
        average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \
            execute_average_policy(env, policies, T, autoencoder=autoencoder,
                                   reward_fn=reward_fn, norm=normalization_factors,
                                   initial_state=initial_state, n=args.n,
                                   render=False, epoch=i)

        print("Calculating maxEnt entropy...")
        round_entropy = entropy(average_p.ravel())
        round_entropy_xy = entropy(average_p_xy.ravel())

        # Update running averages for maxEnt.
        print("Updating maxEnt running averages...")
        running_avg_ent = running_avg_ent * (
            i) / float(i + 1) + round_entropy / float(i + 1)
        running_avg_ent_xy = running_avg_ent_xy * (
            i) / float(i + 1) + round_entropy_xy / float(i + 1)
        running_avg_p *= (i) / float(i + 1)
        running_avg_p += average_p / float(i + 1)
        running_avg_p_xy *= (i) / float(i + 1)
        running_avg_p_xy += average_p_xy / float(i + 1)

        # update reward function
        print("Update reward function")
        eps = 1 / np.sqrt(ant_utils.total_state_space)
        if args.cumulative:
            reward_fn = grad_ent(running_avg_p)
        else:
            reward_fn = 1.
            average_p += eps
            reward_fn /= average_p
        average_p = None  # delete big array

        # (save for plotting)
        running_avg_entropies.append(running_avg_ent)
        running_avg_entropies_xy.append(running_avg_ent_xy)
        if i in indexes:
            running_avg_ps_xy.append(np.copy(running_avg_p_xy))
            avg_ps_xy.append(np.copy(average_p_xy))

        print("Collecting baseline experience....")
        p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random(
            T, normalization_factors=normalization_factors, n=args.n)

        print('Random visits same # states....')
        print(len(states_visited))
        print(len(states_visited_baseline))
        print(len(states_visited_xy))
        print(len(states_visited_xy_baseline))

        plotting.states_visited_over_time(states_visited,
                                          states_visited_baseline, i)
        plotting.states_visited_over_time(states_visited_xy,
                                          states_visited_xy_baseline,
                                          i,
                                          ext='_xy')

        # save for cumulative plot.
        if i in states_visited_indexes:
            # average over a whole bunch of rollouts
            # slow: so only do this when needed.
            print("Averaging unique xy states visited....")
            states_visited_xy = compute_states_visited_xy(env,
                                                          policies,
                                                          T=T,
                                                          n=args.n,
                                                          N=args.avg_N)
            states_visited_xy_baseline = compute_states_visited_xy(
                env,
                policies,
                T=T,
                n=args.n,
                N=args.avg_N,
                initial_state=initial_state,
                baseline=True)
            states_visited_cumulative.append(states_visited_xy)
            states_visited_cumulative_baseline.append(
                states_visited_xy_baseline)

        print("Compute baseline entropy....")
        round_entropy_baseline = entropy(p_baseline.ravel())
        round_entropy_baseline_xy = entropy(p_baseline_xy.ravel())

        # Update baseline running averages.
        print("Updating baseline running averages...")
        running_avg_ent_baseline = running_avg_ent_baseline * (
            i) / float(i + 1) + round_entropy_baseline / float(i + 1)
        running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * (
            i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1)

        running_avg_p_baseline *= (i) / float(i + 1)
        running_avg_p_baseline += p_baseline / float(i + 1)
        running_avg_p_baseline_xy *= (i) / float(i + 1)
        running_avg_p_baseline_xy += p_baseline_xy / float(i + 1)

        p_baseline = None

        # (save for plotting)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy)
        if i in indexes:
            running_avg_ps_baseline_xy.append(
                np.copy(running_avg_p_baseline_xy))
            avg_ps_baseline_xy.append(np.copy(p_baseline_xy))

        utils.log_statement(average_p_xy)
        utils.log_statement(p_baseline_xy)

        # Calculate percent of state space visited.
        pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size)
        pct_visited.append(pct)
        pct_xy = np.count_nonzero(running_avg_p_xy) / float(
            running_avg_p_xy.size)
        pct_visited_xy.append(pct_xy)

        pct_baseline = np.count_nonzero(running_avg_p_baseline) / float(
            running_avg_p_baseline.size)
        pct_visited_baseline.append(pct_baseline)
        pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float(
            running_avg_p_baseline_xy.size)
        pct_visited_xy_baseline.append(pct_xy_baseline)

        # Print round summary.
        col_headers = ["", "baseline", "maxEnt"]
        col1 = [
            "round_entropy_xy", "running_avg_ent_xy", "round_entropy",
            "running_avg_ent", "% state space xy", "% total state space"
        ]
        col2 = [
            round_entropy_baseline_xy, running_avg_ent_baseline_xy,
            round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline,
            pct_baseline
        ]
        col3 = [
            round_entropy_xy, running_avg_ent_xy, round_entropy,
            running_avg_ent, pct_xy, pct
        ]
        table = tabulate(np.transpose([col1, col2, col3]),
                         col_headers,
                         tablefmt="fancy_grid",
                         floatfmt=".4f")
        utils.log_statement(table)

        # Plot from round.
        plotting.heatmap(running_avg_p_xy, average_p_xy, i)
        plotting.heatmap1(running_avg_p_baseline_xy, i)

        if i == states_visited_indexes[3]:
            plotting.states_visited_over_time_multi(
                states_visited_cumulative, states_visited_cumulative_baseline,
                states_visited_indexes)

    # cumulative plots.
    plotting.heatmap4(running_avg_ps_xy,
                      running_avg_ps_baseline_xy,
                      indexes,
                      ext="cumulative")
    plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch")
    plotting.running_average_entropy(running_avg_entropies,
                                     running_avg_entropies_baseline)
    plotting.running_average_entropy(running_avg_entropies_xy,
                                     running_avg_entropies_baseline_xy,
                                     ext='_xy')
    plotting.percent_state_space_reached(pct_visited,
                                         pct_visited_baseline,
                                         ext='_total')
    plotting.percent_state_space_reached(pct_visited_xy,
                                         pct_visited_xy_baseline,
                                         ext="_xy")

    return policies
def execute_average_policy(env,
                           policies,
                           T,
                           autoencoder=None,
                           reward_fn=[],
                           norm=[],
                           initial_state=[],
                           n=10,
                           render=False,
                           epoch=0):

    p = np.zeros(shape=(tuple(ant_utils.num_states)))
    p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))
    random_initial_state = []

    cumulative_states_visited = 0
    states_visited = []
    cumulative_states_visited_xy = 0
    states_visited_xy = []

    rewards = np.zeros(T)

    denom = 0
    max_idx = len(policies) - 1

    # average results over n rollouts
    for iteration in range(n):

        env.reset()

        # TODO: when testing, do not want initial state.
        if len(initial_state) > 0:
            qpos = initial_state[:len(ant_utils.qpos)]
            qvel = initial_state[len(ant_utils.qpos):]
            env.env.set_state(qpos, qvel)

        obs = get_state(env, env.env._get_obs())

        random_T = np.floor(random.random() * T)
        random_initial_state = []

        for t in range(T):

            action = np.zeros(shape=(1, ant_utils.action_dim))

            if args.max_sigma:
                mu = np.zeros(shape=(1, ant_utils.action_dim))
                sigma = np.zeros(shape=(1, ant_utils.action_dim))
                mean_sigma = np.zeros(shape=(1, ant_utils.action_dim))
                for sac in policies:
                    mu += sac.get_action(obs, deterministic=True)
                    sigma = np.maximum(sigma, sac.get_sigma(obs))
                    mean_sigma += sac.get_sigma(obs)
                mu /= float(len(policies))
                mean_sigma /= float(len(policies))

                action = np.random.normal(loc=mu, scale=sigma)
            else:
                # select random policy uniform distribution
                # take non-deterministic action for that policy
                idx = random.randint(0, max_idx)
                if idx == 0:
                    action = env.action_space.sample()
                else:
                    action = policies[idx].get_action(
                        obs, deterministic=args.deterministic)

            # Count the cumulative number of new states visited as a function of t.
            obs, _, done, _ = env.step(action)

            # log encoded data to file.
            if autoencoder is not None:
                encodedfile = 'logs/encoded/' + args.exp_name + '.txt'
                val = autoencoder.encode(obs[:29])
                with open(encodedfile, 'a') as f:
                    f.write(str(val) + '\n')
                print(autoencoder.encode(obs[:29]))

            obs = get_state(env, obs)
            reward = reward_fn[tuple(ant_utils.discretize_state(obs, norm))]
            rewards[t] += reward

            # if this is the first time you are seeing this state, increment.
            if p[tuple(ant_utils.discretize_state(obs, norm))] == 0:
                cumulative_states_visited += 1
            states_visited.append(cumulative_states_visited)
            if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm))] == 0:
                cumulative_states_visited_xy += 1
            states_visited_xy.append(cumulative_states_visited_xy)

            p[tuple(ant_utils.discretize_state(obs, norm))] += 1
            p_xy[tuple(ant_utils.discretize_state_2d(obs, norm))] += 1
            denom += 1

            if t == random_T:
                random_initial_state = obs

            if render:
                env.render()
            if done:  # CRITICAL: ignore done signal
                done = False

    env.close()
    rewards /= float(n)
    plotting.reward_vs_t(rewards, epoch)

    p /= float(denom)
    p_xy /= float(denom)

    return p, p_xy, random_initial_state, states_visited, states_visited_xy