def get_state(env, obs):
    state = env.env.state_vector()
    if not np.array_equal(obs[:len(state) - 2], state[2:]):
        utils.log_statement(obs)
        utils.log_statement(state)
        raise ValueError("state and observation are not equal")
    return state
def collect_avg_obs(env, policies, T, n=100):
    data = []
    max_idx = len(policies) - 1

    utils.log_statement('Collecting ' + str(n * T) + ' steps')

    for iteration in range(n):
        env.reset()
        obs = get_state(env, env.env._get_obs())

        for t in range(T):
            action = np.zeros(shape=(1, ant_utils.action_dim))

            # select random policy uniform distribution
            # take non-deterministic action for that policy
            idx = random.randint(0, max_idx)
            if idx == 0:
                action = env.action_space.sample()
            else:
                action = policies[idx].get_action(
                    obs, deterministic=args.deterministic)

            # Count the cumulative number of new states visited as a function of t.
            obs, _, done, _ = env.step(action)
            data.append(obs[:29])
            obs = get_state(env, obs)

        print('Iteration %i/%i' % (iteration, n))

    return data
Example #3
0
def entropy(pt):
    utils.log_statement("pt size %d" % pt.size)
    # entropy = -sum(pt*log(pt))
    entropy = 0.0
    for p in pt:
        if p == 0.0:
            continue
        entropy += p * np.log(p)
    return -entropy
Example #4
0
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''):

    video_dir = 'videos/' + args.exp_name

    direct = os.getcwd() + '/data/'
    experiment_directory = direct + args.exp_name
    print(experiment_directory)

    print(sys.argv)
    if not os.path.exists(experiment_directory):
        os.makedirs(experiment_directory)
        f = open(experiment_directory + '/args', 'w')
        f.write(' '.join(sys.argv))
        f.flush()

    indexes = [1, 5, 10, 15]
    states_visited_indexes = [0, 5, 10, 15]

    states_visited_cumulative = []
    states_visited_cumulative_baseline = []

    running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent = 0
    running_avg_ent_xy = 0

    running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_baseline_xy = np.zeros(
        shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent_baseline = 0
    running_avg_ent_baseline_xy = 0

    pct_visited = []
    pct_visited_baseline = []
    pct_visited_xy = []
    pct_visited_xy_baseline = []

    running_avg_entropies = []
    running_avg_entropies_xy = []
    running_avg_ps_xy = []
    avg_ps_xy = []

    running_avg_entropies_baseline = []
    running_avg_entropies_baseline_xy = []
    running_avg_ps_baseline_xy = []
    avg_ps_baseline_xy = []

    policies = []
    distributions = []
    initial_state = init_state(env)

    prebuf = ExperienceBuffer()
    env.reset()
    for t in range(10000):
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        prebuf.store(get_state(env, obs))
        if done:
            env.reset()
            done = False

    prebuf.normalize()
    normalization_factors = prebuf.normalization_factors
    utils.log_statement(normalization_factors)
    prebuf = None
    if not args.gaussian:
        normalization_factors = []

    reward_fn = np.zeros(shape=(tuple(ant_utils.num_states)))

    for i in range(epochs):
        utils.log_statement("*** ------- EPOCH %d ------- ***" % i)

        # clear initial state if applicable.
        if not args.initial_state:
            initial_state = []
        else:
            utils.log_statement(initial_state)
        utils.log_statement("max reward: " + str(np.max(reward_fn)))

        logger_kwargs = setup_logger_kwargs("model%02d" % i,
                                            data_dir=experiment_directory)

        # Learn policy that maximizes current reward function.
        print("Learning new oracle...")
        seed = random.randint(1, 100000)
        sac = AntSoftActorCritic(lambda: gym.make(args.env),
                                 reward_fn=reward_fn,
                                 xid=i + 1,
                                 seed=seed,
                                 gamma=args.gamma,
                                 ac_kwargs=dict(hidden_sizes=[args.hid] *
                                                args.l),
                                 logger_kwargs=logger_kwargs,
                                 normalization_factors=normalization_factors)

        # The first policy is random
        if i == 0:
            sac.soft_actor_critic(epochs=0)
        else:
            sac.soft_actor_critic(epochs=args.episodes,
                                  initial_state=initial_state,
                                  start_steps=args.start_steps)
        policies.append(sac)

        p, _ = sac.test_agent(T, normalization_factors=normalization_factors)
        distributions.append(p)
        weights = utils.get_weights(distributions)

        epoch = 'epoch_%02d' % (i)
        if args.render:
            if i < 10:
                sac.record(T=args.record_steps,
                           n=1,
                           video_dir=video_dir + '/baseline/' + epoch,
                           on_policy=False)
            sac.record(T=args.record_steps,
                       n=1,
                       video_dir=video_dir + '/entropy/' + epoch,
                       on_policy=True)

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        print("Executing mixed policy...")
        average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \
            execute_average_policy(env, policies, T, weights,
                                   reward_fn=reward_fn, norm=normalization_factors,
                                   initial_state=initial_state, n=args.n,
                                   render=args.render, video_dir=video_dir+'/mixed/'+epoch, epoch=i,
                                   record_steps=args.record_steps)

        print("Calculating maxEnt entropy...")
        round_entropy = entropy(average_p.ravel())
        round_entropy_xy = entropy(average_p_xy.ravel())

        # Update running averages for maxEnt.
        print("Updating maxEnt running averages...")
        running_avg_ent = running_avg_ent * (
            i) / float(i + 1) + round_entropy / float(i + 1)
        running_avg_ent_xy = running_avg_ent_xy * (
            i) / float(i + 1) + round_entropy_xy / float(i + 1)
        running_avg_p *= (i) / float(i + 1)
        running_avg_p += average_p / float(i + 1)
        running_avg_p_xy *= (i) / float(i + 1)
        running_avg_p_xy += average_p_xy / float(i + 1)

        # update reward function
        print("Update reward function")
        eps = 1 / np.sqrt(ant_utils.total_state_space)
        if args.cumulative:
            reward_fn = grad_ent(running_avg_p)
        else:
            reward_fn = 1.
            average_p += eps
            reward_fn /= average_p
        average_p = None  # delete big array

        # (save for plotting)
        running_avg_entropies.append(running_avg_ent)
        running_avg_entropies_xy.append(running_avg_ent_xy)
        if i in indexes:
            running_avg_ps_xy.append(np.copy(running_avg_p_xy))
            avg_ps_xy.append(np.copy(average_p_xy))

        print("Collecting baseline experience....")
        p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random(
            T, normalization_factors=normalization_factors, n=args.n)

        plotting.states_visited_over_time(states_visited,
                                          states_visited_baseline, i)
        plotting.states_visited_over_time(states_visited_xy,
                                          states_visited_xy_baseline,
                                          i,
                                          ext='_xy')

        # save for cumulative plot.
        if i in states_visited_indexes:
            # average over a whole bunch of rollouts
            # slow: so only do this when needed.
            print("Averaging unique xy states visited....")
            states_visited_xy = compute_states_visited_xy(
                env,
                policies,
                norm=normalization_factors,
                T=T,
                n=args.n,
                N=args.avg_N)
            states_visited_xy_baseline = compute_states_visited_xy(
                env,
                policies,
                norm=normalization_factors,
                T=T,
                n=args.n,
                N=args.avg_N,
                initial_state=initial_state,
                baseline=True)
            states_visited_cumulative.append(states_visited_xy)
            states_visited_cumulative_baseline.append(
                states_visited_xy_baseline)

        print("Compute baseline entropy....")
        round_entropy_baseline = entropy(p_baseline.ravel())
        round_entropy_baseline_xy = entropy(p_baseline_xy.ravel())

        # Update baseline running averages.
        print("Updating baseline running averages...")
        running_avg_ent_baseline = running_avg_ent_baseline * (
            i) / float(i + 1) + round_entropy_baseline / float(i + 1)
        running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * (
            i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1)

        running_avg_p_baseline *= (i) / float(i + 1)
        running_avg_p_baseline += p_baseline / float(i + 1)
        running_avg_p_baseline_xy *= (i) / float(i + 1)
        running_avg_p_baseline_xy += p_baseline_xy / float(i + 1)

        p_baseline = None

        # (save for plotting)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy)
        if i in indexes:
            running_avg_ps_baseline_xy.append(
                np.copy(running_avg_p_baseline_xy))
            avg_ps_baseline_xy.append(np.copy(p_baseline_xy))

        utils.log_statement(average_p_xy)
        utils.log_statement(p_baseline_xy)

        # Calculate percent of state space visited.
        pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size)
        pct_visited.append(pct)
        pct_xy = np.count_nonzero(running_avg_p_xy) / float(
            running_avg_p_xy.size)
        pct_visited_xy.append(pct_xy)

        pct_baseline = np.count_nonzero(running_avg_p_baseline) / float(
            running_avg_p_baseline.size)
        pct_visited_baseline.append(pct_baseline)
        pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float(
            running_avg_p_baseline_xy.size)
        pct_visited_xy_baseline.append(pct_xy_baseline)

        # Print round summary.
        col_headers = ["", "baseline", "maxEnt"]
        col1 = [
            "round_entropy_xy", "running_avg_ent_xy", "round_entropy",
            "running_avg_ent", "% state space xy", "% total state space"
        ]
        col2 = [
            round_entropy_baseline_xy, running_avg_ent_baseline_xy,
            round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline,
            pct_baseline
        ]
        col3 = [
            round_entropy_xy, running_avg_ent_xy, round_entropy,
            running_avg_ent, pct_xy, pct
        ]
        table = tabulate(np.transpose([col1, col2, col3]),
                         col_headers,
                         tablefmt="fancy_grid",
                         floatfmt=".4f")
        utils.log_statement(table)

        # Plot from round.
        plotting.heatmap(running_avg_p_xy, average_p_xy, i)
        plotting.heatmap1(running_avg_p_baseline_xy, i)

        if i == states_visited_indexes[3]:
            plotting.states_visited_over_time_multi(
                states_visited_cumulative, states_visited_cumulative_baseline,
                states_visited_indexes)

    # save final expert weights to use with the trained oracles.
    weights_file = experiment_directory + '/policy_weights'
    np.save(weights_file, weights)

    # cumulative plots.
    plotting.running_average_entropy(running_avg_entropies,
                                     running_avg_entropies_baseline)
    plotting.running_average_entropy(running_avg_entropies_xy,
                                     running_avg_entropies_baseline_xy,
                                     ext='_xy')

    plotting.heatmap4(running_avg_ps_xy,
                      running_avg_ps_baseline_xy,
                      indexes,
                      ext="cumulative")
    plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch")

    plotting.percent_state_space_reached(pct_visited,
                                         pct_visited_baseline,
                                         ext='_total')
    plotting.percent_state_space_reached(pct_visited_xy,
                                         pct_visited_xy_baseline,
                                         ext="_xy")

    return policies