Example #1
0
def collect_entropy_policies(env, epochs, T, MODEL_DIR):

    video_dir = 'videos/' + args.exp_name

    reward_fn = np.zeros(shape=(tuple(base_utils.num_states)))
    online_reward_fn = np.zeros(shape=(tuple(base_utils.num_states)))

    # set initial state to base, motionless state.
    seed = []
    if args.env == "Pendulum-v0":
        env.env.state = [np.pi, 0]
        seed = env.env._get_obs()
    elif args.env == "MountainCarContinuous-v0":
        env.env.state = [-0.50, 0]
        seed = env.env.state

    running_avg_p = np.zeros(shape=(tuple(base_utils.num_states)))
    running_avg_ent = 0
    running_avg_entropies = []
    running_avg_ps = []

    running_avg_p_online = np.zeros(shape=(tuple(base_utils.num_states)))
    running_avg_ent_online = 0
    running_avg_entropies_online = []
    running_avg_ps_online = []

    running_avg_p_baseline = np.zeros(shape=(tuple(base_utils.num_states)))
    running_avg_ent_baseline = 0
    running_avg_entropies_baseline = []
    running_avg_ps_baseline = []

    online_average_ps = []
    
    policies = []
    initial_state = init_state(args.env)

    online_policies = []
    online_initial_state = init_state(args.env)

    for i in range(epochs):

        # Learn policy that maximizes current reward function.
        policy = Policy(env, args.gamma, args.lr, base_utils.obs_dim, base_utils.action_dim)
        online_policy = Policy(env, args.gamma, args.lr, base_utils.obs_dim, base_utils.action_dim) 

        if i == 0:
            policy.learn_policy(reward_fn, 
                episodes=0, 
                train_steps=0)
            online_policy.learn_policy(online_reward_fn, 
                episodes=0, 
                train_steps=0)
        else:
            policy.learn_policy(reward_fn, 
                initial_state=initial_state, 
                episodes=args.episodes, 
                train_steps=args.train_steps)
            online_policy.learn_policy(online_reward_fn, 
                initial_state=online_initial_state, 
                episodes=args.episodes, 
                train_steps=args.train_steps)

        policies.append(policy)
        online_policies.append(online_policy)

        epoch = 'epoch_%02d/' % (i) 
        
        a = 10 # average over this many rounds
        p_baseline = policy.execute_random(T,
            render=args.render, video_dir=video_dir+'/baseline/'+epoch)
       
        round_entropy_baseline = scipy.stats.entropy(p_baseline.flatten())
        for av in range(a - 1):
            next_p_baseline = policy.execute_random(T)
            p_baseline += next_p_baseline
            round_entropy_baseline += scipy.stats.entropy(next_p_baseline.flatten())
        p_baseline /= float(a)
        round_entropy_baseline /= float(a) # running average of the entropy

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        average_p, round_avg_ent, initial_state = \
            curiosity.execute_average_policy(env, policies, T, 
                initial_state=initial_state, 
                avg_runs=a, 
                render=False)
        online_average_p, online_round_avg_ent, online_initial_state = \
            curiosity.execute_average_policy(env, online_policies, T, 
                initial_state=online_initial_state, 
                avg_runs=a, 
                render=False)

        # Get next distribution p by executing pi for T steps.
        # ALSO: Collect video of each policy
        p = policy.execute(T, initial_state=initial_state, 
            render=args.render, video_dir=video_dir+'/normal/'+epoch)
        p_online = online_policy.execute(T, initial_state=initial_state, 
            render=args.render, video_dir=video_dir+'/online/'+epoch)
        
        # Force first round to be equal
        if i == 0:
            average_p = p_baseline
            round_avg_ent = round_entropy_baseline
            online_average_p = p_baseline
            online_round_avg_ent = round_entropy_baseline

        # If in pendulum, set velocity to 0 with some probability
        if args.env == "Pendulum-v0" and random.random() < 0.3:
            initial_state[1] = 0

        # goal: try online reward structure
        online_reward_fn = online_rewards(online_average_p, online_average_ps, epochs)
        online_average_ps.append(online_average_p)

        reward_fn = grad_ent(average_p)

        # Update experimental running averages.
        running_avg_ent = running_avg_ent * (i)/float(i+1) + round_avg_ent/float(i+1)
        running_avg_p = running_avg_p * (i)/float(i+1) + average_p/float(i+1)
        running_avg_entropies.append(running_avg_ent)
        running_avg_ps.append(running_avg_p)  

        # Update online running averages.
        running_avg_ent_online = running_avg_ent_online * (i)/float(i+1) + online_round_avg_ent/float(i+1)
        running_avg_p_online = running_avg_p_online * (i)/float(i+1) + online_average_p/float(i+1)
        running_avg_entropies_online.append(running_avg_ent_online)
        running_avg_ps_online.append(running_avg_p_online)     

        # Update baseline running averages.
        running_avg_ent_baseline = running_avg_ent_baseline * (i)/float(i+1) + round_entropy_baseline/float(i+1)
        running_avg_p_baseline = running_avg_p_baseline * (i)/float(i+1) + p_baseline/float(i+1)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_ps_baseline.append(running_avg_p_baseline) 

        print("--------------------------------")
        print("p=")
        print(p)

        print("average_p =") 
        print(average_p)

        print("online_average_p")
        print(online_average_p)

        print("---------------------")

        print("round_avg_ent[%d] = %f" % (i, round_avg_ent))
        print("running_avg_ent = %s" % running_avg_ent)

        print("..........")

        print("online_round_avg_ent[%d] = %f" % (i, online_round_avg_ent))
        print("running_avg_ent_online = %s" % running_avg_ent_online)

        print("..........")

        print("round_entropy_baseline[%d] = %f" % (i, round_entropy_baseline))
        print("running_avg_ent_baseline = %s" % running_avg_ent_baseline)

        print("--------------------------------")

        plotting.heatmap(running_avg_p, average_p, i, args.env)

    plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline)
    plotting.running_average_entropy3(running_avg_entropies, running_avg_entropies_baseline, running_avg_entropies_online)

    indexes = [1,2,5,10]
    plotting.heatmap4(running_avg_ps, running_avg_ps_baseline, indexes)
    plotting.heatmap3x4(running_avg_ps, running_avg_ps_online, running_avg_ps_baseline, indexes)

    return policies
Example #2
0
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''):

    video_dir = 'videos/' + args.exp_name

    direct = os.getcwd() + '/data/'
    experiment_directory = direct + args.exp_name
    print(experiment_directory)

    print(sys.argv)
    if not os.path.exists(experiment_directory):
        os.makedirs(experiment_directory)
        f = open(experiment_directory + '/args', 'w')
        f.write(' '.join(sys.argv))
        f.flush()

    indexes = [1, 5, 10, 15]
    states_visited_indexes = [0, 5, 10, 15]

    states_visited_cumulative = []
    states_visited_cumulative_baseline = []

    running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent = 0
    running_avg_ent_xy = 0

    running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_baseline_xy = np.zeros(
        shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent_baseline = 0
    running_avg_ent_baseline_xy = 0

    pct_visited = []
    pct_visited_baseline = []
    pct_visited_xy = []
    pct_visited_xy_baseline = []

    running_avg_entropies = []
    running_avg_entropies_xy = []
    running_avg_ps_xy = []
    avg_ps_xy = []

    running_avg_entropies_baseline = []
    running_avg_entropies_baseline_xy = []
    running_avg_ps_baseline_xy = []
    avg_ps_baseline_xy = []

    policies = []
    distributions = []
    initial_state = init_state(env)

    prebuf = ExperienceBuffer()
    env.reset()
    for t in range(10000):
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        prebuf.store(get_state(env, obs))
        if done:
            env.reset()
            done = False

    prebuf.normalize()
    normalization_factors = prebuf.normalization_factors
    utils.log_statement(normalization_factors)
    prebuf = None
    if not args.gaussian:
        normalization_factors = []

    reward_fn = np.zeros(shape=(tuple(ant_utils.num_states)))

    for i in range(epochs):
        utils.log_statement("*** ------- EPOCH %d ------- ***" % i)

        # clear initial state if applicable.
        if not args.initial_state:
            initial_state = []
        else:
            utils.log_statement(initial_state)
        utils.log_statement("max reward: " + str(np.max(reward_fn)))

        logger_kwargs = setup_logger_kwargs("model%02d" % i,
                                            data_dir=experiment_directory)

        # Learn policy that maximizes current reward function.
        print("Learning new oracle...")
        seed = random.randint(1, 100000)
        sac = AntSoftActorCritic(lambda: gym.make(args.env),
                                 reward_fn=reward_fn,
                                 xid=i + 1,
                                 seed=seed,
                                 gamma=args.gamma,
                                 ac_kwargs=dict(hidden_sizes=[args.hid] *
                                                args.l),
                                 logger_kwargs=logger_kwargs,
                                 normalization_factors=normalization_factors)

        # The first policy is random
        if i == 0:
            sac.soft_actor_critic(epochs=0)
        else:
            sac.soft_actor_critic(epochs=args.episodes,
                                  initial_state=initial_state,
                                  start_steps=args.start_steps)
        policies.append(sac)

        p, _ = sac.test_agent(T, normalization_factors=normalization_factors)
        distributions.append(p)
        weights = utils.get_weights(distributions)

        epoch = 'epoch_%02d' % (i)
        if args.render:
            if i < 10:
                sac.record(T=args.record_steps,
                           n=1,
                           video_dir=video_dir + '/baseline/' + epoch,
                           on_policy=False)
            sac.record(T=args.record_steps,
                       n=1,
                       video_dir=video_dir + '/entropy/' + epoch,
                       on_policy=True)

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        print("Executing mixed policy...")
        average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \
            execute_average_policy(env, policies, T, weights,
                                   reward_fn=reward_fn, norm=normalization_factors,
                                   initial_state=initial_state, n=args.n,
                                   render=args.render, video_dir=video_dir+'/mixed/'+epoch, epoch=i,
                                   record_steps=args.record_steps)

        print("Calculating maxEnt entropy...")
        round_entropy = entropy(average_p.ravel())
        round_entropy_xy = entropy(average_p_xy.ravel())

        # Update running averages for maxEnt.
        print("Updating maxEnt running averages...")
        running_avg_ent = running_avg_ent * (
            i) / float(i + 1) + round_entropy / float(i + 1)
        running_avg_ent_xy = running_avg_ent_xy * (
            i) / float(i + 1) + round_entropy_xy / float(i + 1)
        running_avg_p *= (i) / float(i + 1)
        running_avg_p += average_p / float(i + 1)
        running_avg_p_xy *= (i) / float(i + 1)
        running_avg_p_xy += average_p_xy / float(i + 1)

        # update reward function
        print("Update reward function")
        eps = 1 / np.sqrt(ant_utils.total_state_space)
        if args.cumulative:
            reward_fn = grad_ent(running_avg_p)
        else:
            reward_fn = 1.
            average_p += eps
            reward_fn /= average_p
        average_p = None  # delete big array

        # (save for plotting)
        running_avg_entropies.append(running_avg_ent)
        running_avg_entropies_xy.append(running_avg_ent_xy)
        if i in indexes:
            running_avg_ps_xy.append(np.copy(running_avg_p_xy))
            avg_ps_xy.append(np.copy(average_p_xy))

        print("Collecting baseline experience....")
        p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random(
            T, normalization_factors=normalization_factors, n=args.n)

        plotting.states_visited_over_time(states_visited,
                                          states_visited_baseline, i)
        plotting.states_visited_over_time(states_visited_xy,
                                          states_visited_xy_baseline,
                                          i,
                                          ext='_xy')

        # save for cumulative plot.
        if i in states_visited_indexes:
            # average over a whole bunch of rollouts
            # slow: so only do this when needed.
            print("Averaging unique xy states visited....")
            states_visited_xy = compute_states_visited_xy(
                env,
                policies,
                norm=normalization_factors,
                T=T,
                n=args.n,
                N=args.avg_N)
            states_visited_xy_baseline = compute_states_visited_xy(
                env,
                policies,
                norm=normalization_factors,
                T=T,
                n=args.n,
                N=args.avg_N,
                initial_state=initial_state,
                baseline=True)
            states_visited_cumulative.append(states_visited_xy)
            states_visited_cumulative_baseline.append(
                states_visited_xy_baseline)

        print("Compute baseline entropy....")
        round_entropy_baseline = entropy(p_baseline.ravel())
        round_entropy_baseline_xy = entropy(p_baseline_xy.ravel())

        # Update baseline running averages.
        print("Updating baseline running averages...")
        running_avg_ent_baseline = running_avg_ent_baseline * (
            i) / float(i + 1) + round_entropy_baseline / float(i + 1)
        running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * (
            i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1)

        running_avg_p_baseline *= (i) / float(i + 1)
        running_avg_p_baseline += p_baseline / float(i + 1)
        running_avg_p_baseline_xy *= (i) / float(i + 1)
        running_avg_p_baseline_xy += p_baseline_xy / float(i + 1)

        p_baseline = None

        # (save for plotting)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy)
        if i in indexes:
            running_avg_ps_baseline_xy.append(
                np.copy(running_avg_p_baseline_xy))
            avg_ps_baseline_xy.append(np.copy(p_baseline_xy))

        utils.log_statement(average_p_xy)
        utils.log_statement(p_baseline_xy)

        # Calculate percent of state space visited.
        pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size)
        pct_visited.append(pct)
        pct_xy = np.count_nonzero(running_avg_p_xy) / float(
            running_avg_p_xy.size)
        pct_visited_xy.append(pct_xy)

        pct_baseline = np.count_nonzero(running_avg_p_baseline) / float(
            running_avg_p_baseline.size)
        pct_visited_baseline.append(pct_baseline)
        pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float(
            running_avg_p_baseline_xy.size)
        pct_visited_xy_baseline.append(pct_xy_baseline)

        # Print round summary.
        col_headers = ["", "baseline", "maxEnt"]
        col1 = [
            "round_entropy_xy", "running_avg_ent_xy", "round_entropy",
            "running_avg_ent", "% state space xy", "% total state space"
        ]
        col2 = [
            round_entropy_baseline_xy, running_avg_ent_baseline_xy,
            round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline,
            pct_baseline
        ]
        col3 = [
            round_entropy_xy, running_avg_ent_xy, round_entropy,
            running_avg_ent, pct_xy, pct
        ]
        table = tabulate(np.transpose([col1, col2, col3]),
                         col_headers,
                         tablefmt="fancy_grid",
                         floatfmt=".4f")
        utils.log_statement(table)

        # Plot from round.
        plotting.heatmap(running_avg_p_xy, average_p_xy, i)
        plotting.heatmap1(running_avg_p_baseline_xy, i)

        if i == states_visited_indexes[3]:
            plotting.states_visited_over_time_multi(
                states_visited_cumulative, states_visited_cumulative_baseline,
                states_visited_indexes)

    # save final expert weights to use with the trained oracles.
    weights_file = experiment_directory + '/policy_weights'
    np.save(weights_file, weights)

    # cumulative plots.
    plotting.running_average_entropy(running_avg_entropies,
                                     running_avg_entropies_baseline)
    plotting.running_average_entropy(running_avg_entropies_xy,
                                     running_avg_entropies_baseline_xy,
                                     ext='_xy')

    plotting.heatmap4(running_avg_ps_xy,
                      running_avg_ps_baseline_xy,
                      indexes,
                      ext="cumulative")
    plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch")

    plotting.percent_state_space_reached(pct_visited,
                                         pct_visited_baseline,
                                         ext='_total')
    plotting.percent_state_space_reached(pct_visited_xy,
                                         pct_visited_xy_baseline,
                                         ext="_xy")

    return policies
Example #3
0
def collect_entropy_policies(env, epochs, T, MODEL_DIR):

    reward_fn = np.zeros(shape=(tuple(utils.num_states)))

    # set initial state to base, motionless state.
    seed = []
    if args.env == "Pendulum-v0":
        env.env.state = [np.pi, 0]
        seed = env.env._get_obs()
    elif args.env == "MountainCarContinuous-v0":
        env.env.state = [-0.50, 0]
        seed = env.env.state

    reward_fn[tuple(utils.discretize_state(seed))] = 1

    running_avg_p = np.zeros(shape=(tuple(utils.num_states)))
    running_avg_ent = 0
    window_running_avg_p = np.zeros(shape=(tuple(utils.num_states)))
    window_running_avg_ent = 0

    running_avg_p_baseline = np.zeros(shape=(tuple(utils.num_states)))
    running_avg_ent_baseline = 0
    window_running_avg_p_baseline = np.zeros(shape=(tuple(utils.num_states)))
    window_running_avg_ent_baseline = 0

    baseline_entropies = []
    baseline_ps = []
    entropies = []
    ps = []

    average_entropies = []
    average_ps = []

    running_avg_entropies = []
    running_avg_ps = []

    running_avg_entropies_baseline = []
    running_avg_ps_baseline = []

    window_running_avg_ents = []
    window_running_avg_ps = []
    window_running_avg_ents_baseline = []
    window_running_avg_ps_baseline = []

    policies = []
    initial_state = init_state(args.env)

    for i in range(epochs):

        # Learn policy that maximizes current reward function.
        policy = Policy(env, args.gamma, args.lr, utils.obs_dim,
                        utils.action_dim)
        policy.learn_policy(reward_fn, initial_state, args.episodes,
                            args.train_steps)
        policies.append(policy)

        if args.save_models:
            policy.save(MODEL_DIR + 'model_' + str(i) + '.pt')

        # Get next distribution p by executing pi for T steps.
        p_videos = 'cmp_videos/%sp_%d/' % (MODEL_DIR, i)
        p = policy.execute(T,
                           initial_state,
                           render=args.record,
                           video_dir=p_videos)

        a = 10  # average over this many rounds
        baseline_videos = 'cmp_videos/%sbaseline_%d/' % (
            MODEL_DIR, i)  # note that MODEL_DIR has trailing slash
        entropy_videos = 'cmp_videos/%sentropy_%d/' % (MODEL_DIR, i)
        p_baseline = policy.execute_random(
            T, render=False, video_dir=baseline_videos)  # args.episodes?
        round_entropy_baseline = scipy.stats.entropy(p_baseline.flatten())
        for av in range(a - 1):
            next_p_baseline = policy.execute_random(T)
            p_baseline += next_p_baseline
            # print(scipy.stats.entropy(next_p_baseline.flatten()))
            round_entropy_baseline += scipy.stats.entropy(
                next_p_baseline.flatten())
        p_baseline /= float(a)
        round_entropy_baseline /= float(a)  # running average of the entropy

        # note: the entropy is p_baseline is not the same as the computed avg entropy
        # print("baseline compare:")
        # print(round_entropy_baseline) # running average
        # print(scipy.stats.entropy(p_baseline.flatten())) # entropy of final

        # reward_fn = grad_ent(p)

        round_entropy = scipy.stats.entropy(p.flatten())
        entropies.append(round_entropy)
        baseline_entropies.append(round_entropy_baseline)
        ps.append(p)
        baseline_ps.append(p_baseline)

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        average_p, round_avg_ent, initial_state = \
            curiosity.execute_average_policy(env, policies, T, initial_state=initial_state, avg_runs=a, render=False, video_dir=entropy_videos)

        # If in pendulum, set velocity to 0 with some probability
        if args.env == "Pendulum-v0" and random.random() < 0.3:
            initial_state[1] = 0

        reward_fn = grad_ent(average_p)

        print(average_p)
        print("!  --------  !")
        print(reward_fn)

        average_ps.append(average_p)
        average_entropies.append(round_avg_ent)

        # Update running average.
        window = 5
        if (i < window):  # add normally
            window_running_avg_ent = window_running_avg_ent * (
                i) / float(i + 1) + round_avg_ent / float(i + 1)
            window_running_avg_p = window_running_avg_ent * (
                i) / float(i + 1) + average_p / float(i + 1)
            window_running_avg_ent_baseline = window_running_avg_ent_baseline * (
                i) / float(i + 1) + round_entropy_baseline / float(i + 1)
            window_running_avg_p_baseline = window_running_avg_p_baseline * (
                i) / float(i + 1) + p_baseline / float(i + 1)

        else:
            window_running_avg_ent = window_running_avg_ent + round_avg_ent / float(
                window) - average_entropies[i - 5] / float(window)
            window_running_avg_p = window_running_avg_p + average_p / float(
                window) - average_ps[i - 5] / float(window)

            window_running_avg_ent_baseline = window_running_avg_ent_baseline + round_entropy_baseline / float(
                window) - baseline_entropies[i - 5] / float(window)
            window_running_avg_p_baseline = window_running_avg_p_baseline + p_baseline / float(
                window) - baseline_ps[i - 5] / float(window)

        running_avg_ent = running_avg_ent * (
            i) / float(i + 1) + round_avg_ent / float(i + 1)
        running_avg_p = running_avg_p * (
            i) / float(i + 1) + average_p / float(i + 1)
        running_avg_entropies.append(running_avg_ent)
        running_avg_ps.append(running_avg_p)

        # Update baseline running averages.
        running_avg_ent_baseline = running_avg_ent_baseline * (
            i) / float(i + 1) + round_entropy_baseline / float(i + 1)
        running_avg_p_baseline = running_avg_p_baseline * (
            i) / float(i + 1) + p_baseline / float(i + 1)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_ps_baseline.append(running_avg_p_baseline)

        window_running_avg_ents.append(window_running_avg_ent)
        window_running_avg_ps.append(window_running_avg_p)
        window_running_avg_ents_baseline.append(
            window_running_avg_ent_baseline)
        window_running_avg_ps_baseline.append(window_running_avg_p_baseline)

        print("p=")
        print(p)
        print("..........")
        print("round_entropy = %f" % (round_entropy))

        print("---------------------")

        print("average_p =")
        print(average_p)

        print("..........")

        print("round_avg_ent[%d] = %f" % (i, round_avg_ent))
        print("running_avg_ent = %s" % running_avg_ent)
        print("window_running_avg_ent = %s" % window_running_avg_ent)

        print("..........")

        print("round_entropy_baseline[%d] = %f" % (i, round_entropy_baseline))
        print("running_avg_ent_baseline = %s" % running_avg_ent_baseline)
        print("window_running_avg_ent_baseline = %s" %
              window_running_avg_ent_baseline)
        # print("running_avg_p_baseline =")
        # print(running_avg_p_baseline)

        print("----------------------")

        plotting.heatmap(running_avg_p, average_p, i)

    # plotting.smear_lines(running_avg_ps, running_avg_ps_baseline)
    plotting.running_average_entropy(running_avg_entropies,
                                     running_avg_entropies_baseline)
    plotting.running_average_entropy_window(window_running_avg_ents,
                                            window_running_avg_ents_baseline,
                                            window)
    # plotting.difference_heatmap(running_avg_ps, running_avg_ps_baseline)

    indexes = []
    print('which indexes?')
    for i in range(4):
        idx = input("index :")
        indexes.append(int(idx))
    plotting.heatmap4(running_avg_ps, running_avg_ps_baseline, indexes)

    return policies