def collect_entropy_policies(env, epochs, T, MODEL_DIR): video_dir = 'videos/' + args.exp_name reward_fn = np.zeros(shape=(tuple(base_utils.num_states))) online_reward_fn = np.zeros(shape=(tuple(base_utils.num_states))) # set initial state to base, motionless state. seed = [] if args.env == "Pendulum-v0": env.env.state = [np.pi, 0] seed = env.env._get_obs() elif args.env == "MountainCarContinuous-v0": env.env.state = [-0.50, 0] seed = env.env.state running_avg_p = np.zeros(shape=(tuple(base_utils.num_states))) running_avg_ent = 0 running_avg_entropies = [] running_avg_ps = [] running_avg_p_online = np.zeros(shape=(tuple(base_utils.num_states))) running_avg_ent_online = 0 running_avg_entropies_online = [] running_avg_ps_online = [] running_avg_p_baseline = np.zeros(shape=(tuple(base_utils.num_states))) running_avg_ent_baseline = 0 running_avg_entropies_baseline = [] running_avg_ps_baseline = [] online_average_ps = [] policies = [] initial_state = init_state(args.env) online_policies = [] online_initial_state = init_state(args.env) for i in range(epochs): # Learn policy that maximizes current reward function. policy = Policy(env, args.gamma, args.lr, base_utils.obs_dim, base_utils.action_dim) online_policy = Policy(env, args.gamma, args.lr, base_utils.obs_dim, base_utils.action_dim) if i == 0: policy.learn_policy(reward_fn, episodes=0, train_steps=0) online_policy.learn_policy(online_reward_fn, episodes=0, train_steps=0) else: policy.learn_policy(reward_fn, initial_state=initial_state, episodes=args.episodes, train_steps=args.train_steps) online_policy.learn_policy(online_reward_fn, initial_state=online_initial_state, episodes=args.episodes, train_steps=args.train_steps) policies.append(policy) online_policies.append(online_policy) epoch = 'epoch_%02d/' % (i) a = 10 # average over this many rounds p_baseline = policy.execute_random(T, render=args.render, video_dir=video_dir+'/baseline/'+epoch) round_entropy_baseline = scipy.stats.entropy(p_baseline.flatten()) for av in range(a - 1): next_p_baseline = policy.execute_random(T) p_baseline += next_p_baseline round_entropy_baseline += scipy.stats.entropy(next_p_baseline.flatten()) p_baseline /= float(a) round_entropy_baseline /= float(a) # running average of the entropy # Execute the cumulative average policy thus far. # Estimate distribution and entropy. average_p, round_avg_ent, initial_state = \ curiosity.execute_average_policy(env, policies, T, initial_state=initial_state, avg_runs=a, render=False) online_average_p, online_round_avg_ent, online_initial_state = \ curiosity.execute_average_policy(env, online_policies, T, initial_state=online_initial_state, avg_runs=a, render=False) # Get next distribution p by executing pi for T steps. # ALSO: Collect video of each policy p = policy.execute(T, initial_state=initial_state, render=args.render, video_dir=video_dir+'/normal/'+epoch) p_online = online_policy.execute(T, initial_state=initial_state, render=args.render, video_dir=video_dir+'/online/'+epoch) # Force first round to be equal if i == 0: average_p = p_baseline round_avg_ent = round_entropy_baseline online_average_p = p_baseline online_round_avg_ent = round_entropy_baseline # If in pendulum, set velocity to 0 with some probability if args.env == "Pendulum-v0" and random.random() < 0.3: initial_state[1] = 0 # goal: try online reward structure online_reward_fn = online_rewards(online_average_p, online_average_ps, epochs) online_average_ps.append(online_average_p) reward_fn = grad_ent(average_p) # Update experimental running averages. running_avg_ent = running_avg_ent * (i)/float(i+1) + round_avg_ent/float(i+1) running_avg_p = running_avg_p * (i)/float(i+1) + average_p/float(i+1) running_avg_entropies.append(running_avg_ent) running_avg_ps.append(running_avg_p) # Update online running averages. running_avg_ent_online = running_avg_ent_online * (i)/float(i+1) + online_round_avg_ent/float(i+1) running_avg_p_online = running_avg_p_online * (i)/float(i+1) + online_average_p/float(i+1) running_avg_entropies_online.append(running_avg_ent_online) running_avg_ps_online.append(running_avg_p_online) # Update baseline running averages. running_avg_ent_baseline = running_avg_ent_baseline * (i)/float(i+1) + round_entropy_baseline/float(i+1) running_avg_p_baseline = running_avg_p_baseline * (i)/float(i+1) + p_baseline/float(i+1) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_ps_baseline.append(running_avg_p_baseline) print("--------------------------------") print("p=") print(p) print("average_p =") print(average_p) print("online_average_p") print(online_average_p) print("---------------------") print("round_avg_ent[%d] = %f" % (i, round_avg_ent)) print("running_avg_ent = %s" % running_avg_ent) print("..........") print("online_round_avg_ent[%d] = %f" % (i, online_round_avg_ent)) print("running_avg_ent_online = %s" % running_avg_ent_online) print("..........") print("round_entropy_baseline[%d] = %f" % (i, round_entropy_baseline)) print("running_avg_ent_baseline = %s" % running_avg_ent_baseline) print("--------------------------------") plotting.heatmap(running_avg_p, average_p, i, args.env) plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) plotting.running_average_entropy3(running_avg_entropies, running_avg_entropies_baseline, running_avg_entropies_online) indexes = [1,2,5,10] plotting.heatmap4(running_avg_ps, running_avg_ps_baseline, indexes) plotting.heatmap3x4(running_avg_ps, running_avg_ps_online, running_avg_ps_baseline, indexes) return policies
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''): video_dir = 'videos/' + args.exp_name direct = os.getcwd() + '/data/' experiment_directory = direct + args.exp_name print(experiment_directory) print(sys.argv) if not os.path.exists(experiment_directory): os.makedirs(experiment_directory) f = open(experiment_directory + '/args', 'w') f.write(' '.join(sys.argv)) f.flush() indexes = [1, 5, 10, 15] states_visited_indexes = [0, 5, 10, 15] states_visited_cumulative = [] states_visited_cumulative_baseline = [] running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) running_avg_ent = 0 running_avg_ent_xy = 0 running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_baseline_xy = np.zeros( shape=(tuple(ant_utils.num_states_2d))) running_avg_ent_baseline = 0 running_avg_ent_baseline_xy = 0 pct_visited = [] pct_visited_baseline = [] pct_visited_xy = [] pct_visited_xy_baseline = [] running_avg_entropies = [] running_avg_entropies_xy = [] running_avg_ps_xy = [] avg_ps_xy = [] running_avg_entropies_baseline = [] running_avg_entropies_baseline_xy = [] running_avg_ps_baseline_xy = [] avg_ps_baseline_xy = [] policies = [] distributions = [] initial_state = init_state(env) prebuf = ExperienceBuffer() env.reset() for t in range(10000): action = env.action_space.sample() obs, reward, done, _ = env.step(action) prebuf.store(get_state(env, obs)) if done: env.reset() done = False prebuf.normalize() normalization_factors = prebuf.normalization_factors utils.log_statement(normalization_factors) prebuf = None if not args.gaussian: normalization_factors = [] reward_fn = np.zeros(shape=(tuple(ant_utils.num_states))) for i in range(epochs): utils.log_statement("*** ------- EPOCH %d ------- ***" % i) # clear initial state if applicable. if not args.initial_state: initial_state = [] else: utils.log_statement(initial_state) utils.log_statement("max reward: " + str(np.max(reward_fn))) logger_kwargs = setup_logger_kwargs("model%02d" % i, data_dir=experiment_directory) # Learn policy that maximizes current reward function. print("Learning new oracle...") seed = random.randint(1, 100000) sac = AntSoftActorCritic(lambda: gym.make(args.env), reward_fn=reward_fn, xid=i + 1, seed=seed, gamma=args.gamma, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), logger_kwargs=logger_kwargs, normalization_factors=normalization_factors) # The first policy is random if i == 0: sac.soft_actor_critic(epochs=0) else: sac.soft_actor_critic(epochs=args.episodes, initial_state=initial_state, start_steps=args.start_steps) policies.append(sac) p, _ = sac.test_agent(T, normalization_factors=normalization_factors) distributions.append(p) weights = utils.get_weights(distributions) epoch = 'epoch_%02d' % (i) if args.render: if i < 10: sac.record(T=args.record_steps, n=1, video_dir=video_dir + '/baseline/' + epoch, on_policy=False) sac.record(T=args.record_steps, n=1, video_dir=video_dir + '/entropy/' + epoch, on_policy=True) # Execute the cumulative average policy thus far. # Estimate distribution and entropy. print("Executing mixed policy...") average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \ execute_average_policy(env, policies, T, weights, reward_fn=reward_fn, norm=normalization_factors, initial_state=initial_state, n=args.n, render=args.render, video_dir=video_dir+'/mixed/'+epoch, epoch=i, record_steps=args.record_steps) print("Calculating maxEnt entropy...") round_entropy = entropy(average_p.ravel()) round_entropy_xy = entropy(average_p_xy.ravel()) # Update running averages for maxEnt. print("Updating maxEnt running averages...") running_avg_ent = running_avg_ent * ( i) / float(i + 1) + round_entropy / float(i + 1) running_avg_ent_xy = running_avg_ent_xy * ( i) / float(i + 1) + round_entropy_xy / float(i + 1) running_avg_p *= (i) / float(i + 1) running_avg_p += average_p / float(i + 1) running_avg_p_xy *= (i) / float(i + 1) running_avg_p_xy += average_p_xy / float(i + 1) # update reward function print("Update reward function") eps = 1 / np.sqrt(ant_utils.total_state_space) if args.cumulative: reward_fn = grad_ent(running_avg_p) else: reward_fn = 1. average_p += eps reward_fn /= average_p average_p = None # delete big array # (save for plotting) running_avg_entropies.append(running_avg_ent) running_avg_entropies_xy.append(running_avg_ent_xy) if i in indexes: running_avg_ps_xy.append(np.copy(running_avg_p_xy)) avg_ps_xy.append(np.copy(average_p_xy)) print("Collecting baseline experience....") p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random( T, normalization_factors=normalization_factors, n=args.n) plotting.states_visited_over_time(states_visited, states_visited_baseline, i) plotting.states_visited_over_time(states_visited_xy, states_visited_xy_baseline, i, ext='_xy') # save for cumulative plot. if i in states_visited_indexes: # average over a whole bunch of rollouts # slow: so only do this when needed. print("Averaging unique xy states visited....") states_visited_xy = compute_states_visited_xy( env, policies, norm=normalization_factors, T=T, n=args.n, N=args.avg_N) states_visited_xy_baseline = compute_states_visited_xy( env, policies, norm=normalization_factors, T=T, n=args.n, N=args.avg_N, initial_state=initial_state, baseline=True) states_visited_cumulative.append(states_visited_xy) states_visited_cumulative_baseline.append( states_visited_xy_baseline) print("Compute baseline entropy....") round_entropy_baseline = entropy(p_baseline.ravel()) round_entropy_baseline_xy = entropy(p_baseline_xy.ravel()) # Update baseline running averages. print("Updating baseline running averages...") running_avg_ent_baseline = running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * ( i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1) running_avg_p_baseline *= (i) / float(i + 1) running_avg_p_baseline += p_baseline / float(i + 1) running_avg_p_baseline_xy *= (i) / float(i + 1) running_avg_p_baseline_xy += p_baseline_xy / float(i + 1) p_baseline = None # (save for plotting) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy) if i in indexes: running_avg_ps_baseline_xy.append( np.copy(running_avg_p_baseline_xy)) avg_ps_baseline_xy.append(np.copy(p_baseline_xy)) utils.log_statement(average_p_xy) utils.log_statement(p_baseline_xy) # Calculate percent of state space visited. pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size) pct_visited.append(pct) pct_xy = np.count_nonzero(running_avg_p_xy) / float( running_avg_p_xy.size) pct_visited_xy.append(pct_xy) pct_baseline = np.count_nonzero(running_avg_p_baseline) / float( running_avg_p_baseline.size) pct_visited_baseline.append(pct_baseline) pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float( running_avg_p_baseline_xy.size) pct_visited_xy_baseline.append(pct_xy_baseline) # Print round summary. col_headers = ["", "baseline", "maxEnt"] col1 = [ "round_entropy_xy", "running_avg_ent_xy", "round_entropy", "running_avg_ent", "% state space xy", "% total state space" ] col2 = [ round_entropy_baseline_xy, running_avg_ent_baseline_xy, round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline, pct_baseline ] col3 = [ round_entropy_xy, running_avg_ent_xy, round_entropy, running_avg_ent, pct_xy, pct ] table = tabulate(np.transpose([col1, col2, col3]), col_headers, tablefmt="fancy_grid", floatfmt=".4f") utils.log_statement(table) # Plot from round. plotting.heatmap(running_avg_p_xy, average_p_xy, i) plotting.heatmap1(running_avg_p_baseline_xy, i) if i == states_visited_indexes[3]: plotting.states_visited_over_time_multi( states_visited_cumulative, states_visited_cumulative_baseline, states_visited_indexes) # save final expert weights to use with the trained oracles. weights_file = experiment_directory + '/policy_weights' np.save(weights_file, weights) # cumulative plots. plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) plotting.running_average_entropy(running_avg_entropies_xy, running_avg_entropies_baseline_xy, ext='_xy') plotting.heatmap4(running_avg_ps_xy, running_avg_ps_baseline_xy, indexes, ext="cumulative") plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch") plotting.percent_state_space_reached(pct_visited, pct_visited_baseline, ext='_total') plotting.percent_state_space_reached(pct_visited_xy, pct_visited_xy_baseline, ext="_xy") return policies
def collect_entropy_policies(env, epochs, T, MODEL_DIR): reward_fn = np.zeros(shape=(tuple(utils.num_states))) # set initial state to base, motionless state. seed = [] if args.env == "Pendulum-v0": env.env.state = [np.pi, 0] seed = env.env._get_obs() elif args.env == "MountainCarContinuous-v0": env.env.state = [-0.50, 0] seed = env.env.state reward_fn[tuple(utils.discretize_state(seed))] = 1 running_avg_p = np.zeros(shape=(tuple(utils.num_states))) running_avg_ent = 0 window_running_avg_p = np.zeros(shape=(tuple(utils.num_states))) window_running_avg_ent = 0 running_avg_p_baseline = np.zeros(shape=(tuple(utils.num_states))) running_avg_ent_baseline = 0 window_running_avg_p_baseline = np.zeros(shape=(tuple(utils.num_states))) window_running_avg_ent_baseline = 0 baseline_entropies = [] baseline_ps = [] entropies = [] ps = [] average_entropies = [] average_ps = [] running_avg_entropies = [] running_avg_ps = [] running_avg_entropies_baseline = [] running_avg_ps_baseline = [] window_running_avg_ents = [] window_running_avg_ps = [] window_running_avg_ents_baseline = [] window_running_avg_ps_baseline = [] policies = [] initial_state = init_state(args.env) for i in range(epochs): # Learn policy that maximizes current reward function. policy = Policy(env, args.gamma, args.lr, utils.obs_dim, utils.action_dim) policy.learn_policy(reward_fn, initial_state, args.episodes, args.train_steps) policies.append(policy) if args.save_models: policy.save(MODEL_DIR + 'model_' + str(i) + '.pt') # Get next distribution p by executing pi for T steps. p_videos = 'cmp_videos/%sp_%d/' % (MODEL_DIR, i) p = policy.execute(T, initial_state, render=args.record, video_dir=p_videos) a = 10 # average over this many rounds baseline_videos = 'cmp_videos/%sbaseline_%d/' % ( MODEL_DIR, i) # note that MODEL_DIR has trailing slash entropy_videos = 'cmp_videos/%sentropy_%d/' % (MODEL_DIR, i) p_baseline = policy.execute_random( T, render=False, video_dir=baseline_videos) # args.episodes? round_entropy_baseline = scipy.stats.entropy(p_baseline.flatten()) for av in range(a - 1): next_p_baseline = policy.execute_random(T) p_baseline += next_p_baseline # print(scipy.stats.entropy(next_p_baseline.flatten())) round_entropy_baseline += scipy.stats.entropy( next_p_baseline.flatten()) p_baseline /= float(a) round_entropy_baseline /= float(a) # running average of the entropy # note: the entropy is p_baseline is not the same as the computed avg entropy # print("baseline compare:") # print(round_entropy_baseline) # running average # print(scipy.stats.entropy(p_baseline.flatten())) # entropy of final # reward_fn = grad_ent(p) round_entropy = scipy.stats.entropy(p.flatten()) entropies.append(round_entropy) baseline_entropies.append(round_entropy_baseline) ps.append(p) baseline_ps.append(p_baseline) # Execute the cumulative average policy thus far. # Estimate distribution and entropy. average_p, round_avg_ent, initial_state = \ curiosity.execute_average_policy(env, policies, T, initial_state=initial_state, avg_runs=a, render=False, video_dir=entropy_videos) # If in pendulum, set velocity to 0 with some probability if args.env == "Pendulum-v0" and random.random() < 0.3: initial_state[1] = 0 reward_fn = grad_ent(average_p) print(average_p) print("! -------- !") print(reward_fn) average_ps.append(average_p) average_entropies.append(round_avg_ent) # Update running average. window = 5 if (i < window): # add normally window_running_avg_ent = window_running_avg_ent * ( i) / float(i + 1) + round_avg_ent / float(i + 1) window_running_avg_p = window_running_avg_ent * ( i) / float(i + 1) + average_p / float(i + 1) window_running_avg_ent_baseline = window_running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) window_running_avg_p_baseline = window_running_avg_p_baseline * ( i) / float(i + 1) + p_baseline / float(i + 1) else: window_running_avg_ent = window_running_avg_ent + round_avg_ent / float( window) - average_entropies[i - 5] / float(window) window_running_avg_p = window_running_avg_p + average_p / float( window) - average_ps[i - 5] / float(window) window_running_avg_ent_baseline = window_running_avg_ent_baseline + round_entropy_baseline / float( window) - baseline_entropies[i - 5] / float(window) window_running_avg_p_baseline = window_running_avg_p_baseline + p_baseline / float( window) - baseline_ps[i - 5] / float(window) running_avg_ent = running_avg_ent * ( i) / float(i + 1) + round_avg_ent / float(i + 1) running_avg_p = running_avg_p * ( i) / float(i + 1) + average_p / float(i + 1) running_avg_entropies.append(running_avg_ent) running_avg_ps.append(running_avg_p) # Update baseline running averages. running_avg_ent_baseline = running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) running_avg_p_baseline = running_avg_p_baseline * ( i) / float(i + 1) + p_baseline / float(i + 1) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_ps_baseline.append(running_avg_p_baseline) window_running_avg_ents.append(window_running_avg_ent) window_running_avg_ps.append(window_running_avg_p) window_running_avg_ents_baseline.append( window_running_avg_ent_baseline) window_running_avg_ps_baseline.append(window_running_avg_p_baseline) print("p=") print(p) print("..........") print("round_entropy = %f" % (round_entropy)) print("---------------------") print("average_p =") print(average_p) print("..........") print("round_avg_ent[%d] = %f" % (i, round_avg_ent)) print("running_avg_ent = %s" % running_avg_ent) print("window_running_avg_ent = %s" % window_running_avg_ent) print("..........") print("round_entropy_baseline[%d] = %f" % (i, round_entropy_baseline)) print("running_avg_ent_baseline = %s" % running_avg_ent_baseline) print("window_running_avg_ent_baseline = %s" % window_running_avg_ent_baseline) # print("running_avg_p_baseline =") # print(running_avg_p_baseline) print("----------------------") plotting.heatmap(running_avg_p, average_p, i) # plotting.smear_lines(running_avg_ps, running_avg_ps_baseline) plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) plotting.running_average_entropy_window(window_running_avg_ents, window_running_avg_ents_baseline, window) # plotting.difference_heatmap(running_avg_ps, running_avg_ps_baseline) indexes = [] print('which indexes?') for i in range(4): idx = input("index :") indexes.append(int(idx)) plotting.heatmap4(running_avg_ps, running_avg_ps_baseline, indexes) return policies