def compute_states_visited_xy(env, policies, T, n, norm=[], N=20, initial_state=[], baseline=False): states_visited_xy = np.zeros(T * n) max_idx = len(policies) - 1 for it in range(N): p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) cumulative_states_visited_xy = 0 # average results over n rollouts for iteration in range(n): env.reset() if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] env.env.set_state(qpos, qvel) obs = get_state(env, env.env._get_obs()) for t in range(T): action = np.zeros(shape=(1, ant_utils.action_dim)) idx = random.randint(0, max_idx) if idx == 0 or baseline: action = env.action_space.sample() else: action = policies[idx].get_action( obs, deterministic=args.deterministic) # Count the cumulative number of new states visited as a function of t. obs, _, done, _ = env.step(action) obs = get_state(env, obs) # if this is the first time you are seeing this state, increment. if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] == 0: cumulative_states_visited_xy += 1 step = iteration * T + t states_visited_xy[step] += cumulative_states_visited_xy p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] += 1 if done: # CRITICAL: ignore done signal done = False env.close() states_visited_xy /= float(N) return states_visited_xy
def test_agent_random(self, T, normalization_factors=[], n=10): p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) cumulative_states_visited_baseline = 0 states_visited_baseline = [] cumulative_states_visited_xy_baseline = 0 states_visited_xy_baseline = [] denom = 0 for j in range(n): o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0 o = get_state(self.test_env, o) while not (d or (ep_len == T)): a = self.test_env.action_space.sample() o, r, d, _ = self.test_env.step(a) o = get_state(self.test_env, o) r = self.reward(self.test_env, r, o) # if this is the first time you are seeing this state, increment. if p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] == 0: cumulative_states_visited_baseline += 1 states_visited_baseline.append( cumulative_states_visited_baseline) if p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] == 0: cumulative_states_visited_xy_baseline += 1 states_visited_xy_baseline.append( cumulative_states_visited_xy_baseline) p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] += 1 p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] += 1 denom += 1 ep_len += 1 if d: # CRITICAL: ignore done signal d = False p /= float(denom) p_xy /= float(denom) return p, p_xy, states_visited_baseline, states_visited_xy_baseline
def execute_one_rollout(policies, weights, env, start_obs, T, data, norm, wrapped=False): obs = start_obs p, p_xy, cumulative_states_visited, states_visited, \ cumulative_states_visited_xy, states_visited_xy, random_initial_state = data random_T = np.random.randint(0, T) for t in range(T): action = select_action(policies, weights, env, obs) # Count the cumulative number of new states visited as a function of t. obs, _, done, _ = env.step(action) obs = get_state(env, obs, wrapped) # if this is the first time you are seeing this state, increment. if p[tuple(ant_utils.discretize_state(obs, norm, env))] == 0: cumulative_states_visited += 1 states_visited.append(cumulative_states_visited) if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] == 0: cumulative_states_visited_xy += 1 states_visited_xy.append(cumulative_states_visited_xy) p[tuple(ant_utils.discretize_state(obs, norm, env))] += 1 p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] += 1 if t == random_T: random_initial_state = obs if done: # CRITICAL: ignore done signal done = False if wrapped: obs = env.reset() obs = get_state(env, obs, wrapped) data = (p, p_xy, cumulative_states_visited, states_visited, \ cumulative_states_visited_xy, states_visited_xy, random_initial_state) return data
def test_agent(self, T, n=10, initial_state=[], normalization_factors=[], store_log=True, deterministic=True, reset=False): denom = 0 p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) for j in range(n): o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0 if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] self.test_env.env.set_state(qpos, qvel) o = self.test_env.env._get_obs() o = get_state(self.test_env, o) while not (d or (ep_len == T)): # Take deterministic actions at test time a = self.get_action(o, deterministic) o, r, d, _ = self.test_env.step(a) o = get_state(self.test_env, o) r = self.reward(self.test_env, r, o) ep_ret += r ep_len += 1 denom += 1 p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] += 1 p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] += 1 if d and reset: d = False if store_log: self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) p /= float(denom) p_xy /= float(denom) return p, p_xy
def get_discrete_distribution_2d(self): if self.p_2d is not None: return self.p_2d # normalized buffer experience if not self.normalized: self.normalize() p_2d = np.zeros(shape=(ant_utils.num_states_2d)) for obs in self.buffer: # discritize obs, add to distribution tabulation. p_2d[tuple(ant_utils.discretize_state_2d(obs))] += 1 p_2d /= len(self.buffer) self.p_2d = p_2d return p_2d
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''): direct = os.getcwd() + '/data/' experiment_directory = direct + args.exp_name print(experiment_directory) indexes = [1, 5, 10, 15] states_visited_indexes = [0, 5, 10, 15] states_visited_cumulative = [] states_visited_cumulative_baseline = [] running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) running_avg_ent = 0 running_avg_ent_xy = 0 running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_baseline_xy = np.zeros( shape=(tuple(ant_utils.num_states_2d))) running_avg_ent_baseline = 0 running_avg_ent_baseline_xy = 0 pct_visited = [] pct_visited_baseline = [] pct_visited_xy = [] pct_visited_xy_baseline = [] running_avg_entropies = [] running_avg_entropies_xy = [] running_avg_ps_xy = [] avg_ps_xy = [] running_avg_entropies_baseline = [] running_avg_entropies_baseline_xy = [] running_avg_ps_baseline_xy = [] avg_ps_baseline_xy = [] policies = [] initial_state = init_state(env) prebuf = ExperienceBuffer() env.reset() for t in range(10000): action = env.action_space.sample() obs, reward, done, _ = env.step(action) prebuf.store(get_state(env, obs)) if done: env.reset() done = False prebuf.normalize() normalization_factors = prebuf.normalization_factors utils.log_statement(normalization_factors) prebuf = None reward_fn = np.zeros(shape=(tuple(ant_utils.num_states))) for i in range(epochs): utils.log_statement("*** ------- EPOCH %d ------- ***" % i) # clear initial state if applicable. if not args.initial_state: initial_state = [] else: utils.log_statement(initial_state) utils.log_statement( tuple( ant_utils.discretize_state_2d(initial_state, normalization_factors))) utils.log_statement( tuple( ant_utils.discretize_state(initial_state, normalization_factors))) utils.log_statement("max reward: " + str(np.max(reward_fn))) logger_kwargs = setup_logger_kwargs("model" + str(i), data_dir=experiment_directory) # Learn policy that maximizes current reward function. print("Learning new oracle...") if args.seed != -1: seed = args.seed else: seed = random.randint(1, 100000) sac = AntSoftActorCritic(lambda: gym.make(args.env), reward_fn=reward_fn, xid=i + 1, seed=seed, gamma=args.gamma, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), logger_kwargs=logger_kwargs, normalization_factors=normalization_factors, learn_reduced=args.learn_reduced) # TODO: start learning from initial state to add gradient? # The first policy is random if i == 0: sac.soft_actor_critic(epochs=0) else: sac.soft_actor_critic(epochs=args.episodes, initial_state=initial_state, start_steps=args.start_steps) policies.append(sac) print("Learning autoencoding....") autoencoder = learn_encoding(env, policies, i) # Execute the cumulative average policy thus far. # Estimate distribution and entropy. print("Executing mixed policy...") average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \ execute_average_policy(env, policies, T, autoencoder=autoencoder, reward_fn=reward_fn, norm=normalization_factors, initial_state=initial_state, n=args.n, render=False, epoch=i) print("Calculating maxEnt entropy...") round_entropy = entropy(average_p.ravel()) round_entropy_xy = entropy(average_p_xy.ravel()) # Update running averages for maxEnt. print("Updating maxEnt running averages...") running_avg_ent = running_avg_ent * ( i) / float(i + 1) + round_entropy / float(i + 1) running_avg_ent_xy = running_avg_ent_xy * ( i) / float(i + 1) + round_entropy_xy / float(i + 1) running_avg_p *= (i) / float(i + 1) running_avg_p += average_p / float(i + 1) running_avg_p_xy *= (i) / float(i + 1) running_avg_p_xy += average_p_xy / float(i + 1) # update reward function print("Update reward function") eps = 1 / np.sqrt(ant_utils.total_state_space) if args.cumulative: reward_fn = grad_ent(running_avg_p) else: reward_fn = 1. average_p += eps reward_fn /= average_p average_p = None # delete big array # (save for plotting) running_avg_entropies.append(running_avg_ent) running_avg_entropies_xy.append(running_avg_ent_xy) if i in indexes: running_avg_ps_xy.append(np.copy(running_avg_p_xy)) avg_ps_xy.append(np.copy(average_p_xy)) print("Collecting baseline experience....") p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random( T, normalization_factors=normalization_factors, n=args.n) print('Random visits same # states....') print(len(states_visited)) print(len(states_visited_baseline)) print(len(states_visited_xy)) print(len(states_visited_xy_baseline)) plotting.states_visited_over_time(states_visited, states_visited_baseline, i) plotting.states_visited_over_time(states_visited_xy, states_visited_xy_baseline, i, ext='_xy') # save for cumulative plot. if i in states_visited_indexes: # average over a whole bunch of rollouts # slow: so only do this when needed. print("Averaging unique xy states visited....") states_visited_xy = compute_states_visited_xy(env, policies, T=T, n=args.n, N=args.avg_N) states_visited_xy_baseline = compute_states_visited_xy( env, policies, T=T, n=args.n, N=args.avg_N, initial_state=initial_state, baseline=True) states_visited_cumulative.append(states_visited_xy) states_visited_cumulative_baseline.append( states_visited_xy_baseline) print("Compute baseline entropy....") round_entropy_baseline = entropy(p_baseline.ravel()) round_entropy_baseline_xy = entropy(p_baseline_xy.ravel()) # Update baseline running averages. print("Updating baseline running averages...") running_avg_ent_baseline = running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * ( i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1) running_avg_p_baseline *= (i) / float(i + 1) running_avg_p_baseline += p_baseline / float(i + 1) running_avg_p_baseline_xy *= (i) / float(i + 1) running_avg_p_baseline_xy += p_baseline_xy / float(i + 1) p_baseline = None # (save for plotting) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy) if i in indexes: running_avg_ps_baseline_xy.append( np.copy(running_avg_p_baseline_xy)) avg_ps_baseline_xy.append(np.copy(p_baseline_xy)) utils.log_statement(average_p_xy) utils.log_statement(p_baseline_xy) # Calculate percent of state space visited. pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size) pct_visited.append(pct) pct_xy = np.count_nonzero(running_avg_p_xy) / float( running_avg_p_xy.size) pct_visited_xy.append(pct_xy) pct_baseline = np.count_nonzero(running_avg_p_baseline) / float( running_avg_p_baseline.size) pct_visited_baseline.append(pct_baseline) pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float( running_avg_p_baseline_xy.size) pct_visited_xy_baseline.append(pct_xy_baseline) # Print round summary. col_headers = ["", "baseline", "maxEnt"] col1 = [ "round_entropy_xy", "running_avg_ent_xy", "round_entropy", "running_avg_ent", "% state space xy", "% total state space" ] col2 = [ round_entropy_baseline_xy, running_avg_ent_baseline_xy, round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline, pct_baseline ] col3 = [ round_entropy_xy, running_avg_ent_xy, round_entropy, running_avg_ent, pct_xy, pct ] table = tabulate(np.transpose([col1, col2, col3]), col_headers, tablefmt="fancy_grid", floatfmt=".4f") utils.log_statement(table) # Plot from round. plotting.heatmap(running_avg_p_xy, average_p_xy, i) plotting.heatmap1(running_avg_p_baseline_xy, i) if i == states_visited_indexes[3]: plotting.states_visited_over_time_multi( states_visited_cumulative, states_visited_cumulative_baseline, states_visited_indexes) # cumulative plots. plotting.heatmap4(running_avg_ps_xy, running_avg_ps_baseline_xy, indexes, ext="cumulative") plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch") plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) plotting.running_average_entropy(running_avg_entropies_xy, running_avg_entropies_baseline_xy, ext='_xy') plotting.percent_state_space_reached(pct_visited, pct_visited_baseline, ext='_total') plotting.percent_state_space_reached(pct_visited_xy, pct_visited_xy_baseline, ext="_xy") return policies
def execute_average_policy(env, policies, T, autoencoder=None, reward_fn=[], norm=[], initial_state=[], n=10, render=False, epoch=0): p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) random_initial_state = [] cumulative_states_visited = 0 states_visited = [] cumulative_states_visited_xy = 0 states_visited_xy = [] rewards = np.zeros(T) denom = 0 max_idx = len(policies) - 1 # average results over n rollouts for iteration in range(n): env.reset() # TODO: when testing, do not want initial state. if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] env.env.set_state(qpos, qvel) obs = get_state(env, env.env._get_obs()) random_T = np.floor(random.random() * T) random_initial_state = [] for t in range(T): action = np.zeros(shape=(1, ant_utils.action_dim)) if args.max_sigma: mu = np.zeros(shape=(1, ant_utils.action_dim)) sigma = np.zeros(shape=(1, ant_utils.action_dim)) mean_sigma = np.zeros(shape=(1, ant_utils.action_dim)) for sac in policies: mu += sac.get_action(obs, deterministic=True) sigma = np.maximum(sigma, sac.get_sigma(obs)) mean_sigma += sac.get_sigma(obs) mu /= float(len(policies)) mean_sigma /= float(len(policies)) action = np.random.normal(loc=mu, scale=sigma) else: # select random policy uniform distribution # take non-deterministic action for that policy idx = random.randint(0, max_idx) if idx == 0: action = env.action_space.sample() else: action = policies[idx].get_action( obs, deterministic=args.deterministic) # Count the cumulative number of new states visited as a function of t. obs, _, done, _ = env.step(action) # log encoded data to file. if autoencoder is not None: encodedfile = 'logs/encoded/' + args.exp_name + '.txt' val = autoencoder.encode(obs[:29]) with open(encodedfile, 'a') as f: f.write(str(val) + '\n') print(autoencoder.encode(obs[:29])) obs = get_state(env, obs) reward = reward_fn[tuple(ant_utils.discretize_state(obs, norm))] rewards[t] += reward # if this is the first time you are seeing this state, increment. if p[tuple(ant_utils.discretize_state(obs, norm))] == 0: cumulative_states_visited += 1 states_visited.append(cumulative_states_visited) if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm))] == 0: cumulative_states_visited_xy += 1 states_visited_xy.append(cumulative_states_visited_xy) p[tuple(ant_utils.discretize_state(obs, norm))] += 1 p_xy[tuple(ant_utils.discretize_state_2d(obs, norm))] += 1 denom += 1 if t == random_T: random_initial_state = obs if render: env.render() if done: # CRITICAL: ignore done signal done = False env.close() rewards /= float(n) plotting.reward_vs_t(rewards, epoch) p /= float(denom) p_xy /= float(denom) return p, p_xy, random_initial_state, states_visited, states_visited_xy