def test_agent_random(self, T, normalization_factors=[], n=10): p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) cumulative_states_visited_baseline = 0 states_visited_baseline = [] cumulative_states_visited_xy_baseline = 0 states_visited_xy_baseline = [] denom = 0 for j in range(n): o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0 o = get_state(self.test_env, o) while not (d or (ep_len == T)): a = self.test_env.action_space.sample() o, r, d, _ = self.test_env.step(a) o = get_state(self.test_env, o) r = self.reward(self.test_env, r, o) # if this is the first time you are seeing this state, increment. if p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] == 0: cumulative_states_visited_baseline += 1 states_visited_baseline.append( cumulative_states_visited_baseline) if p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] == 0: cumulative_states_visited_xy_baseline += 1 states_visited_xy_baseline.append( cumulative_states_visited_xy_baseline) p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] += 1 p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] += 1 denom += 1 ep_len += 1 if d: # CRITICAL: ignore done signal d = False p /= float(denom) p_xy /= float(denom) return p, p_xy, states_visited_baseline, states_visited_xy_baseline
def execute_policy_internal(env, T, policies, state, render): random_T = np.floor(random.random() * T) p = np.zeros(shape=(tuple(ant_utils.num_states))) random_initial_state = [] for t in range(T): # Compute average probability over action space for state. probs = torch.tensor(np.zeros(shape=(1, ant_utils.action_dim))).float() var = torch.tensor(np.zeros(shape=(1, ant_utils.action_dim))).float() for policy in policies: prob, v = policy.get_probs_and_var(env.env.state_vector()) probs += prob var += v probs /= len(policies) var /= len(policies) action = select_action(probs, var) state, reward, done, _ = env.step(action) p[tuple(ant_utils.discretize_state(state))] += 1 if t == random_T: random_initial_state = env.env.state_vector() if render: env.render() if done: env.reset() p /= float(T) return p, random_initial_state
def execute_one_rollout(policies, weights, env, start_obs, T, data, norm, wrapped=False): obs = start_obs p, p_xy, cumulative_states_visited, states_visited, \ cumulative_states_visited_xy, states_visited_xy, random_initial_state = data random_T = np.random.randint(0, T) for t in range(T): action = select_action(policies, weights, env, obs) # Count the cumulative number of new states visited as a function of t. obs, _, done, _ = env.step(action) obs = get_state(env, obs, wrapped) # if this is the first time you are seeing this state, increment. if p[tuple(ant_utils.discretize_state(obs, norm, env))] == 0: cumulative_states_visited += 1 states_visited.append(cumulative_states_visited) if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] == 0: cumulative_states_visited_xy += 1 states_visited_xy.append(cumulative_states_visited_xy) p[tuple(ant_utils.discretize_state(obs, norm, env))] += 1 p_xy[tuple(ant_utils.discretize_state_2d(obs, norm, env))] += 1 if t == random_T: random_initial_state = obs if done: # CRITICAL: ignore done signal done = False if wrapped: obs = env.reset() obs = get_state(env, obs, wrapped) data = (p, p_xy, cumulative_states_visited, states_visited, \ cumulative_states_visited_xy, states_visited_xy, random_initial_state) return data
def reward(self, env, r, o): if len(self.reward_fn) == 0: return r # use self.normalization_factors to normalize the state. tup = tuple( ant_utils.discretize_state(o, self.normalization_factors, env)) return self.reward_fn[tup]
def test_agent(self, T, n=10, initial_state=[], normalization_factors=[], store_log=True, deterministic=True, reset=False): denom = 0 p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) for j in range(n): o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0 if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] self.test_env.env.set_state(qpos, qvel) o = self.test_env.env._get_obs() o = get_state(self.test_env, o) while not (d or (ep_len == T)): # Take deterministic actions at test time a = self.get_action(o, deterministic) o, r, d, _ = self.test_env.step(a) o = get_state(self.test_env, o) r = self.reward(self.test_env, r, o) ep_ret += r ep_len += 1 denom += 1 p[tuple( ant_utils.discretize_state(o, normalization_factors, self.test_env))] += 1 p_xy[tuple( ant_utils.discretize_state_2d(o, normalization_factors, self.test_env))] += 1 if d and reset: d = False if store_log: self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) p /= float(denom) p_xy /= float(denom) return p, p_xy
def execute_internal(self, env, T, state, render): p = np.zeros(shape=(tuple(ant_utils.num_states))) print("Simulation starting at = " + str(state)) state = self.get_obs() for t in range(T): action = self.select_action(state) _, reward, done, _ = self.env.step(action) state = self.get_obs() p[tuple(ant_utils.discretize_state(state))] += 1 if render: env.render() if done: env.reset() env.close() return p
def learn_policy(self, reward_fn, initial_state=[], episodes=1000, train_steps=1000): if len(initial_state) == 0: # initial_state = self.init_state initial_state = self.env.reset() initial_state = initial_state[:29] print("init: " + str(initial_state)) qpos = initial_state[:15] qvel = initial_state[15:] running_reward = 0 running_loss = 0 for i_episode in range(episodes): # if i_episode % 2 == 0: # self.env.env.set_state(qpos, qvel) self.env.reset() state = self.get_obs() ep_reward = 0 for t in range(train_steps): # Don't infinite loop while learning action = self.select_action(state) _, _, done, _ = self.env.step(action) state = self.get_obs() reward = reward_fn[tuple(ant_utils.discretize_state(state))] ep_reward += reward self.rewards.append(reward) if done: self.env.reset() running_reward = running_reward * 0.99 + ep_reward * 0.01 if (i_episode == 0): running_reward = ep_reward loss = self.update_policy() running_loss = running_loss * 0.99 + loss * .01 # Log to console. if i_episode % 10 == 0: print( 'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\tLoss: {:.2f}' .format(i_episode, ep_reward, running_reward, running_loss))
def get_discrete_distribution(self): if self.p is not None: return self.p # normalize buffer experience if not self.normalized: self.normalize() p = np.zeros(shape=(tuple(ant_utils.num_states))) for obs in self.buffer: # discritize obs, add to distribution tabulation. p[tuple(ant_utils.discretize_state(obs))] += 1 p /= len(self.buffer) self.p = p return p
def execute_random_internal(self, env, T, state, render): p = np.zeros(shape=(tuple(ant_utils.num_states))) for t in range(T): r = random.random() action = -1 if (r < 1 / 3.): action = 0 elif r < 2 / 3.: action = 1 # action = self.env.action_space.sample() # continuous actions _, reward, done, _ = env.step([action]) state = self.get_obs() p[tuple(ant_utils.discretize_state(state))] += 1 if render: env.render() if done: env.reset() env.close() return p
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''): reward_fn = np.zeros(shape=(tuple(ant_utils.num_states))) # set initial state to base state. seed = init_state(env) reward_fn[tuple(ant_utils.discretize_state(seed))] = 1 print(seed) print(tuple(ant_utils.discretize_state(seed))) running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_ent = 0 window_running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states))) window_running_avg_ent = 0 running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_ent_baseline = 0 window_running_avg_p_baseline = np.zeros( shape=(tuple(ant_utils.num_states))) window_running_avg_ent_baseline = 0 baseline_entropies = [] baseline_ps = [] entropies = [] ps = [] average_entropies = [] average_ps = [] running_avg_entropies = [] running_avg_ps = [] running_avg_entropies_baseline = [] running_avg_ps_baseline = [] window_running_avg_ents = [] window_running_avg_ps = [] window_running_avg_ents_baseline = [] window_running_avg_ps_baseline = [] policies = [] initial_state = [] #init_state(env) for i in range(epochs): # Learn policy that maximizes current reward function. policy = Policy(env, args.gamma, args.lr, ant_utils.obs_dim, ant_utils.action_dim) policy.learn_policy(reward_fn, initial_state, args.episodes, args.train_steps) policies.append(policy) # if args.save_models: # policy.save(MODEL_DIR + 'model_' + str(i) + '.pt') # Get next distribution p by executing pi for T steps. # p_videos = 'cmp_videos/%sp_%d/'% (MODEL_DIR, i) initial_state = [] p = policy.execute(T, initial_state, render=args.render) a = 10 # average over this many rounds baseline_videos = 'cmp_videos/%sbaseline_%d/' % ( MODEL_DIR, i) # note that MODEL_DIR has trailing slash entropy_videos = 'cmp_videos/%sentropy_%d/' % (MODEL_DIR, i) p_baseline = policy.execute_random( T, render=False, video_dir=baseline_videos) # args.episodes? round_entropy_baseline = scipy.stats.entropy(p_baseline.flatten()) for av in range(a - 1): next_p_baseline = policy.execute_random(T) p_baseline += next_p_baseline # print(scipy.stats.entropy(next_p_baseline.flatten())) round_entropy_baseline += scipy.stats.entropy( next_p_baseline.flatten()) p_baseline /= float(a) round_entropy_baseline /= float(a) # running average of the entropy # note: the entropy is p_baseline is not the same as the computed avg entropy # print("baseline compare:") # print(round_entropy_baseline) # running average # print(scipy.stats.entropy(p_baseline.flatten())) # entropy of final # reward_fn = grad_ent(p) round_entropy = scipy.stats.entropy(p.flatten()) entropies.append(round_entropy) baseline_entropies.append(round_entropy_baseline) ps.append(p) baseline_ps.append(p_baseline) # Execute the cumulative average policy thus far. # Estimate distribution and entropy. average_p, round_avg_ent, initial_state = \ execute_average_policy(env, policies, T, initial_state=initial_state, avg_runs=a, render=False, video_dir=entropy_videos) reward_fn = grad_ent(average_p) average_ps.append(average_p) average_entropies.append(round_avg_ent) # Update running average. window = 5 if (i < window): # add normally window_running_avg_ent = window_running_avg_ent * ( i) / float(i + 1) + round_avg_ent / float(i + 1) window_running_avg_p = window_running_avg_ent * ( i) / float(i + 1) + average_p / float(i + 1) window_running_avg_ent_baseline = window_running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) window_running_avg_p_baseline = window_running_avg_p_baseline * ( i) / float(i + 1) + p_baseline / float(i + 1) else: window_running_avg_ent = window_running_avg_ent + round_avg_ent / float( window) - average_entropies[i - 5] / float(window) window_running_avg_p = window_running_avg_p + average_p / float( window) - average_ps[i - 5] / float(window) window_running_avg_ent_baseline = window_running_avg_ent_baseline + round_entropy_baseline / float( window) - baseline_entropies[i - 5] / float(window) window_running_avg_p_baseline = window_running_avg_p_baseline + p_baseline / float( window) - baseline_ps[i - 5] / float(window) running_avg_ent = running_avg_ent * ( i) / float(i + 1) + round_avg_ent / float(i + 1) running_avg_p = running_avg_p * ( i) / float(i + 1) + average_p / float(i + 1) running_avg_entropies.append(running_avg_ent) running_avg_ps.append(running_avg_p) # Update baseline running averages. running_avg_ent_baseline = running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) running_avg_p_baseline = running_avg_p_baseline * ( i) / float(i + 1) + p_baseline / float(i + 1) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_ps_baseline.append(running_avg_p_baseline) window_running_avg_ents.append(window_running_avg_ent) window_running_avg_ps.append(window_running_avg_p) window_running_avg_ents_baseline.append( window_running_avg_ent_baseline) window_running_avg_ps_baseline.append(window_running_avg_p_baseline) # print("p=") # print(p) # print("..........") # print("round_entropy = %f" % (round_entropy)) print("---------------------") # print("average_p =") # print(average_p) # print("..........") print("round_avg_ent[%d] = %f" % (i, round_avg_ent)) print("running_avg_ent = %s" % running_avg_ent) print("window_running_avg_ent = %s" % window_running_avg_ent) print("..........") print("round_entropy_baseline[%d] = %f" % (i, round_entropy_baseline)) print("running_avg_ent_baseline = %s" % running_avg_ent_baseline) print("window_running_avg_ent_baseline = %s" % window_running_avg_ent_baseline) print("----------------------") #plotting.heatmap(running_avg_p, average_p, i) # plotting.smear_lines(running_avg_ps, running_avg_ps_baseline) # plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) # plotting.running_average_entropy_window(window_running_avg_ents, window_running_avg_ents_baseline, window) # plotting.difference_heatmap(running_avg_ps, running_avg_ps_baseline) # indexes = [] # print('which indexes?') # for i in range(4): # idx = input("index :") # indexes.append(int(idx)) # plotting.heatmap4(running_avg_ps, running_avg_ps_baseline, indexes) return policies
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''): direct = os.getcwd() + '/data/' experiment_directory = direct + args.exp_name print(experiment_directory) indexes = [1, 5, 10, 15] states_visited_indexes = [0, 5, 10, 15] states_visited_cumulative = [] states_visited_cumulative_baseline = [] running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) running_avg_ent = 0 running_avg_ent_xy = 0 running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_baseline_xy = np.zeros( shape=(tuple(ant_utils.num_states_2d))) running_avg_ent_baseline = 0 running_avg_ent_baseline_xy = 0 pct_visited = [] pct_visited_baseline = [] pct_visited_xy = [] pct_visited_xy_baseline = [] running_avg_entropies = [] running_avg_entropies_xy = [] running_avg_ps_xy = [] avg_ps_xy = [] running_avg_entropies_baseline = [] running_avg_entropies_baseline_xy = [] running_avg_ps_baseline_xy = [] avg_ps_baseline_xy = [] policies = [] initial_state = init_state(env) prebuf = ExperienceBuffer() env.reset() for t in range(10000): action = env.action_space.sample() obs, reward, done, _ = env.step(action) prebuf.store(get_state(env, obs)) if done: env.reset() done = False prebuf.normalize() normalization_factors = prebuf.normalization_factors utils.log_statement(normalization_factors) prebuf = None reward_fn = np.zeros(shape=(tuple(ant_utils.num_states))) for i in range(epochs): utils.log_statement("*** ------- EPOCH %d ------- ***" % i) # clear initial state if applicable. if not args.initial_state: initial_state = [] else: utils.log_statement(initial_state) utils.log_statement( tuple( ant_utils.discretize_state_2d(initial_state, normalization_factors))) utils.log_statement( tuple( ant_utils.discretize_state(initial_state, normalization_factors))) utils.log_statement("max reward: " + str(np.max(reward_fn))) logger_kwargs = setup_logger_kwargs("model" + str(i), data_dir=experiment_directory) # Learn policy that maximizes current reward function. print("Learning new oracle...") if args.seed != -1: seed = args.seed else: seed = random.randint(1, 100000) sac = AntSoftActorCritic(lambda: gym.make(args.env), reward_fn=reward_fn, xid=i + 1, seed=seed, gamma=args.gamma, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), logger_kwargs=logger_kwargs, normalization_factors=normalization_factors, learn_reduced=args.learn_reduced) # TODO: start learning from initial state to add gradient? # The first policy is random if i == 0: sac.soft_actor_critic(epochs=0) else: sac.soft_actor_critic(epochs=args.episodes, initial_state=initial_state, start_steps=args.start_steps) policies.append(sac) print("Learning autoencoding....") autoencoder = learn_encoding(env, policies, i) # Execute the cumulative average policy thus far. # Estimate distribution and entropy. print("Executing mixed policy...") average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \ execute_average_policy(env, policies, T, autoencoder=autoencoder, reward_fn=reward_fn, norm=normalization_factors, initial_state=initial_state, n=args.n, render=False, epoch=i) print("Calculating maxEnt entropy...") round_entropy = entropy(average_p.ravel()) round_entropy_xy = entropy(average_p_xy.ravel()) # Update running averages for maxEnt. print("Updating maxEnt running averages...") running_avg_ent = running_avg_ent * ( i) / float(i + 1) + round_entropy / float(i + 1) running_avg_ent_xy = running_avg_ent_xy * ( i) / float(i + 1) + round_entropy_xy / float(i + 1) running_avg_p *= (i) / float(i + 1) running_avg_p += average_p / float(i + 1) running_avg_p_xy *= (i) / float(i + 1) running_avg_p_xy += average_p_xy / float(i + 1) # update reward function print("Update reward function") eps = 1 / np.sqrt(ant_utils.total_state_space) if args.cumulative: reward_fn = grad_ent(running_avg_p) else: reward_fn = 1. average_p += eps reward_fn /= average_p average_p = None # delete big array # (save for plotting) running_avg_entropies.append(running_avg_ent) running_avg_entropies_xy.append(running_avg_ent_xy) if i in indexes: running_avg_ps_xy.append(np.copy(running_avg_p_xy)) avg_ps_xy.append(np.copy(average_p_xy)) print("Collecting baseline experience....") p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random( T, normalization_factors=normalization_factors, n=args.n) print('Random visits same # states....') print(len(states_visited)) print(len(states_visited_baseline)) print(len(states_visited_xy)) print(len(states_visited_xy_baseline)) plotting.states_visited_over_time(states_visited, states_visited_baseline, i) plotting.states_visited_over_time(states_visited_xy, states_visited_xy_baseline, i, ext='_xy') # save for cumulative plot. if i in states_visited_indexes: # average over a whole bunch of rollouts # slow: so only do this when needed. print("Averaging unique xy states visited....") states_visited_xy = compute_states_visited_xy(env, policies, T=T, n=args.n, N=args.avg_N) states_visited_xy_baseline = compute_states_visited_xy( env, policies, T=T, n=args.n, N=args.avg_N, initial_state=initial_state, baseline=True) states_visited_cumulative.append(states_visited_xy) states_visited_cumulative_baseline.append( states_visited_xy_baseline) print("Compute baseline entropy....") round_entropy_baseline = entropy(p_baseline.ravel()) round_entropy_baseline_xy = entropy(p_baseline_xy.ravel()) # Update baseline running averages. print("Updating baseline running averages...") running_avg_ent_baseline = running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * ( i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1) running_avg_p_baseline *= (i) / float(i + 1) running_avg_p_baseline += p_baseline / float(i + 1) running_avg_p_baseline_xy *= (i) / float(i + 1) running_avg_p_baseline_xy += p_baseline_xy / float(i + 1) p_baseline = None # (save for plotting) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy) if i in indexes: running_avg_ps_baseline_xy.append( np.copy(running_avg_p_baseline_xy)) avg_ps_baseline_xy.append(np.copy(p_baseline_xy)) utils.log_statement(average_p_xy) utils.log_statement(p_baseline_xy) # Calculate percent of state space visited. pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size) pct_visited.append(pct) pct_xy = np.count_nonzero(running_avg_p_xy) / float( running_avg_p_xy.size) pct_visited_xy.append(pct_xy) pct_baseline = np.count_nonzero(running_avg_p_baseline) / float( running_avg_p_baseline.size) pct_visited_baseline.append(pct_baseline) pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float( running_avg_p_baseline_xy.size) pct_visited_xy_baseline.append(pct_xy_baseline) # Print round summary. col_headers = ["", "baseline", "maxEnt"] col1 = [ "round_entropy_xy", "running_avg_ent_xy", "round_entropy", "running_avg_ent", "% state space xy", "% total state space" ] col2 = [ round_entropy_baseline_xy, running_avg_ent_baseline_xy, round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline, pct_baseline ] col3 = [ round_entropy_xy, running_avg_ent_xy, round_entropy, running_avg_ent, pct_xy, pct ] table = tabulate(np.transpose([col1, col2, col3]), col_headers, tablefmt="fancy_grid", floatfmt=".4f") utils.log_statement(table) # Plot from round. plotting.heatmap(running_avg_p_xy, average_p_xy, i) plotting.heatmap1(running_avg_p_baseline_xy, i) if i == states_visited_indexes[3]: plotting.states_visited_over_time_multi( states_visited_cumulative, states_visited_cumulative_baseline, states_visited_indexes) # cumulative plots. plotting.heatmap4(running_avg_ps_xy, running_avg_ps_baseline_xy, indexes, ext="cumulative") plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch") plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) plotting.running_average_entropy(running_avg_entropies_xy, running_avg_entropies_baseline_xy, ext='_xy') plotting.percent_state_space_reached(pct_visited, pct_visited_baseline, ext='_total') plotting.percent_state_space_reached(pct_visited_xy, pct_visited_xy_baseline, ext="_xy") return policies
def execute_average_policy(env, policies, T, autoencoder=None, reward_fn=[], norm=[], initial_state=[], n=10, render=False, epoch=0): p = np.zeros(shape=(tuple(ant_utils.num_states))) p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) random_initial_state = [] cumulative_states_visited = 0 states_visited = [] cumulative_states_visited_xy = 0 states_visited_xy = [] rewards = np.zeros(T) denom = 0 max_idx = len(policies) - 1 # average results over n rollouts for iteration in range(n): env.reset() # TODO: when testing, do not want initial state. if len(initial_state) > 0: qpos = initial_state[:len(ant_utils.qpos)] qvel = initial_state[len(ant_utils.qpos):] env.env.set_state(qpos, qvel) obs = get_state(env, env.env._get_obs()) random_T = np.floor(random.random() * T) random_initial_state = [] for t in range(T): action = np.zeros(shape=(1, ant_utils.action_dim)) if args.max_sigma: mu = np.zeros(shape=(1, ant_utils.action_dim)) sigma = np.zeros(shape=(1, ant_utils.action_dim)) mean_sigma = np.zeros(shape=(1, ant_utils.action_dim)) for sac in policies: mu += sac.get_action(obs, deterministic=True) sigma = np.maximum(sigma, sac.get_sigma(obs)) mean_sigma += sac.get_sigma(obs) mu /= float(len(policies)) mean_sigma /= float(len(policies)) action = np.random.normal(loc=mu, scale=sigma) else: # select random policy uniform distribution # take non-deterministic action for that policy idx = random.randint(0, max_idx) if idx == 0: action = env.action_space.sample() else: action = policies[idx].get_action( obs, deterministic=args.deterministic) # Count the cumulative number of new states visited as a function of t. obs, _, done, _ = env.step(action) # log encoded data to file. if autoencoder is not None: encodedfile = 'logs/encoded/' + args.exp_name + '.txt' val = autoencoder.encode(obs[:29]) with open(encodedfile, 'a') as f: f.write(str(val) + '\n') print(autoencoder.encode(obs[:29])) obs = get_state(env, obs) reward = reward_fn[tuple(ant_utils.discretize_state(obs, norm))] rewards[t] += reward # if this is the first time you are seeing this state, increment. if p[tuple(ant_utils.discretize_state(obs, norm))] == 0: cumulative_states_visited += 1 states_visited.append(cumulative_states_visited) if p_xy[tuple(ant_utils.discretize_state_2d(obs, norm))] == 0: cumulative_states_visited_xy += 1 states_visited_xy.append(cumulative_states_visited_xy) p[tuple(ant_utils.discretize_state(obs, norm))] += 1 p_xy[tuple(ant_utils.discretize_state_2d(obs, norm))] += 1 denom += 1 if t == random_T: random_initial_state = obs if render: env.render() if done: # CRITICAL: ignore done signal done = False env.close() rewards /= float(n) plotting.reward_vs_t(rewards, epoch) p /= float(denom) p_xy /= float(denom) return p, p_xy, random_initial_state, states_visited, states_visited_xy