if __name__ == "__main__": '''Implement a basic version of the Q-learning algorithm and use it to solve the taxi domain. The agent should explore the MDP, collect data to learn the optimal policy and the optimal Q-value function. (Be mindful of how you handle terminal states, typically if St is a terminal state, V (St+1) = 0). Use gamma = 0.90. Also, you will see how an Epsilon-Greedy strategy can find the optimal policy despite finding sub-optimal Q-values. As we are looking for optimal Q-values you will have to carefully consider your exploration strategy. Evaluate your agent using the OpenAI gym 0.14.0 Taxi-v2 environment. Install OpenAI Gym 0.14.0 with pip install gym==0.14.0''' # env_HW4 = gym.make('Taxi-v2') # env_HW4 = gym.make('Taxi-v2').unwrapped env_HW4 = TaxiEnv() '''Taxi-v2 - Q-learning''' print("Taxi-v2") for i in range(1): QL_HW4 = QLearningTable(actions=list(range(env_HW4.nA)), # learning_rate=0.1, reward_decay=0.90, # gamma # epsilon=0.2, verbose=True) Q_output = Q_HW4(num_episode = 2500000, # Sammy runed 10000000 episodes to get stable updates. # Q table only updated for 2500000 episodes # sometimes only makes 90% score learning_rate=0.01) # function to execute the q-learner, shown above print(Q_output)
phi = set() for var in variables: for i, node in enumerate(cat.trajectory): if node.outgoing[var] == set(len(cat.trajectory) - 1): phi.add(i) for i in reversed(range(len(cat.trajectory))): if cat.trajectory[i].get_all_outgoing().issubset(phi): phi.add(i) phi = sorted(list(phi)) for var in variables: last_incoming = None for i in range(len(phi)): incoming_index = cat.trajectory[phi[i]].incoming[var] if incoming_index is not None and incoming_index not in phi: last_incoming = i phi = phi[last_incoming:] ans.append(phi) return ans if __name__ == "__main__": from taxi import TaxiEnv agent = HG() env = TaxiEnv() tr = agent.build_CAT(env) print(*tr, sep = '\n') ct = HG.CAT_trajectory(tr[0])
qt = set(q_table_lp) qfile = open("other.lp", "w") for (state, action) in qt: taxirow, taxicol, passidx, _ = env.decode(state) actionname = getActionName(action) # comment = "% taxi:"+str(taxirow)+","+str(taxicol)+",passenger:"+str(passidx)+"\n" qrule = "q((" + str(taxirow) + "," + str(taxicol) + "," + str( passidx) + ")," + actionname + "," + str(ro_table[state, action]) + ").\n" qfile.write(qrule) qfile.close() if __name__ == '__main__': env = TaxiEnv() nA = env.nA nS = env.nS # Parameters LEARNING_RATE_Q = 1 LEARNING_RATE_R = 1 DISCOUNT = 0.001 EPSILON = 0.01 BETA = 0.3 # Q and rewards # q_table = np.zeros((nS, nA)) # q_table_lp = {}
def game(N_episodes, AI_type, Intrinsic_type, clip_ratio): ############## Hyperparameters ############## env = TaxiEnv() #memory = Memory(max_size=300) #n_episodes = number_of_episodes #n_actions = env.action_space.n #intrinsic = intrinsic #print(n_actions) #n_agents = 1 #n_episodes = number_of_episodes #state_size = env.observation_space.n #env_name = "LunarLander-v2" # creating environment state_dim = env.observation_space.n action_dim = env.action_space.n render = False solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = N_episodes # max training episodes max_timesteps = 250 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 2000 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None samp_rewards = [] avg_rewards = [] best_avg_reward = -np.inf n_agents = 1 ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 avg_reward = 0 ppo.memcount.delete() state_size = env.observation_space.n reward_rms = RunningMeanStd() obs_rms = RunningMeanStd() norm_step = 5000 #Pre Run next_obs = [] for norm_step in range(norm_step): action_norm = np.random.randint(0, action_dim) state_norm, reward_norm, done_norm, _ = env.step(action_norm) state_norm = to_categorical(state_norm, state_size) #optional next_obs.append(state_norm) obs_rms.update(next_obs) # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() state = to_categorical(state, state_size) done = False t = 0 episode_reward = 0 intrinsic_rewards = 0 reward = 0 #for t in range(max_timesteps): #while not done: while t <= max_timesteps: timestep += 1 t += 1 # Running policy_old: action = ppo.policy_old.act(state, memory) state, reward, done, _ = env.step(action) state = to_categorical(state, state_size) #======================================================== if ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "1"): intrinsic_rewards = get_intrinsic_rewards( AI_type, state, ppo, n_agents, 10) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards1",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "2"): intrinsic_rewards = get_intrinsic_rewards2( AI_type, state, action, ppo, n_agents, 10) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards2",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "3"): intrinsic_rewards = get_intrinsic_rewards3( AI_type, state, action, ppo, n_agents, reward, 1) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards3",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "4"): intrinsic_rewards = get_intrinsic_rewards4( AI_type, state, action, ppo, n_agents, reward, t, 1, 0.99) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "5"): intrinsic_rewards = get_intrinsic_rewards5( AI_type, state, ppo, n_agents, 1, 16) #print("intrinsic_rewards5",intrinsic_rewards) else: intrinsic_rewards = 0 reward_sum = reward #+ intrinsic_rewards #=========================================================== memory.rewards.append(reward_sum) #temp_int = memory.intrinsic_rewards.data.numpy() #temp_int = memory.intrinsic_rewards #print(temp_int) memory.intrinsic_rewards.append(intrinsic_rewards) memory.is_terminals.append(done) """ try: mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int) reward_rms.update_from_moments(mean1, std1 ** 2, count1) adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var) except: adv_int = 0 """ # update if its time if timestep % update_timestep == 0: temp_int = memory.intrinsic_rewards mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len( temp_int) reward_rms.update_from_moments(mean1, std1**2, count1) adv_int = (temp_int) / np.sqrt(reward_rms.var) ppo.update(memory, adv_int, clip_ratio) memory.clear_memory() timestep = 0 running_reward += reward episode_reward += reward if render: env.render() #if done: #break avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): print("########## Solved! ##########") #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) #break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 samp_rewards.append(episode_reward) if (i_episode >= 100): # get average reward from last 100 episodes avg_reward = np.mean(samp_rewards[-100:]) # append to deque avg_rewards.append(avg_reward) # update best average reward if avg_reward > best_avg_reward: best_avg_reward = avg_reward print("Total reward in episode {} = {}".format(i_episode, episode_reward)) print("Best_avg_reward =", np.round(best_avg_reward, 3), "Average_rewards =", np.round(avg_reward, 3)) #env.save_replay() env.close() return avg_rewards, best_avg_reward, samp_rewards, "0"
def main(cfg): initial_seed = cfg.initial_seed random.seed(initial_seed) np.random.seed(initial_seed) gamma = cfg.gamma n_trajectories = cfg.n_trajectories horizon = cfg.horizon horizon_normalization = (1 - gamma**horizon) / (1 - gamma) processor = lambda x: x seed_list = [ initial_seed + np.random.randint(0, 10000) * i for i in range(cfg.n_experiments) ] # generate a list of random seeds if cfg.env == 'grid_world': from gridworld import GridWorldEnv env = GridWorldEnv() elif cfg.env == 'taxi': from taxi import TaxiEnv env = TaxiEnv() n_states = env.nS n_actions = env.nA P = env.P_matrix R = env.R_matrix.copy() d0 = env.isd q_star_original = env.value_iteration() # pi_prob = gymEnv.extract_policy(q_star_original, temperature=0.05) # mu_prob = gymEnv.extract_policy(q_star_original, temperature=1) pi = env.extract_policy(q_star_original, temperature=0.1) # mu = env.extract_policy(q_star_original, temperature=0.3) # pi = env.extract_policy(q_star_original, temperature=0.15) # mu = env.extract_policy(q_star_original, temperature=0.3) # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy() mu = pi.copy() mu[:, 0] = pi[:, 1].copy() mu[:, 1] = pi[:, 2].copy() mu[:, 2] = pi[:, 3].copy() mu[:, 3] = pi[:, 0].copy() dpi, dpi_t, v_pi_s, P_pi = exact_calculation(env, pi, cfg.horizon, cfg.gamma) dmu, dmu_t, vmu_s, P_mu = exact_calculation(env, mu, cfg.horizon, cfg.gamma) #! sanity check the loss objective #* verify the claim that L(w*,f) = 0 for all f, where #* L(w,f) = \E_{(s,a,s')\sim d_mu} [ w(s) (f(s) - gamm*rho(s,a)*f(s'))] +1/h E_{s\sim d0} [f(s)] - 1/h *gamma^horizon \E_{s\sim d_pi,H}[f(s)] # determine w_star w_star = np.nan_to_num(dpi / dmu) v_pi = np.sum(d0 * v_pi_s) v_mu = np.sum(d0 * vmu_s) dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1]) dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1]) if RUN_SANITY_CHECK: def L(w, f): loss = 0 for s in range(n_states): for a in range(n_actions): for sn in range(n_states): loss += w[s] * (-f[s] + gamma * pi[s, a] / mu[s, a] * f[sn]) * dmu[s] * mu[s, a] * P[s, a, sn] loss += 1 / horizon_normalization * np.sum(d0 * f) loss -= 1 / horizon_normalization * gamma**horizon * np.sum( dpi_H * f) return loss f = np.random.rand(n_states) loss = L(w_star, f) assert abs(loss) < 1e-8 #! sanity check bellman and td error R_pi = np.sum(R * pi, axis=-1) bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s) bellman_new = v_pi_s - np.dot( (np.identity(n_states) - np.linalg.matrix_power( gamma * P_pi, horizon)), R_pi) - gamma * np.dot(P_pi, v_pi_s) pdb.set_trace() ground_truth_info = AttrDict({}) ground_truth_info.update({ 'd_pi': torch.tensor(dpi, dtype=dtype), 'd_mu': torch.tensor(dmu, dtype=dtype), 'v_pi': torch.tensor(v_pi_s, dtype=dtype), 'v_star': v_pi }) ground_truth_info.update({'w_pi': w_star}) ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)}) ground_truth_info.update({ 'pi': torch.tensor(pi, dtype=dtype), 'mu': torch.tensor(mu, dtype=dtype) }) true_rho = torch.tensor(pi / mu, dtype=dtype) true_rho[true_rho != true_rho] = 0 true_rho[torch.isinf(true_rho)] = 0 ground_truth_info.update({'rho': true_rho}) ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)}) ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)}) ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)}) ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)}) ground_truth_info.update({ 'd_pi_t': torch.tensor(dpi_t, dtype=dtype), 'd_mu_t': torch.tensor(dmu_t, dtype=dtype) }) estimate = {} squared_error = {} estimate.update({'True pi': [float(v_pi)]}) squared_error.update({'True pi': [0]}) estimate.update({'True mu': [float(v_mu)]}) squared_error.update({'True mu': [float(v_mu - v_pi)**2]}) #* Generate multiple sets of behavior data from mu training_data = [] training_data_processed = [] for _ in range(cfg.n_experiments): print('Experiment:', _) print('------------------------') np.random.seed(seed_list[_]) env.seed(seed_list[_]) # behavior_data = rollout(env, mu, processor, absorbing_state, pi_e = pi, N=n_trajectories, T=horizon, frameskip=1, frameheight=1, path=None, filename='tmp',) behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon) behavior_data_processed = prepare_behavior_data(behavior_data) training_data.append(behavior_data) training_data_processed.append(behavior_data_processed) # pdb.set_trace() estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[ 'STEP WIS'], estimate['Mu hat'] = [], [], [], [], [] squared_error['IS'] = [] squared_error['STEP IS'] = [] squared_error['WIS'] = [] squared_error['STEP WIS'] = [] squared_error['Mu hat'] = [] estimate['IH_SN'] = [] squared_error['IH_SN'] = [] estimate['IH_no_SN'] = [] squared_error['IH_no_SN'] = [] estimate['MB'] = [] squared_error['MB'] = [] ###* Looping over the number of baseline experiments for _ in range(cfg.n_experiments): behavior_data = training_data[_] behavior_data_processed = training_data_processed[_] IS = importance_sampling_estimator(behavior_data, mu, pi, gamma) step_IS = importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) WIS = weighted_importance_sampling_estimator(behavior_data, mu, pi, gamma) step_WIS = weighted_importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) estimate['IS'].append(float(IS)) squared_error['IS'].append(float((IS - v_pi)**2)) estimate['STEP IS'].append(float(step_IS)) squared_error['STEP IS'].append(float((step_IS - v_pi)**2)) estimate['WIS'].append(float(WIS)) squared_error['WIS'].append(float((WIS - v_pi)**2)) estimate['STEP WIS'].append(float(step_WIS)) squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2)) MB = model_based(n_states, n_actions, behavior_data, pi, gamma) estimate['MB'].append(float(MB)) squared_error['MB'].append(float((MB - v_pi)**2)) IH, IH_unnormalized = lihong_infinite_horizon(n_states, behavior_data, mu, pi, gamma) estimate['IH_SN'].append(float(IH)) squared_error['IH_SN'].append(float((IH - v_pi)**2)) estimate['IH_no_SN'].append(float(IH_unnormalized)) squared_error['IH_no_SN'].append(float((IH_unnormalized - v_pi)**2)) display((estimate, squared_error)) print('exp seed:', cfg.initial_seed) # pdb.set_trace() if RUN_SANITY_CHECK: #! Let's run some additional sanity check #* check to see if bias formula checks out v_w = 0 normalization = 0 for trajectory in behavior_data: discounted_t = 1 for s, a, sn, r in trajectory: v_w += w_star[s] * pi[s, a] / mu[s, a] * r * discounted_t normalization += discounted_t discounted_t *= gamma v_w = v_w / normalization on_policy_data, frequency, avg_reward = roll_out( env, pi, 4096, horizon) # pdb.set_trace() empirical_v_pi = np.zeros(n_states) empirical_d_pi = np.zeros(n_states) empirical_d0 = np.zeros(n_states) empirical_r_pi = np.zeros(n_states) empirical_frequency = np.zeros(n_states) empirical_P = np.zeros((n_states, n_actions, n_states)) horizon_normalization = (1 - gamma**horizon) / (1 - gamma) num_traj = len(on_policy_data) for trajectory in on_policy_data: discounted_t = 1 for s, a, sn, r in trajectory: empirical_v_pi[s] += r * discounted_t empirical_d_pi[s] += discounted_t # empirical_d0[s] += 1-discounted_t discounted_t *= gamma empirical_r_pi[s] += r empirical_frequency[s] += 1 empirical_P[s, a, sn] += 1 empirical_v_pi = empirical_v_pi / num_traj empirical_d_pi = empirical_d_pi / horizon_normalization / num_traj empirical_P = np.nan_to_num(empirical_P / np.sum(empirical_P, axis=-1)[:, :, None]) # T = np.nan_to_num(T/np.sum(T, axis = -1)[:,:,None]) empirical_r_pi = np.nan_to_num(empirical_r_pi / empirical_frequency) empirical_P_pi = np.einsum('san,sa->sn', empirical_P, pi) empirical_d_mu = np.zeros(n_states) num_traj = len(behavior_data) for trajectory in behavior_data: discounted_t = 1 for s, a, sn, r in trajectory: empirical_d_mu[s] += discounted_t discounted_t *= gamma empirical_d_mu = empirical_d_mu / horizon_normalization / num_traj empirical_w = np.nan_to_num(empirical_d_pi / empirical_d_mu) empirical_loss = L(empirical_w, empirical_v_pi) empirical_bellman_original = 0 empirical_bellman_new = 0 empirical_td_error = 0 num_traj = len(on_policy_data) empirical_r_pi_adjusted = np.dot( (np.identity(n_states) - np.linalg.matrix_power(gamma * empirical_P_pi, horizon)), empirical_r_pi) for trajectory in on_policy_data: discounted_t = 1.0 for s, a, sn, r in trajectory: empirical_bellman_original += discounted_t * ( v_pi_s[s] - empirical_r_pi[s] - gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2 empirical_bellman_new += discounted_t * ( v_pi_s[s] - empirical_r_pi_adjusted[s] - gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2 empirical_td_error += discounted_t * (v_pi_s[s] - r - gamma * v_pi_s[sn])**2 discounted_t *= gamma empirical_td_error = empirical_td_error / horizon_normalization / num_traj empirical_bellman_original = empirical_bellman_original / horizon_normalization / num_traj empirical_bellman_new = empirical_bellman_new / horizon_normalization / num_traj # empirical_bellman_original = empirical_v_pi - empirical_r_pi - gamma*np.dot(empirical_P_pi, empirical_v_pi) # bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s) # bellman_new = v_pi_s - np.dot((np.identity(n_states) - np.linalg.matrix_power(gamma*P_pi, horizon)),R_pi) - gamma*np.dot(P_pi, v_pi_s) pdb.set_trace() for objective in cfg.objective: estimate[objective] = [] squared_error[objective] = [] objective_sn = objective + '-SN' estimate[objective_sn] = [] squared_error[objective_sn] = [] for i in range(cfg.n_experiments): training_set = training_data_processed[i] fixed_terminal_value = True logging = cfg.logging mvm = Tabular_State_MVM_Estimator(training_set, cfg, logging=logging, ground_truth=ground_truth_info) penalty = cfg.penalty_input horizon_normalization = (1 - gamma**horizon) / (1 - gamma) # penalty_base = 1/mdp_calculator.horizon_normalization#/cfg.n_trajectories penalty_base = 1 / horizon_normalization mvm.set_random_seed( seed_list[i]) #different random seed per experiment mvm.solve_closed_form_bias() mvm.generate_random_v_class(cfg.v_class_cardinality) mvm.generate_random_w_class(cfg.v_class_cardinality) # mvm.bias_check() for objective in cfg.objective: mvm.set_random_seed(seed_list[i]) # w_estimator = mvm.optimize_finite_class(objective = objective, td_penalty=penalty*penalty_base) # w_estimator = mvm.optimize_discrete(objective = objective, td_penalty=penalty*penalty_base) w_estimator = mvm.optimize(objective, td_penalty=0.1) # w_estimator, w_estimator_sn = mvm.optimize_optimistic() # w_estimator, w_estimator_sn = mvm.optimize_optimistic_adam(objective = objective, td_penalty=penalty*penalty_base) # w_estimator = mvm.optimize_closed_form() estimate[objective].append(float(w_estimator)) # objective_sn = objective + '-SN' # estimate[objective_sn].append(float(w_estimator_sn)) squared_error[objective].append(float(w_estimator - v_pi)**2) # squared_error[objective_sn].append(float(w_estimator_sn-v_pi)**2) display((estimate, squared_error)) display((estimate, squared_error))
def main(cfg): initial_seed = cfg.initial_seed random.seed(initial_seed) np.random.seed(initial_seed) gamma = cfg.gamma # n_trajectories_list = cfg.n_trajectories # for n_trajectories in n_trajectories_list: # n_trajectories = cfg.n_trajectories horizon = cfg.horizon horizon_normalization = (1 - gamma**horizon) / ( 1 - gamma) if gamma < 1 else horizon seed_list = [ initial_seed + np.random.randint(0, 10000) * i for i in range(cfg.n_experiments) ] # generate a list of random seeds if cfg.env == 'grid_world': from gridworld import GridWorldEnv env = GridWorldEnv() elif cfg.env == 'taxi': from taxi import TaxiEnv env = TaxiEnv() n_states = env.nS n_actions = env.nA P = env.P_matrix R = env.R_matrix.copy() d0 = env.isd q_star_original = env.value_iteration() pi = env.extract_policy(q_star_original, temperature=0.3) mu = env.extract_policy(q_star_original, temperature=0.1) # pi = env.extract_policy(q_star_original, temperature=0.1) # mu = env.extract_policy(q_star_original, temperature=0.3) # pi = env.extract_policy(q_star_original, temperature=0.3) # mu = env.extract_policy(q_star_original, temperature=0.15) # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy() #* 4 swapped cyclic # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,0].copy() #* D swapped with R, L swapped with U # mu = pi.copy(); mu[:,0] = pi[:,3].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy(); mu[:,3] = pi[:,0].copy() # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,4].copy();mu[:,4] = pi[:,5].copy();mu[:,5] = pi[:,0].copy() dpi, dpi_t, v_pi_s, q_pi_sa, P_pi = exact_calculation( env, pi, cfg.horizon, cfg.gamma) dmu, dmu_t, vmu_s, qmu_sa, P_mu = exact_calculation( env, mu, cfg.horizon, cfg.gamma) w_star = np.nan_to_num(dpi / dmu) v_pi = np.sum(d0 * v_pi_s) v_mu = np.sum(d0 * vmu_s) dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1]) dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1]) ground_truth_info = AttrDict({}) ground_truth_info.update({ 'd_pi': torch.tensor(dpi, dtype=dtype), 'd_mu': torch.tensor(dmu, dtype=dtype), 'v_pi': torch.tensor(v_pi_s, dtype=dtype), 'q_pi': torch.tensor(q_pi_sa, dtype=dtype), 'v_star': v_pi }) ground_truth_info.update({'w_pi': w_star}) ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)}) ground_truth_info.update({ 'pi': torch.tensor(pi, dtype=dtype), 'mu': torch.tensor(mu, dtype=dtype) }) true_rho = torch.tensor(pi / mu, dtype=dtype) true_rho[true_rho != true_rho] = 0 true_rho[torch.isinf(true_rho)] = 0 ground_truth_info.update({'rho': true_rho}) ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)}) ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)}) ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)}) ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)}) ground_truth_info.update({ 'd_pi_t': torch.tensor(dpi_t, dtype=dtype), 'd_mu_t': torch.tensor(dmu_t, dtype=dtype) }) estimate = {} squared_error = {} estimate.update({'True pi': [float(v_pi)]}) squared_error.update({'True pi': [0]}) estimate.update({'True mu': [float(v_mu)]}) squared_error.update({'True mu': [float(v_mu - v_pi)**2]}) results = {} results['trajectories'] = [] results['IS'] = [] results['IH'] = [] results['MB'] = [] results['WIS'] = [] results['STEP WIS'] = [] results['STEP IS'] = [] results['True mu'] = [] for objective in cfg.objective: results[objective] = [] n_trajectories_list = cfg.n_trajectories for n_trajectories in n_trajectories_list: print('------------------------') #* Generate multiple sets of behavior data from mu training_data = [] training_data_processed = [] for _ in range(cfg.n_experiments): # print('Experiment:',_) # print('------------------------') np.random.seed(seed_list[_]) env.seed(seed_list[_]) behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon) behavior_data_processed = prepare_behavior_data(behavior_data) training_data.append(behavior_data) training_data_processed.append(behavior_data_processed) estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[ 'STEP WIS'], estimate['Mu hat'] = [], [], [], [], [] squared_error['IS'] = [] squared_error['STEP IS'] = [] squared_error['WIS'] = [] squared_error['STEP WIS'] = [] squared_error['Mu hat'] = [] estimate['IH_SN'] = [] squared_error['IH_SN'] = [] estimate['IH_no_SN'] = [] squared_error['IH_no_SN'] = [] estimate['MB'] = [] squared_error['MB'] = [] ###* Looping over the number of baseline experiments for _ in range(cfg.n_experiments): behavior_data = training_data[_] behavior_data_processed = training_data_processed[_] IS = importance_sampling_estimator(behavior_data, mu, pi, gamma) step_IS = importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) WIS = weighted_importance_sampling_estimator( behavior_data, mu, pi, gamma) step_WIS = weighted_importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) estimate['IS'].append(float(IS)) squared_error['IS'].append(float((IS - v_pi)**2)) estimate['STEP IS'].append(float(step_IS)) squared_error['STEP IS'].append(float((step_IS - v_pi)**2)) estimate['WIS'].append(float(WIS)) squared_error['WIS'].append(float((WIS - v_pi)**2)) estimate['STEP WIS'].append(float(step_WIS)) squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2)) MB = model_based(n_states, n_actions, behavior_data, pi, gamma) estimate['MB'].append(float(MB)) squared_error['MB'].append(float((MB - v_pi)**2)) IH, IH_unnormalized = lihong_infinite_horizon( n_states, behavior_data, mu, pi, gamma) estimate['IH_SN'].append(float(IH)) squared_error['IH_SN'].append(float((IH - v_pi)**2)) estimate['IH_no_SN'].append(float(IH_unnormalized)) squared_error['IH_no_SN'].append(float( (IH_unnormalized - v_pi)**2)) # display((estimate, squared_error)) # print('exp seed:', cfg.initial_seed) # pdb.set_trace() results['trajectories'].append(np.log2(n_trajectories)) results['IH'].append( np.log2( sum(squared_error['IH_SN']) / len(squared_error['IH_SN']) / v_pi**2)) results['MB'].append( np.log2( sum(squared_error['MB']) / len(squared_error['IH_SN']) / v_pi**2)) results['IS'].append( np.log2( sum(squared_error['IS']) / len(squared_error['IS']) / v_pi**2)) results['WIS'].append( np.log2( sum(squared_error['WIS']) / len(squared_error['WIS']) / v_pi**2)) results['STEP WIS'].append( np.log2( sum(squared_error['STEP WIS']) / len(squared_error['STEP WIS']) / v_pi**2)) results['STEP IS'].append( np.log2( sum(squared_error['STEP IS']) / len(squared_error['STEP IS']) / v_pi**2)) results['True mu'].append( np.log2( sum(squared_error['True mu']) / len(squared_error['True mu']) / v_pi**2)) for objective in cfg.objective: estimate[objective] = [] squared_error[objective] = [] # for i in range(cfg.n_experiments): # training_set = training_data_processed[i] # mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth = ground_truth_info) # for objective in cfg.objective: # mvm.set_random_seed(seed_list[i]) # w_estimator = mvm.optimize(objective) # estimate[objective].append(float(w_estimator)) # squared_error[objective].append(float(w_estimator-v_pi)**2) # display((estimate, squared_error)) for i in range(cfg.n_experiments): training_set = training_data_processed[i] mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth=ground_truth_info) for objective in cfg.objective: mvm.set_random_seed(seed_list[i]) w_estimator = mvm.optimize(objective) estimate[objective].append(float(w_estimator)) squared_error[objective].append(float(w_estimator - v_pi)**2) # display((estimate, squared_error)) for objective in cfg.objective: results[objective].append( np.log2( sum(squared_error[objective]) / len(squared_error[objective]) / v_pi**2)) display((estimate, squared_error), n_trajectories) print('\n') print('End of one set of experiments') # pdb.set_trace() df = pd.DataFrame(results) # plt.plot(results['trajectories'], results['IH'],marker='o', markerfacecolor='blue', markersize=12, color='blue', linewidth=4) # plt.plot(results['trajectories'], results['MB'],marker='o', markerfacecolor='red', markersize=12, color='red', linewidth=4) # plt.plot(results['trajectories'], results['STEP WIS'],marker='o', markerfacecolor='aqua', markersize=12, color='aqua', linewidth=4) # plt.plot(results['trajectories'], results['STEP IS'],marker='o', markerfacecolor='orange', markersize=12, color='orange', linewidth=4) markersize = 8 linewidth = 4 plt.plot('trajectories', 'STEP WIS', data=df, marker='o', markerfacecolor='slategrey', markersize=markersize, color='slategrey', linewidth=linewidth) plt.plot('trajectories', 'STEP IS', data=df, marker='o', markerfacecolor='rosybrown', markersize=markersize, color='rosybrown', linewidth=linewidth) plt.plot('trajectories', 'True mu', data=df, marker='o', markerfacecolor='black', markersize=markersize, color='black', linewidth=linewidth) # plt.plot('trajectories', 'MWL', data=df, marker='o', markerfacecolor='green', markersize=markersize, color='green', linewidth=linewidth) # plt.plot('trajectories', 'LSTDQ', data=df, marker='o', markerfacecolor='olive', markersize=markersize, color='olive', linewidth=linewidth) plt.plot('trajectories', 'IH', data=df, marker='o', markerfacecolor='purple', markersize=markersize, color='purple', linewidth=linewidth) plt.plot('trajectories', 'MB', data=df, marker='o', markerfacecolor='gold', markersize=markersize, color='gold', linewidth=linewidth) plt.plot('trajectories', 'TD-ball center', data=df, marker='p', markerfacecolor='cadetblue', markersize=markersize, color='cadetblue', linewidth=linewidth) plt.plot('trajectories', 'bias', data=df, marker='s', markerfacecolor='skyblue', markersize=markersize, color='skyblue', linewidth=linewidth) plt.plot('trajectories', 'bias_td', data=df, marker='s', markerfacecolor='darkred', markersize=markersize, color='darkred', linewidth=linewidth) plt.plot('trajectories', 'bias_td_var', data=df, marker='s', markerfacecolor='orange', markersize=markersize, color='orange', linewidth=linewidth) # plt.xticks(cfg.n_trajectories) plt.xticks(results['trajectories']) plt.xlabel('log number of trajectories') plt.ylabel('log MSE') plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, prop={'size': 8}) plt.savefig('pi_03_mu_01_grid_misspecified_w.png') pdb.set_trace()