コード例 #1
0
if __name__ == "__main__":

    '''Implement a basic version of the Q-learning algorithm and use it to solve the taxi domain. The agent should
    explore the MDP, collect data to learn the optimal policy and the optimal Q-value function. (Be mindful of
    how you handle terminal states, typically if St is a terminal state, V (St+1) = 0). Use gamma = 0.90. 
    
    Also, you will
    see how an Epsilon-Greedy strategy can find the optimal policy despite finding sub-optimal Q-values. As we
    are looking for optimal Q-values you will have to carefully consider your exploration strategy.
    Evaluate your agent using the OpenAI gym 0.14.0 Taxi-v2 environment. Install OpenAI Gym 0.14.0 with
    pip install gym==0.14.0'''

    # env_HW4 = gym.make('Taxi-v2')

    # env_HW4 = gym.make('Taxi-v2').unwrapped
    env_HW4 = TaxiEnv()

    '''Taxi-v2 - Q-learning'''
    print("Taxi-v2")
    for i in range(1):
        QL_HW4 = QLearningTable(actions=list(range(env_HW4.nA)),
                                  # learning_rate=0.1,
                                  reward_decay=0.90,   # gamma
                                  # epsilon=0.2,
                                  verbose=True)

        Q_output = Q_HW4(num_episode = 2500000,         # Sammy runed 10000000 episodes to get stable updates.
                                                        # Q table only updated for 2500000 episodes
                                                        # sometimes only makes 90% score
                         learning_rate=0.01)     # function to execute the q-learner, shown above
        print(Q_output)
コード例 #2
0
            phi = set()
            for var in variables:
                for i, node in enumerate(cat.trajectory):
                    if node.outgoing[var] == set(len(cat.trajectory) - 1):
                        phi.add(i)

            for i in reversed(range(len(cat.trajectory))):
                if cat.trajectory[i].get_all_outgoing().issubset(phi):
                    phi.add(i)

            phi = sorted(list(phi))
            for var in variables:
                last_incoming = None
                for i in range(len(phi)):
                    incoming_index = cat.trajectory[phi[i]].incoming[var]
                    if incoming_index is not None and incoming_index not in phi:
                        last_incoming = i
                phi = phi[last_incoming:]

            ans.append(phi)
        return ans

    

if __name__ == "__main__":
    from taxi import TaxiEnv
    agent = HG()
    env = TaxiEnv()
    tr = agent.build_CAT(env)
    print(*tr, sep = '\n')
    ct = HG.CAT_trajectory(tr[0])
コード例 #3
0
    qt = set(q_table_lp)
    qfile = open("other.lp", "w")
    for (state, action) in qt:
        taxirow, taxicol, passidx, _ = env.decode(state)
        actionname = getActionName(action)
        #    comment = "% taxi:"+str(taxirow)+","+str(taxicol)+",passenger:"+str(passidx)+"\n"
        qrule = "q((" + str(taxirow) + "," + str(taxicol) + "," + str(
            passidx) + ")," + actionname + "," + str(ro_table[state,
                                                              action]) + ").\n"
        qfile.write(qrule)
    qfile.close()


if __name__ == '__main__':

    env = TaxiEnv()

    nA = env.nA
    nS = env.nS

    # Parameters
    LEARNING_RATE_Q = 1
    LEARNING_RATE_R = 1
    DISCOUNT = 0.001
    EPSILON = 0.01
    BETA = 0.3

    # Q and rewards
    #    q_table = np.zeros((nS, nA))
    #    q_table_lp = {}
コード例 #4
0
def game(N_episodes, AI_type, Intrinsic_type, clip_ratio):
    ############## Hyperparameters ##############

    env = TaxiEnv()
    #memory = Memory(max_size=300)

    #n_episodes = number_of_episodes
    #n_actions = env.action_space.n
    #intrinsic = intrinsic
    #print(n_actions)
    #n_agents = 1
    #n_episodes = number_of_episodes
    #state_size = env.observation_space.n

    #env_name = "LunarLander-v2"
    # creating environment
    state_dim = env.observation_space.n
    action_dim = env.action_space.n
    render = False
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 20  # print avg reward in the interval
    max_episodes = N_episodes  # max training episodes
    max_timesteps = 250  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 2000  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    samp_rewards = []
    avg_rewards = []
    best_avg_reward = -np.inf
    n_agents = 1
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    avg_reward = 0
    ppo.memcount.delete()
    state_size = env.observation_space.n
    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd()
    norm_step = 5000
    #Pre Run
    next_obs = []
    for norm_step in range(norm_step):
        action_norm = np.random.randint(0, action_dim)
        state_norm, reward_norm, done_norm, _ = env.step(action_norm)
        state_norm = to_categorical(state_norm, state_size)  #optional
        next_obs.append(state_norm)
    obs_rms.update(next_obs)

    # training loop
    for i_episode in range(1, max_episodes + 1):
        state = env.reset()
        state = to_categorical(state, state_size)
        done = False
        t = 0
        episode_reward = 0
        intrinsic_rewards = 0
        reward = 0
        #for t in range(max_timesteps):
        #while not done:
        while t <= max_timesteps:
            timestep += 1
            t += 1

            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            state = to_categorical(state, state_size)

            #========================================================
            if ((AI_type == "PPO" or AI_type == "A2C")
                    and Intrinsic_type == "1"):
                intrinsic_rewards = get_intrinsic_rewards(
                    AI_type, state, ppo, n_agents, 10)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards1",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "2"):
                intrinsic_rewards = get_intrinsic_rewards2(
                    AI_type, state, action, ppo, n_agents, 10)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards2",intrinsic_rewards)

            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "3"):
                intrinsic_rewards = get_intrinsic_rewards3(
                    AI_type, state, action, ppo, n_agents, reward, 1)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards3",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "4"):
                intrinsic_rewards = get_intrinsic_rewards4(
                    AI_type, state, action, ppo, n_agents, reward, t, 1, 0.99)

            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "5"):
                intrinsic_rewards = get_intrinsic_rewards5(
                    AI_type, state, ppo, n_agents, 1, 16)
                #print("intrinsic_rewards5",intrinsic_rewards)
            else:
                intrinsic_rewards = 0
            reward_sum = reward  #+ intrinsic_rewards
            #===========================================================
            memory.rewards.append(reward_sum)
            #temp_int = memory.intrinsic_rewards.data.numpy()
            #temp_int = memory.intrinsic_rewards
            #print(temp_int)
            memory.intrinsic_rewards.append(intrinsic_rewards)
            memory.is_terminals.append(done)
            """
            try:
                mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int)
                reward_rms.update_from_moments(mean1, std1 ** 2, count1)
                adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var)
            except:
                adv_int = 0
                """

            # update if its time
            if timestep % update_timestep == 0:
                temp_int = memory.intrinsic_rewards
                mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(
                    temp_int)
                reward_rms.update_from_moments(mean1, std1**2, count1)
                adv_int = (temp_int) / np.sqrt(reward_rms.var)
                ppo.update(memory, adv_int, clip_ratio)
                memory.clear_memory()
                timestep = 0

            running_reward += reward
            episode_reward += reward
            if render:
                env.render()
            #if done:
            #break

        avg_length += t

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            #break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

        samp_rewards.append(episode_reward)
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards[-100:])
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward

        print("Total reward in episode {} = {}".format(i_episode,
                                                       episode_reward))
        print("Best_avg_reward =", np.round(best_avg_reward, 3),
              "Average_rewards =", np.round(avg_reward, 3))
    #env.save_replay()
    env.close()

    return avg_rewards, best_avg_reward, samp_rewards, "0"
コード例 #5
0
def main(cfg):
    initial_seed = cfg.initial_seed
    random.seed(initial_seed)
    np.random.seed(initial_seed)
    gamma = cfg.gamma
    n_trajectories = cfg.n_trajectories
    horizon = cfg.horizon
    horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
    processor = lambda x: x
    seed_list = [
        initial_seed + np.random.randint(0, 10000) * i
        for i in range(cfg.n_experiments)
    ]  # generate a list of random seeds
    if cfg.env == 'grid_world':
        from gridworld import GridWorldEnv
        env = GridWorldEnv()
    elif cfg.env == 'taxi':
        from taxi import TaxiEnv
        env = TaxiEnv()

    n_states = env.nS
    n_actions = env.nA
    P = env.P_matrix
    R = env.R_matrix.copy()
    d0 = env.isd

    q_star_original = env.value_iteration()
    # pi_prob = gymEnv.extract_policy(q_star_original, temperature=0.05)
    # mu_prob = gymEnv.extract_policy(q_star_original, temperature=1)
    pi = env.extract_policy(q_star_original, temperature=0.1)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # pi = env.extract_policy(q_star_original, temperature=0.15)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy()
    mu = pi.copy()
    mu[:, 0] = pi[:, 1].copy()
    mu[:, 1] = pi[:, 2].copy()
    mu[:, 2] = pi[:, 3].copy()
    mu[:, 3] = pi[:, 0].copy()

    dpi, dpi_t, v_pi_s, P_pi = exact_calculation(env, pi, cfg.horizon,
                                                 cfg.gamma)
    dmu, dmu_t, vmu_s, P_mu = exact_calculation(env, mu, cfg.horizon,
                                                cfg.gamma)
    #! sanity check the loss objective
    #* verify the claim that L(w*,f) = 0 for all f, where
    #* L(w,f) = \E_{(s,a,s')\sim d_mu} [ w(s) (f(s) - gamm*rho(s,a)*f(s'))] +1/h E_{s\sim d0} [f(s)] - 1/h *gamma^horizon \E_{s\sim d_pi,H}[f(s)]
    # determine w_star
    w_star = np.nan_to_num(dpi / dmu)
    v_pi = np.sum(d0 * v_pi_s)
    v_mu = np.sum(d0 * vmu_s)

    dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1])
    dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1])
    if RUN_SANITY_CHECK:

        def L(w, f):
            loss = 0
            for s in range(n_states):
                for a in range(n_actions):
                    for sn in range(n_states):
                        loss += w[s] * (-f[s] + gamma * pi[s, a] / mu[s, a] *
                                        f[sn]) * dmu[s] * mu[s, a] * P[s, a,
                                                                       sn]

            loss += 1 / horizon_normalization * np.sum(d0 * f)
            loss -= 1 / horizon_normalization * gamma**horizon * np.sum(
                dpi_H * f)
            return loss

        f = np.random.rand(n_states)
        loss = L(w_star, f)
        assert abs(loss) < 1e-8

        #! sanity check bellman and td error
        R_pi = np.sum(R * pi, axis=-1)
        bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s)
        bellman_new = v_pi_s - np.dot(
            (np.identity(n_states) - np.linalg.matrix_power(
                gamma * P_pi, horizon)), R_pi) - gamma * np.dot(P_pi, v_pi_s)
        pdb.set_trace()

    ground_truth_info = AttrDict({})
    ground_truth_info.update({
        'd_pi': torch.tensor(dpi, dtype=dtype),
        'd_mu': torch.tensor(dmu, dtype=dtype),
        'v_pi': torch.tensor(v_pi_s, dtype=dtype),
        'v_star': v_pi
    })
    ground_truth_info.update({'w_pi': w_star})
    ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)})
    ground_truth_info.update({
        'pi': torch.tensor(pi, dtype=dtype),
        'mu': torch.tensor(mu, dtype=dtype)
    })
    true_rho = torch.tensor(pi / mu, dtype=dtype)
    true_rho[true_rho != true_rho] = 0
    true_rho[torch.isinf(true_rho)] = 0
    ground_truth_info.update({'rho': true_rho})
    ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)})
    ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)})
    ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)})
    ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)})
    ground_truth_info.update({
        'd_pi_t': torch.tensor(dpi_t, dtype=dtype),
        'd_mu_t': torch.tensor(dmu_t, dtype=dtype)
    })

    estimate = {}
    squared_error = {}
    estimate.update({'True pi': [float(v_pi)]})
    squared_error.update({'True pi': [0]})
    estimate.update({'True mu': [float(v_mu)]})
    squared_error.update({'True mu': [float(v_mu - v_pi)**2]})

    #* Generate multiple sets of behavior data from mu
    training_data = []
    training_data_processed = []
    for _ in range(cfg.n_experiments):
        print('Experiment:', _)
        print('------------------------')
        np.random.seed(seed_list[_])
        env.seed(seed_list[_])
        # behavior_data = rollout(env, mu, processor, absorbing_state, pi_e = pi, N=n_trajectories, T=horizon, frameskip=1, frameheight=1, path=None, filename='tmp',)
        behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon)
        behavior_data_processed = prepare_behavior_data(behavior_data)
        training_data.append(behavior_data)
        training_data_processed.append(behavior_data_processed)
        # pdb.set_trace()
    estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[
        'STEP WIS'], estimate['Mu hat'] = [], [], [], [], []
    squared_error['IS'] = []
    squared_error['STEP IS'] = []
    squared_error['WIS'] = []
    squared_error['STEP WIS'] = []
    squared_error['Mu hat'] = []
    estimate['IH_SN'] = []
    squared_error['IH_SN'] = []
    estimate['IH_no_SN'] = []
    squared_error['IH_no_SN'] = []
    estimate['MB'] = []
    squared_error['MB'] = []

    ###* Looping over the number of baseline experiments
    for _ in range(cfg.n_experiments):
        behavior_data = training_data[_]
        behavior_data_processed = training_data_processed[_]

        IS = importance_sampling_estimator(behavior_data, mu, pi, gamma)
        step_IS = importance_sampling_estimator_stepwise(
            behavior_data, mu, pi, gamma)
        WIS = weighted_importance_sampling_estimator(behavior_data, mu, pi,
                                                     gamma)
        step_WIS = weighted_importance_sampling_estimator_stepwise(
            behavior_data, mu, pi, gamma)
        estimate['IS'].append(float(IS))
        squared_error['IS'].append(float((IS - v_pi)**2))
        estimate['STEP IS'].append(float(step_IS))
        squared_error['STEP IS'].append(float((step_IS - v_pi)**2))
        estimate['WIS'].append(float(WIS))
        squared_error['WIS'].append(float((WIS - v_pi)**2))
        estimate['STEP WIS'].append(float(step_WIS))
        squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2))
        MB = model_based(n_states, n_actions, behavior_data, pi, gamma)
        estimate['MB'].append(float(MB))
        squared_error['MB'].append(float((MB - v_pi)**2))
        IH, IH_unnormalized = lihong_infinite_horizon(n_states, behavior_data,
                                                      mu, pi, gamma)
        estimate['IH_SN'].append(float(IH))
        squared_error['IH_SN'].append(float((IH - v_pi)**2))
        estimate['IH_no_SN'].append(float(IH_unnormalized))
        squared_error['IH_no_SN'].append(float((IH_unnormalized - v_pi)**2))

    display((estimate, squared_error))
    print('exp seed:', cfg.initial_seed)

    # pdb.set_trace()
    if RUN_SANITY_CHECK:
        #! Let's run some additional sanity check
        #* check to see if bias formula checks out
        v_w = 0
        normalization = 0
        for trajectory in behavior_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                v_w += w_star[s] * pi[s, a] / mu[s, a] * r * discounted_t
                normalization += discounted_t
                discounted_t *= gamma
        v_w = v_w / normalization

        on_policy_data, frequency, avg_reward = roll_out(
            env, pi, 4096, horizon)
        # pdb.set_trace()
        empirical_v_pi = np.zeros(n_states)
        empirical_d_pi = np.zeros(n_states)
        empirical_d0 = np.zeros(n_states)
        empirical_r_pi = np.zeros(n_states)
        empirical_frequency = np.zeros(n_states)
        empirical_P = np.zeros((n_states, n_actions, n_states))
        horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
        num_traj = len(on_policy_data)
        for trajectory in on_policy_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                empirical_v_pi[s] += r * discounted_t
                empirical_d_pi[s] += discounted_t
                # empirical_d0[s] += 1-discounted_t
                discounted_t *= gamma
                empirical_r_pi[s] += r
                empirical_frequency[s] += 1
                empirical_P[s, a, sn] += 1
        empirical_v_pi = empirical_v_pi / num_traj
        empirical_d_pi = empirical_d_pi / horizon_normalization / num_traj
        empirical_P = np.nan_to_num(empirical_P /
                                    np.sum(empirical_P, axis=-1)[:, :, None])
        # T = np.nan_to_num(T/np.sum(T, axis = -1)[:,:,None])
        empirical_r_pi = np.nan_to_num(empirical_r_pi / empirical_frequency)
        empirical_P_pi = np.einsum('san,sa->sn', empirical_P, pi)

        empirical_d_mu = np.zeros(n_states)
        num_traj = len(behavior_data)
        for trajectory in behavior_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                empirical_d_mu[s] += discounted_t
                discounted_t *= gamma
        empirical_d_mu = empirical_d_mu / horizon_normalization / num_traj

        empirical_w = np.nan_to_num(empirical_d_pi / empirical_d_mu)
        empirical_loss = L(empirical_w, empirical_v_pi)

        empirical_bellman_original = 0
        empirical_bellman_new = 0
        empirical_td_error = 0
        num_traj = len(on_policy_data)
        empirical_r_pi_adjusted = np.dot(
            (np.identity(n_states) -
             np.linalg.matrix_power(gamma * empirical_P_pi, horizon)),
            empirical_r_pi)
        for trajectory in on_policy_data:
            discounted_t = 1.0
            for s, a, sn, r in trajectory:
                empirical_bellman_original += discounted_t * (
                    v_pi_s[s] - empirical_r_pi[s] -
                    gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2
                empirical_bellman_new += discounted_t * (
                    v_pi_s[s] - empirical_r_pi_adjusted[s] -
                    gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2
                empirical_td_error += discounted_t * (v_pi_s[s] - r -
                                                      gamma * v_pi_s[sn])**2
                discounted_t *= gamma
        empirical_td_error = empirical_td_error / horizon_normalization / num_traj
        empirical_bellman_original = empirical_bellman_original / horizon_normalization / num_traj
        empirical_bellman_new = empirical_bellman_new / horizon_normalization / num_traj
        # empirical_bellman_original = empirical_v_pi - empirical_r_pi - gamma*np.dot(empirical_P_pi, empirical_v_pi)

        # bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s)
        # bellman_new = v_pi_s - np.dot((np.identity(n_states) - np.linalg.matrix_power(gamma*P_pi, horizon)),R_pi) - gamma*np.dot(P_pi, v_pi_s)
        pdb.set_trace()

    for objective in cfg.objective:
        estimate[objective] = []
        squared_error[objective] = []
        objective_sn = objective + '-SN'
        estimate[objective_sn] = []
        squared_error[objective_sn] = []

    for i in range(cfg.n_experiments):
        training_set = training_data_processed[i]
        fixed_terminal_value = True
        logging = cfg.logging
        mvm = Tabular_State_MVM_Estimator(training_set,
                                          cfg,
                                          logging=logging,
                                          ground_truth=ground_truth_info)
        penalty = cfg.penalty_input

        horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
        # penalty_base = 1/mdp_calculator.horizon_normalization#/cfg.n_trajectories
        penalty_base = 1 / horizon_normalization
        mvm.set_random_seed(
            seed_list[i])  #different random seed per experiment
        mvm.solve_closed_form_bias()
        mvm.generate_random_v_class(cfg.v_class_cardinality)
        mvm.generate_random_w_class(cfg.v_class_cardinality)
        # mvm.bias_check()
        for objective in cfg.objective:
            mvm.set_random_seed(seed_list[i])
            # w_estimator = mvm.optimize_finite_class(objective = objective, td_penalty=penalty*penalty_base)
            # w_estimator = mvm.optimize_discrete(objective = objective, td_penalty=penalty*penalty_base)
            w_estimator = mvm.optimize(objective, td_penalty=0.1)
            # w_estimator, w_estimator_sn = mvm.optimize_optimistic()
            # w_estimator, w_estimator_sn = mvm.optimize_optimistic_adam(objective = objective, td_penalty=penalty*penalty_base)
            # w_estimator = mvm.optimize_closed_form()
            estimate[objective].append(float(w_estimator))
            # objective_sn = objective + '-SN'
            # estimate[objective_sn].append(float(w_estimator_sn))
            squared_error[objective].append(float(w_estimator - v_pi)**2)
            # squared_error[objective_sn].append(float(w_estimator_sn-v_pi)**2)
        display((estimate, squared_error))

    display((estimate, squared_error))
コード例 #6
0
def main(cfg):
    initial_seed = cfg.initial_seed
    random.seed(initial_seed)
    np.random.seed(initial_seed)
    gamma = cfg.gamma
    # n_trajectories_list = cfg.n_trajectories
    # for n_trajectories in n_trajectories_list:
    # n_trajectories = cfg.n_trajectories
    horizon = cfg.horizon
    horizon_normalization = (1 - gamma**horizon) / (
        1 - gamma) if gamma < 1 else horizon
    seed_list = [
        initial_seed + np.random.randint(0, 10000) * i
        for i in range(cfg.n_experiments)
    ]  # generate a list of random seeds
    if cfg.env == 'grid_world':
        from gridworld import GridWorldEnv
        env = GridWorldEnv()
    elif cfg.env == 'taxi':
        from taxi import TaxiEnv
        env = TaxiEnv()

    n_states = env.nS
    n_actions = env.nA
    P = env.P_matrix
    R = env.R_matrix.copy()
    d0 = env.isd
    q_star_original = env.value_iteration()
    pi = env.extract_policy(q_star_original, temperature=0.3)
    mu = env.extract_policy(q_star_original, temperature=0.1)
    # pi = env.extract_policy(q_star_original, temperature=0.1)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # pi = env.extract_policy(q_star_original, temperature=0.3)
    # mu = env.extract_policy(q_star_original, temperature=0.15)
    # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy()
    #* 4 swapped cyclic
    # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,0].copy()
    #* D swapped with R, L swapped with U
    # mu = pi.copy(); mu[:,0] = pi[:,3].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy(); mu[:,3] = pi[:,0].copy()
    # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,4].copy();mu[:,4] = pi[:,5].copy();mu[:,5] = pi[:,0].copy()

    dpi, dpi_t, v_pi_s, q_pi_sa, P_pi = exact_calculation(
        env, pi, cfg.horizon, cfg.gamma)
    dmu, dmu_t, vmu_s, qmu_sa, P_mu = exact_calculation(
        env, mu, cfg.horizon, cfg.gamma)
    w_star = np.nan_to_num(dpi / dmu)
    v_pi = np.sum(d0 * v_pi_s)
    v_mu = np.sum(d0 * vmu_s)

    dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1])
    dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1])

    ground_truth_info = AttrDict({})
    ground_truth_info.update({
        'd_pi': torch.tensor(dpi, dtype=dtype),
        'd_mu': torch.tensor(dmu, dtype=dtype),
        'v_pi': torch.tensor(v_pi_s, dtype=dtype),
        'q_pi': torch.tensor(q_pi_sa, dtype=dtype),
        'v_star': v_pi
    })
    ground_truth_info.update({'w_pi': w_star})
    ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)})
    ground_truth_info.update({
        'pi': torch.tensor(pi, dtype=dtype),
        'mu': torch.tensor(mu, dtype=dtype)
    })
    true_rho = torch.tensor(pi / mu, dtype=dtype)
    true_rho[true_rho != true_rho] = 0
    true_rho[torch.isinf(true_rho)] = 0
    ground_truth_info.update({'rho': true_rho})
    ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)})
    ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)})
    ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)})
    ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)})
    ground_truth_info.update({
        'd_pi_t': torch.tensor(dpi_t, dtype=dtype),
        'd_mu_t': torch.tensor(dmu_t, dtype=dtype)
    })

    estimate = {}
    squared_error = {}
    estimate.update({'True pi': [float(v_pi)]})
    squared_error.update({'True pi': [0]})
    estimate.update({'True mu': [float(v_mu)]})
    squared_error.update({'True mu': [float(v_mu - v_pi)**2]})

    results = {}
    results['trajectories'] = []
    results['IS'] = []
    results['IH'] = []
    results['MB'] = []
    results['WIS'] = []
    results['STEP WIS'] = []
    results['STEP IS'] = []
    results['True mu'] = []
    for objective in cfg.objective:
        results[objective] = []

    n_trajectories_list = cfg.n_trajectories
    for n_trajectories in n_trajectories_list:
        print('------------------------')
        #* Generate multiple sets of behavior data from mu
        training_data = []
        training_data_processed = []
        for _ in range(cfg.n_experiments):
            # print('Experiment:',_)
            # print('------------------------')
            np.random.seed(seed_list[_])
            env.seed(seed_list[_])
            behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon)
            behavior_data_processed = prepare_behavior_data(behavior_data)
            training_data.append(behavior_data)
            training_data_processed.append(behavior_data_processed)
        estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[
            'STEP WIS'], estimate['Mu hat'] = [], [], [], [], []
        squared_error['IS'] = []
        squared_error['STEP IS'] = []
        squared_error['WIS'] = []
        squared_error['STEP WIS'] = []
        squared_error['Mu hat'] = []
        estimate['IH_SN'] = []
        squared_error['IH_SN'] = []
        estimate['IH_no_SN'] = []
        squared_error['IH_no_SN'] = []
        estimate['MB'] = []
        squared_error['MB'] = []
        ###* Looping over the number of baseline experiments
        for _ in range(cfg.n_experiments):
            behavior_data = training_data[_]
            behavior_data_processed = training_data_processed[_]

            IS = importance_sampling_estimator(behavior_data, mu, pi, gamma)
            step_IS = importance_sampling_estimator_stepwise(
                behavior_data, mu, pi, gamma)
            WIS = weighted_importance_sampling_estimator(
                behavior_data, mu, pi, gamma)
            step_WIS = weighted_importance_sampling_estimator_stepwise(
                behavior_data, mu, pi, gamma)
            estimate['IS'].append(float(IS))
            squared_error['IS'].append(float((IS - v_pi)**2))
            estimate['STEP IS'].append(float(step_IS))
            squared_error['STEP IS'].append(float((step_IS - v_pi)**2))
            estimate['WIS'].append(float(WIS))
            squared_error['WIS'].append(float((WIS - v_pi)**2))
            estimate['STEP WIS'].append(float(step_WIS))
            squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2))
            MB = model_based(n_states, n_actions, behavior_data, pi, gamma)
            estimate['MB'].append(float(MB))
            squared_error['MB'].append(float((MB - v_pi)**2))
            IH, IH_unnormalized = lihong_infinite_horizon(
                n_states, behavior_data, mu, pi, gamma)
            estimate['IH_SN'].append(float(IH))
            squared_error['IH_SN'].append(float((IH - v_pi)**2))
            estimate['IH_no_SN'].append(float(IH_unnormalized))
            squared_error['IH_no_SN'].append(float(
                (IH_unnormalized - v_pi)**2))

        # display((estimate, squared_error))
        # print('exp seed:', cfg.initial_seed)
        # pdb.set_trace()
        results['trajectories'].append(np.log2(n_trajectories))
        results['IH'].append(
            np.log2(
                sum(squared_error['IH_SN']) / len(squared_error['IH_SN']) /
                v_pi**2))
        results['MB'].append(
            np.log2(
                sum(squared_error['MB']) / len(squared_error['IH_SN']) /
                v_pi**2))
        results['IS'].append(
            np.log2(
                sum(squared_error['IS']) / len(squared_error['IS']) / v_pi**2))
        results['WIS'].append(
            np.log2(
                sum(squared_error['WIS']) / len(squared_error['WIS']) /
                v_pi**2))
        results['STEP WIS'].append(
            np.log2(
                sum(squared_error['STEP WIS']) /
                len(squared_error['STEP WIS']) / v_pi**2))
        results['STEP IS'].append(
            np.log2(
                sum(squared_error['STEP IS']) / len(squared_error['STEP IS']) /
                v_pi**2))
        results['True mu'].append(
            np.log2(
                sum(squared_error['True mu']) / len(squared_error['True mu']) /
                v_pi**2))

        for objective in cfg.objective:
            estimate[objective] = []
            squared_error[objective] = []

        # for i in range(cfg.n_experiments):
        #     training_set = training_data_processed[i]
        #     mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth = ground_truth_info)
        #     for objective in cfg.objective:
        #         mvm.set_random_seed(seed_list[i])
        #         w_estimator = mvm.optimize(objective)
        #         estimate[objective].append(float(w_estimator))
        #         squared_error[objective].append(float(w_estimator-v_pi)**2)
        #     display((estimate, squared_error))

        for i in range(cfg.n_experiments):
            training_set = training_data_processed[i]
            mvm = Tabular_State_MVM_Estimator(training_set,
                                              cfg,
                                              ground_truth=ground_truth_info)
            for objective in cfg.objective:
                mvm.set_random_seed(seed_list[i])
                w_estimator = mvm.optimize(objective)
                estimate[objective].append(float(w_estimator))
                squared_error[objective].append(float(w_estimator - v_pi)**2)
        # display((estimate, squared_error))
        for objective in cfg.objective:
            results[objective].append(
                np.log2(
                    sum(squared_error[objective]) /
                    len(squared_error[objective]) / v_pi**2))
        display((estimate, squared_error), n_trajectories)
        print('\n')
        print('End of one set of experiments')

    # pdb.set_trace()
    df = pd.DataFrame(results)
    # plt.plot(results['trajectories'], results['IH'],marker='o', markerfacecolor='blue', markersize=12, color='blue', linewidth=4)
    # plt.plot(results['trajectories'], results['MB'],marker='o', markerfacecolor='red', markersize=12, color='red', linewidth=4)
    # plt.plot(results['trajectories'], results['STEP WIS'],marker='o', markerfacecolor='aqua', markersize=12, color='aqua', linewidth=4)
    # plt.plot(results['trajectories'], results['STEP IS'],marker='o', markerfacecolor='orange', markersize=12, color='orange', linewidth=4)
    markersize = 8
    linewidth = 4
    plt.plot('trajectories',
             'STEP WIS',
             data=df,
             marker='o',
             markerfacecolor='slategrey',
             markersize=markersize,
             color='slategrey',
             linewidth=linewidth)
    plt.plot('trajectories',
             'STEP IS',
             data=df,
             marker='o',
             markerfacecolor='rosybrown',
             markersize=markersize,
             color='rosybrown',
             linewidth=linewidth)
    plt.plot('trajectories',
             'True mu',
             data=df,
             marker='o',
             markerfacecolor='black',
             markersize=markersize,
             color='black',
             linewidth=linewidth)
    # plt.plot('trajectories', 'MWL', data=df, marker='o', markerfacecolor='green', markersize=markersize, color='green', linewidth=linewidth)
    # plt.plot('trajectories', 'LSTDQ', data=df, marker='o', markerfacecolor='olive', markersize=markersize, color='olive', linewidth=linewidth)
    plt.plot('trajectories',
             'IH',
             data=df,
             marker='o',
             markerfacecolor='purple',
             markersize=markersize,
             color='purple',
             linewidth=linewidth)
    plt.plot('trajectories',
             'MB',
             data=df,
             marker='o',
             markerfacecolor='gold',
             markersize=markersize,
             color='gold',
             linewidth=linewidth)
    plt.plot('trajectories',
             'TD-ball center',
             data=df,
             marker='p',
             markerfacecolor='cadetblue',
             markersize=markersize,
             color='cadetblue',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias',
             data=df,
             marker='s',
             markerfacecolor='skyblue',
             markersize=markersize,
             color='skyblue',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias_td',
             data=df,
             marker='s',
             markerfacecolor='darkred',
             markersize=markersize,
             color='darkred',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias_td_var',
             data=df,
             marker='s',
             markerfacecolor='orange',
             markersize=markersize,
             color='orange',
             linewidth=linewidth)
    # plt.xticks(cfg.n_trajectories)
    plt.xticks(results['trajectories'])
    plt.xlabel('log number of trajectories')
    plt.ylabel('log MSE')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, 1.05),
               ncol=3,
               prop={'size': 8})
    plt.savefig('pi_03_mu_01_grid_misspecified_w.png')
    pdb.set_trace()