Exemple #1
0
def test_scot(test, policy_opt, values_opt, seed, horizon, traj_limit):
    trajs = scot(test.env, test.env.weights, H=horizon, seed=seed, verbose=False)
    lens = []
    for t in trajs:
        lens.append(len(t))
    print("traj_limit", traj_limit)
    if len(trajs) > traj_limit:
        trajs = sample(trajs, traj_limit)
    # student's inferred reward function from the trajectories from SCOT
    r_weights = max_likelihood_irl(trajs, test.env, step_size=0.2, eps=1.0e-03, max_steps=1000, verbose=False)

    values_MLIRL, policy_MLIRL = value_iteration(mdp=test.env,
                                                 r_weights=r_weights)  # student's policy and value function under student's reward funct
    values_MLIRL, _ = value_iteration(mdp=test.env,
                                      policy=policy_MLIRL)  # value of student's policy under teacher's reward funct (true)

    policy_similarity = np.sum(policy_MLIRL == policy_opt) / policy_MLIRL.shape[0]

    print("Policy similarity for SCOT: {}".format(policy_similarity))

    # FOR NOW, THE START STATE DISTRIBUTION START_DIST DOES NOT HAVE AN ELEMENT FOR THE STATE nS
    total_value_opt = np.dot(test.env.start_dist, values_opt)
    #print("Optimal expected value: {}".format(total_value_opt))

    total_value_MLIRL = np.dot(test.env.start_dist, values_MLIRL)
    print("Max Likelihood IRL expected value: {}".format(total_value_MLIRL))

    value_gain_MLIRL = total_value_MLIRL / total_value_opt
    print("Value gain of Max Likelihood IRL: {}".format(value_gain_MLIRL))
    print(len(trajs), sum(lens))
    return policy_similarity, total_value_MLIRL, value_gain_MLIRL
Exemple #2
0
def test_QLearning(test, policy_opt, values_opt, horizon, traj_limit):
    value_function_est, _, Q_policy, num_trajs = q_learning(test.wrapper, **{'n_samp': traj_limit * horizon, 'step_size': 0.1,
                                                                  'epsilon': 0.1, 'horizon':horizon, 'traj_limit':traj_limit})

    print("num_trajs", num_trajs)
    values_QL, _ = value_iteration(mdp=test.env,
                                   policy=Q_policy)  # value of student's PI policy under teacher's reward funct (true)

    policy_similarity = np.sum(Q_policy == policy_opt) / Q_policy.shape[0]
    print("Policy similarity for Q Learning: {}".format(policy_similarity))


    # FOR NOW, THE START STATE DISTRIBUTION START_DIST DOES NOT HAVE AN ELEMENT FOR THE STATE nS
    total_value_opt = np.dot(test.env.start_dist, values_opt)
    #print("Optimal expected value: {}".format(total_value_opt))

    total_value_QL = np.dot(test.env.start_dist, values_QL)
    print("True QL expected value: {}".format(total_value_QL))

    value_gain_QL = total_value_QL / total_value_opt
    print("Value gain of true PI: {}".format(value_gain_QL))

    total_value_est_QL = np.dot(test.env.start_dist, value_function_est)
    print("Estimated PI expected value: {}".format(total_value_est_QL))

    value_gain_est_QL = total_value_est_QL / total_value_opt
    print("Value gain of Est QL: {}".format(value_gain_est_QL))

    return policy_similarity, total_value_QL, total_value_est_QL, value_gain_QL, value_gain_est_QL
Exemple #3
0
def test_PI(test, policy_opt, values_opt, horizon, traj_limit):
    print("testPI")
    est_values_PI, policy_PI = policy_iteration(
        test.env, test.agent, every_visit_monte_carlo, kwargs={'n_eps': traj_limit, 'eps_len': horizon})

    #est_values_PI, policy_PI = policy_iteration(
    #    test.env, test.agent, temporal_difference, kwargs={'n_samp':1000, 'step_size': 0.1, 'horizon': horizon, 'traj_limit': traj_limit})

    #est_values_PI, policy_PI = policy_iteration(
    #    test.env, test.agent, first_visit_monte_carlo, kwargs={'n_eps': traj_limit, 'eps_len': horizon})
    print('here')


    values_PI, _ = value_iteration(mdp=test.env,
                                   policy=policy_PI)  # value of student's PI policy under teacher's reward funct (true)

    policy_similarity = np.sum(policy_PI == policy_opt) / policy_PI.shape[0]
    print("Policy similarity for PI: {}".format(policy_similarity))

    # FOR NOW, THE START STATE DISTRIBUTION START_DIST DOES NOT HAVE AN ELEMENT FOR THE STATE nS
    total_value_opt = np.dot(test.env.start_dist, values_opt)
    #print("Optimal expected value: {}".format(total_value_opt))

    total_value_PI = np.dot(test.env.start_dist, values_PI)
    print("True PI expected value: {}".format(total_value_PI))

    value_gain_PI = total_value_PI / total_value_opt
    print("Value gain of true PI: {}".format(value_gain_PI))

    total_value_est_PI = np.dot(test.env.start_dist, est_values_PI)
    print("Estimated PI expected value: {}".format(total_value_est_PI))

    value_gain_est_PI = total_value_est_PI / total_value_opt
    print("Value gain of Est PI: {}".format(value_gain_est_PI))

    return policy_similarity, total_value_PI, total_value_est_PI, value_gain_PI, value_gain_est_PI
Exemple #4
0
def scot(mdp, w, s_start=None, m=None, H=None, seed=None, verbose=False):
    """
    Implements the Set Cover Optimal Teaching (SCOT) algorithm from
    "Machine Teaching for Inverse Reinforcement Learning:
    Algorithms and Applications", Brown and Niekum (2019)

    Args:
        mdp: MDP environment
        s_start: list of possible initial states
        w (np.array): weights of linear reward function of expert teacher agent
            (featurization computed by MDP environment) as a numpy array
        m (int): number of sample demonstration trajectories to draw per start state
        H (int): horizon (max length) of demonstration trajectories
        verbose (Boolean): whether to print out algorithmic runtime information

    Returns:
        D: list of maximally informative machine teaching trajectories
            represented as lists of (s, a, r, s') experience tuples
    """
    if seed is not None:
        np.random.seed(seed)
    else:
        np.random.seed(2)

    # compute optimal policy pi_opt
    _, teacher_pol_det = value_iteration(mdp)  # using variation of VI code from HW1)
    # convert teacher policy to stochastic policy
    teacher_pol = det2stoch_policy(teacher_pol_det, mdp.nS, mdp.nA)

    # compute expected feature counts mu[s][a] under optimal policy
    mu, mu_sa = get_feature_counts(mdp, teacher_pol)

    # compute BEC of teacher as list of vectors defining halfspaces of linear reward
    # function parameters implied by teacher's policy
    BEC = np.empty((mdp.nS*mdp.nA, w.shape[0]))

    # compute BEC for teacher policy
    # for a in range(mdp.nA):
    #     BEC[a*mdp.nS:(a+1)*mdp.nS] = mu - mu_sa[:, a]
    BEC = np.empty((mdp.nS*(mdp.nA-1), w.shape[0]))
    i = 0
    for s in range(mdp.nS):
        for a in range(mdp.nA):
            if a != teacher_pol_det[s]:
                BEC[i] = mu[s] - mu_sa[s, a]
                i += 1

    # remove trivial, duplicate, and redundant half-space constraints
    BEC = refineBEC(w, BEC)

    if verbose:
        print("BEC", BEC)

    # (1) compute candidate demonstration trajectories

    # number of demonstration trajectories to sample per start state
    if m is None:
        m = int(np.ceil(1/(1.0 - 0.95*mdp.noise)))

    teacher = Agent(teacher_pol, mdp.nS, mdp.nA)
    wrapper = Wrapper(mdp, teacher, False)

    demo_trajs = []

    # limit trajectory length to guarantee SCOT termination,
    # increase max trajectory length for stochastic environments
    # may want to set trajectory limit to number of iterations required in VI/feature count computations
    if H is None:
        H = mdp.nS

    # sample demonstration trajectories from each starting state (either from each state with non-zero probability mass
    # in the start state distribution of the MDP or a given set of start states
    # for s in range(mdp.nS):
    #     demo_trajs += wrapper.eval_episodes(m, s, horizon=H)[1]
    if s_start is None:
        for s in range(mdp.nS):
            if mdp.start_dist[s] > 0.0:
                demo_trajs += wrapper.eval_episodes(m, s, horizon=H)[1]
    else:
        for s in s_start:
            demo_trajs += wrapper.eval_episodes(m, s, horizon=H)[1]

    if verbose:
        print("number of demonstration trajectories:")
        print(len(demo_trajs))

    # (2) greedy set cover algorithm to compute maximally informative trajectories
    U = set()
    for i in range(BEC.shape[0]):
        U.add(tuple(BEC[i].tolist()))
    D = []
    C = set()

    U_sub_C = U - C
    if verbose:
        print("number of BEC constraints before set cover")
        print(len(U))

    # greedy set cover algorithm
    """
    the set cover problem is to identify the smallest sub-collection of S whose union equals the universe.
    For example, consider the universe U={1,2,3,4,5} and the collection of sets S={{1,2,3},{2,4},{3,4},{4,5}}}
    """
    BECs_trajs = []
    for traj in demo_trajs:
        BECs_trajs.append(compute_traj_BEC(traj, mu, mu_sa, mdp, w))

    while len(U_sub_C) > 0 and len(BECs_trajs) > 0:
        t_list = []  # collects the cardinality of the intersection between BEC(traj|pi*) and U \ C
        BEC_list = []
        for BEC_traj in BECs_trajs:
            BEC_list.append(BEC_traj)
            t_list.append(len(BEC_traj.intersection(U_sub_C)))
        t_greedy_index = t_list.index(max(t_list))
        t_greedy = demo_trajs[t_greedy_index]  # argmax over t_list to find greedy traj
        del BECs_trajs[t_greedy_index]
        del demo_trajs[t_greedy_index]
        D.append(t_greedy)
        C = C.union(BEC_list[t_greedy_index])
        U_sub_C = U - C
        if len(BECs_trajs) == 0:
            print("BEC_trajs empty")
            print(len(U_sub_C))
            print(U_sub_C)

    if verbose:
        print("trajectories", D)
        lens = [len(s) for s in D]
        print(len(D), lens)
    return D
Exemple #5
0
def max_likelihood_irl(D,
                       mdp,
                       step_size=0.01,
                       eps=1e-02,
                       max_steps=float("inf"),
                       verbose=False):
    """
    Maximum Likelihood IRL: returns maximally likely reward function under Boltzmann policy for given set of rewards
    See Vroman 2011 (http://www.icml-2011.org/papers/478_icmlpaper.pdf) for original paper,
    Ratia 2012 (https://arxiv.org/pdf/1202.1558.pdf) for likelihood gradient formula.

    :param D: list of trajectories of an optimal policy in the given MDP mdp
    :param mdp: MDP environment
    :param eps: convergence criteria, float
    :param max_steps: max number of steps in gradient ascent, int
    :param verbose: verbosity of algorithmic reporting
    :return: r_weights: reward weights as np array
    """

    # initialize reward weights
    r_weights = np.random.rand(*mdp.weights.shape)
    print("Initial reward weights:")
    print(r_weights)

    # get state-action pairs observed in trajectories
    # (should technically be the only thing input to the algorithm, maybe move this code outside into utilizing code...
    sa_traj = []
    traj_states = []
    for traj in D:
        for i in range(len(traj)):
            sa_traj.append([traj[i][0], traj[i][1]])
            traj_states.append(traj[i][0])
    sa_traj = np.array(sa_traj)

    # convergence criteria
    iters = 0
    delta = eps + 1
    while delta > eps and iters < max_steps:
        iters += 1
        # compute value function and optimal policy under current reward estimate
        # Use max iterations of 100 in line with Vroman et al
        values, policy = value_iteration(mdp, r_weights=r_weights)
        policy = det2stoch_policy(policy, mdp.nS, mdp.nA)

        # compute Q-values
        Qvalues = np.array([[
            mdp.reward(s, r_weights) + mdp.gamma * np.sum(
                [mdp.P[s, a, succ] * values[succ] for succ in range(mdp.nS)])
            for a in range(mdp.nA)
        ] for s in traj_states])

        # compute state-action likelihoods for all trajectory state-action pairs
        # temperature value for Boltzmann exploration policy (softmax with each exponent multiplied by beta)
        # used by Vroman et al
        beta = 0.5

        # apply Boltzmann temperature. subtract max Q-value so no large intermediate exponential values in likelihoods
        Qvalues_normalized = beta * (Qvalues - np.max(Qvalues))
        likelihoods = np.exp(Qvalues_normalized)
        for j in range(len(traj_states)):
            likelihood_sum = np.sum(likelihoods[j])
            likelihoods[j, :] /= likelihood_sum

        # compute state-action feature counts under current optimal policy
        mu, mu_sa = get_feature_counts(mdp, policy, tol=1.0e-03)

        # print(mdp.nA, likelihoods.shape, mu_sa.shape, sa_traj.shape)
        # print(sa_traj)
        # get likelihood gradient
        grad = beta * np.sum(np.array(
            [(mu_sa[sa_traj[sa_idx][0], sa_traj[sa_idx][1]] - np.sum(np.array([
                likelihoods[sa_idx, b] * mu_sa[sa_traj[sa_idx][0], b]
                for b in range(mdp.nA)
            ]),
                                                                     axis=0))
             for sa_idx in range(len(sa_traj))]),
                             axis=0)

        # perform gradient ascent step, use step size schedule of gradient ascent iterations
        r_weights_old = np.copy(r_weights)
        # L2_reg_factor = 0.001
        r_weights += step_size / iters * grad  #- L2_reg_factor * r_weights

        # normalize r_weights to constrain L2 norm and force a unique reward weight vector
        r_weights /= np.linalg.norm(r_weights, ord=2)

        # convergence criterion
        delta = np.linalg.norm(r_weights - r_weights_old, 1)
        if verbose:
            print(delta)

    if verbose:
        print("MLIRL iterations: {}".format(iters))
    return r_weights
Exemple #6
0
        else:
            curr_state = next_state

    V = np.max(Q, axis=1)
    policy = np.argmax(Q, axis=1)
    return V, Q, policy, num_trajs


if __name__ == '__main__':
    np.random.seed(2)
    from tests import BrownNiekum, Random
    from algorithms.value_iteration import value_iteration

    test = BrownNiekum()
    test.env.render()
    value_function_opt, policy = value_iteration(test.env)

    value_function_est, _, Q_policy = q_learning(
        test.wrapper, **{
            'n_samp': 50000,
            'step_size': 0.1,
            'epsilon': 0.1
        })

    # compare value functions
    print('Optimal policy: {}\nPolicy from Q-learning: {}'.format(
        policy, Q_policy))
    print('Optimal value function from Value Iteration: {}'.format(
        value_function_opt))
    print('Estimated value function from {}: {}'.format(
        q_learning.__name__, value_function_est))
Exemple #7
0
def main():

    # for i in range(50, 100):
    #     np.random.seed(i)
    #     test = get_env()  # default BrownNiekum()
    #     print("i")
    #     print(i)
    #     trajs = scot(test.env, test.env.weights, seed=i+1, verbose=False)


    horizon = 20
    traj_limit = 30
    #print(test_scot(test, policy_opt, values_opt, seed, horizon))
    #print(test_PI(test, policy_opt, values_opt, horizon))

    num_tests = 10
    MLIRL_policy_similarity_list = []
    total_value_MLIRL_list = []
    value_gain_MLIRL_list = []

    PI_policy_similarity_list = []
    total_value_PI_list = []
    total_value_est_PI_list = []
    value_gain_PI_list = []
    value_gain_est_PI_list = []

    QL_policy_similarity_list = []
    total_value_QL_list = []
    total_value_est_QL_list = []
    value_gain_QL_list = []
    value_gain_est_QL_list = []

    baseline_policy_similarity_list = []
    total_value_baseline_list = []
    value_gain_baseline_list = []

    for i in range(num_tests):
        np.random.seed(i)
        test = get_env()  # default BrownNiekum()
        test.env.render()
        values_opt, policy_opt = value_iteration(
            mdp=test.env)  # optimal value and policy under teacher's reward funct (true)

        print(i)
        #MLIRL_policy_similarity, total_value_MLIRL, value_gain_MLIRL = test_scot(test, policy_opt, values_opt, i, horizon, traj_limit)
        #exit(0)
        PI_policy_similarity, total_value_PI, total_value_est_PI, value_gain_PI, value_gain_est_PI = test_PI(test, policy_opt, values_opt, horizon, traj_limit)
        #QL_policy_similarity, total_value_QL, total_value_est_QL, value_gain_QL, value_gain_est_QL = test_QLearning(test, policy_opt, values_opt, horizon, traj_limit)
        #baseline_policy_similarity, total_value_baseline, value_gain_baseline  = test_baseline(test, policy_opt, values_opt, i, horizon, traj_limit)

        #MLIRL_policy_similarity_list.append(MLIRL_policy_similarity)
        #total_value_MLIRL_list.append(total_value_MLIRL)
        #value_gain_MLIRL_list.append(value_gain_MLIRL)

        PI_policy_similarity_list.append(PI_policy_similarity)
        total_value_PI_list.append(total_value_PI)
        total_value_est_PI_list.append(total_value_est_PI)
        value_gain_PI_list.append(value_gain_PI)
        value_gain_est_PI_list.append(value_gain_est_PI)
        """
        QL_policy_similarity_list.append(QL_policy_similarity)
        total_value_QL_list.append(total_value_QL)
        total_value_est_QL_list.append(total_value_est_QL)
        value_gain_QL_list.append(value_gain_QL)
        value_gain_est_QL_list.append(value_gain_est_QL)
        

        baseline_policy_similarity_list.append(baseline_policy_similarity)
        total_value_baseline_list.append(total_value_baseline)
        value_gain_baseline_list.append(value_gain_baseline)    
        """
    print("MLIRL_policy_similarity", np.mean(MLIRL_policy_similarity_list), np.var(MLIRL_policy_similarity_list))
    print("total_value_MLIRL", np.mean(total_value_MLIRL_list), np.var(total_value_MLIRL_list))
    print("value_gain_MLIRL", np.mean(value_gain_MLIRL_list), np.var(value_gain_MLIRL_list))

    print("PI_policy_similarity", np.mean(PI_policy_similarity_list), np.var(PI_policy_similarity_list))
    print("total_value_PI", np.mean(total_value_PI_list), np.var(total_value_PI_list))
    print("total_value_est_PI", np.mean(total_value_est_PI_list), np.var(total_value_est_PI_list))
    print("value_gain_PI", np.mean(value_gain_PI_list), np.var(value_gain_PI_list))
    print("value_gain_est_PI", np.mean(value_gain_est_PI_list), np.var(value_gain_est_PI_list))

    print("QL_policy_similarity", np.mean(QL_policy_similarity_list), np.var(QL_policy_similarity_list))
    print("total_value_QL", np.mean(total_value_QL_list), np.var(total_value_QL_list))
    print("total_value_est_QL", np.mean(total_value_est_QL_list), np.var(total_value_est_QL_list))
    print("value_gain_QL", np.mean(value_gain_QL_list), np.var(value_gain_QL_list))
    print("value_gain_est_QL", np.mean(value_gain_est_QL_list), np.var(value_gain_est_QL_list))

    print("baseline_policy_similarity", np.mean(baseline_policy_similarity_list), np.var(baseline_policy_similarity_list))
    print("total_value_baseline", np.mean(total_value_baseline_list), np.var(total_value_baseline_list))
    print("value_gain_baseline", np.mean(value_gain_baseline_list), np.var(value_gain_baseline_list))
Exemple #8
0
 def init_policy(self):
     _, policy = value_iteration(self.env)
     print('Policy from VI: {}'.format(policy))
     return policy
Exemple #9
0
 def init_policy(self):
     _, policy = value_iteration(self.env)
     print('Policy from VI: {}'.format(policy))
     policy = det2stoch_policy(policy, self.env.nS, self.env.nA)
     return policy
Exemple #10
0
                break

            traj = []
        else:
            curr_state = next_state

    return V_pi

if __name__ == '__main__':
    np.random.seed(2)
    from tests import BrownNiekum, Random
    from algorithms.value_iteration import value_iteration
    test = BrownNiekum()
    test.env.render()

    value_function_opt, policy = value_iteration(test.env, verbose=True)
    
    # change this to test other functions
    policy_eval_func = temporal_difference

    if policy_eval_func == temporal_difference:
        n_samples =500000
        step_size = 0.1
        horizon = 50
        value_function_est = policy_eval_func(test.wrapper, **{'n_samp': n_samples, 'step_size':step_size, 'horizon': horizon})
        print('Running {} with {} samples, step size {}, and horz, {} '.format(policy_eval_func.__name__, n_samples, 
                                                                               step_size, horizon))
    elif policy_eval_func == every_visit_monte_carlo:
        eps_len = 500
        n_eps = 50000
        value_function_est = policy_eval_func(test.wrapper, **{'eps_len': eps_len, 'n_eps':n_eps})