Beispiel #1
0
def get_expert_dataset(
    expert,
    venv,
    total_timesteps,
):
    filename = f"/tmp/{uuid.uuid4()}"
    n_episodes = total_timesteps // get_horizon(venv)

    generate_expert_traj(expert,
                         save_path=filename,
                         env=venv,
                         n_episodes=n_episodes)
    dataset = ExpertDataset(expert_path=f"{filename}.npz", verbose=0)

    return dataset
Beispiel #2
0
def maximum_entropy_irl(
    venv,
    expert=None,
    expert_venv=None,
    expert_trajectories=None,
    causal=True,
    total_timesteps=10000,
    **kwargs,
):
    if expert_trajectories is None:
        expert_trajectories = sample_trajectories(expert_venv,
                                                  expert,
                                                  n_timesteps=total_timesteps)

    nS = venv.observation_space.n

    expert_occupancy = np.zeros(nS)
    for trj in expert_trajectories:
        for ob in trj.obs:
            expert_occupancy[ob] += 1.0
    expert_occupancy /= expert_occupancy.sum()

    state_features = np.identity(nS)
    reward_model = LinearRewardModel(state_features)

    q_update_fn = mce_q_update_fn if causal else max_ent_q_update_fn

    horizon = get_horizon(venv)
    initial_state_distribution = get_initial_state_dist(venv)

    irl_reward, policy_matrix = occupancy_match_irl(
        dynamics=get_transition_matrix(venv),
        horizon=horizon,
        reward_model=reward_model,
        expert_occupancy=expert_occupancy,
        initial_state_distribution=initial_state_distribution,
        max_iterations=total_timesteps,
        q_update_fn=q_update_fn,
    )

    policy = LightweightRLModel.from_matrix(policy_matrix, env=venv)

    results = {}
    results["reward_model"] = irl_reward
    results["policy"] = policy
    return results
Beispiel #3
0
def hard_value_iteration(venv, discount=1.0):
    horizon = get_horizon(venv)
    nS = venv.observation_space.n
    nA = venv.action_space.n

    reward_matrix = force_shape(get_reward_matrix(venv), (nS, nA, nS))
    dynamics = get_transition_matrix(venv)

    Q = np.empty((horizon, nS, nA))
    V = np.empty((horizon + 1, nS))
    V[-1] = np.zeros(nS)
    for t in reversed(range(horizon)):
        for s in range(nS):
            for a in range(nA):
                Q[t, s, a] = dynamics[s, a, :] @ (reward_matrix[s, a, :] +
                                                  discount * V[t + 1, :])
        V[t] = np.max(Q[t], axis=1)

    policy = np.eye(nA)[Q.argmax(axis=2)]

    return policy
Beispiel #4
0
def soft_value_iteration(venv, beta=10):
    horizon = get_horizon(venv)
    nS = venv.observation_space.n
    nA = venv.action_space.n

    reward_matrix = force_shape(get_reward_matrix(venv), (nS, nA, nS))
    dynamics = get_transition_matrix(venv)

    Q = np.empty((horizon, nS, nA))
    V = np.empty((horizon + 1, nS))
    V[-1] = np.zeros(nS)
    for t in reversed(range(horizon)):
        for s in range(nS):
            for a in range(nA):
                Q[t, s,
                  a] = dynamics[s,
                                a, :] @ (reward_matrix[s, a, :] + V[t + 1, :])
        V[t] = logsumexp(Q[t], axis=1)

    policy = np.exp(Q - V[:-1, :, None])

    return policy
def preferences(
    venv,
    expert=None,
    evaluate_trajectories_fn=None,
    n_pairs_per_batch=50,
    n_timesteps_per_query=None,
    reward_lr=1e-3,
    policy_lr=1e-3,
    policy_epoch_timesteps=200,
    total_timesteps=10000,
    state_only=False,
    use_rnd_bonus=False,
    rnd_lr=1e-3,
    rnd_coeff=0.5,
    normalize_extrinsic=False,
    egreedy_sampling=False,
    **kwargs,
):
    if n_pairs_per_batch is None:
        horizon = get_horizon(venv)
        n_pairs_per_batch = (n_timesteps_per_query / (2 * horizon))

    if evaluate_trajectories_fn is None:
        reward_eval_fn = reward_eval_path_fn(venv)
        evaluate_trajectories_fn = get_eval_trajectories_fn(reward_eval_fn)

    # Create reward model
    rn = BasicShapedRewardNet(
        venv.observation_space,
        venv.action_space,
        theta_units=[32, 32],
        phi_units=[32, 32],
        scale=True,
        state_only=state_only,
    )

    # Compute trajectory probabilities
    preferences_ph = tf.placeholder(
        shape=(None, 2),
        dtype=tf.float32,
        name="preferences",
    )
    num_segments = 2 * tf.shape(preferences_ph)[0]
    rewards_out = tf.reshape(rn.reward_output_train, [num_segments, -1])
    returns_out = tf.reduce_sum(rewards_out, axis=1)
    returns = tf.reshape(returns_out, shape=[-1, 2])
    log_probs = tf.nn.log_softmax(returns, axis=1)

    # Write loss and optimizer op
    loss = (-1) * tf.reduce_sum(log_probs * preferences_ph)
    optimizer = tf.train.AdamOptimizer(learning_rate=reward_lr)
    reward_train_op = optimizer.minimize(loss)

    base_extrinsic_reward_fn = get_reward_fn_from_model(rn)

    if not use_rnd_bonus:
        reward_fn = base_extrinsic_reward_fn
    else:
        # Random network distillation bonus
        rnd_size = 50

        inputs = [rn.obs_inp, rn.act_inp]
        inputs = [tf.layers.flatten(x) for x in inputs]
        inputs = tf.concat(inputs, axis=1)

        rnd_target_net = build_mlp([32, 32, 32], output_size=rnd_size)
        rnd_target = sequential(inputs, rnd_target_net)

        rnd_pred_net = build_mlp([32, 32, 32], output_size=rnd_size)
        rnd_pred = sequential(inputs, rnd_pred_net)

        rnd_loss = tf.reduce_mean((tf.stop_gradient(rnd_target) - rnd_pred)**2)
        rnd_optimizer = tf.train.AdamOptimizer(learning_rate=rnd_lr)
        rnd_train_op = rnd_optimizer.minimize(rnd_loss)

        runn_rnd_rews = RunningMeanVar(alpha=0.01)

        def rnd_reward_fn(obs, acts=None, *args, **kwargs):
            if acts is None:
                acts = [venv.action_space.sample()]
            int_rew = sess.run(rnd_loss,
                               feed_dict={
                                   rn.obs_ph: obs,
                                   rn.act_ph: acts
                               })
            int_rew_old = int_rew
            int_rew = runn_rnd_rews.exp_update(int_rew)

            return int_rew

        if normalize_extrinsic:
            runn_ext_rews = RunningMeanVar(alpha=0.01)

        def extrinsic_reward_fn(*args, **kwargs):
            ext_rew = base_extrinsic_reward_fn(*args, **kwargs)
            if normalize_extrinsic:
                ext_rew = runn_ext_rews.exp_update(ext_rew)
            return ext_rew

        def reward_fn(*args, **kwargs):
            return extrinsic_reward_fn(
                *args, **kwargs) + rnd_coeff * rnd_reward_fn(*args, **kwargs)

    # Create learner from reward model
    venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
    policy = PPO2(MlpPolicy, venv_train, learning_rate=policy_lr)

    # Start training
    sess = tf.get_default_session()
    sess.run(tf.global_variables_initializer())

    sampling_policy = make_egreedy(policy,
                                   venv) if egreedy_sampling else policy

    num_epochs = int(np.ceil(total_timesteps / policy_epoch_timesteps))

    for epoch in range(num_epochs):
        trajectories = sample_trajectories(venv, sampling_policy,
                                           2 * n_pairs_per_batch)

        segments = get_segments(trajectories)

        seg_returns = evaluate_trajectories_fn(segments)
        seg_returns = seg_returns.reshape(-1, 2)
        preferences = np.stack(
            [
                seg_returns[:, 0] > seg_returns[:, 1],
                seg_returns[:, 1] > seg_returns[:, 0],
            ],
            axis=1,
        )

        obs = np.concatenate([seg.obs for seg in segments])
        acts = np.concatenate([seg.acts for seg in segments])
        next_obs = np.concatenate([seg.next_obs for seg in segments])

        ops = [reward_train_op]
        if use_rnd_bonus:
            ops.append(rnd_train_op)

        sess.run(
            ops,
            feed_dict={
                rn.obs_ph: obs,
                rn.act_ph: acts,
                rn.next_obs_ph: next_obs,
                preferences_ph: preferences,
            },
        )

        policy.learn(total_timesteps=policy_epoch_timesteps)

    results = {}
    results["reward_model"] = rn
    results["policy"] = policy

    return results