def max_causal_ent_policy(transition, reward, horizon, discount): """Soft Q-iteration, theorem 6.8 of Ziebart's PhD thesis (2010).""" nS, nA, _ = transition.shape V = np.zeros(nS) for i in range(horizon): Q = reward.reshape(nS, 1) + discount * (transition * V).sum(2) V = sp_lse(Q, axis=1) return np.exp(Q - V.reshape(nS, 1))
def max_ent_policy(transition, reward, horizon, discount): """Backward pass of algorithm 1 of Ziebart (2008). This corresponds to maximum entropy. WARNING: You probably want to use max_causal_ent_policy instead. See discussion in section 6.2.2 of Ziebart's PhD thesis (2010).""" nS = transition.shape[0] logsc = np.zeros(nS) # TODO: terminal states only? with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore', 'divide by zero encountered in log') logt = np.nan_to_num(np.log(transition)) reward = reward.reshape(nS, 1, 1) for i in range(horizon): # Ziebart (2008) never describes how to handle discounting. This is a # backward pass: so on the i'th iteration, we are computing the # frequency a state/action is visited at the (horizon-i-1)'th position. # So we should multiply reward by discount ** (horizon - i - 1). cur_discount = discount**(horizon - i - 1) x = logt + (cur_discount * reward) + logsc.reshape(1, 1, nS) logac = sp_lse(x, axis=2) logsc = sp_lse(logac, axis=1) return np.exp(logac - logsc.reshape(nS, 1))
def logsumexp(q, alpha=1.0, axis=1): if alpha == 0: return np.max(q, axis=axis) return alpha * sp_lse((1.0 / alpha) * q, axis=axis)
def logsumexp(q, alpha=1.0, axis=1): return alpha * sp_lse((1.0 / alpha) * q, axis=axis)