def point_get_action(theta, ob, rng=np.random): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :return: A vector of size |A| """ ob_1 = include_bias(ob) mean = theta.dot(ob_1) return rng.normal(loc=mean, scale=1.)
def compute_logits(theta, ob): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :return: A vector of size |A| """ ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) return logits
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ ob_1 = include_bias(ob) # (|S|+1) mean = theta.dot(ob_1) # (|A|,) return np.outer(action - mean, ob_1)
def point_get_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A scalar """ ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean return -0.5 * np.log(2 * np.pi) * theta.shape[0] - 0.5 * np.sum(np.square(zs))
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ e_a = np.zeros(theta.shape[0]) # |A| e_a[action] = 1. ob_1 = include_bias(ob) # |S| + 1 logits = ob_1.dot(theta.T) # |S| + 1 * (|S|+1) * |A| return np.outer(e_a - softmax(logits), ob_1) # (|A| - |A|) * |S| + 1
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ # grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean return np.outer(zs, ob_1)
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ # grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" a = np.zeros(theta.shape[0]) a[action] = 1 p = softmax(compute_logits(theta, ob)) ob_1 = include_bias(ob) return np.outer(a - p, ob_1)
def cartpole_get_action(theta, ob, rng=np.random): ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) return weighted_sample(logits, rng=rng)
def point_get_action(theta, ob, rng=np.random): ob_1 = include_bias(ob) mean = theta.dot(ob_1) return rng.normal(loc=mean, scale=1.)