def test_metrics(dim): y = np.zeros(dim) x = np.ones(dim) scaling_1 = np.ones(dim) scaling_2 = 0.5 * np.ones(dim) for p in range(1, 10): assert np.abs(metric_lp(x, y, p, scaling_1) - np.power(dim, 1.0 / p)) < 1e-15 assert ( np.abs(metric_lp(x, y, p, scaling_2) - 2 * np.power(dim, 1.0 / p)) < 1e-15 )
def map_to_representative(state, lp_metric, representative_states, n_representatives, min_dist, scaling, accept_new_repr): """ Map state to representative state. """ dist_to_closest = np.inf argmin = -1 for ii in range(n_representatives): dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling) if dist < dist_to_closest: dist_to_closest = dist argmin = ii max_representatives = representative_states.shape[0] if dist_to_closest > min_dist \ and n_representatives < max_representatives \ and accept_new_repr: new_index = n_representatives representative_states[new_index, :] = state return new_index, 0.0 return argmin, dist_to_closest
def update_model( repr_state, action, repr_next_state, reward, n_representatives, repr_states, lp_metric, scaling, bandwidth, bonus_scale_factor, beta, v_max, bonus_type, kernel_type, N_sa, B_sa, P_hat, R_hat, ): """ Model update function, lots of arguments so we can use JIT :) """ # aux var for transition update dirac_next_s = np.zeros(n_representatives) dirac_next_s[repr_next_state] = 1.0 for u_repr_state in range(n_representatives): # compute weight dist = metric_lp(repr_states[repr_state, :], repr_states[u_repr_state, :], lp_metric, scaling) weight = kernel_func(dist / bandwidth, kernel_type=kernel_type) # aux variables prev_N_sa = beta + N_sa[u_repr_state, action] # regularization beta current_N_sa = prev_N_sa + weight # update weights N_sa[u_repr_state, action] += weight # update transitions P_hat[u_repr_state, action, :n_representatives] = ( dirac_next_s * weight / current_N_sa + (prev_N_sa / current_N_sa) * P_hat[u_repr_state, action, :n_representatives]) # update rewards R_hat[u_repr_state, action] = ( weight * reward / current_N_sa + (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action]) # update bonus B_sa[u_repr_state, action] = compute_bonus(N_sa[u_repr_state, action], beta, bonus_scale_factor, v_max, bonus_type)