def test_sample_single(self): # values are already normalized values = np.array([1.0, 0.0]) c = Categorical(values=values) self.assertEqual(0, c.sample()) # values are not normalized values = np.array([0, 10.0]) c = Categorical(values=values) self.assertEqual(1, c.sample())
def sample_action(q_pi, policies, n_control, sampling_type="marginal_action"): """ Samples action from posterior over policies, using one of two methods. Parameters ---------- q_pi [1D numpy.ndarray or Categorical]: Posterior beliefs about (possibly multi-step) policies. policies [list of numpy ndarrays]: List of arrays that indicate the policies under consideration. Each element within the list is a matrix that stores the the indices of the actions upon the separate hidden state factors, at each timestep (nStep x nControlFactor) n_control [list of integers]: List of the dimensionalities of the different (controllable)) hidden state factors sampling_type [string, 'marginal_action' or 'posterior_sample']: Indicates whether the sampled action for a given hidden state factor is given by the evidence for that action, marginalized across different policies ('marginal_action') or simply the action entailed by a sample from the posterior over policies Returns ---------- selectedPolicy [1D numpy ndarray]: Numpy array containing the indices of the actions along each control factor """ n_factors = len(n_control) if sampling_type == "marginal_action": if utils.is_distribution(q_pi): q_pi = utils.to_numpy(q_pi) action_marginals = np.empty(n_factors, dtype=object) for c_idx in range(n_factors): action_marginals[c_idx] = np.zeros(n_control[c_idx]) # weight each action according to its integrated posterior probability over policies and timesteps for pol_idx, policy in enumerate(policies): for t in range(policy.shape[0]): for factor_i, action_i in enumerate(policy[t, :]): action_marginals[factor_i][action_i] += q_pi[pol_idx] action_marginals = Categorical(values=action_marginals) action_marginals.normalize() selected_policy = np.array(action_marginals.sample()) elif sampling_type == "posterior_sample": if utils.is_distribution(q_pi): policy_index = q_pi.sample() selected_policy = policies[policy_index] else: q_pi = Categorical(values=q_pi) policy_index = q_pi.sample() selected_policy = policies[policy_index] return selected_policy
def test_sample_AoA(self): # values are already normalized values_1 = np.array([1.0, 0.0]) values_2 = np.array([0.0, 1.0, 0.0]) values = np.array([values_1, values_2]) c = Categorical(values=values) self.assertTrue(np.isclose(np.array([0, 1]), c.sample()).all()) # values are not normalized values_1 = np.array([10.0, 0.0]) values_2 = np.array([0.0, 10.0, 0.0]) values = np.array([values_1, values_2]) c = Categorical(values=values) self.assertTrue(np.isclose(np.array([0, 1]), c.sample()).all())
def sample_action(p_i, possible_policies, Nu, sampling_type="marginal_action"): """ Samples action from posterior over policies, using one of two methods. @TODO: Needs to be amended for use with multi-step policies (where possible_policies is a list of np.arrays (nStep x nFactor), not just a list of tuples as it is now) Parameters ---------- p_i [1D numpy.ndarray or Categorical]: Variational posterior over policies. possible_policies [list of tuples]: List of tuples that indicate the possible policies under consideration. Each tuple stores the actions taken upon the separate hidden state factors. Same length as p_i. Nu [list of integers]: List of the dimensionalities of the different (controllable)) hidden states sampling_type [string, 'marginal_action' or 'posterior_sample']: Indicates whether the sampled action for a given hidden state factor is given by the evidence for that action, marginalized across different policies ('marginal_action') or simply the action entailed by the policy sampled from the posterior. Returns ---------- selectedPolicy [tuple]: tuple containing the list of actions selected by the agent """ numControls = len(Nu) if sampling_type == "marginal_action": if isinstance(p_i, Categorical): p_i = p_i.values.squeeze() action_marginals = np.empty(numControls, dtype=object) for nu_i in range(numControls): action_marginals[nu_i] = np.zeros(Nu[nu_i]) # Weight each action according to the posterior probability it gets across policies for pol_i, policy in enumerate(possible_policies): for nu_i, a_i in enumerate(policy): action_marginals[nu_i][a_i] += p_i[pol_i] action_marginals = Categorical(values=action_marginals) action_marginals.normalize() selected_policy = action_marginals.sample() elif sampling_type == "posterior_sample": if isinstance(p_i, Categorical): policy_index = p_i.sample() selected_policy = possible_policies[policy_index] else: sample_onehot = np.random.multinomial(1, p_i.squeeze()) policy_index = np.where(sample_onehot == 1)[0][0] selected_policy = possible_policies[policy_index] return selected_policy