def vpi(self, state) -> 'float, >= -0.001': """ Calculates vpi. All nodes of branch are important. Basically calculating vpi_action with goal node selected """ option_dist = [] for option in range(1, self.no_options+1): action = self.goals[option - 1][0] obs = (*self.subtree[action][0:], *self.path_to(action)[1:]) obs = list(set(obs)) op_dist = self.node_value_after_observe_option(option, state, obs) node_idx = self.goals[option-1][0] if not hasattr(state[node_idx], 'sample'): goal_dist = Categorical(vals=[state[node_idx]], probs= [1]) else: goal_dist = state[node_idx] dists = [op_dist, goal_dist] option_dist.append(cross_1(dists, sum)) net_dist = self.shrink(option_dist) nvao = float(cmax(net_dist, default=ZERO).expectation()) # print("VPI Node observe value = {}".format(nvao)) result = nvao - self.expected_term_reward_disc(state) if abs(result) < 0.001: result = 0.0 return result
def vpi_action(self, action, state) -> 'float, >= -0.001': """ Calculates vpi action. Nodes of importance are those who are either parents or children of the node selected """ # print("Ground Truth = {}".format(self.ground_truth)) # print("State = {}".format(state)) # print("Action = {}".format(action)) option_dist = [] obs = (*self.subtree[action][0:], *self.path_to(action)[1:]) obs = list(set(obs)) for option in range(1, self.no_options + 1): op_dist = self.node_value_after_observe_option(option, state, obs) node_idx = self.goals[option - 1][0] if not hasattr(state[node_idx], 'sample'): goal_dist = Categorical(vals=[state[node_idx]], probs=[1]) else: goal_dist = state[node_idx] dists = [op_dist, goal_dist] option_dist.append(cross_1(dists, sum)) net_dist = self.shrink(option_dist) nvao = float(cmax(net_dist, default=ZERO).expectation()) # print(obs) # print("Env.state = {}".format(state)) # for _,i in enumerate(state): # print(i) # print("Expected Term Reward = {}".format(self.expected_term_reward(state))) # print("Observe Node Expected = {}".format(self.node_value_after_observe(obs, 0, state,verbose).expectation())) result = nvao - self.expected_term_reward_disc(state) if abs(result) < 0.001: result = 0.0 return result
def high_vpi(self, state, bins=4): """Returns the high level VPI Arguments: state: high state for computation bins: number of bins to discretize continuous distribution """ dists = [] for option in range(1, self.no_options + 1): # To get the node distributions goal_clicked = self.goals[option - 1][0] node = self.low_state[goal_clicked] if hasattr(node, 'sample'): if hasattr(node, 'mu'): dist = node.to_discrete(n=bins, max_sigma=4) dist.vals = tuple([(round(val, 3)) for val in dist.vals]) dist.probs = tuple([(round(p, 3)) for p in dist.probs]) else: dist = node else: dist = Categorical(vals=[node], probs=[1]) dists.append(dist) net_dist = self.shrink(dists) expected_return = cmax(net_dist).expectation() return expected_return - self.expected_high_term_reward(state)
def exact_node_value_after_observe(obs_tree): """A distribution over the expected value of node, after making an observation. `obs` can be a single node, a list of nodes, or 'all' """ children = tuple(exact_node_value_after_observe(c) + c[0] for c in obs_tree[1]) return cmax(children, default=ZERO)
def shrink(self, option_dist): if len(option_dist) == 2: return option_dist else: # print("Continuing") two_dist = [option_dist[0], option_dist[1]] # print(two_dist) new_dist = [cmax(two_dist, default=ZERO)] + option_dist[2:] return self.shrink(new_dist)
def exact_node_value_after_observe(obs_tree): """A distribution over the expected value of node, after making an observation. Arguments: obs_tree: the tree indicating the nodes used for observation """ children = tuple( exact_node_value_after_observe(c) + c[0] for c in obs_tree[1]) return cmax(children, default=ZERO)