def __init__(self, obs_space, share_obs_space, action_space, gain=1, base=None, base_kwargs=None, device=torch.device("cpu")): super(Policy, self).__init__() self.mixed_action = False self.multi_discrete = False self.device = device if base_kwargs is None: base_kwargs = {} if obs_space.__class__.__name__ == "Box": obs_shape = obs_space.shape share_obs_shape = share_obs_space.shape elif obs_space.__class__.__name__ == "list": obs_shape = obs_space share_obs_shape = share_obs_space else: raise NotImplementedError if len(obs_shape) == 3: self.base = CNNBase(obs_shape, share_obs_shape, **base_kwargs) else: self.base = MLPBase(obs_shape, share_obs_shape, **base_kwargs) if action_space.__class__.__name__ == "Discrete": num_actions = action_space.n self.dist = Categorical(self.base.output_size, num_actions, gain) elif action_space.__class__.__name__ == "Box": num_actions = action_space.shape[0] self.dist = DiagGaussian(self.base.output_size, num_actions) elif action_space.__class__.__name__ == "MultiBinary": num_actions = action_space.shape[0] self.dist = Bernoulli(self.base.output_size, num_actions) elif action_space.__class__.__name__ == "MultiDiscrete": self.multi_discrete = True self.discrete_N = action_space.shape action_size = action_space.high - action_space.low + 1 self.dists = [] for num_actions in action_size: self.dists.append( Categorical(self.base.output_size, num_actions, gain)) self.dists = nn.ModuleList(self.dists) else: # discrete+continous self.mixed_action = True continous = action_space[0].shape[0] discrete = action_space[1].n self.dist = nn.ModuleList([ DiagGaussian(self.base.output_size, continous), Categorical(self.base.output_size, discrete, gain) ])
def split_obs(obs, x, num_parents): node_var: Categorical = obs[x] obs_values = [] for v in zip(node_var.vals): obs_copy = [o for o in obs] obs_copy[x] = Categorical(v) # Copy obs to new nodes for i in range(num_parents - 1): obs_copy.append(Categorical(v)) obs_values.append(obs_copy) probs = node_var.probs return probs, obs_values
def prepare_loss(self): with tf.device(self.device): print(" [{}]Preparing loss".format(self.id)) # [Policy distribution] if self.is_continuous_control(): # Old policy old_policy_batch = tf.transpose(self.old_policy_batch, [1, 0, 2]) old_policy_distributions = Normal(old_policy_batch[0], old_policy_batch[1]) # New policy new_policy_batch = tf.transpose(self.policy_batch, [1, 0, 2]) new_policy_distributions = Normal(new_policy_batch[0], new_policy_batch[1]) else: # discrete control old_policy_distributions = Categorical( self.old_policy_batch) # Old policy new_policy_distributions = Categorical( self.policy_batch) # New policy # [Actor loss] policy_loss_builder = PolicyLoss( cliprange=self.clip, cross_entropy=new_policy_distributions.cross_entropy( self.old_action_batch), old_cross_entropy=old_policy_distributions.cross_entropy( self.old_action_batch), advantage=self.advantage_batch, # entropy=self.fentropy, entropy=new_policy_distributions.entropy(), beta=self.beta) self.policy_loss = policy_loss_builder.get() # [Critic loss] value_loss_builder = ValueLoss(cliprange=self.clip, value=self.value_batch, old_value=self.old_value_batch, reward=self.cumulative_reward_batch) self.value_loss = flags.value_coefficient * value_loss_builder.get( ) # usually critic has lower learning rate # [Extra loss] self.extra_loss = tf.constant(0.) if self.predict_reward: self.extra_loss += self._reward_prediction_loss() # [Debug variables] self.policy_kl_divergence = policy_loss_builder.approximate_kullback_leibler_divergence( ) self.policy_clipping_frequency = policy_loss_builder.get_clipping_frequency( ) self.policy_entropy_contribution = policy_loss_builder.get_entropy_contribution( ) self.total_loss = self.policy_loss + self.value_loss + self.extra_loss
def get_goal_distribution(self, goal, n=4): """ Returns the reward distribution for a goal state taking the best path to the goal into account. """ #max_path_reward = self.get_max_path_reward(goal) max_path = self.get_max_path(goal, include=True) path_reward = [ self._state[node] if hasattr(self._state[node], "sample") else Categorical([self._state[node]]) for node in max_path[1:] ] reward = Categorical([0]) for r in path_reward: reward += r return shrink_categorical(reward, n=n)
def prepare_loss(self, global_step): self.global_step = global_step print( "Preparing loss {}".format(self.id) ) self.state_value_batch = self.critic_batch # [Policy distribution] old_policy_distributions = [] new_policy_distributions = [] policy_loss_builder = [] for h,policy_head in enumerate(self.policy_heads): if is_continuous_control(policy_head['depth']): # Old policy old_policy_batch = tf.transpose(self.old_policy_batch[h], [1, 0, 2]) old_policy_distributions.append( Normal(old_policy_batch[0], old_policy_batch[1]) ) # New policy new_policy_batch = tf.transpose(self.actor_batch[h], [1, 0, 2]) new_policy_distributions.append( Normal(new_policy_batch[0], new_policy_batch[1]) ) else: # discrete control old_policy_distributions.append( Categorical(self.old_policy_batch[h]) ) # Old policy new_policy_distributions.append( Categorical(self.actor_batch[h]) ) # New policy builder = self._get_policy_loss_builder(new_policy_distributions[h], old_policy_distributions[h], self.old_action_batch[h], self.old_action_mask_batch[h] if self.has_masked_actions else None) policy_loss_builder.append(builder) # [Actor loss] self.policy_loss = sum(self._get_policy_loss(b) for b in policy_loss_builder) # [Debug variables] self.policy_kl_divergence = sum(b.approximate_kullback_leibler_divergence() for b in policy_loss_builder) self.policy_clipping_frequency = sum(b.get_clipping_frequency() for b in policy_loss_builder)/len(policy_loss_builder) # take average because clipping frequency must be in [0,1] self.policy_entropy_regularization = sum(b.get_entropy_regularization() for b in policy_loss_builder) # [Critic loss] value_loss_builder = self._get_value_loss_builder() self.value_loss = self._get_value_loss(value_loss_builder) # [Entropy regularization] if flags.entropy_regularization: self.policy_loss += -self.policy_entropy_regularization # [Constraining Replay] if self.constrain_replay: constrain_loss = sum( 0.5*builder.reduce_function(tf.squared_difference(new_distribution.mean(), tf.stop_gradient(old_action))) for builder, new_distribution, old_action in zip(policy_loss_builder, new_policy_distributions, self.old_action_batch) ) self.policy_loss += tf.cond( pred=self.is_replayed_batch[0], true_fn=lambda: constrain_loss, false_fn=lambda: tf.constant(0., dtype=self.parameters_type) ) # [Total loss] self.total_loss = self.policy_loss + self.value_loss if flags.intrinsic_reward: self.total_loss += self.intrinsic_reward_loss
def __init__(self, obs_shape, action_space, model_type=0, base_kwargs=None): super(RL_Policy, self).__init__() if base_kwargs is None: base_kwargs = {} if model_type == 0: self.network = Global_Policy(obs_shape, **base_kwargs) else: raise NotImplementedError if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(self.network.output_size, num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(self.network.output_size, num_outputs) else: raise NotImplementedError self.model_type = model_type
def high_vpi(self, state, bins=4): """Returns the high level VPI Arguments: state: high state for computation bins: number of bins to discretize continuous distribution """ dists = [] for option in range(1, self.no_options + 1): # To get the node distributions goal_clicked = self.goals[option - 1][0] node = self.low_state[goal_clicked] if hasattr(node, 'sample'): if hasattr(node, 'mu'): dist = node.to_discrete(n=bins, max_sigma=4) dist.vals = tuple([(round(val, 3)) for val in dist.vals]) dist.probs = tuple([(round(p, 3)) for p in dist.probs]) else: dist = node else: dist = Categorical(vals=[node], probs=[1]) dists.append(dist) net_dist = self.shrink(dists) expected_return = cmax(net_dist).expectation() return expected_return - self.expected_high_term_reward(state)
def __init__(self, obs_shape, action_space, num_agents, base=None, base_kwargs=None): super(Policy, self).__init__() if base_kwargs is None: base_kwargs = {} if base is None: if len(obs_shape) == 3: base = CNNBase self.base = base(num_agents, obs_shape, **base_kwargs) elif len(obs_shape) == 1: base = MLPBase self.base = base(num_agents, obs_shape[0], **base_kwargs) else: raise NotImplementedError if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(self.base.output_size, num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(self.base.output_size, num_outputs) elif action_space.__class__.__name__ == "MultiBinary": num_outputs = action_space.shape[0] self.dist = Bernoulli(self.base.output_size, num_outputs) else: raise NotImplementedError
def vpi_action(self, action, state) -> 'float, >= -0.001': """ Calculates vpi action. Nodes of importance are those who are either parents or children of the node selected """ # print("Ground Truth = {}".format(self.ground_truth)) # print("State = {}".format(state)) # print("Action = {}".format(action)) option_dist = [] obs = (*self.subtree[action][0:], *self.path_to(action)[1:]) obs = list(set(obs)) for option in range(1, self.no_options + 1): op_dist = self.node_value_after_observe_option(option, state, obs) node_idx = self.goals[option - 1][0] if not hasattr(state[node_idx], 'sample'): goal_dist = Categorical(vals=[state[node_idx]], probs=[1]) else: goal_dist = state[node_idx] dists = [op_dist, goal_dist] option_dist.append(cross_1(dists, sum)) net_dist = self.shrink(option_dist) nvao = float(cmax(net_dist, default=ZERO).expectation()) # print(obs) # print("Env.state = {}".format(state)) # for _,i in enumerate(state): # print(i) # print("Expected Term Reward = {}".format(self.expected_term_reward(state))) # print("Observe Node Expected = {}".format(self.node_value_after_observe(obs, 0, state,verbose).expectation())) result = nvao - self.expected_term_reward_disc(state) if abs(result) < 0.001: result = 0.0 return result
def low_vpi(self, option, state) -> 'float, >= -0.001': """ Calculates vpi for a given option. All nodes of branch in option set are important. Basically calculating vpi_action with goal node selected Arguments: option: option for computation state: state for computation """ action = self.goals[option - 1][0] obs = (*self.subtree[action][0:], *self.path_to(action)[1:]) obs = list(set(obs)) op_dist = self.node_value_after_observe_option(option, 0, state, obs) node_idx = self.goals[option - 1][0] if not hasattr(state[node_idx], 'sample'): goal_dist = Categorical(vals=[state[node_idx]], probs=[1]) else: goal_dist = state[node_idx] dists = [op_dist, goal_dist] nvao = float((cross_1(dists, sum)).expectation()) result = nvao - self.expected_low_term_reward_disc(option, state) if abs(result) < 0.001: result = 0.0 return result
def vpi(self, state) -> 'float, >= -0.001': """ Calculates vpi. All nodes of branch are important. Basically calculating vpi_action with goal node selected """ option_dist = [] for option in range(1, self.no_options+1): action = self.goals[option - 1][0] obs = (*self.subtree[action][0:], *self.path_to(action)[1:]) obs = list(set(obs)) op_dist = self.node_value_after_observe_option(option, state, obs) node_idx = self.goals[option-1][0] if not hasattr(state[node_idx], 'sample'): goal_dist = Categorical(vals=[state[node_idx]], probs= [1]) else: goal_dist = state[node_idx] dists = [op_dist, goal_dist] option_dist.append(cross_1(dists, sum)) net_dist = self.shrink(option_dist) nvao = float(cmax(net_dist, default=ZERO).expectation()) # print("VPI Node observe value = {}".format(nvao)) result = nvao - self.expected_term_reward_disc(state) if abs(result) < 0.001: result = 0.0 return result
def get_goal_myopic_distribution(self, goal, n=4): """Finds the best click of the goal subtree through variance heuristic, then calculates the goal distribution based on the distribution of the best click and the expected reward of other nodes on the best path. Args: goal (int): Node index of the goal node """ max_path = self.get_max_path(goal, include=True) max_variance_node = max(max_path, key=self.variance) reward = Categorical([0]) for node in max_path: value = self._state[node] if node == max_variance_node: reward += value elif hasattr(value, "sample"): reward += Categorical([value.expectation()]) else: reward += Categorical([value]) return shrink_categorical(reward, n=n)
def sample_actions(self): action_batch = [] hot_action_batch = [] for h,actor_head in enumerate(self.actor_batch): if is_continuous_control(self.policy_heads[h]['depth']): new_policy_batch = tf.transpose(actor_head, [1, 0, 2]) sample_batch = Normal(new_policy_batch[0], new_policy_batch[1]).sample() action = tf.clip_by_value(sample_batch, -1,1) action_batch.append(action) # Sample action batch in forward direction, use old action in backward direction hot_action_batch.append(action) else: # discrete control distribution = Categorical(actor_head) action = distribution.sample(one_hot=False) # Sample action batch in forward direction, use old action in backward direction action_batch.append(action) hot_action_batch.append(distribution.get_sample_one_hot(action)) # Give self esplicative name to output for easily retrieving it in frozen graph # tf.identity(action_batch, name="action") return action_batch, hot_action_batch
def __init__(self, obs_space, action_space, base=None, base_kwargs=None): super(Policy, self).__init__() pixel_shape, non_pixel_obs, non_pixel_shape = parse_obs_space( obs_space) action_spaces, action_spaces_name = parse_action_space(action_space) # pixel:tuple (h,w,c), # non_pixel_obs = ['name'] # non_pixel_shape :int = len(non_pixel_obs) # action_spaces: [2,2,1,1...] # action_spaces_name = ['attack',...] if base_kwargs is None: base_kwargs = {} base = Branch_CNNBase if non_pixel_shape == 0: add_non_pixel = False else: add_non_pixel = True # arguments non_pixel_layer = base_kwargs['non_pixel_layer'] convs = base_kwargs['convs'] in_channels = base_kwargs['frame_history_len'] * pixel_shape[2] in_feature = base_kwargs['in_feature'] hidden_actions = base_kwargs['hidden_actions'] hidden_value = base_kwargs['hidden_value'] aggregator = base_kwargs['aggregator'] self.num_branches = len(action_spaces) self.base = base(add_non_pixel, non_pixel_shape, non_pixel_layer, convs, in_channels, in_feature, hidden_actions, hidden_value, action_spaces, aggregator) self.dist_idxes = [] dist_l = 1 for i in range(self.num_branches): if (action_spaces[i] == 1): # continuous action space num_outputs = action_spaces[i] num_inputs = hidden_actions[-1] setattr(self, "dist" + str(dist_l), DiagGaussian(num_inputs, num_outputs)) self.dist_idxes.append(dist_l) dist_l += 1 else: # discrete action space num_inputs = hidden_actions[-1] num_outputs = action_spaces[i] setattr(self, "dist" + str(dist_l), Categorical(num_inputs, num_outputs)) self.dist_idxes.append(dist_l) dist_l += 1
def to_obs_tree(self, state, node, obs=()): """Updated obs tree computation for tree contraction method. """ state = [ state[n] if n in obs else expectation(state[n]) for n in range(len(state)) ] state = [ node if hasattr(node, "sample") else Categorical([node]) for node in state ] return tuple(state)
def shrink_categorical(cat, n=4): ''' Reduces the categorical distribution to distribution of size n using k-means clustering :param cat: categorical distribution :param n: number of bins/clusters to be reduced to :return: ''' if (not hasattr(cat, "sample")) or (len(cat.vals) < n): return cat clusters, centroids = kmeans1d.cluster(cat.vals, n) probs = [0 for _ in range(n)] for cluster, prob in zip(clusters, cat.probs): probs[cluster] += prob return Categorical(centroids, probs=probs)
def sample_actions(self): with tf.device(self.device): if self.is_continuous_control(): new_policy_batch = tf.transpose(self.policy_batch, [1, 0, 2]) sample_batch = Normal(new_policy_batch[0], new_policy_batch[1]).sample() action_batch = tf.clip_by_value( sample_batch, -1, 1 ) # Sample action batch in forward direction, use old action in backward direction else: # discrete control action_batch = Categorical(self.policy_batch).sample( ) # Sample action batch in forward direction, use old action in backward direction # Give self esplicative name to output for easily retrieving it in frozen graph tf.identity(action_batch, name="action") return action_batch
def get_goal_reward(self, goal): """ Returns the reward distribution for a goal state taking the best path to the goal into account. """ max_path_reward = self.get_max_path_reward(goal) # Update the distribution by the value along the path goal_state = self._state[goal] if hasattr(goal_state, "sample"): if hasattr(goal_state, "mu") and hasattr(goal_state, "sigma"): return Normal(goal_state.mu + max_path_reward, goal_state.sigma) elif hasattr(goal_state, "vals") and hasattr(goal_state, "probs"): vals = tuple( [value + max_path_reward for value in goal_state.vals]) return Categorical(vals, goal_state.probs) else: print(f"Type {type(goal_state)} not supported.") raise NotImplementedError() else: return goal_state + max_path_reward
def high_myopic_voc(self, state, action, bins=4): """Returns the high level myopic VOC Arguments: state: high state for computation action: high level action for computation bins: number of bins to discretize continuous distribution """ option_to_explore = action - (len(self.init) + self.no_options - 1) goal_clicked = self.goals[option_to_explore - 1][0] node = self.low_state[goal_clicked] if hasattr(node, 'sample'): if hasattr(node, 'mu'): dist = node.to_discrete(n=bins, max_sigma=4) dist.vals = tuple([(round(val, 3)) for val in dist.vals]) dist.probs = tuple([(round(p, 3)) for p in dist.probs]) else: dist = node else: dist = Categorical(vals=[node], probs=[1]) r, p = zip(*dist) expected_return = 0 high_state = [] for op in range(1, self.no_options + 1): val, _ = self.low_term_reward(op, self.low_state) high_state.append(val) for k in range( len(p) ): # Find best option to explore for each possible goal value state2 = list(self.low_state) state2[goal_clicked] = r[k] high_state[option_to_explore - 1], _ = self.high_belief_update( state2, option_to_explore) expected_return += p[k] * max(high_state) return expected_return - self.expected_high_term_reward(state)
def _reward_prediction_loss(self): self.reward_prediction_labels = self._reward_prediction_target_placeholder( "reward_prediction_target", 1) return tf.reduce_sum( Categorical(self.reward_prediction_logits).cross_entropy( self.reward_prediction_labels))
def exact_node_value_after_observe(state, operations): """ Computes the categorical node value of the tree by applying the passed operations. Args: state ([Categorical]): Node value distribution based on the observation. operations ([(str, int, int)]): Operations to be applied. Returns: result (Categorical): Categorical node value of the final tree node. """ def reduce_add(state, i, j): new_state = [state[k] for k in range(len(state)) if k is not j] new_state[i] = new_state[i] + state[j] return new_state def reduce_mul(state, i, j): new_state = [state[k] for k in range(len(state)) if k is not j] new_state[i] = cross([state[i], state[j]], max) return new_state def split_obs(obs, x, num_parents): node_var: Categorical = obs[x] obs_values = [] for v in zip(node_var.vals): obs_copy = [o for o in obs] obs_copy[x] = Categorical(v) # Copy obs to new nodes for i in range(num_parents - 1): obs_copy.append(Categorical(v)) obs_values.append(obs_copy) probs = node_var.probs return probs, obs_values for i in range(len(operations)): op, a, b = operations[i] if op == "add": state = reduce_add(state, a, b) elif op == "mul": state = reduce_mul(state, a, b) elif op == "split": # a = split node, b = num parents probs, obs_vals = split_obs(state, a, b) states = [] # Solve partial trees recursively for val in obs_vals: states.append( exact_node_value_after_observe(val, operations[i + 1:])) # Combine partial responses total_p = [] total_v = [] for var_p, cat in zip(probs, states): assert len(cat) == 1 p, v = cat[0].probs, cat[0].vals p = [x * var_p for x in p] total_p += p total_v += v state = [Categorical(total_v, probs=total_p)] break else: assert False assert len(state) == 1 return state[0]
# Environment parameters SWITCH_COST = 0 # Cost of switching goals HIGH_COST = 10 # Cost of computing a goal LOW_COST = 10 # Cost of computing a low level node SEED = 0 # Fixes generated environments for training COST_FUNC = "Basic" TREE = [[1, 16, 31, 46], [2, 3, 4, 5], [6], [6], [7], [7], [8], [8], [9, 10, 11, 12], [13], [13], [14], [14], [15], [15], [], [17, 18, 19, 20], [21], [21], [22], [22], [23], [23], [24, 25, 26, 27], [28], [28], [29], [29], [30], [30], [], [32, 33, 34, 35], [36], [36], [37], [37], [38], [38], [39, 40, 41, 42], [43], [43], [44], [44], [45], [45], [], [47, 48, 49, 50], [51], [51], [52], [52], [53], [53], [54, 55, 56, 57], [58], [58], [59], [59], [60], [60], []] d0 = Categorical([0]) dr = Categorical([-1500, 0], probs=[0.1, 0.9]) di = Categorical([-10, -5, 5, 10]) dg = Categorical([0, 25, 75, 100]) node_types = [ di, d0, di, di, di, di, di, di, dr, di, di, di, di, di, di, dg, d0, di, di, di, di, di, di, dr, di, di, di, di, di, di, dg, d0, di, di, di, di, di, di, dr, di, di, di, di, di, di, dg, d0, di, di, di, di, di, di, dr, di, di, di, di, di, di, dg ] INIT = tuple([r for r in node_types]) W = np.array([[0.45137647, 0.2288873, 9.26596405, 0.17091717, 2.24210099]]) high_risk_clicks = [8, 23, 38, 53] goal_clicks = [15, 30, 45, 60] term_click = 61