class PolicyNetwork: def __init__(self, state_shape, action_space, tasks, shared_layers, task_specific_layers, state_transformer, alpha=0.0001, entropy_regularization=0.1, q_network: QNetwork = None, fixed_steps=1000): """ A Tasked policy network :param state_shape: The shape of the state variable WITHOUT the batch dimension :param action_space: The Action space :param tasks: A list of Tasks :param shared_layers: The shared/common layers of the network as a function (using the keras functional API) :param task_specific_layers: The task specific layers of the network as a function (using the keras functional API) :param state_transformer: A function that takes a state object and transforms it to a network input :param alpha: The learning rate :param entropy_regularization: The entropy regularization factor for the loss function :param q_network: The related Q Network instance. This can be left None if it's set later (for instance by the SACU actor) :param fixed_steps: The number of training steps that the fixed network is kept fixed. After these steps it's updated and the step counter is reset. """ self.fixed_steps = fixed_steps self.steps = 0 self.state_shape = state_shape self.action_space = action_space self.q_network = q_network self.tasks = tasks self.model = TaskedDualNeuralNet( state_shape, shared_layers, task_specific_layers, lambda model: model.compile( optimizer=ks.optimizers.Adam(alpha, clipnorm=1.0), loss=make_policy_loss(entropy_regularization)), tasks) self.state_transformer = state_transformer def distribution(self, state: State, task: Task) -> dict: s = self.state_transformer(state) s = np.expand_dims(s, axis=0) return self.model.predict(s, task)[0] def distribution_array(self, states, task: Task, live=True): # Expects transformed states return self.model.predict(states, task, live=live) def train(self, trajectories): # Creates a list of all "initial" states and respective Q-values and fits the policy network # TODO: It should actually iterate over all states in trajectory, but this is not implemented in the Q-network for task in self.tasks: xs = [] #q_values = [] for trajectory in trajectories: states = np.array( [self.state_transformer(t[0]) for t in trajectory[:1]]) xs.append(states) #qs = self.q_network.Qp_array(states, task) #q_values.append(qs) xs = np.concatenate(xs, axis=0) q_values = self.q_network.Qp_array(xs, task) self.model.fit(xs, q_values, task) # Related to the fixed parameter update self.steps += 1 if self.steps > self.fixed_steps: self.steps = 0 self.sync() def sample(self, state: State, task: Task, dist=None) -> Action: if dist is None: dist = self.distribution(state, task) choice = np.random.random() p_cumulative = 0 index = 0 for i, p in enumerate(dist): p_cumulative += p if p_cumulative > choice: index = i break return self.action_space[index] def sample_greedy(self, state: State, task: Task) -> Action: raise NotImplementedError def sample_epsilon_greedy(self, state: State, task: Task) -> Action: raise NotImplementedError def sample_distribution(self, state: State, task: Task) -> tuple: dist = self.distribution(state, task) a = self.sample(state, task, dist) return a, dist def sync(self): self.model.sync()
class QNetwork(generic.QNetwork): def __init__(self, state_shape, action_space, tasks, shared_layers, task_specific_layers, state_transformer, alpha=0.0001, gamma=0.9, p_network: PolicyNetwork = None, fixed_steps=1000, reward_scale=1.0, lambd=0.9, lambd_min=0.0): """ Initializes a Tasked Q Network. This Q network uses retrace for Q-value calculation :param state_shape: The shape of the state variable WITHOUT the batch dimension :param action_space: The Action space :param tasks: A list of Tasks :param shared_layers: The shared/common layers of the network as a function (using the keras functional API) :param task_specific_layers: The task specific layers of the network as a function (using the keras functional API) :param state_transformer: A function that takes a state object and transforms it to a network input :param alpha: The learning rate :param gamma: The discount factor :param p_network: The Policy Network, this can be None at init as long as it's set later (This is done by the SACU agent) :param fixed_steps: The number of training steps that the fixed network is kept fixed. After these steps it's updated and the step counter is reset. :param reward_scale: A float that is used to scale rewards :param lambd: The lambda value used by the retrace algorithm. Similar to its use in SARSA lambda. :param lambd_min: A cap on the lambda value similar to the one used in our SARSA lambda implementation. It cuts off trajectories when lambda^k goes below this value. Given that SAC-X often operates on trajectories with a mean length of ~500 (where lambda^k could easily be 1e-20), using this value yields massive speed improvements without large costs. """ self.lambd = lambd self.lambda_min = lambd_min self.max_trajectory_length = None if self.lambda_min != 0: self.max_trajectory_length = int(math.log(self.lambda_min, lambd)) print("Capping trajectories at a max length of ", self.max_trajectory_length) self.reward_scale = reward_scale self.fixed_steps = fixed_steps self.steps = 0 self.state_transformer = state_transformer self.task_specific_layers = task_specific_layers self.shared_layers = shared_layers self.p_network = p_network self.tasks = tasks self.action_space = action_space self.inverse_action_lookup = dict() for i, a in enumerate(action_space): self.inverse_action_lookup[a] = i self.state_shape = state_shape self.gamma = gamma self.model = TaskedDualNeuralNet( state_shape, shared_layers, task_specific_layers, lambda model: model.compile(optimizer=ks.optimizers.Adam(alpha), loss=ks.losses.mean_squared_error), tasks) def Qs(self, state: State, task: Task): return self.model.predict( np.expand_dims(self.state_transformer(state), axis=0), task)[0] def Q(self, state: State, action: Action, task: Task): pass # TODO def Q_array(self, states, task: Task): return self.model.predict(states, task, live=True) def Qp_array(self, states, task: Task): return self.model.predict(states, task, live=False) def train(self, trajectories): for task in self.tasks: initial_states = np.stack( [self.state_transformer(t[0][0]) for t in trajectories], axis=0) ys = self.model.predict(initial_states, task) for i, trajectory in enumerate(trajectories): _, a, _, _ = trajectory[0] ys[i, self.inverse_action_lookup[a]] += self.get_q_delta( trajectory, task) self.model.fit(initial_states, ys, task) self.steps += 1 if self.steps > self.fixed_steps: self.steps = 0 self.sync() def get_q_delta(self, trajectory, task): """ Calculate the Q-delta according to the Retrace algorithm (the implementation is based on the Retrace paper, not the SAC-X paper) :param trajectory: The trajectory to calculate the delta on :param task: The task to calculate the delta for :param sess: The TF session :return: (The Q delta, the target q-value) Here the target q-value is the fixed q-values + the q-delta """ # Initialize the q_delta variable and get all Qsa values for the trajectory q_delta = 0 if self.max_trajectory_length is None: states = np.array( [self.state_transformer(e[0]) for e in trajectory]) else: states = np.array([ self.state_transformer(e[0]) for e in trajectory[:self.max_trajectory_length + 1] ]) q_values = self.model.predict(states, task, live=True) q_fixed_values = self.model.predict(states, task, live=False) # Calculate all values of π(* | state, task_id) for the trajectory for out current task # (using the fixed network) policies = self.p_network.distribution_array(states, task, live=False) # Pick all π(a_t | state, task_id) from π(* | state, task_id) for every action taken in the trajectory all_action_probabilities = np.array([ a[self.inverse_action_lookup[i]] for i, a in zip([e[1] for e in trajectory], policies) ]) # Pick all b(a_t | state, B) from the trajectory if self.max_trajectory_length is None: all_b_action_probabilities = np.array(([ experience[3][self.inverse_action_lookup[experience[1]]] for experience in trajectory ])) else: all_b_action_probabilities = np.array(([ experience[3][self.inverse_action_lookup[experience[1]]] for experience in trajectory[:self.max_trajectory_length + 1] ])) # Calculate the value of c_k for the whole trajectory c = all_action_probabilities / all_b_action_probabilities # Make sure that c is capped on 1, so c_k = min(1, c_k) c[c > 1] = 1 c[0] = 1 # According to the retrace paper # Keep the product of c_k values in a variable c_product = 1 iterations = len(trajectory) if self.max_trajectory_length is not None: iterations = min(iterations, self.max_trajectory_length) # Iterate over the trajectory to calculate the expected returns for j, (s, a, r, _) in enumerate(trajectory[:iterations]): # Multiply the c_product with the next c-value, # this makes any move done after a "dumb" move (according to our policy) less significant c_product *= c[j] * self.lambd a_index = self.inverse_action_lookup[a] # Check if we're at the end of the trajectory if j != len(trajectory) - 1: # If we're not: calculate the next difference and use the Q for j+1 as well # The Expected value of the Q(s_j+1, *) under the policy expected_q_tp1 = np.sum(policies[j + 1] * q_fixed_values[j + 1]) # The delta for this lookahead delta = self.reward_scale * r[ task] + self.gamma * expected_q_tp1 - q_values[j, a_index] else: # If this is the last entry, we'll assume the Q(s_j+1, *) to be fixed on 0 as the state is terminal delta = self.reward_scale * r[task] - q_values[j, a_index] # if task == self.tasks[0]: # print("Calculated last delta for main task, old_Q and delta:", q_values[j, a_index], q_delta + c_product * self.gamma ** j * delta) # Add this to the sum of q_deltas, where the term is multiplied by gamma and delta q_delta += c_product * self.gamma**j * delta return q_delta def sync(self): self.model.sync()