def policy(self, inputs, states): """ We first compute the successor features and then use the goal vector to compute the Q values. """ srs, states = self.sr(inputs, states) goal = self.goal(inputs).unsqueeze(1).expand(-1, self.num_actions, -1) q_value = torch.sum(torch.mul(goal, srs), dim=-1).view(-1, self.num_actions) return dict(action=comf.q_categorical(q_value)), states
def policy(self, inputs, states): values, states = self.value(inputs, states) q_value = values["q_value"] return dict(action=comf.q_categorical(q_value)), states