def _build_graph(self, tf_graph, scope, model_dir): """Construct a TensorGraph containing the policy and loss calculations.""" state_shape = self._env.state_shape state_dtype = self._env.state_dtype if not self._state_is_list: state_shape = [state_shape] state_dtype = [state_dtype] features = [] for s, d in zip(state_shape, state_dtype): features.append(Feature(shape=[None] + list(s), dtype=tf.as_dtype(d))) policy_layers = self._policy.create_layers(features) action_prob = policy_layers['action_prob'] value = policy_layers['value'] rewards = Weights(shape=(None,)) advantages = Weights(shape=(None,)) actions = Label(shape=(None, self._env.n_actions)) loss = A3CLoss( self.value_weight, self.entropy_weight, in_layers=[rewards, actions, action_prob, value, advantages]) graph = TensorGraph( batch_size=self.max_rollout_length, use_queue=False, graph=tf_graph, model_dir=model_dir) for f in features: graph._add_layer(f) graph.add_output(action_prob) graph.add_output(value) graph.set_loss(loss) graph.set_optimizer(self._optimizer) with graph._get_tf("Graph").as_default(): with tf.variable_scope(scope): graph.build() return graph, features, rewards, actions, action_prob, value, advantages
def _build_graph(self, tf_graph, scope, model_dir): """Construct a TensorGraph containing the policy and loss calculations.""" state_shape = self._env.state_shape state_dtype = self._env.state_dtype if not self._state_is_list: state_shape = [state_shape] state_dtype = [state_dtype] features = [] for s, d in zip(state_shape, state_dtype): features.append(Feature(shape=[None] + list(s), dtype=tf.as_dtype(d))) policy_layers = self._policy.create_layers(features) action_prob = policy_layers['action_prob'] value = policy_layers['value'] search_prob = Label(shape=(None, self._env.n_actions)) search_value = Label(shape=(None,)) loss = MCTSLoss( self.value_weight, in_layers=[action_prob, value, search_prob, search_value]) graph = TensorGraph( batch_size=self.max_search_depth, use_queue=False, graph=tf_graph, model_dir=model_dir) for f in features: graph._add_layer(f) graph.add_output(action_prob) graph.add_output(value) graph.set_loss(loss) graph.set_optimizer(self._optimizer) with graph._get_tf("Graph").as_default(): with tf.variable_scope(scope): graph.build() if len(graph.rnn_initial_states) > 0: raise ValueError('MCTS does not support policies with recurrent layers') return graph, features, action_prob, value, search_prob, search_value
def _build_graph(self, tf_graph, scope, model_dir): """Construct a TensorGraph containing the policy and loss calculations.""" state_shape = self._env.state_shape state_dtype = self._env.state_dtype if not self._state_is_list: state_shape = [state_shape] state_dtype = [state_dtype] features = [] for s, d in zip(state_shape, state_dtype): features.append(Feature(shape=[None] + list(s), dtype=tf.as_dtype(d))) policy_layers = self._policy.create_layers(features) value = policy_layers['value'] rewards = Weights(shape=(None,)) advantages = Weights(shape=(None,)) graph = TensorGraph( batch_size=self.max_rollout_length, use_queue=False, graph=tf_graph, model_dir=model_dir) for f in features: graph._add_layer(f) if 'action_prob' in policy_layers: self.continuous = False action_prob = policy_layers['action_prob'] actions = Label(shape=(None, self._env.n_actions)) loss = A3CLossDiscrete( self.value_weight, self.entropy_weight, in_layers=[rewards, actions, action_prob, value, advantages]) graph.add_output(action_prob) else: self.continuous = True action_mean = policy_layers['action_mean'] action_std = policy_layers['action_std'] actions = Label(shape=[None] + list(self._env.action_shape)) loss = A3CLossContinuous( self.value_weight, self.entropy_weight, in_layers=[ rewards, actions, action_mean, action_std, value, advantages ]) graph.add_output(action_mean) graph.add_output(action_std) graph.add_output(value) graph.set_loss(loss) graph.set_optimizer(self._optimizer) with graph._get_tf("Graph").as_default(): with tf.variable_scope(scope): graph.build() if self.continuous: return graph, features, rewards, actions, action_mean, action_std, value, advantages else: return graph, features, rewards, actions, action_prob, value, advantages
def _build_graph(self, tf_graph, scope, model_dir): """Construct a TensorGraph containing the policy and loss calculations.""" state_shape = self._env.state_shape state_dtype = self._env.state_dtype if not self._state_is_list: state_shape = [state_shape] state_dtype = [state_dtype] features = [] for s, d in zip(state_shape, state_dtype): features.append( Feature(shape=[None] + list(s), dtype=tf.as_dtype(d))) policy_layers = self._policy.create_layers(features) action_prob = policy_layers['action_prob'] value = policy_layers['value'] search_prob = Label(shape=(None, self._env.n_actions)) search_value = Label(shape=(None, )) loss = MCTSLoss( self.value_weight, in_layers=[action_prob, value, search_prob, search_value]) graph = TensorGraph(batch_size=self.max_search_depth, use_queue=False, graph=tf_graph, model_dir=model_dir) for f in features: graph._add_layer(f) graph.add_output(action_prob) graph.add_output(value) graph.set_loss(loss) graph.set_optimizer(self._optimizer) with graph._get_tf("Graph").as_default(): with tf.variable_scope(scope): graph.build() if len(graph.rnn_initial_states) > 0: raise ValueError( 'MCTS does not support policies with recurrent layers') return graph, features, action_prob, value, search_prob, search_value