Beispiel #1
1
 def _build_graph(self, tf_graph, scope, model_dir):
   """Construct a TensorGraph containing the policy and loss calculations."""
   state_shape = self._env.state_shape
   state_dtype = self._env.state_dtype
   if not self._state_is_list:
     state_shape = [state_shape]
     state_dtype = [state_dtype]
   features = []
   for s, d in zip(state_shape, state_dtype):
     features.append(Feature(shape=[None] + list(s), dtype=tf.as_dtype(d)))
   policy_layers = self._policy.create_layers(features)
   action_prob = policy_layers['action_prob']
   value = policy_layers['value']
   search_prob = Label(shape=(None, self._env.n_actions))
   search_value = Label(shape=(None,))
   loss = MCTSLoss(
       self.value_weight,
       in_layers=[action_prob, value, search_prob, search_value])
   graph = TensorGraph(
       batch_size=self.max_search_depth,
       use_queue=False,
       graph=tf_graph,
       model_dir=model_dir)
   for f in features:
     graph._add_layer(f)
   graph.add_output(action_prob)
   graph.add_output(value)
   graph.set_loss(loss)
   graph.set_optimizer(self._optimizer)
   with graph._get_tf("Graph").as_default():
     with tf.variable_scope(scope):
       graph.build()
   if len(graph.rnn_initial_states) > 0:
     raise ValueError('MCTS does not support policies with recurrent layers')
   return graph, features, action_prob, value, search_prob, search_value
Beispiel #2
0
 def _build_graph(self, tf_graph, scope, model_dir):
   """Construct a TensorGraph containing the policy and loss calculations."""
   state_shape = self._env.state_shape
   state_dtype = self._env.state_dtype
   if not self._state_is_list:
     state_shape = [state_shape]
     state_dtype = [state_dtype]
   features = []
   for s, d in zip(state_shape, state_dtype):
     features.append(Feature(shape=[None] + list(s), dtype=tf.as_dtype(d)))
   policy_layers = self._policy.create_layers(features)
   action_prob = policy_layers['action_prob']
   value = policy_layers['value']
   rewards = Weights(shape=(None,))
   advantages = Weights(shape=(None,))
   actions = Label(shape=(None, self._env.n_actions))
   loss = A3CLoss(
       self.value_weight,
       self.entropy_weight,
       in_layers=[rewards, actions, action_prob, value, advantages])
   graph = TensorGraph(
       batch_size=self.max_rollout_length,
       use_queue=False,
       graph=tf_graph,
       model_dir=model_dir)
   for f in features:
     graph._add_layer(f)
   graph.add_output(action_prob)
   graph.add_output(value)
   graph.set_loss(loss)
   graph.set_optimizer(self._optimizer)
   with graph._get_tf("Graph").as_default():
     with tf.variable_scope(scope):
       graph.build()
   return graph, features, rewards, actions, action_prob, value, advantages
Beispiel #3
0
 def _build_graph(self, tf_graph, scope, model_dir):
   """Construct a TensorGraph containing the policy and loss calculations."""
   state_shape = self._env.state_shape
   state_dtype = self._env.state_dtype
   if not self._state_is_list:
     state_shape = [state_shape]
     state_dtype = [state_dtype]
   features = []
   for s, d in zip(state_shape, state_dtype):
     features.append(Feature(shape=[None] + list(s), dtype=tf.as_dtype(d)))
   policy_layers = self._policy.create_layers(features)
   value = policy_layers['value']
   rewards = Weights(shape=(None,))
   advantages = Weights(shape=(None,))
   graph = TensorGraph(
       batch_size=self.max_rollout_length,
       use_queue=False,
       graph=tf_graph,
       model_dir=model_dir)
   for f in features:
     graph._add_layer(f)
   if 'action_prob' in policy_layers:
     self.continuous = False
     action_prob = policy_layers['action_prob']
     actions = Label(shape=(None, self._env.n_actions))
     loss = A3CLossDiscrete(
         self.value_weight,
         self.entropy_weight,
         in_layers=[rewards, actions, action_prob, value, advantages])
     graph.add_output(action_prob)
   else:
     self.continuous = True
     action_mean = policy_layers['action_mean']
     action_std = policy_layers['action_std']
     actions = Label(shape=[None] + list(self._env.action_shape))
     loss = A3CLossContinuous(
         self.value_weight,
         self.entropy_weight,
         in_layers=[
             rewards, actions, action_mean, action_std, value, advantages
         ])
     graph.add_output(action_mean)
     graph.add_output(action_std)
   graph.add_output(value)
   graph.set_loss(loss)
   graph.set_optimizer(self._optimizer)
   with graph._get_tf("Graph").as_default():
     with tf.variable_scope(scope):
       graph.build()
   if self.continuous:
     return graph, features, rewards, actions, action_mean, action_std, value, advantages
   else:
     return graph, features, rewards, actions, action_prob, value, advantages
Beispiel #4
-1
 def _build_graph(self, tf_graph, scope, model_dir):
     """Construct a TensorGraph containing the policy and loss calculations."""
     state_shape = self._env.state_shape
     state_dtype = self._env.state_dtype
     if not self._state_is_list:
         state_shape = [state_shape]
         state_dtype = [state_dtype]
     features = []
     for s, d in zip(state_shape, state_dtype):
         features.append(
             Feature(shape=[None] + list(s), dtype=tf.as_dtype(d)))
     policy_layers = self._policy.create_layers(features)
     action_prob = policy_layers['action_prob']
     value = policy_layers['value']
     search_prob = Label(shape=(None, self._env.n_actions))
     search_value = Label(shape=(None, ))
     loss = MCTSLoss(
         self.value_weight,
         in_layers=[action_prob, value, search_prob, search_value])
     graph = TensorGraph(batch_size=self.max_search_depth,
                         use_queue=False,
                         graph=tf_graph,
                         model_dir=model_dir)
     for f in features:
         graph._add_layer(f)
     graph.add_output(action_prob)
     graph.add_output(value)
     graph.set_loss(loss)
     graph.set_optimizer(self._optimizer)
     with graph._get_tf("Graph").as_default():
         with tf.variable_scope(scope):
             graph.build()
     if len(graph.rnn_initial_states) > 0:
         raise ValueError(
             'MCTS does not support policies with recurrent layers')
     return graph, features, action_prob, value, search_prob, search_value