def __init__(self, state_shape: Sequence[int], n_actions: int, n_hidden: int, lstm_size: int = 256, summary: bool = True) -> None: super(ActorCriticNetworkDiscreteCNNRNN, self).__init__() self.state_shape: Sequence[int] = state_shape self.n_actions: int = n_actions self.n_hidden: int = n_hidden self.summary: bool = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.actions_taken = tf.placeholder(tf.float32, name="actions_taken") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten reshape = tf.expand_dims(flatten(x), [0]) self.enc_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size) lstm_state_size = self.enc_cell.state_size c_init = np.zeros((1, lstm_state_size.c), np.float32) h_init = np.zeros((1, lstm_state_size.h), np.float32) self.state_init = [c_init, h_init] self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32) tf.add_to_collection("rnn_state_in_c", self.rnn_state_in.c) tf.add_to_collection("rnn_state_in_h", self.rnn_state_in.h) L3, self.rnn_state_out = tf.nn.dynamic_rnn(cell=self.enc_cell, inputs=reshape, initial_state=self.rnn_state_in, dtype=tf.float32) tf.add_to_collection("rnn_state_out_c", self.rnn_state_out.c) tf.add_to_collection("rnn_state_out_h", self.rnn_state_out.h) L3 = tf.reshape(L3, [-1, lstm_size]) # Fully connected for actor and critic self.logits = linear(L3, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape(linear(L3, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] # Log probabilities of all actions self.log_probs = tf.nn.log_softmax(self.logits) # Prob of the action that was actually taken self.action_log_prob = tf.reduce_sum(self.log_probs * self.actions_taken, [1]) self.entropy = self.probs * self.log_probs
def __init__(self, state_shape, n_actions, n_hidden, summary=True): super(ActorCriticNetworkDiscreteCNN, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden self.summary = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.adv = tf.placeholder(tf.float32, name="advantage") self.actions_taken = tf.placeholder(tf.float32, [None, n_actions], name="actions_taken") self.r = tf.placeholder(tf.float32, [None], name="r") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten shape = x.get_shape().as_list() reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3] ]) # -1 for the (unknown) batch size # Fully connected for Actor & Critic self.logits = linear(reshape, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape( linear(reshape, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] log_probs = tf.nn.log_softmax(self.logits) self.actor_loss = -tf.reduce_sum( tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv) self.critic_loss = 0.5 * tf.reduce_sum(tf.square(self.value - self.r)) entropy = -tf.reduce_sum(self.probs * log_probs) self.loss = self.actor_loss + 0.5 * self.critic_loss - entropy * 0.01 self.summary_loss = self.loss self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, state_shape: Sequence[int], n_actions: int, n_hidden: int, summary: bool = True) -> None: super(ActorCriticNetworkDiscreteCNN, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden self.summary = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.actions_taken = tf.placeholder(tf.float32, [None, n_actions], name="actions_taken") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten shape = x.get_shape().as_list() # -1 for the (unknown) batch size reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]]) # Fully connected for actor & critic self.logits = linear(reshape, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape( linear(reshape, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] # Log probabilities of all actions self.log_probs = tf.nn.log_softmax(self.logits) # Prob of the action that was actually taken self.action_log_prob = tf.reduce_sum( self.log_probs * self.actions_taken, [1]) self.entropy = self.probs * self.log_probs
def build_network(self): self.rnn_state = None self.a_n = tf.placeholder(tf.float32, name="a_n") # Discrete action self.adv_n = tf.placeholder(tf.float32, name="adv_n") # Advantage image_size = 80 image_depth = 1 # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images self.states = tf.placeholder( tf.float32, [None, image_size, image_size, image_depth], name="states") self.N = tf.placeholder(tf.int32, name="N") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten shape = x.get_shape().as_list() reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3] ]) # -1 for the (unknown) batch size reshape = tf.expand_dims(flatten(reshape), [0]) self.enc_cell = tf.contrib.rnn.BasicLSTMCell( self.config["n_hidden_units"]) self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32) self.L3, self.rnn_state_out = tf.nn.dynamic_rnn( cell=self.enc_cell, inputs=reshape, initial_state=self.rnn_state_in, dtype=tf.float32) self.probs = tf.contrib.layers.fully_connected( inputs=self.L3[0], num_outputs=self.env_runner.nA, activation_fn=tf.nn.softmax, weights_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer()) self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1), name="action")
def build_network(self): image_size = 80 image_depth = 1 # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images self.states = tf.placeholder( tf.float32, [None, image_size, image_size, image_depth], name="states") self.a_n = tf.placeholder(tf.float32, name="a_n") self.N = tf.placeholder(tf.int32, name="N") self.adv_n = tf.placeholder(tf.float32, name="adv_n") # Advantage x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten shape = x.get_shape().as_list() reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3] ]) # -1 for the (unknown) batch size # Fully connected layer 1 self.L3 = tf.contrib.layers.fully_connected( inputs=reshape, num_outputs=self.config["n_hidden_units"], activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.01), biases_initializer=tf.zeros_initializer()) # Fully connected layer 2 self.probs = tf.contrib.layers.fully_connected( inputs=self.L3, num_outputs=self.env_runner.nA, activation_fn=tf.nn.softmax, weights_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer()) self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1), name="action")
def __init__(self, state_shape, n_actions, n_hidden, summary=True): super(ActorCriticNetworkDiscreteCNNRNN, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden self.summary = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.adv = tf.placeholder(tf.float32, name="advantage") self.actions_taken = tf.placeholder(tf.float32, name="actions_taken") self.r = tf.placeholder(tf.float32, [None], name="r") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten reshape = tf.expand_dims(flatten(x), [0]) lstm_size = 256 self.enc_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size) lstm_state_size = self.enc_cell.state_size c_init = np.zeros((1, lstm_state_size.c), np.float32) h_init = np.zeros((1, lstm_state_size.h), np.float32) self.state_init = [c_init, h_init] self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32) tf.add_to_collection("rnn_state_in_c", self.rnn_state_in.c) tf.add_to_collection("rnn_state_in_h", self.rnn_state_in.h) L3, self.rnn_state_out = tf.nn.dynamic_rnn( cell=self.enc_cell, inputs=reshape, initial_state=self.rnn_state_in, dtype=tf.float32) tf.add_to_collection("rnn_state_out_c", self.rnn_state_out.c) tf.add_to_collection("rnn_state_out_h", self.rnn_state_out.h) L3 = tf.reshape(L3, [-1, lstm_size]) # Fully connected for Actor self.logits = linear(L3, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape( linear(L3, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] log_probs = tf.nn.log_softmax(self.logits) self.actor_loss = -tf.reduce_sum( tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv) self.critic_loss = 0.5 * tf.reduce_sum(tf.square(self.value - self.r)) self.entropy = -tf.reduce_sum(self.probs * log_probs) self.loss = self.actor_loss + 0.5 * self.critic_loss - self.entropy * 0.01 self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)