def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma): """ Calculates the target Q-retrace :param rewards: ([TensorFlow Tensor]) The rewards :param dones: ([TensorFlow Tensor]) :param q_i: ([TensorFlow Tensor]) The Q values for actions taken :param values: ([TensorFlow Tensor]) The output of the value functions :param rho_i: ([TensorFlow Tensor]) The importance weight for each action :param n_envs: (int) The number of environments :param n_steps: (int) The number of steps to run for each environment :param gamma: (float) The discount value :return: ([TensorFlow Tensor]) the target Q-retrace """ rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps, True) # list of len steps, shape [n_envs] reward_seq = batch_to_seq(rewards, n_envs, n_steps, True) # list of len steps, shape [n_envs] done_seq = batch_to_seq(dones, n_envs, n_steps, True) # list of len steps, shape [n_envs] q_is = batch_to_seq(q_i, n_envs, n_steps, True) value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True) final_value = value_sequence[-1] qret = final_value qrets = [] for i in range(n_steps - 1, -1, -1): check_shape([qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i], value_sequence[i]], [[n_envs]] * 6) # my-stable-baselines modified: qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i]) qret = reward_seq[i] + gamma * qret qrets.append(qret) qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i] qrets = qrets[::-1] qret = seq_to_batch(qrets, flat=True) return qret
def strip(var, n_envs, n_steps, flat=False): """ Removes the last step in the batch :param var: (TensorFlow Tensor) The input Tensor :param n_envs: (int) The number of environments :param n_steps: (int) The number of steps to run for each environment :param flat: (bool) If the input Tensor is flat :return: (TensorFlow Tensor) the input tensor, without the last step in the batch """ out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat) return seq_to_batch(out_vars[:-1], flat)
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(2 * n_lstm, ), reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if net_arch is None: # Legacy mode if layers is None: layers = [64, 64] else: warnings.warn("The layers parameter is deprecated. Use the net_arch parameter instead.") with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": extracted_features = cnn_extractor(self.processed_obs, **kwargs) else: extracted_features = tf.layers.flatten(self.processed_obs) for i, layer_size in enumerate(layers): extracted_features = act_fun(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self._value_fn = value_fn else: # Use the new net_arch parameter if layers is not None: warnings.warn("The new net_arch parameter overrides the deprecated layers parameter.") if feature_extraction == "cnn": raise NotImplementedError() with tf.variable_scope("model", reuse=reuse): latent = tf.layers.flatten(self.processed_obs) policy_only_layers = [] # Layer sizes of the network that only belongs to the policy network value_only_layers = [] # Layer sizes of the network that only belongs to the value network # Iterate through the shared layers and build the shared parts of the network lstm_layer_constructed = False for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) elif layer == "lstm": if lstm_layer_constructed: raise ValueError("The net_arch parameter must only contain one occurrence of 'lstm'!") input_sequence = batch_to_seq(latent, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) latent = seq_to_batch(rnn_output) lstm_layer_constructed = True else: assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the policy-network latent_policy = latent for idx, pi_layer_size in enumerate(policy_only_layers): if pi_layer_size == "lstm": raise NotImplementedError("LSTMs are only supported in the shared part of the policy network.") assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun( linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) # Build the non-shared part of the value-network latent_value = latent for idx, vf_layer_size in enumerate(value_only_layers): if vf_layer_size == "lstm": raise NotImplementedError("LSTMs are only supported in the shared part of the value function " "network.") assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun( linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) if not lstm_layer_constructed: raise ValueError("The net_arch parameter must contain at least one occurrence of 'lstm'!") self._value_fn = linear(latent_value, 'vf', 1) # TODO: why not init_scale = 0.001 here like in the feedforward self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(latent_policy, latent_value) self._setup_init()
def lstm(extracted_features, dones_ph, cell_state_hidden, scope, n_hidden, n_env, n_steps, init_scale=1.0, layer_norm=False): """ Creates an Long Short Term Memory (LSTM) cell for TensorFlow :param extracted_features: (TensorFlow Tensor) The input tensor for the LSTM cell (before converting into sequence) :param dones_ph: (TensorFlow Tensor) The mask tensor for the LSTM cell (before converting into sequence) :param cell_state_hidden: (TensorFlow Tensor) The state tensor for the LSTM cell :param scope: (str) The TensorFlow variable scope :param n_hidden: (int) The number of hidden neurons :param init_scale: (int) The initialization scale :param layer_norm: (bool) Whether to apply Layer Normalization or not :return: (TensorFlow Tensor) LSTM cell """ #_, n_input = [v.value for v in input_tensor[0].get_shape()] n_input = extracted_features.get_shape()[1].value #print(n_input) #print(extracted_features.get_shape()) input_sequence = batch_to_seq(extracted_features, n_env, n_steps) masks = batch_to_seq(dones_ph, n_env, n_steps) #print(len(input_sequence)) #print(len(masks)) with tf.variable_scope(scope): weight_x = tf.get_variable("wx", [n_input, n_hidden * 4], initializer=ortho_init(init_scale)) weight_h = tf.get_variable("wh", [n_hidden, n_hidden * 4], initializer=ortho_init(init_scale)) bias = tf.get_variable("b", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) if layer_norm: # Gain and bias of layer norm gain_x = tf.get_variable("gx", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) bias_x = tf.get_variable("bx", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) gain_h = tf.get_variable("gh", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) bias_h = tf.get_variable("bh", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) gain_c = tf.get_variable("gc", [n_hidden], initializer=tf.constant_initializer(1.0)) bias_c = tf.get_variable("bc", [n_hidden], initializer=tf.constant_initializer(0.0)) cell_state, hidden = tf.split(axis=1, num_or_size_splits=2, value=cell_state_hidden) for idx, (_input, mask) in enumerate(zip(input_sequence, masks)): cell_state = cell_state * (1 - mask) hidden = hidden * (1 - mask) if layer_norm: gates = _ln(tf.matmul(_input, weight_x), gain_x, bias_x) \ + _ln(tf.matmul(hidden, weight_h), gain_h, bias_h) + bias else: #print(_input.get_shape()) #print(weight_x.get_shape()) #print(hidden.get_shape()) #print(weight_h.get_shape()) gates = tf.matmul(_input, weight_x) + tf.matmul(hidden, weight_h) + bias in_gate, forget_gate, out_gate, cell_candidate = tf.split( axis=1, num_or_size_splits=4, value=gates) in_gate = tf.nn.sigmoid(in_gate) forget_gate = tf.nn.sigmoid(forget_gate) out_gate = tf.nn.sigmoid(out_gate) cell_candidate = tf.tanh(cell_candidate) cell_state = forget_gate * cell_state + in_gate * cell_candidate if layer_norm: hidden = out_gate * tf.tanh(_ln(cell_state, gain_c, bias_c)) else: hidden = out_gate * tf.tanh(cell_state) input_sequence[idx] = hidden cell_state_hidden = tf.concat(axis=1, values=[cell_state, hidden]) return input_sequence, cell_state_hidden