def __init__(self, continuous, ob_dim, action_dim, n_layers): self.continuous = continuous self.state = tf.placeholder(shape=[None, ob_dim], name="observations", dtype=tf.float32) with tf.variable_scope('policy'): self.pi = pi = build_mlp(2, self.state, action_dim) self.dist, self.sample, self.log_prob = dist_continuous(pi) with tf.variable_scope('v_pred'): vpred = build_mlp(2, self.state, 1) self.vpred = tf.squeeze(vpred, axis=1)
def add_critic_network_op(self): """ Build critic network. Assign it to self.q, self.target_q. :param scope: variable scope used for parameters in this network :return: None """ self.q_scope = "q" self.target_q_scope = "target_q" with tf.variable_scope(self.critic_network_scope): input = tf.concat([tf.layers.flatten(self.state_placeholder), tf.layers.flatten(self.actions_n_placeholder)], axis=1) self.q = build_mlp(input, 1, self.q_scope, self.config.n_layers, self.config.layer_size) self.target_q = build_mlp(input, 1, self.target_q_scope, self.config.n_layers, self.config.layer_size) if self.config.debug_logging: self.q = tf.Print(self.q, [self.q], message="q", summarize=20) self.target_q = tf.Print(self.target_q, [self.target_q], message="target_q", summarize=20)
def __init__(self, state_shape, hidden_units=(64, 64), hidden_activation=nn.Tanh()): super().__init__() self.net = build_mlp(input_dim=state_shape[0], output_dim=1, hidden_units=hidden_units, hidden_activation=hidden_activation)
def __init__(self, state_shape, action_shape, hidden_units=(256, 256), hidden_activation=nn.ReLU(inplace=True)): super().__init__() self.net = build_mlp(input_dim=state_shape[0], output_dim=2 * action_shape[0], hidden_units=hidden_units, hidden_activation=hidden_activation)
def __init__(self, state_shape, gamma, hidden_units_r=(64, 64), hidden_units_v=(64, 64), hidden_activation_r=nn.ReLU(inplace=True), hidden_activation_v=nn.ReLU(inplace=True)): super().__init__() self.g = build_mlp(input_dim=state_shape[0], output_dim=1, hidden_units=hidden_units_r, hidden_activation=hidden_activation_r) self.h = build_mlp(input_dim=state_shape[0], output_dim=1, hidden_units=hidden_units_v, hidden_activation=hidden_activation_v) self.gamma = gamma
def add_actor_loss_op(self): slice_1 = tf.slice(self.actions_n_placeholder, [0, 0, 0], [self.config.batch_size, self.agent_idx, self.action_dim]) slice_2 = tf.slice(self.actions_n_placeholder, [0, self.agent_idx+1, 0], [self.config.batch_size, self.env.n - self.agent_idx - 1, self.action_dim]) action_logits = tf.expand_dims(self.mu_noise, axis=1) actions_n = tf.concat([slice_1, action_logits, slice_2], axis=1) input = tf.concat([tf.layers.flatten(self.state_placeholder), tf.layers.flatten(actions_n)], axis=1) combined_q_scope = self.critic_network_scope + "/" + self.q_scope self.q_reuse = build_mlp(input, 1, combined_q_scope, self.config.n_layers, self.config.layer_size)
def __init__(self, state_shape, action_shape, hidden_units=(64, 64), hidden_activation=nn.Tanh()): super().__init__() self.net = build_mlp(input_dim=state_shape[0], output_dim=action_shape[0], hidden_units=hidden_units, hidden_activation=hidden_activation) self.log_stds = nn.Parameter(torch.zeros(1, action_shape[0]))
def build_policy_network_op(self): """ Builds the policy network. """ self.mu_scope = "mu" self.target_mu_scope = "target_mu" with tf.variable_scope(self.actor_network_scope): self.mu = build_mlp(self.observation_placeholder, self.action_dim, self.mu_scope, n_layers=self.config.n_layers, size=self.config.layer_size, output_activation=None, use_batch_normalization=self.config.use_batch_normalization) if self.config.debug_logging: self.mu = tf.Print(self.mu, [self.mu], message="mu", summarize=20) self.target_mu = build_mlp(self.observation_placeholder, self.action_dim, self.target_mu_scope, n_layers=self.config.n_layers, size=self.config.layer_size, output_activation=None, use_batch_normalization=self.config.use_batch_normalization) self.mu_normalized = tf.nn.softmax(self.mu, axis=-1) self.target_mu_normalized = tf.nn.softmax(self.target_mu, axis=-1) if self.config.param_noise: self.setup_param_noise(self.observation_placeholder) self.mu_noise = tf.nn.softmax(self.mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.mu)))), axis=-1) if self.config.debug_logging: self.mu_noise = tf.Print(self.mu_noise, [self.mu_noise], summarize=10, message="action logits") self.target_mu_noise = tf.nn.softmax(self.target_mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.target_mu)))), axis=-1) elif self.config.random_process_exploration == 0: self.mu_noise = tf.nn.softmax(self.mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.mu)))), axis=-1) if self.config.debug_logging: self.mu_noise = tf.Print(self.mu_noise, [self.mu_noise], summarize=10, message="action logits") self.target_mu_noise = tf.nn.softmax(self.target_mu - tf.log(-tf.log(tf.random_uniform(tf.shape(self.target_mu)))), axis=-1) elif self.config.random_process_exploration == 1: self.mu_noise = self.mu_normalized self.target_mu_noise = self.target_mu_normalized elif self.config.random_process_exploration == 2: log_std = tf.get_variable("random_process_log_std", shape=[self.action_dim], dtype=tf.float32) std = tf.exp(log_std) dist = tf.contrib.distributions.MultivariateNormalDiag(self.mu_normalized, std) self.mu_noise = dist.sample() self.target_mu_noise = self.target_mu_normalized
def __init__(self, state_shape, action_shape, hidden_units=(256, 256), hidden_activation=nn.ReLU(inplace=True)): super().__init__() # print("Network shape") # print(state_shape[0]) # print(state_shape[1]) # print(action_shape[0]) self.net1 = build_mlp( #input_dim=state_shape[0] + state_shape[1] + action_shape[0], input_dim=state_shape[0] + action_shape[0], output_dim=1, hidden_units=hidden_units, hidden_activation=hidden_activation) self.net2 = build_mlp( #input_dim=state_shape[0] + state_shape[1] + action_shape[0], input_dim=state_shape[0] + action_shape[0], output_dim=1, hidden_units=hidden_units, hidden_activation=hidden_activation)
def build_policy_approx_networks(self): """ Build one network per other agent to estimate what the other agents would do :return: None """ policy_approximate_logits = [] policy_approximate_actions = [] with tf.variable_scope(self.policy_approx_networks_scope): for i in range(self.env.n): if i == self.agent_idx: policy_approximate_logits.append(None) policy_approximate_actions.append(None) continue scope = "agent_" + str(i) logits = build_mlp(self.observation_placeholder, self.action_dim, scope, self.config.n_layers, self.config.layer_size, output_activation=None) policy_approximate_logits.append(logits) policy_approximate_actions.append(tf.nn.softmax(logits, axis=-1)) self.policy_approximate_logits = policy_approximate_logits self.policy_approximate_actions = policy_approximate_actions