def __init__(self, brain, lr=1e-4, h_size=128, epsilon_start=1, epsilon_end = 0.1, epsilon_decay_steps=1e5, tau=0.0001, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, frozen=False, update_frozen_freq=None): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate MADQN agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. """ LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) if num_layers < 1: num_layers = 1 self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder() if brain.vector_action_space_type == "continuous": print("DQN only supports discrete action space") else: self.create_madqn_model(h_size, num_layers, epsilon_start, epsilon_end, epsilon_decay_steps, frozen, update_frozen_freq) if not frozen: self.create_dqn_optimizer(lr, max_step)
def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None,agent_cnt=1): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. """ LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) if num_layers < 1: num_layers = 1 self.num_layers = num_layers self.h_size = h_size self.lr = lr self.beta = beta self.max_step = max_step self.vepsilon = epsilon self.created_model = False self.create_model(agent_cnt,None,"")
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) num_streams = 1 hidden_streams = self.create_new_obs(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate") hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden_reg, self.memory_out = self.create_recurrent_encoder( hidden_reg, self.memory_in) self.memory_out = tf.identity(self.memory_out, name='recurrent_out') self.policy = tf.layers.dense( hidden_reg, self.a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer( factor=0.01)) if brain.vector_action_space_type == "discrete": self.action_probs = tf.nn.softmax(self.policy) self.sample_action_float = tf.multinomial(self.policy, 1) self.sample_action_float = tf.identity(self.sample_action_float, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder(shape=[None], dtype=tf.int32, name="teacher_action") self.action_oh = tf.one_hot(self.true_action, self.a_size) self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh) self.action_percent = tf.reduce_mean( tf.cast( tf.equal( tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32)) else: self.sample_action = tf.identity(self.policy, name="action") self.true_action = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name="teacher_action") self.loss = tf.reduce_sum( tf.squared_difference(self.true_action, self.sample_action)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)
def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. """ LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) if num_layers < 1: num_layers = 1 self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder() if brain.vector_action_space_type == "continuous": self.create_cc_actor_critic(h_size, num_layers) self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy else: self.create_dc_actor_critic(h_size, num_layers) self.create_ppo_optimizer(self.probs, self.old_probs, self.value, self.entropy, beta, epsilon, lr, max_step)
def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, n_agents=None): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. """ LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) if num_layers < 1: num_layers = 1 self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder() if brain.vector_action_space_type == "continuous": self.create_cc_actor_critic(h_size, num_layers) self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy else: self.create_dc_ma_actor_critic(h_size, num_layers, n_agents, 'mappo') self.create_mappo_optimizer(self.probs, self.old_probs, self.value, self.entropy, beta, epsilon, lr, max_step)
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) num_streams = 1 hidden_streams = self.create_new_obs(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate") hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in) self.memory_out = tf.identity(self.memory_out, name='recurrent_out') self.policy = tf.layers.dense(hidden_reg, self.a_size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)) if brain.vector_action_space_type == "discrete": self.action_probs = tf.nn.softmax(self.policy) self.sample_action_float = tf.multinomial(self.policy, 1) self.sample_action_float = tf.identity(self.sample_action_float, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder(shape=[None], dtype=tf.int32, name="teacher_action") self.action_oh = tf.one_hot(self.true_action, self.a_size) self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh) self.action_percent = tf.reduce_mean(tf.cast( tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32)) else: self.sample_action = tf.identity(self.policy, name="action") self.true_action = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name="teacher_action") self.loss = tf.reduce_sum(tf.squared_difference(self.true_action, self.sample_action)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain) num_streams = 1 hidden_streams = self.create_observation_streams( num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate") hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden_reg, self.memory_out = self.create_recurrent_encoder( hidden_reg, self.memory_in, self.sequence_length) self.memory_out = tf.identity(self.memory_out, name='recurrent_out') if brain.vector_action_space_type == "discrete": policy_branches = [] for size in self.a_size: policy_branches.append( tf.layers.dense(hidden, size, activation=None, use_bias=False, kernel_initializer=c_layers. variance_scaling_initializer(factor=0.01))) self.action_probs = tf.concat( [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs") self.sample_action_float = tf.concat( [tf.multinomial(branch, 1) for branch in policy_branches], axis=1) self.sample_action_float = tf.identity(self.sample_action_float, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action") self.action_oh = tf.concat([ tf.one_hot(self.true_action[:, i], self.a_size[i]) for i in range(len(self.a_size)) ], axis=1) self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh) self.action_percent = tf.reduce_mean( tf.cast( tf.equal( tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32)) else: self.policy = tf.layers.dense( hidden_reg, self.a_size[0], activation=None, use_bias=False, name='pre_action', kernel_initializer=c_layers.variance_scaling_initializer( factor=0.01)) self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1) self.sample_action = tf.identity(self.clipped_sample_action, name="action") self.true_action = tf.placeholder(shape=[None, self.a_size[0]], dtype=tf.float32, name="teacher_action") self.clipped_true_action = tf.clip_by_value( self.true_action, -1, 1) self.loss = tf.reduce_sum( tf.squared_difference(self.clipped_true_action, self.sample_action)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)