Esempio n. 1
0
 def __init__(self, brain, lr=1e-4, h_size=128, epsilon_start=1, epsilon_end = 0.1,
              epsilon_decay_steps=1e5, tau=0.0001, max_step=5e6,
              normalize=False, use_recurrent=False, num_layers=2, m_size=None,
              frozen=False, update_frozen_freq=None):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate MADQN agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :return: a sub-class of PPOAgent tailored to the environment.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     """
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)
     if num_layers < 1:
         num_layers = 1
     self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
     if brain.vector_action_space_type == "continuous":
         print("DQN only supports discrete action space")
     else:
         self.create_madqn_model(h_size, num_layers, epsilon_start, epsilon_end, epsilon_decay_steps, frozen, update_frozen_freq)
     if not frozen:
         self.create_dqn_optimizer(lr, max_step)
Esempio n. 2
0
    def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6,
                 normalize=False, use_recurrent=False, num_layers=2, m_size=None,agent_cnt=1):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: BrainInfo used to generate specific network graph.
        :param lr: Learning rate.
        :param h_size: Size of hidden layers
        :param epsilon: Value for policy-divergence threshold.
        :param beta: Strength of entropy regularization.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers Number of hidden layers between encoded input and policy & value layers
        :param m_size: Size of brain memory.
        """
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)
        if num_layers < 1:
            num_layers = 1
       
        self.num_layers = num_layers
        self.h_size = h_size
        self.lr = lr
        self.beta = beta
        self.max_step = max_step
        self.vepsilon = epsilon
        self.created_model = False

        self.create_model(agent_cnt,None,"")
Esempio n. 3
0
    def __init__(self,
                 brain,
                 h_size=128,
                 lr=1e-4,
                 n_layers=2,
                 m_size=128,
                 normalize=False,
                 use_recurrent=False):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)

        num_streams = 1
        hidden_streams = self.create_new_obs(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(dtype=tf.float32,
                                           shape=[],
                                           name="dropout_rate")
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name='recurrent_in')
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in)
            self.memory_out = tf.identity(self.memory_out,
                                          name='recurrent_out')
        self.policy = tf.layers.dense(
            hidden_reg,
            self.a_size,
            activation=None,
            use_bias=False,
            kernel_initializer=c_layers.variance_scaling_initializer(
                factor=0.01))

        if brain.vector_action_space_type == "discrete":
            self.action_probs = tf.nn.softmax(self.policy)
            self.sample_action_float = tf.multinomial(self.policy, 1)
            self.sample_action_float = tf.identity(self.sample_action_float,
                                                   name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="teacher_action")
            self.action_oh = tf.one_hot(self.true_action, self.a_size)
            self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) *
                                      self.action_oh)
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1),
                                tf.int32), self.sample_action), tf.float32))
        else:
            self.sample_action = tf.identity(self.policy, name="action")
            self.true_action = tf.placeholder(shape=[None, self.a_size],
                                              dtype=tf.float32,
                                              name="teacher_action")
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.true_action, self.sample_action))

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
Esempio n. 4
0
 def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6,
              normalize=False, use_recurrent=False, num_layers=2, m_size=None):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :return: a sub-class of PPOAgent tailored to the environment.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     """
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)
     if num_layers < 1:
         num_layers = 1
     self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
     if brain.vector_action_space_type == "continuous":
         self.create_cc_actor_critic(h_size, num_layers)
         self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
     else:
         self.create_dc_actor_critic(h_size, num_layers)
     self.create_ppo_optimizer(self.probs, self.old_probs, self.value,
                               self.entropy, beta, epsilon, lr, max_step)
Esempio n. 5
0
 def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6,
              normalize=False, use_recurrent=False, num_layers=2, m_size=None, n_agents=None):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :return: a sub-class of PPOAgent tailored to the environment.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     """
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)
     if num_layers < 1:
         num_layers = 1
     self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
     if brain.vector_action_space_type == "continuous":
         self.create_cc_actor_critic(h_size, num_layers)
         self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
     else:
         self.create_dc_ma_actor_critic(h_size, num_layers, n_agents, 'mappo')
     self.create_mappo_optimizer(self.probs, self.old_probs, self.value,
                                 self.entropy, beta, epsilon, lr, max_step)
Esempio n. 6
0
    def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128,
                 normalize=False, use_recurrent=False):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)

        num_streams = 1
        hidden_streams = self.create_new_obs(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate")
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
            hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in)
            self.memory_out = tf.identity(self.memory_out, name='recurrent_out')
        self.policy = tf.layers.dense(hidden_reg, self.a_size, activation=None, use_bias=False,
                                      kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))

        if brain.vector_action_space_type == "discrete":
            self.action_probs = tf.nn.softmax(self.policy)
            self.sample_action_float = tf.multinomial(self.policy, 1)
            self.sample_action_float = tf.identity(self.sample_action_float, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(shape=[None], dtype=tf.int32, name="teacher_action")
            self.action_oh = tf.one_hot(self.true_action, self.a_size)
            self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh)
            self.action_percent = tf.reduce_mean(tf.cast(
                tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32))
        else:
            self.sample_action = tf.identity(self.policy, name="action")
            self.true_action = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name="teacher_action")
            self.loss = tf.reduce_sum(tf.squared_difference(self.true_action, self.sample_action))

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
    def __init__(self,
                 brain,
                 h_size=128,
                 lr=1e-4,
                 n_layers=2,
                 m_size=128,
                 normalize=False,
                 use_recurrent=False):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)

        num_streams = 1
        hidden_streams = self.create_observation_streams(
            num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(dtype=tf.float32,
                                           shape=[],
                                           name="dropout_rate")
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            tf.Variable(self.m_size,
                        name="memory_size",
                        trainable=False,
                        dtype=tf.int32)
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name='recurrent_in')
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in, self.sequence_length)
            self.memory_out = tf.identity(self.memory_out,
                                          name='recurrent_out')

        if brain.vector_action_space_type == "discrete":
            policy_branches = []
            for size in self.a_size:
                policy_branches.append(
                    tf.layers.dense(hidden,
                                    size,
                                    activation=None,
                                    use_bias=False,
                                    kernel_initializer=c_layers.
                                    variance_scaling_initializer(factor=0.01)))
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches],
                axis=1,
                name="action_probs")
            self.sample_action_float = tf.concat(
                [tf.multinomial(branch, 1) for branch in policy_branches],
                axis=1)
            self.sample_action_float = tf.identity(self.sample_action_float,
                                                   name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="teacher_action")
            self.action_oh = tf.concat([
                tf.one_hot(self.true_action[:, i], self.a_size[i])
                for i in range(len(self.a_size))
            ],
                                       axis=1)
            self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) *
                                      self.action_oh)
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1),
                                tf.int32), self.sample_action), tf.float32))
        else:
            self.policy = tf.layers.dense(
                hidden_reg,
                self.a_size[0],
                activation=None,
                use_bias=False,
                name='pre_action',
                kernel_initializer=c_layers.variance_scaling_initializer(
                    factor=0.01))
            self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
            self.sample_action = tf.identity(self.clipped_sample_action,
                                             name="action")
            self.true_action = tf.placeholder(shape=[None, self.a_size[0]],
                                              dtype=tf.float32,
                                              name="teacher_action")
            self.clipped_true_action = tf.clip_by_value(
                self.true_action, -1, 1)
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.clipped_true_action,
                                      self.sample_action))

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)