Beispiel #1
0
def test_ppo_model_cc_visual(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=2)
            env = UnityEnvironment(' ')

            model = PPOModel(env.brains["RealFakeBrain"])
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output, model.log_probs, model.value, model.entropy,
                model.learning_rate
            ]
            feed_dict = {
                model.batch_size: 2,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.visual_in[0]: np.ones([2, 40, 30, 3]),
                model.visual_in[1]: np.ones([2, 40, 30, 3]),
                model.epsilon: np.array([[0, 1], [2, 3]])
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()
Beispiel #2
0
def test_ppo_model_cc_vector_rnn(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=0)
            env = UnityEnvironment(' ')
            memory_size = 128
            model = PPOModel(env.brains["RealFakeBrain"],
                             use_recurrent=True,
                             m_size=memory_size)
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output, model.all_log_probs, model.value, model.entropy,
                model.learning_rate, model.memory_out
            ]
            feed_dict = {
                model.batch_size: 1,
                model.sequence_length: 2,
                model.memory_in: np.zeros((1, memory_size)),
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.epsilon: np.array([[0, 1]])
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()
Beispiel #3
0
def test_ppo_model_dc_vector_curio(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=True, visual_inputs=0)
            env = UnityEnvironment(' ')
            model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output, model.all_log_probs, model.value, model.entropy,
                model.learning_rate, model.intrinsic_reward
            ]
            feed_dict = {
                model.batch_size:
                2,
                model.sequence_length:
                1,
                model.vector_in:
                np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
                model.next_vector_in:
                np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
                model.action_holder: [[0], [0]],
                model.action_masks:
                np.ones([2, 2])
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()
Beispiel #4
0
    def __init__(self, seed, brain, trainer_params, is_training, load,
                 use_depth, save_activations):
        """
        Policy for Proximal Policy Optimization Networks.
        :param seed: Random seed.
        :param brain: Assigned Brain object.
        :param trainer_params: Defined training parameters.
        :param is_training: Whether the model should be trained.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        :param use_depth: Augment visual input with depth information.
        :param save_activations: Save network activations.
        """
        super().__init__(seed, brain, trainer_params)
        self.has_updated = False
        self.use_curiosity = bool(trainer_params['use_curiosity'])
        self.save_activations = save_activations
        self.allGinis = []
        self.VisGinis = []

        with self.graph.as_default():
            self.model = PPOModel(
                brain,
                lr=float(trainer_params['learning_rate']),
                h_size=int(trainer_params['hidden_units']),
                h_size_vec=int(trainer_params['hidden_units_vec']),
                epsilon=float(trainer_params['epsilon']),
                beta=float(trainer_params['beta']),
                max_step=float(trainer_params['max_steps']),
                normalize=trainer_params['normalize'],
                use_recurrent=trainer_params['use_recurrent'],
                num_layers=int(trainer_params['num_layers']),
                m_size=self.m_size,
                use_curiosity=bool(trainer_params['use_curiosity']),
                curiosity_strength=float(trainer_params['curiosity_strength']),
                curiosity_enc_size=float(trainer_params['curiosity_enc_size']),
                seed=seed,
                forward_model_weight=trainer_params['forward_model_weight'],
                use_depth=use_depth,
                save_activations=save_activations)

        if load:
            self._load_graph()
        else:
            self._initialize_graph()

        self.inference_dict = {
            'action': self.model.output,
            'log_probs': self.model.all_log_probs,
            'value': self.model.value,
            'entropy': self.model.entropy,
            'learning_rate': self.model.learning_rate
        }
        if self.use_continuous_act:
            self.inference_dict['pre_action'] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict['memory_out'] = self.model.memory_out
        if is_training and self.use_vec_obs and trainer_params['normalize']:
            self.inference_dict['update_mean'] = self.model.update_mean
            self.inference_dict['update_variance'] = self.model.update_variance

        #if self.use_curiosity:
        #    self.inference_dict['pred_next_state'] = self.model.pred_next_state

        self.update_dict = {
            'value_loss': self.model.value_loss,
            'policy_loss': self.model.policy_loss,
            'update_batch': self.model.update_batch
        }

        self.inference_dict['encoded_state'] = self.model.encoding

        if self.save_activations:
            self.update_dict['gradients'] = self.model.hidden_grad
            self.encodings = []
            self.gradients = []
            self.values = []
            self.actions = []
            print(self.update_dict)
            if self.use_curiosity:
                self.enc_cur_state = []
                self.enc_next_state = []
                self.pred_state = []
                self.pred_act = []

        if self.use_curiosity:
            self.update_dict['forward_loss'] = self.model.forward_loss
            self.update_dict['inverse_loss'] = self.model.inverse_loss
            if self.save_activations:
                self.inference_dict['enc_cur_state'] = self.model.enc_cur_state
                self.inference_dict['pred_state'] = self.model.pred_next_state
                self.inference_dict['pred_act'] = self.model.pred_action
                self.inference_dict['forward_loss'] = self.model.forward_loss
                self.inference_dict['inverse_loss'] = self.model.inverse_loss