def test_ppo_model_cc_visual(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=2) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.log_probs, model.value, model.entropy, model.learning_rate ] feed_dict = { model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.visual_in[0]: np.ones([2, 40, 30, 3]), model.visual_in[1]: np.ones([2, 40, 30, 3]), model.epsilon: np.array([[0, 1], [2, 3]]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_ppo_model_cc_vector_rnn(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') memory_size = 128 model = PPOModel(env.brains["RealFakeBrain"], use_recurrent=True, m_size=memory_size) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate, model.memory_out ] feed_dict = { model.batch_size: 1, model.sequence_length: 2, model.memory_in: np.zeros((1, memory_size)), model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.epsilon: np.array([[0, 1]]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_ppo_model_dc_vector_curio(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=True, visual_inputs=0) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate, model.intrinsic_reward ] feed_dict = { model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.action_holder: [[0], [0]], model.action_masks: np.ones([2, 2]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def __init__(self, seed, brain, trainer_params, is_training, load, use_depth, save_activations): """ Policy for Proximal Policy Optimization Networks. :param seed: Random seed. :param brain: Assigned Brain object. :param trainer_params: Defined training parameters. :param is_training: Whether the model should be trained. :param load: Whether a pre-trained model will be loaded or a new one created. :param use_depth: Augment visual input with depth information. :param save_activations: Save network activations. """ super().__init__(seed, brain, trainer_params) self.has_updated = False self.use_curiosity = bool(trainer_params['use_curiosity']) self.save_activations = save_activations self.allGinis = [] self.VisGinis = [] with self.graph.as_default(): self.model = PPOModel( brain, lr=float(trainer_params['learning_rate']), h_size=int(trainer_params['hidden_units']), h_size_vec=int(trainer_params['hidden_units_vec']), epsilon=float(trainer_params['epsilon']), beta=float(trainer_params['beta']), max_step=float(trainer_params['max_steps']), normalize=trainer_params['normalize'], use_recurrent=trainer_params['use_recurrent'], num_layers=int(trainer_params['num_layers']), m_size=self.m_size, use_curiosity=bool(trainer_params['use_curiosity']), curiosity_strength=float(trainer_params['curiosity_strength']), curiosity_enc_size=float(trainer_params['curiosity_enc_size']), seed=seed, forward_model_weight=trainer_params['forward_model_weight'], use_depth=use_depth, save_activations=save_activations) if load: self._load_graph() else: self._initialize_graph() self.inference_dict = { 'action': self.model.output, 'log_probs': self.model.all_log_probs, 'value': self.model.value, 'entropy': self.model.entropy, 'learning_rate': self.model.learning_rate } if self.use_continuous_act: self.inference_dict['pre_action'] = self.model.output_pre if self.use_recurrent: self.inference_dict['memory_out'] = self.model.memory_out if is_training and self.use_vec_obs and trainer_params['normalize']: self.inference_dict['update_mean'] = self.model.update_mean self.inference_dict['update_variance'] = self.model.update_variance #if self.use_curiosity: # self.inference_dict['pred_next_state'] = self.model.pred_next_state self.update_dict = { 'value_loss': self.model.value_loss, 'policy_loss': self.model.policy_loss, 'update_batch': self.model.update_batch } self.inference_dict['encoded_state'] = self.model.encoding if self.save_activations: self.update_dict['gradients'] = self.model.hidden_grad self.encodings = [] self.gradients = [] self.values = [] self.actions = [] print(self.update_dict) if self.use_curiosity: self.enc_cur_state = [] self.enc_next_state = [] self.pred_state = [] self.pred_act = [] if self.use_curiosity: self.update_dict['forward_loss'] = self.model.forward_loss self.update_dict['inverse_loss'] = self.model.inverse_loss if self.save_activations: self.inference_dict['enc_cur_state'] = self.model.enc_cur_state self.inference_dict['pred_state'] = self.model.pred_next_state self.inference_dict['pred_act'] = self.model.pred_action self.inference_dict['forward_loss'] = self.model.forward_loss self.inference_dict['inverse_loss'] = self.model.inverse_loss