def __init__(self, env, ph_observations, params):
        self.ph_observations = ph_observations

        num_actions = env.action_space.n
        obs_space = main_observation_space(env)

        # Goal observation
        self.ph_goal_obs = None
        self.is_goal_env = is_goal_based_env(env)
        if self.is_goal_env:
            # goal obs has the same shape as main obs
            self.ph_goal_obs = placeholder_from_space(main_observation_space(env))

        make_encoder_func = make_encoder_with_goal if self.is_goal_env else make_encoder

        regularizer = None  # don't use L2 regularization

        actor_enc_params = get_enc_params(params, 'actor')

        # actor computation graph
        # use actor encoder as main observation encoder (including landmarks, etc.)
        if self.is_goal_env:
            actor_encoder = make_encoder_func(
                self.ph_observations, self.ph_goal_obs, obs_space, regularizer, actor_enc_params, name='act_enc',
            )
        else:
            actor_encoder = make_encoder_func(
                self.ph_observations, obs_space, regularizer, actor_enc_params, name='act_enc',
            )

        actor_model = make_model(actor_encoder.encoded_input, regularizer, params, 'act_mdl')

        actions_fc = dense(actor_model.latent, params.model_fc_size // 2, regularizer)
        action_logits = tf.contrib.layers.fully_connected(actions_fc, num_actions, activation_fn=None)
        self.best_action_deterministic = tf.argmax(action_logits, axis=1)
        self.actions_distribution = CategoricalProbabilityDistribution(action_logits)
        self.act = self.actions_distribution.sample()
        self.action_prob = self.actions_distribution.probability(self.act)

        critic_enc_params = get_enc_params(params, 'critic')

        # critic computation graph
        if self.is_goal_env:
            value_encoder = make_encoder_func(
                self.ph_observations, self.ph_goal_obs, obs_space, regularizer, critic_enc_params, 'val_enc',
            )
        else:
            value_encoder = make_encoder_func(
                self.ph_observations, obs_space, regularizer, critic_enc_params, 'val_enc',
            )

        value_model = make_model(value_encoder.encoded_input, regularizer, params, 'val_mdl')

        value_fc = dense(value_model.latent, params.model_fc_size // 2, regularizer)
        self.value = tf.squeeze(tf.contrib.layers.fully_connected(value_fc, 1, activation_fn=None), axis=[1])

        log.info('Total parameters in the model: %d', count_total_parameters())
    def test_normalize(self):
        env = make_doom_env(doom_env_by_name(TEST_ENV_NAME))
        obs_space = main_observation_space(env)

        env.reset()
        obs = [env.step(0)[0] for _ in range(10)]

        self.assertTrue(np.all(obs_space.low == 0))
        self.assertTrue(np.all(obs_space.high == 255))
        self.assertEqual(obs_space.dtype, np.uint8)

        self.assertFalse(is_normalized(obs_space))

        tf.reset_default_graph()

        ph_obs = placeholder_from_space(obs_space)
        obs_tensor = tf_normalize(ph_obs, obs_space)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            normalized_obs = sess.run(obs_tensor, feed_dict={ph_obs: obs})

            self.assertEqual(normalized_obs.dtype, np.float32)
            self.assertLessEqual(normalized_obs.max(), 1.0)
            self.assertGreaterEqual(normalized_obs.min(), -1.0)

        tf.reset_default_graph()
        gc.collect()
Exemple #3
0
    def __init__(self, make_env_func, params):
        super(AgentCuriousPPO, self).__init__(make_env_func, params)

        env = self.make_env_func(
        )  # we need it to query observation shape, number of actions, etc.
        self.ph_next_observations = placeholder_from_space(
            main_observation_space(env))
        self.num_actions = env.action_space.n
        env.close()

        if self.params.curiosity_type == 'icm':
            # create graph for curiosity module (ICM)
            self.curiosity = IntrinsicCuriosityModule(
                env,
                self.ph_observations,
                self.ph_next_observations,
                self.ph_actions,
                params.forward_fc,
                params,
            )
        elif self.params.curiosity_type == 'ecr':
            self.curiosity = ECRModule(env, params)
        elif self.params.curiosity_type == 'ecr_map':
            self.curiosity = ECRMapModule(env, params)
        elif self.params.curiosity_type == 'rnd':
            self.curiosity = RandomNetworkDistillation(env,
                                                       self.ph_observations,
                                                       params)
        else:
            raise Exception(
                f'Curiosity type {self.params.curiosity_type} not supported')

        self.previous_actions = np.random.randint(0, self.num_actions,
                                                  self.params.num_envs)
    def __init__(self, env, ph_obs, params=None):
        """
        :param env
        :param ph_obs - placeholder for observations
        """
        with tf.variable_scope('rnd'):
            self.params = params
            self.ph_obs = ph_obs

            reg = None  # don't use regularization

            obs_space = main_observation_space(env)

            target_enc_params = get_enc_params(params, 'rnd_target')
            encoder_obs = make_encoder(ph_obs, obs_space, reg, target_enc_params, name='target_encoder')
            self.predicted_features = encoder_obs.encoded_input

            predictor_enc_params = get_enc_params(params, 'rnd_predictor')
            target_features = make_encoder(ph_obs, obs_space, reg, predictor_enc_params, name='predictor_encoder')
            self.tgt_features = tf.stop_gradient(target_features.encoded_input)

            self.feature_vector_size = self.predicted_features.get_shape().as_list()[-1]
            log.info('Feature vector size in RND module: %d', self.feature_vector_size)

            self.objectives = self._objectives()

            self._add_summaries()
            self.summaries = merge_summaries(collections=['rnd'])

            self.step = tf.Variable(0, trainable=False, dtype=tf.int64, name='rnd_step')

            opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='rnd_opt')
            self.train_rnd = opt.minimize(self.objectives.loss, global_step=self.step)
    def __init__(self, make_env_func, params):
        """Initialize PPO computation graph and some auxiliary tensors."""
        super(AgentPPO, self).__init__(params)

        self.actor_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='actor_step')
        self.critic_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='critic_step')

        self.make_env_func = make_env_func
        env = make_env_func()  # we need the env to query observation shape, number of actions, etc.

        self.obs_shape = [-1] + list(main_observation_space(env).shape)
        self.ph_observations = placeholder_from_space(main_observation_space(env))
        self.ph_actions = placeholder_from_space(env.action_space)  # actions sampled from the policy
        self.ph_advantages, self.ph_returns, self.ph_old_action_probs = placeholders(None, None, None)

        self.actor_critic = ActorCritic(env, self.ph_observations, self.params)

        env.close()

        self.objectives = self.add_ppo_objectives(
            self.actor_critic,
            self.ph_actions, self.ph_old_action_probs, self.ph_advantages, self.ph_returns,
            self.params,
            self.actor_step,
        )

        # optimizers
        actor_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='actor_opt')
        self.train_actor = actor_opt.minimize(self.objectives.actor_loss, global_step=self.actor_step)

        critic_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='critic_opt')
        self.train_critic = critic_opt.minimize(self.objectives.critic_loss, global_step=self.critic_step)

        self.add_ppo_summaries()

        summary_dir = summaries_dir(self.params.experiment_dir())
        self.summary_writer = tf.summary.FileWriter(summary_dir)
        self.actor_summaries = merge_summaries(collections=['actor'])
        self.critic_summaries = merge_summaries(collections=['critic'])

        if self.params.use_env_map:
            self.map_img, self.coord_limits = generate_env_map(make_env_func)
Exemple #6
0
    def __init__(self, env, params):
        obs_space = main_observation_space(env)
        self.ph_obs_first, self.ph_obs_second = placeholders_from_spaces(
            obs_space, obs_space)
        self.ph_labels = tf.placeholder(dtype=tf.int32, shape=(None, ))
        self.ph_is_training = tf.placeholder(dtype=tf.bool, shape=[])

        with tf.variable_scope('distance') as scope:
            log.info('Distance network graph...')

            self.step = tf.Variable(0,
                                    trainable=False,
                                    dtype=tf.int64,
                                    name='dist_step')
            reg = tf.contrib.layers.l2_regularizer(scale=1e-5)
            summary_collections = ['dist']

            enc_params = EncoderParams()
            enc_params.enc_name = params.distance_encoder
            enc_params.batch_norm = params.distance_use_batch_norm
            enc_params.ph_is_training = self.ph_is_training
            enc_params.summary_collections = summary_collections

            encoder = tf.make_template(
                'siamese_enc',
                make_encoder,
                create_scope_now_=True,
                obs_space=obs_space,
                regularizer=reg,
                enc_params=enc_params,
            )

            obs_first_enc = encoder(self.ph_obs_first)
            obs_second_enc = encoder(self.ph_obs_second)

            self.first_encoded = obs_first_enc.encoded_input
            self.second_encoded = obs_second_enc.encoded_input

            encoder_reg_loss = 0.0
            if hasattr(obs_first_enc, 'reg_loss'):
                encoder_reg_loss = obs_first_enc.reg_loss

            observations_encoded = tf.concat(
                [self.first_encoded, self.second_encoded], axis=1)

            fc_layers = [params.distance_fc_size] * params.distance_fc_num
            x = observations_encoded
            for fc_layer_size in fc_layers:
                x = dense(
                    x,
                    fc_layer_size,
                    reg,
                    batch_norm=params.distance_use_batch_norm,
                    is_training=self.ph_is_training,
                )

            logits = tf.contrib.layers.fully_connected(x,
                                                       2,
                                                       activation_fn=None)
            self.probabilities = tf.nn.softmax(logits)
            self.correct = tf.reduce_mean(
                tf.to_float(
                    tf.equal(self.ph_labels,
                             tf.cast(tf.argmax(logits, axis=1), tf.int32))), )

            self.dist_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=self.ph_labels)
            self.dist_loss = tf.reduce_mean(self.dist_loss)

            reg_losses = tf.losses.get_regularization_losses(scope=scope.name)
            self.reg_loss = tf.reduce_sum(reg_losses) + encoder_reg_loss

            self.loss = self.dist_loss + self.reg_loss

            # helpers to encode observations (saves time)
            # does not matter if we use first vs second here
            self.ph_obs = self.ph_obs_first
            self.encoded_observation = self.first_encoded

            self._add_summaries(summary_collections)
            self.summaries = merge_summaries(collections=summary_collections)

            opt = tf.train.AdamOptimizer(learning_rate=params.learning_rate,
                                         name='dist_opt')

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                      scope='distance')):
                self.train_op = opt.minimize(self.loss, global_step=self.step)

        # other stuff not related to computation graph
        self.obs_encoder = ObservationEncoder(
            encode_func=self.encode_observation)
    def __init__(self, env, params):
        obs_space = main_observation_space(env)
        self.ph_obs_prev, self.ph_obs_curr, self.ph_obs_goal = placeholders_from_spaces(
            obs_space, obs_space, obs_space)
        self.ph_actions = placeholder_from_space(env.action_space)
        self.ph_is_training = tf.placeholder(dtype=tf.bool, shape=[])

        with tf.variable_scope('loco') as scope:
            log.info('Locomotion network graph...')

            self.step = tf.Variable(0,
                                    trainable=False,
                                    dtype=tf.int64,
                                    name='loco_step')

            reg = tf.contrib.layers.l2_regularizer(scale=1e-5)

            enc_params = EncoderParams()
            enc_params.enc_name = params.locomotion_encoder
            enc_params.batch_norm = params.locomotion_use_batch_norm
            enc_params.ph_is_training = self.ph_is_training
            enc_params.summary_collections = ['loco']

            encoder = tf.make_template(
                'enc_loco',
                make_encoder,
                create_scope_now_=True,
                obs_space=obs_space,
                regularizer=reg,
                enc_params=enc_params,
            )

            if params.locomotion_siamese:
                obs_curr_encoder = encoder(self.ph_obs_curr)
                obs_curr_encoded = obs_curr_encoder.encoded_input

                obs_goal_encoder = encoder(self.ph_obs_goal)
                obs_goal_encoded = obs_goal_encoder.encoded_input

                obs_encoder = obs_curr_encoder  # any of the two
                obs_encoded = tf.concat([obs_curr_encoded, obs_goal_encoded],
                                        axis=1)
            else:
                if params.locomotion_use_prev:
                    obs_concat = tf.concat(
                        [self.ph_obs_prev, self.ph_obs_curr, self.ph_obs_goal],
                        axis=3)
                else:
                    obs_concat = tf.concat(
                        [self.ph_obs_curr, self.ph_obs_goal], axis=3)

                obs_encoder = encoder(obs_concat)
                obs_encoded = obs_encoder.encoded_input

            encoder_reg_loss = 0.0
            if hasattr(obs_encoder, 'reg_loss'):
                encoder_reg_loss = obs_encoder.reg_loss

            fc_layers = [params.locomotion_fc_size] * params.locomotion_fc_num
            x = obs_encoded
            for fc_layer_size in fc_layers:
                x = dense(
                    x,
                    fc_layer_size,
                    regularizer=reg,
                    batch_norm=params.locomotion_use_batch_norm,
                    is_training=self.ph_is_training,
                )

            action_logits = tf.layers.dense(x,
                                            env.action_space.n,
                                            activation=None)
            self.actions_distribution = CategoricalProbabilityDistribution(
                action_logits)
            self.best_action_deterministic = tf.argmax(action_logits, axis=1)
            self.act = self.actions_distribution.sample()

            self.correct = tf.reduce_mean(
                tf.to_float(
                    tf.equal(
                        self.ph_actions,
                        tf.cast(tf.argmax(action_logits, axis=1),
                                tf.int32))), )

            actions_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.ph_actions, logits=action_logits)
            self.actions_loss = tf.reduce_mean(actions_loss)

            reg_losses = tf.losses.get_regularization_losses(scope=scope.name)
            self.reg_loss = tf.reduce_sum(reg_losses) + encoder_reg_loss

            self.loss = self.actions_loss + self.reg_loss

            loco_opt = tf.train.AdamOptimizer(
                learning_rate=params.learning_rate, name='loco_opt')

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='loco')):
                self.train_loco = loco_opt.minimize(self.loss,
                                                    global_step=self.step)
    def __init__(self,
                 env,
                 ph_obs,
                 ph_next_obs,
                 ph_actions,
                 forward_fc=256,
                 params=None):
        """
        :param ph_obs - placeholder for observations
        :param ph_actions - placeholder for selected actions
        """
        with tf.variable_scope('icm'):
            self.params = params

            self.ph_obs = ph_obs
            self.ph_next_obs = ph_next_obs
            self.ph_actions = ph_actions

            reg = None  # don't use regularization

            obs_space = main_observation_space(env)
            num_actions = env.action_space.n

            enc_params = get_enc_params(params, 'icm_enc')
            encoder_template = tf.make_template(
                'obs_encoder',
                make_encoder,
                create_scope_now_=True,
                obs_space=obs_space,
                regularizer=reg,
                enc_params=enc_params,
            )

            encoder_obs = encoder_template(ph_obs)
            encoder_next_obs = encoder_template(ph_next_obs)

            encoded_obs = encoder_obs.encoded_input
            self.encoded_next_obs = encoder_next_obs.encoded_input

            self.feature_vector_size = encoded_obs.get_shape().as_list()[-1]
            log.info('Feature vector size in ICM/RND module: %d',
                     self.feature_vector_size)

            actions_one_hot = tf.one_hot(ph_actions, num_actions)

            # forward model
            forward_model_input = tf.concat([encoded_obs, actions_one_hot],
                                            axis=1)
            forward_model_hidden = dense(forward_model_input, forward_fc, reg)
            forward_model_hidden = dense(forward_model_hidden, forward_fc, reg)
            forward_model_output = tf.contrib.layers.fully_connected(
                forward_model_hidden,
                self.feature_vector_size,
                activation_fn=None,
            )
            self.predicted_features = forward_model_output

            # inverse model
            inverse_model_input = tf.concat(
                [encoded_obs, self.encoded_next_obs], axis=1)
            inverse_model_hidden = dense(inverse_model_input, 256, reg)
            inverse_model_output = tf.contrib.layers.fully_connected(
                inverse_model_hidden,
                num_actions,
                activation_fn=None,
            )
            self.predicted_actions = inverse_model_output

            self.objectives = self._objectives()

            self._add_summaries()
            self.summaries = merge_summaries(collections=['icm'])

            self.step = tf.Variable(0,
                                    trainable=False,
                                    dtype=tf.int64,
                                    name='icm_step')

            opt = tf.train.AdamOptimizer(
                learning_rate=self.params.learning_rate, name='icm_opt')
            self.train_icm = opt.minimize(self.objectives.loss,
                                          global_step=self.step)