Ejemplo n.º 1
0
    def make_duel_critics(self,
                          obs=None,
                          action=None,
                          reuse=False,
                          scope="duel_values_fn"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h_duel = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h_duel = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qf_h_duel = tf.concat([critics_h_duel, action], axis=-1)

            # Double Q values to reduce overestimation
            with tf.variable_scope('qf1', reuse=reuse):
                qf1_h_duel = mlp(qf_h_duel, action)
                qf1_duel = tf.layers.dense(qf1_h_duel, 1, name="qf1")
                # qf1 = tf.layers.dense(qf1_h, 1, name="qf1", activation=tf.nn.relu)

            with tf.variable_scope('qf2', reuse=reuse):
                qf2_h_duel = mlp(qf_h_duel, action)
                qf2_duel = tf.layers.dense(qf2_h_duel, 1, name="qf1")
                # qf2 = tf.layers.dense(qf1_h, 1, name="qf1", activation=tf.nn.relu)

            self.qf1_duel = qf1_duel
            self.qf2_duel = qf2_duel

        return self.qf1_duel, self.qf2_duel
Ejemplo n.º 2
0
    def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qf_h = tf.concat([critics_h, action], axis=-1)

            # Double Q values to reduce overestimation
            with tf.variable_scope('qf1', reuse=reuse):
                qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
                qf1 = tf.layers.dense(qf1_h, 1, name="qf1")

            with tf.variable_scope('qf2', reuse=reuse):
                qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
                qf2 = tf.layers.dense(qf2_h, 1, name="qf2")

            self.qf1 = qf1
            self.qf2 = qf2

        return self.qf1, self.qf2
Ejemplo n.º 3
0
    def make_critics(self,
                     obs=None,
                     action=None,
                     reuse=False,
                     scope="values_fn",
                     create_vf=True,
                     create_qf=True):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            if create_vf:
                # Value function
                with tf.variable_scope('vf', reuse=reuse):
                    vf_h = mlp(critics_h,
                               self.layers,
                               self.activ_fn,
                               layer_norm=self.layer_norm)
                    value_fn = tf.layers.dense(vf_h,
                                               self.n_spt,
                                               name="vf",
                                               activation="softmax")
                self.value_fn = value_fn

            if create_qf:
                # Concatenate preprocessed state and action
                qf_h = tf.concat([critics_h, action], axis=-1)

                # Double Q values to reduce overestimation
                with tf.variable_scope('qf1', reuse=reuse):
                    qf1_h = mlp(qf_h,
                                self.layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    qf1 = tf.layers.dense(qf1_h,
                                          self.n_spt,
                                          name="qf1",
                                          activation="softmax")

                with tf.variable_scope('qf2', reuse=reuse):
                    qf2_h = mlp(qf_h,
                                self.layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    qf2 = tf.layers.dense(qf2_h,
                                          self.n_spt,
                                          name="qf2",
                                          activation="softmax")

                self.qf1 = qf1
                self.qf2 = qf2

        return self.qf1, self.qf2, self.value_fn
    def setup_model(self):
        self.graph = tf.Graph()

        with self.graph.as_default():
            self.sess = tf_util.make_session(num_cpu=None, graph=self.graph)
            self.observation_ph, self.processed_obs = observation_input(
                self.venv.observation_space,
                scale=(self.network_type == "cnn"))

            with tf.variable_scope("target_model"):
                if self.network_type == 'cnn':
                    self.target_network = small_convnet(
                        self.processed_obs, tf.nn.leaky_relu)
                elif self.network_type == 'mlp':
                    self.target_network = tf_layers.mlp(
                        self.processed_obs, [1024, 512])
                    self.target_network = tf_layers.linear(
                        self.target_network, "out", 512)
                else:
                    raise ValueError("Unknown network type {}!".format(
                        self.network_type))

            with tf.variable_scope("predictor_model"):
                if self.network_type == 'cnn':
                    self.predictor_network = tf.nn.relu(
                        small_convnet(self.processed_obs, tf.nn.leaky_relu))
                elif self.network_type == 'mlp':
                    self.predictor_network = tf_layers.mlp(
                        self.processed_obs, [1024, 512])

                self.predictor_network = tf.nn.relu(
                    tf_layers.linear(self.predictor_network, "pred_fc1", 512))
                self.predictor_network = tf_layers.linear(
                    self.predictor_network, "out", 512)

            with tf.name_scope("loss"):
                self.int_reward = tf.reduce_mean(tf.square(
                    tf.stop_gradient(self.target_network) -
                    self.predictor_network),
                                                 axis=1)
                self.aux_loss = tf.reduce_mean(
                    tf.square(
                        tf.stop_gradient(self.target_network) -
                        self.predictor_network))

            with tf.name_scope("train"):
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.training_op = self.optimizer.minimize(self.aux_loss)

            self.params = tf.trainable_variables()
            tf.global_variables_initializer().run(session=self.sess)
Ejemplo n.º 5
0
    def make_quantile_function(self,
                               obs=None,
                               action=None,
                               reuse=False,
                               scope="quantile_fn",
                               n_support=64):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qi_h = tf.concat([critics_h, action], axis=-1)

            qi_h = mlp(qi_h,
                       self.layers,
                       self.activ_fn,
                       layer_norm=self.layer_norm)
            #qi_h = tf.layers.dense(qi_h, n_support, name="qi")
            #logqi = tf.nn.log_softmax(qi_h,axis=1)
            #qi = tf.exp(logqi)
            qi = tf.layers.dense(qi_h, n_support, tf.nn.softmax, name="qi")
            logqi = tf.log(qi)
            tau = tf.math.cumsum(qi, axis=1)
            tau = tf.concat([tf.zeros([tf.shape(tau)[0], 1]), tau], axis=-1)
            tau_hats = tf.stop_gradient((tau[:, :-1] + tau[:, 1:]) / 2.0)
            entropies = -tf.reduce_sum(logqi * qi, axis=-1, keepdims=True)
        return qi, tau, tau_hats, entropies
Ejemplo n.º 6
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                pi_h = tf.layers.flatten(obs)

            pi_h = mlp(pi_h,
                       self.layers,
                       self.activ_fn,
                       layer_norm=self.layer_norm)

            if reuse:
                policy = tf.layers.dense(pi_h,
                                         self.ac_space.shape[0],
                                         activation=tf.tanh)
            else:
                self.policy = policy = tf.layers.dense(pi_h,
                                                       self.ac_space.shape[0],
                                                       activation=tf.tanh)

        return policy
Ejemplo n.º 7
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        if self.obs_module_indices is not None:
            obs = tf.gather(obs, self.obs_module_indices["pi"], axis=-1)

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                pi_h = self.cnn_extractor(obs,
                                          name="pi_c1",
                                          act_fun=self.activ_fn,
                                          **self.cnn_kwargs)
            else:
                pi_h = tf.layers.flatten(obs)

            pi_h = mlp(pi_h,
                       self.layers,
                       self.activ_fn,
                       layer_norm=self.layer_norm)

            self.act_mu = mu_ = tf.layers.dense(pi_h,
                                                self.ac_space.shape[0],
                                                activation=None)
            # Important difference with SAC and other algo such as PPO:
            # the std depends on the state, so we cannot use stable_baselines.common.distribution
            log_std = tf.layers.dense(pi_h,
                                      self.ac_space.shape[0],
                                      activation=None)

        # Regularize policy output (not used for now)
        # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2)
        # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2)
        # self.reg_loss = reg_loss

        # OpenAI Variation to cap the standard deviation
        # activation = tf.tanh # for log_std
        # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
        # Original Implementation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        self.std = std = tf.exp(log_std)
        # Reparameterization trick
        pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std
        logp_pi = gaussian_likelihood(pi_, mu_, log_std)
        self.entropy = gaussian_entropy(log_std)
        # MISSING: reg params for log and mu
        # Apply squashing and account for it in the probability
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            mu_, pi_, logp_pi)
        self.policy = policy
        self.deterministic_policy = deterministic_policy

        return deterministic_policy, policy, logp_pi
Ejemplo n.º 8
0
    def make_critics(self,
                     obs=None,
                     action=None,
                     reuse=False,
                     scope="values_fn",
                     model_type="QR",
                     iqn_tau=None,
                     n_support=64):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qf_h = tf.concat([critics_h, action], axis=-1)

            # Double Q values to reduce overestimation
            if model_type == "QR":

                with tf.variable_scope('qf1', reuse=reuse):
                    qf1_h = mlp(qf_h,
                                self.layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    #qf1_h = mlp_ficnn(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
                    qf1 = tf.layers.dense(qf1_h, n_support, name="qf1")

                with tf.variable_scope('qf2', reuse=reuse):
                    qf2_h = mlp(qf_h,
                                self.layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    #qf2_h = mlp_ficnn(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
                    qf2 = tf.layers.dense(qf2_h, n_support, name="qf2")
            elif model_type == "IQN" or model_type == "FQF":
                embeding_size = self.layers[0]
                pi_mtx = tf.constant(np.expand_dims(np.pi * np.arange(0, 128),
                                                    axis=0),
                                     dtype=tf.float32)
                costau = tf.cos(tf.matmul(tf.reshape(iqn_tau, [-1, 1]),
                                          pi_mtx))
                with tf.variable_scope('qf1', reuse=reuse):
                    qf1_h_embeding = tf.layers.dense(
                        inputs=qf_h,
                        units=embeding_size,
                        activation=self.activ_fn,
                        name="qf1_embeding",
                        kernel_initializer=tf.initializers.random_normal(
                            stddev=0.01))
                    phi1 = tf.layers.dense(
                        costau,
                        embeding_size,
                        self.activ_fn,
                        name="cos_embeding",
                        kernel_initializer=tf.initializers.random_normal(
                            stddev=0.01))
                    if self.layer_norm:
                        qf1_h_embeding = tf.contrib.layers.layer_norm(
                            qf1_h_embeding, center=True, scale=True)
                        phi1 = tf.contrib.layers.layer_norm(phi1,
                                                            center=True,
                                                            scale=True)
                    critics1_h_embeding = tf.reshape(tf.tile(
                        qf1_h_embeding, [1, iqn_tau.shape[1]]),
                                                     shape=[-1, embeding_size])
                    qf1_h = tf.multiply(critics1_h_embeding, phi1)
                    if len(self.layers) > 1:
                        qf1_h = mlp(qf1_h,
                                    self.layers[1:],
                                    self.activ_fn,
                                    layer_norm=self.layer_norm)
                        #qf1_h = mlp(qf1_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
                    qf1 = tf.reshape(tf.layers.dense(qf1_h, 1),
                                     [-1, iqn_tau.shape[1]],
                                     name="qf1")

                with tf.variable_scope('qf2', reuse=reuse):
                    qf2_h_embeding = tf.layers.dense(
                        inputs=qf_h,
                        units=embeding_size,
                        activation=self.activ_fn,
                        name="qf2_embeding",
                        kernel_initializer=tf.initializers.random_normal(
                            stddev=0.01))
                    phi2 = tf.layers.dense(
                        costau,
                        embeding_size,
                        self.activ_fn,
                        name="cos_embeding",
                        kernel_initializer=tf.initializers.random_normal(
                            stddev=0.01))
                    if self.layer_norm:
                        qf2_h_embeding = tf.contrib.layers.layer_norm(
                            qf2_h_embeding, center=True, scale=True)
                        phi2 = tf.contrib.layers.layer_norm(phi2,
                                                            center=True,
                                                            scale=True)
                    critics2_h_embeding = tf.reshape(tf.tile(
                        qf2_h_embeding, [1, iqn_tau.shape[1]]),
                                                     shape=[-1, embeding_size])
                    qf2_h = tf.multiply(critics2_h_embeding, phi2)
                    if len(self.layers) > 1:
                        qf2_h = mlp(qf2_h,
                                    self.layers[1:],
                                    self.activ_fn,
                                    layer_norm=self.layer_norm)
                        #qf2_h = mlp(qf2_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
                    qf2 = tf.reshape(tf.layers.dense(qf2_h, 1),
                                     [-1, iqn_tau.shape[1]],
                                     name="qf2")
            else:
                print("No model type : ", model_type,
                      " please retry with 'QR' or 'IQN'")
                exit()

            self.qf1 = qf1
            self.qf2 = qf2

        return self.qf1, self.qf2