コード例 #1
0
ファイル: learner2.py プロジェクト: liyuan9988/IVOPEwithACME
    def update_final_weight(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)

        target_1st = discount_1st * add_const_col(
            self.value_feature(
                obs=next_obs_1st, action=next_action_1st, training=True))
        target_1st = add_const_col(
            self.value_feature(obs=current_obs_1st,
                               action=action_1st,
                               training=True)) - self.discount * target_1st
        stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                   self.stage1_reg)
        self.stage1_weight.assign(stage1_weight)
        predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                            stage1_weight)
        stage2_weight = fit_linear(tf.expand_dims(reward_2nd, -1),
                                   predicted_feature, self.stage2_reg)
        self.value_func.weight.assign(stage2_weight)

        return stage1_weight, stage2_weight
コード例 #2
0
ファイル: learner2.py プロジェクト: liyuan9988/IVOPEwithACME
    def update_value(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)
        l2 = snt.regularizers.L2(self.value_reg)
        with tf.GradientTape() as tape:
            target_1st = discount_1st * add_const_col(
                self.value_feature(
                    obs=next_obs_1st, action=next_action_1st, training=True))
            target_1st = add_const_col(
                self.value_feature(obs=current_obs_1st,
                                   action=action_1st,
                                   training=True)) - self.discount * target_1st
            stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                       self.stage1_reg)
            predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                                stage1_weight)
            loss = linear_reg_loss(tf.expand_dims(reward_2nd, -1),
                                   predicted_feature, self.stage2_reg)
            loss = loss + l2(self.value_feature.trainable_variables)
            loss /= action_2nd.shape[0]

        gradient = tape.gradient(loss, self.value_feature.trainable_variables)
        self._value_func_optimizer.apply(
            gradient, self.value_feature.trainable_variables)
        return loss
コード例 #3
0
ファイル: learner.py プロジェクト: liyuan9988/IVOPEwithACME
    def cal_validation_err(self, valid_input):
        """Return prediction MSE on the validation dataset."""
        stage1_weight = self.stage1_weight
        stage2_weight = self.value_func.weight
        se_sum = 0.
        se2_sum = 0.
        weight_sum = 0.
        for sample in valid_input:
            data = sample.data
            current_obs, action, reward = data[:3]
            d_tm1 = self._get_d_tm1(data)
            d_tm1 = tf.expand_dims(d_tm1, axis=1)
            instrumental_feature = self.instrumental_feature(obs=current_obs,
                                                             action=action,
                                                             training=False)
            predicted_feature = linear_reg_pred(instrumental_feature,
                                                stage1_weight)
            current_feature = add_const_col(
                self.value_feature(obs=current_obs,
                                   action=action,
                                   training=True))
            predicted_feature = current_feature - d_tm1 * self.discount * predicted_feature
            predict = linear_reg_pred(predicted_feature, stage2_weight)

            weight = d_tm1 + (1.0 - d_tm1) * tf.convert_to_tensor(
                self.d_tm1_weight, dtype=tf.float32)
            weight = tf.square(weight)
            sq_err = tf.square(tf.expand_dims(reward, -1) - predict)
            se_sum += tf.reduce_sum(weight * sq_err)
            se2_sum += tf.reduce_sum(weight * tf.square(sq_err))
            weight_sum += tf.reduce_sum(weight)
        mse = se_sum / weight_sum
        mse_err_std = tf.sqrt((se2_sum / weight_sum - mse**2) / weight_sum)
        return mse, mse_err_std
コード例 #4
0
ファイル: learner.py プロジェクト: liyuan9988/IVOPEwithACME
    def update_value(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        d_tm1_1st = self._get_d_tm1(stage1_input)
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        d_tm1_2nd = self._get_d_tm1(stage2_input)
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)
        d_tm1_1st = tf.expand_dims(d_tm1_1st, axis=1)
        d_tm1_2nd = tf.expand_dims(d_tm1_2nd, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_1st = d_tm1_1st * instrumental_feature_1st
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)
        l2 = snt.regularizers.L2(self.value_reg)
        with tf.GradientTape() as tape:
            # target_1st = discount_1st * self.value_feature(obs=next_obs_1st, action=next_action_1st, training=True)
            target_1st = d_tm1_1st * discount_1st * add_const_col(
                self.value_feature(
                    obs=next_obs_1st, action=next_action_1st, training=True))
            stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                       self.stage1_reg)
            predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                                stage1_weight)
            # current_feature = self.value_feature(obs=current_obs_2nd, action=action_2nd, training=True)
            current_feature = add_const_col(
                self.value_feature(obs=current_obs_2nd,
                                   action=action_2nd,
                                   training=True))
            predicted_feature = current_feature - d_tm1_2nd * self.discount * predicted_feature
            # loss = linear_reg_loss(tf.expand_dims(reward_2nd, -1), predicted_feature, self.stage2_reg)

            weight = d_tm1_2nd + (1.0 - d_tm1_2nd) * tf.convert_to_tensor(
                self.d_tm1_weight, dtype=tf.float32)
            loss = linear_reg_loss(weight * tf.expand_dims(reward_2nd, -1),
                                   weight * predicted_feature, self.stage2_reg)

            loss = loss + l2(self.value_feature.trainable_variables)
            loss /= action_2nd.shape[0]

        gradient = tape.gradient(loss, self.value_feature.trainable_variables)
        self._value_func_optimizer.apply(
            gradient, self.value_feature.trainable_variables)
        return loss
コード例 #5
0
ファイル: learner.py プロジェクト: liyuan9988/IVOPEwithACME
    def update_final_weight(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        d_tm1_1st = self._get_d_tm1(stage1_input)
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        d_tm1_2nd = self._get_d_tm1(stage2_input)
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)
        d_tm1_1st = tf.expand_dims(d_tm1_1st, axis=1)
        d_tm1_2nd = tf.expand_dims(d_tm1_2nd, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_1st = d_tm1_1st * instrumental_feature_1st
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)

        # target_1st = discount_1st * self.value_feature(obs=next_obs_1st, action=next_action_1st, training=False)
        target_1st = d_tm1_1st * discount_1st * add_const_col(
            self.value_feature(
                obs=next_obs_1st, action=next_action_1st, training=False))
        stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                   self.stage1_reg)
        self.stage1_weight.assign(stage1_weight)
        predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                            stage1_weight)
        # current_feature = self.value_feature(obs=current_obs_2nd, action=action_2nd, training=False)
        current_feature = add_const_col(
            self.value_feature(obs=current_obs_2nd,
                               action=action_2nd,
                               training=False))
        # predicted_feature = add_const_col(current_feature) - self.discount * add_const_col(predicted_feature)
        predicted_feature = current_feature - d_tm1_2nd * self.discount * predicted_feature
        # self.value_func._weight.assign(
        #     fit_linear(tf.expand_dims(reward_2nd, -1), predicted_feature, self.stage2_reg))

        weight = d_tm1_2nd + (1.0 - d_tm1_2nd) * tf.convert_to_tensor(
            self.d_tm1_weight, dtype=tf.float32)
        stage2_weight = fit_linear(weight * tf.expand_dims(reward_2nd, -1),
                                   weight * predicted_feature, self.stage2_reg)
        self.value_func.weight.assign(stage2_weight)

        return stage1_weight, stage2_weight
コード例 #6
0
ファイル: learner2.py プロジェクト: liyuan9988/IVOPEwithACME
    def update_instrumental(self, current_obs, action, reward, discount,
                            next_obs):
        next_action = self.policy(next_obs)
        discount = tf.expand_dims(discount, axis=1)
        target = discount * add_const_col(
            self.value_feature(next_obs, next_action, training=False))
        target = add_const_col(self.value_feature(
            current_obs, action)) - self.discount * target
        l2 = snt.regularizers.L2(self.instrumental_reg)
        with tf.GradientTape() as tape:
            feature = self.instrumental_feature(obs=current_obs,
                                                action=action,
                                                training=True)
            loss = linear_reg_loss(target, feature, self.stage1_reg)
            loss = loss + l2(self.instrumental_feature.trainable_variables)
            loss /= action.shape[0]

        gradient = tape.gradient(loss,
                                 self.instrumental_feature.trainable_variables)
        self._instrumental_func_optimizer.apply(
            gradient, self.instrumental_feature.trainable_variables)

        return loss
コード例 #7
0
 def __call__(self, obs, action, training=False):
     feature = self._feature(obs, action, training)
     return tf.matmul(add_const_col(feature), self._weight)
コード例 #8
0
 def __call__(self, obs, action, training=False):
     feature = self._net(obs, action)
     feature = add_const_col(feature)
     return feature