コード例 #1
0
ファイル: __init__.py プロジェクト: cheth-rowe/ihmexp
    def addPriors(self, prior_list):
        """
        Add priors to the object, require prior_list contains priors
        """
        self.prior_list = prior_list

        (self.C, self.JC, self.c, self.H, self.JH, self.h, self.uprior,
         self.gprior, self.lprior, self.C_list, self.JC_list, self.c_list,
         self.H_list, self.JH_list, self.h_list, self.id_C_list,
         self.id_C_var_list, self.id_H_list, self.id_H_var_list,
         self.num_constraints_list,
         self.num_regularizers_list) = utils.constructPrior(prior_list, self)

        # renew uprior for gamma
        if self.uprior is None:
            self.uprior = np.array([[-np.inf] * self.k_beta +
                                    [1e-7] * self.k_gamma, [np.inf] * self.k])
        else:
            uprior_beta = self.uprior[:, self.id_beta]
            uprior_gamma = self.uprior[:, self.id_gamma]
            uprior_gamma[0] = np.maximum(1e-7, uprior_gamma[0])
            uprior_gamma[1] = np.maximum(uprior_gamma[0], uprior_gamma[1])
            self.uprior = np.hstack((uprior_beta, uprior_gamma))

        self.lt.C, self.lt.JC, self.lt.c = self.C, self.JC, self.c
        self.lt.H, self.lt.JH, self.lt.h = self.H, self.JH, self.h
        (self.lt.uprior, self.lt.gprior,
         self.lt.lprior) = (self.uprior, self.gprior, self.lprior)

        self.lt = LimeTr(self.study_sizes,
                         self.k_beta,
                         self.k_gamma,
                         self.obs_mean,
                         self.F,
                         self.JF,
                         self.Z,
                         self.obs_std,
                         C=self.C,
                         JC=self.JC,
                         c=self.c,
                         H=self.H,
                         JH=self.JH,
                         h=self.h,
                         uprior=self.uprior,
                         gprior=self.gprior,
                         lprior=self.lprior,
                         inlier_percentage=self.inlier_percentage)
コード例 #2
0
ファイル: ap_model.py プロジェクト: cheth-rowe/ihmexp
 def sample_params(self, n_samples=500):
     if 'mr_list' in dir(self.mr):
         # sample for each submodel
         sample_size_list = self.mr.compute_sample_sizes(n_samples)
         param_samples = [
             LimeTr.sampleSoln(sub_mr.lt, sample_size=ss)
             for sub_mr, ss in zip(self.mr.mr_list, sample_size_list)
         ]
         given_samples = {
             'given_beta_samples_list': [i[0] for i in param_samples],
             'given_gamma_samples_list': [i[1] for i in param_samples]
         }
     else:
         beta_samples, gamma_samples = LimeTr.sampleSoln(
             self.mr.lt, sample_size=n_samples)
         given_samples = {
             'given_beta_samples': beta_samples,
             'given_gamma_samples': gamma_samples
         }
     self.given_samples = given_samples
コード例 #3
0
ファイル: model.py プロジェクト: cheth-rowe/ihmexp
def get_parameter_samples(mr, n_samples=1000):
    # sample for each submodel
    sample_size_list = mr.compute_sample_sizes(n_samples)
    param_samples = [
        LimeTr.sampleSoln(sub_mr.lt, sample_size=ss)
        for sub_mr, ss in zip(mr.mr_list, sample_size_list)
    ]
    given_samples = {
        'given_beta_samples_list': [i[0] for i in param_samples],
        'given_gamma_samples_list': [i[1] for i in param_samples]
    }

    return given_samples
コード例 #4
0
ファイル: __init__.py プロジェクト: cheth-rowe/ihmexp
    def __init__(self,
                 obs_mean,
                 obs_std,
                 study_sizes,
                 x_cov_list,
                 z_cov_list,
                 spline_list=[],
                 inlier_percentage=1.0,
                 rr_random_slope=False):
        """
        Initialize the object and pass in the data, require
        - obs_mean: observations
        - obs_std: standard deviations for the observations
        - study_sizes: all study sizes in a list
        - x_cov_list: all x cov in a list
        - z_cov_list: all z cov in a list
        - spline_list: optional, all spline in a list
        - inlier_percentage: optional, used for trimming
        """
        # pass in data
        self.obs_mean = obs_mean
        self.obs_std = obs_std
        self.study_sizes = study_sizes
        self.num_studies = len(study_sizes)
        self.num_obs = sum(study_sizes)

        # construct x and z "covariates"
        self.spline_list = spline_list
        self.x_cov_list = x_cov_list
        self.z_cov_list = z_cov_list

        (self.F, self.JF, self.F_list, self.JF_list, self.id_beta_list,
         self.id_spline_beta_list,
         self.k_beta_list) = utils.constructXCov(x_cov_list,
                                                 spline_list=spline_list)
        (self.Z, self.Z_list, self.id_gamma_list, self.id_spline_gamma_list,
         self.k_gamma_list) = utils.constructZCov(z_cov_list,
                                                  spline_list=spline_list)

        self.k_beta = int(sum(self.k_beta_list))
        self.k_gamma = int(sum(self.k_gamma_list))
        self.k = self.k_beta + self.k_gamma

        self.id_beta = slice(0, self.k_beta)
        self.id_gamma = slice(self.k_beta, self.k)

        # if use the random slope model or not
        self.rr_random_slope = rr_random_slope
        if rr_random_slope:
            valid_x_cov_id = [
                i for i in range(len(x_cov_list)) if x_cov_list[i]['cov_type']
                in ['log_ratio_spline', 'log_ratio_spline_integral']
            ]
            if len(valid_x_cov_id) == 0:
                raise Exception(
                    "Error: no suitable x cov for random slope model.")
            if len(valid_x_cov_id) >= 2:
                raise Exception(
                    "Error: multiple x cov for random slope model.")

            x_cov = x_cov_list[valid_x_cov_id[0]]
            mat = x_cov['mat']
            if x_cov['cov_type'] == 'log_ratio_spline':
                scaling = mat[0] - mat[1]
            else:
                scaling = 0.5 * (mat[0] + mat[1] - mat[2] - mat[3])
            self.Z *= scaling.reshape(scaling.size, 1)

        # create limetr object
        self.inlier_percentage = inlier_percentage
        self.lt = LimeTr(self.study_sizes,
                         self.k_beta,
                         self.k_gamma,
                         self.obs_mean,
                         self.F,
                         self.JF,
                         self.Z,
                         self.obs_std,
                         inlier_percentage=inlier_percentage)
コード例 #5
0
ファイル: __init__.py プロジェクト: cheth-rowe/ihmexp
class MR_BRT:
    def __init__(self,
                 obs_mean,
                 obs_std,
                 study_sizes,
                 x_cov_list,
                 z_cov_list,
                 spline_list=[],
                 inlier_percentage=1.0,
                 rr_random_slope=False):
        """
        Initialize the object and pass in the data, require
        - obs_mean: observations
        - obs_std: standard deviations for the observations
        - study_sizes: all study sizes in a list
        - x_cov_list: all x cov in a list
        - z_cov_list: all z cov in a list
        - spline_list: optional, all spline in a list
        - inlier_percentage: optional, used for trimming
        """
        # pass in data
        self.obs_mean = obs_mean
        self.obs_std = obs_std
        self.study_sizes = study_sizes
        self.num_studies = len(study_sizes)
        self.num_obs = sum(study_sizes)

        # construct x and z "covariates"
        self.spline_list = spline_list
        self.x_cov_list = x_cov_list
        self.z_cov_list = z_cov_list

        (self.F, self.JF, self.F_list, self.JF_list, self.id_beta_list,
         self.id_spline_beta_list,
         self.k_beta_list) = utils.constructXCov(x_cov_list,
                                                 spline_list=spline_list)
        (self.Z, self.Z_list, self.id_gamma_list, self.id_spline_gamma_list,
         self.k_gamma_list) = utils.constructZCov(z_cov_list,
                                                  spline_list=spline_list)

        self.k_beta = int(sum(self.k_beta_list))
        self.k_gamma = int(sum(self.k_gamma_list))
        self.k = self.k_beta + self.k_gamma

        self.id_beta = slice(0, self.k_beta)
        self.id_gamma = slice(self.k_beta, self.k)

        # if use the random slope model or not
        self.rr_random_slope = rr_random_slope
        if rr_random_slope:
            valid_x_cov_id = [
                i for i in range(len(x_cov_list)) if x_cov_list[i]['cov_type']
                in ['log_ratio_spline', 'log_ratio_spline_integral']
            ]
            if len(valid_x_cov_id) == 0:
                raise Exception(
                    "Error: no suitable x cov for random slope model.")
            if len(valid_x_cov_id) >= 2:
                raise Exception(
                    "Error: multiple x cov for random slope model.")

            x_cov = x_cov_list[valid_x_cov_id[0]]
            mat = x_cov['mat']
            if x_cov['cov_type'] == 'log_ratio_spline':
                scaling = mat[0] - mat[1]
            else:
                scaling = 0.5 * (mat[0] + mat[1] - mat[2] - mat[3])
            self.Z *= scaling.reshape(scaling.size, 1)

        # create limetr object
        self.inlier_percentage = inlier_percentage
        self.lt = LimeTr(self.study_sizes,
                         self.k_beta,
                         self.k_gamma,
                         self.obs_mean,
                         self.F,
                         self.JF,
                         self.Z,
                         self.obs_std,
                         inlier_percentage=inlier_percentage)

    def addPriors(self, prior_list):
        """
        Add priors to the object, require prior_list contains priors
        """
        self.prior_list = prior_list

        (self.C, self.JC, self.c, self.H, self.JH, self.h, self.uprior,
         self.gprior, self.lprior, self.C_list, self.JC_list, self.c_list,
         self.H_list, self.JH_list, self.h_list, self.id_C_list,
         self.id_C_var_list, self.id_H_list, self.id_H_var_list,
         self.num_constraints_list,
         self.num_regularizers_list) = utils.constructPrior(prior_list, self)

        # renew uprior for gamma
        if self.uprior is None:
            self.uprior = np.array([[-np.inf] * self.k_beta +
                                    [1e-7] * self.k_gamma, [np.inf] * self.k])
        else:
            uprior_beta = self.uprior[:, self.id_beta]
            uprior_gamma = self.uprior[:, self.id_gamma]
            uprior_gamma[0] = np.maximum(1e-7, uprior_gamma[0])
            uprior_gamma[1] = np.maximum(uprior_gamma[0], uprior_gamma[1])
            self.uprior = np.hstack((uprior_beta, uprior_gamma))

        self.lt.C, self.lt.JC, self.lt.c = self.C, self.JC, self.c
        self.lt.H, self.lt.JH, self.lt.h = self.H, self.JH, self.h
        (self.lt.uprior, self.lt.gprior,
         self.lt.lprior) = (self.uprior, self.gprior, self.lprior)

        self.lt = LimeTr(self.study_sizes,
                         self.k_beta,
                         self.k_gamma,
                         self.obs_mean,
                         self.F,
                         self.JF,
                         self.Z,
                         self.obs_std,
                         C=self.C,
                         JC=self.JC,
                         c=self.c,
                         H=self.H,
                         JH=self.JH,
                         h=self.h,
                         uprior=self.uprior,
                         gprior=self.gprior,
                         lprior=self.lprior,
                         inlier_percentage=self.inlier_percentage)

    def fitModel(self,
                 x0=None,
                 outer_verbose=False,
                 outer_max_iter=100,
                 outer_step_size=1.0,
                 outer_tol=1e-6,
                 inner_print_level=0,
                 inner_max_iter=20):

        # initialization with gamma set to be zero
        gamma_uprior = self.lt.uprior[:, self.lt.idx_gamma].copy()
        self.lt.uprior[:, self.lt.idx_gamma] = 1e-6
        self.lt.n = np.array([1] * self.num_obs)
        norm_z_col = np.linalg.norm(self.lt.Z, axis=0)
        self.lt.Z /= norm_z_col

        if x0 is None:
            if self.lprior is None:
                x0 = np.array([1.0] * self.k_beta + [1e-6] * self.k_gamma)
            else:
                x0 = np.array([1.0] * self.k_beta * 2 +
                              [1e-6] * self.k_gamma * 2)
        else:
            if self.lprior is not None:
                beta0 = x0[:self.k_beta]
                gamma0 = x0[self.k_beta:self.k_beta + self.k_gamma]
                x0 = np.hstack((beta0, np.abs(beta0), gamma0, gamma0))

        (beta_0, gamma_0,
         self.w_soln) = self.lt.fitModel(x0=x0,
                                         outer_verbose=outer_verbose,
                                         outer_max_iter=outer_max_iter,
                                         outer_step_size=outer_step_size,
                                         outer_tol=outer_tol,
                                         inner_print_level=inner_print_level,
                                         inner_max_iter=inner_max_iter)
        # print("init obj", self.lt.objective(self.lt.soln))

        # fit the model from the initial point
        self.lt.uprior[:, self.lt.idx_gamma] = gamma_uprior
        self.lt.n = self.study_sizes

        if self.lprior is not None:
            x0 = np.hstack((beta_0, np.abs(beta_0), gamma_0, gamma_0))
        else:
            x0 = np.hstack((beta_0, gamma_0))

        self.lt.optimize(x0=x0, print_level=inner_print_level, max_iter=100)

        self.lt.Z *= norm_z_col
        self.lt.gamma /= norm_z_col**2

        self.beta_soln = self.lt.beta
        self.gamma_soln = self.lt.gamma

        # print("final obj", self.lt.objective(self.lt.soln))
        # print("------------------------------")

    def predictData(self,
                    pred_x_cov_list,
                    pred_z_cov_list,
                    sample_size,
                    pred_study_sizes=None,
                    given_beta_samples=None,
                    given_gamma_samples=None,
                    ref_point=None,
                    include_random_effect=True):
        # sample solutions
        if given_beta_samples is None or given_gamma_samples is None:
            beta_samples, gamma_samples = LimeTr.sampleSoln(
                self.lt, sample_size=sample_size)
        else:
            beta_samples = given_beta_samples
            gamma_samples = given_gamma_samples

        # calculate the beta and gamma post cov
        # self.beta_samples_mean = np.mean(beta_samples, axis=0)
        # self.gamma_samples_mean = np.mean(gamma_samples, axis=0)

        # self.beta_samples_cov = \
        #     beta_samples.T.dot(beta_samples)/sample_size - \
        #     np.outer(self.beta_samples_mean, self.beta_samples_mean)
        # self.gamma_samples_cov = \
        #     gamma_samples.T.dot(gamma_samples)/sample_size - \
        #     np.outer(self.gamma_samples_mean, self.gamma_samples_mean)

        # create x cov
        (pred_F, pred_JF, pred_F_list, pred_JF_list,
         pred_id_beta_list) = utils.constructPredXCov(pred_x_cov_list, self)

        # create z cov
        (pred_Z, pred_Z_list,
         pred_id_gamma_list) = utils.constructPredZCov(pred_z_cov_list, self)

        # num of studies
        pred_num_obs = pred_Z.shape[0]

        # create observation samples
        y_samples = np.vstack([pred_F(beta) for beta in beta_samples])

        if ref_point is not None:
            x_cov_spline_id = [
                x_cov['spline_id'] for x_cov in pred_x_cov_list
                if 'spline' in x_cov['cov_type']
            ]
            if len(x_cov_spline_id) == 0:
                raise Exception("Error: no spline x cov")
            if len(x_cov_spline_id) >= 2:
                raise Exception("Error: multiple spline x covs")

            spline = self.spline_list[x_cov_spline_id[0]]
            ref_risk = spline.designMat(np.array([ref_point])).dot(
                beta_samples[:,
                             self.id_spline_beta_list[x_cov_spline_id[0]]].T)

            y_samples /= ref_risk.reshape(sample_size, 1)

        pred_gamma = np.hstack([
            self.gamma_soln[pred_id_gamma_list[i]]
            for i in range(len(pred_id_gamma_list))
        ])

        if include_random_effect:
            if self.rr_random_slope:
                u = np.random.randn(sample_size, self.k_gamma)*\
                    np.sqrt(self.gamma_soln)
                # zu = np.sum(pred_Z*u, axis=1)
                zu = u[:, 0]

                valid_x_cov_id = [
                    i for i in range(len(pred_x_cov_list))
                    if pred_x_cov_list[i]['cov_type'] == 'spline'
                ]

                if len(valid_x_cov_id) == 0:
                    raise Exception(
                        "Error: no suitable x cov for random slope model.")
                if len(valid_x_cov_id) >= 2:
                    raise Exception(
                        "Error: multiple x cov for random slope model.")

                mat = pred_x_cov_list[valid_x_cov_id[0]]['mat']
                if ref_point is None:
                    y_samples *= np.exp(np.outer(zu, mat - mat[0]))
                else:
                    y_samples *= np.exp(np.outer(zu, mat - ref_point))
            else:
                if pred_study_sizes is None:
                    pred_study_sizes = np.array([1] * pred_num_obs)
                else:
                    assert sum(pred_study_sizes) == pred_num_obs

                pred_num_studies = len(pred_study_sizes)

                pred_Z_sub = np.split(pred_Z, np.cumsum(pred_study_sizes)[:-1])
                u = [
                    np.random.multivariate_normal(
                        np.zeros(pred_study_sizes[i]),
                        (pred_Z_sub[i] * pred_gamma).dot(pred_Z_sub[i].T),
                        sample_size) for i in range(pred_num_studies)
                ]
                U = np.hstack(u)

                if np.any([
                        'log_ratio' in self.x_cov_list[i]['cov_type']
                        for i in range(len(self.x_cov_list))
                ]):
                    y_samples *= np.exp(U)
                else:
                    y_samples += U

        return y_samples, beta_samples, gamma_samples, pred_F, pred_Z
コード例 #6
0
ファイル: __init__.py プロジェクト: cheth-rowe/ihmexp
    def predictData(self,
                    pred_x_cov_list,
                    pred_z_cov_list,
                    sample_size,
                    pred_study_sizes=None,
                    given_beta_samples=None,
                    given_gamma_samples=None,
                    ref_point=None,
                    include_random_effect=True):
        # sample solutions
        if given_beta_samples is None or given_gamma_samples is None:
            beta_samples, gamma_samples = LimeTr.sampleSoln(
                self.lt, sample_size=sample_size)
        else:
            beta_samples = given_beta_samples
            gamma_samples = given_gamma_samples

        # calculate the beta and gamma post cov
        # self.beta_samples_mean = np.mean(beta_samples, axis=0)
        # self.gamma_samples_mean = np.mean(gamma_samples, axis=0)

        # self.beta_samples_cov = \
        #     beta_samples.T.dot(beta_samples)/sample_size - \
        #     np.outer(self.beta_samples_mean, self.beta_samples_mean)
        # self.gamma_samples_cov = \
        #     gamma_samples.T.dot(gamma_samples)/sample_size - \
        #     np.outer(self.gamma_samples_mean, self.gamma_samples_mean)

        # create x cov
        (pred_F, pred_JF, pred_F_list, pred_JF_list,
         pred_id_beta_list) = utils.constructPredXCov(pred_x_cov_list, self)

        # create z cov
        (pred_Z, pred_Z_list,
         pred_id_gamma_list) = utils.constructPredZCov(pred_z_cov_list, self)

        # num of studies
        pred_num_obs = pred_Z.shape[0]

        # create observation samples
        y_samples = np.vstack([pred_F(beta) for beta in beta_samples])

        if ref_point is not None:
            x_cov_spline_id = [
                x_cov['spline_id'] for x_cov in pred_x_cov_list
                if 'spline' in x_cov['cov_type']
            ]
            if len(x_cov_spline_id) == 0:
                raise Exception("Error: no spline x cov")
            if len(x_cov_spline_id) >= 2:
                raise Exception("Error: multiple spline x covs")

            spline = self.spline_list[x_cov_spline_id[0]]
            ref_risk = spline.designMat(np.array([ref_point])).dot(
                beta_samples[:,
                             self.id_spline_beta_list[x_cov_spline_id[0]]].T)

            y_samples /= ref_risk.reshape(sample_size, 1)

        pred_gamma = np.hstack([
            self.gamma_soln[pred_id_gamma_list[i]]
            for i in range(len(pred_id_gamma_list))
        ])

        if include_random_effect:
            if self.rr_random_slope:
                u = np.random.randn(sample_size, self.k_gamma)*\
                    np.sqrt(self.gamma_soln)
                # zu = np.sum(pred_Z*u, axis=1)
                zu = u[:, 0]

                valid_x_cov_id = [
                    i for i in range(len(pred_x_cov_list))
                    if pred_x_cov_list[i]['cov_type'] == 'spline'
                ]

                if len(valid_x_cov_id) == 0:
                    raise Exception(
                        "Error: no suitable x cov for random slope model.")
                if len(valid_x_cov_id) >= 2:
                    raise Exception(
                        "Error: multiple x cov for random slope model.")

                mat = pred_x_cov_list[valid_x_cov_id[0]]['mat']
                if ref_point is None:
                    y_samples *= np.exp(np.outer(zu, mat - mat[0]))
                else:
                    y_samples *= np.exp(np.outer(zu, mat - ref_point))
            else:
                if pred_study_sizes is None:
                    pred_study_sizes = np.array([1] * pred_num_obs)
                else:
                    assert sum(pred_study_sizes) == pred_num_obs

                pred_num_studies = len(pred_study_sizes)

                pred_Z_sub = np.split(pred_Z, np.cumsum(pred_study_sizes)[:-1])
                u = [
                    np.random.multivariate_normal(
                        np.zeros(pred_study_sizes[i]),
                        (pred_Z_sub[i] * pred_gamma).dot(pred_Z_sub[i].T),
                        sample_size) for i in range(pred_num_studies)
                ]
                U = np.hstack(u)

                if np.any([
                        'log_ratio' in self.x_cov_list[i]['cov_type']
                        for i in range(len(self.x_cov_list))
                ]):
                    y_samples *= np.exp(U)
                else:
                    y_samples += U

        return y_samples, beta_samples, gamma_samples, pred_F, pred_Z
コード例 #7
0
ファイル: model.py プロジェクト: vishalbelsare/MRTool
class MRBRT:
    """MR-BRT Object
    """
    def __init__(self,
                 data: MRData,
                 cov_models: List[CovModel],
                 inlier_pct: float = 1.0):
        """Constructor of MRBRT.

        Args:
            data (MRData): Data for meta-regression.
            cov_models (List[CovModel]): A list of covariates models.
            inlier_pct (float, optional):
                A float number between 0 and 1 indicate the percentage of inliers.
        """
        self.data = data
        self.cov_models = cov_models
        self.inlier_pct = inlier_pct
        self.check_input()
        self.cov_model_names = [
            cov_model.name for cov_model in self.cov_models
        ]
        self.num_cov_models = len(self.cov_models)
        self.cov_names = []
        for cov_model in self.cov_models:
            self.cov_names.extend(cov_model.covs)
        self.num_covs = len(self.cov_names)

        # attach data to cov_model
        for cov_model in self.cov_models:
            cov_model.attach_data(self.data)

        # fixed effects size and index
        self.x_vars_sizes = [
            cov_model.num_x_vars for cov_model in self.cov_models
        ]
        self.x_vars_indices = utils.sizes_to_indices(self.x_vars_sizes)
        self.num_x_vars = sum(self.x_vars_sizes)

        # random effects size and index
        self.z_vars_sizes = [
            cov_model.num_z_vars for cov_model in self.cov_models
        ]
        self.z_vars_indices = utils.sizes_to_indices(self.z_vars_sizes)
        self.num_z_vars = sum(self.z_vars_sizes)

        self.num_vars = self.num_x_vars + self.num_z_vars

        # number of constraints
        self.num_constraints = sum(
            [cov_model.num_constraints for cov_model in self.cov_models])

        # number of regularizations
        self.num_regularizations = sum(
            [cov_model.num_regularizations for cov_model in self.cov_models])

        # place holder for the limetr objective
        self.lt = None
        self.beta_soln = None
        self.gamma_soln = None
        self.u_soln = None
        self.w_soln = None
        self.re_soln = None

    def check_input(self):
        """Check the input type of the attributes.
        """
        assert isinstance(self.data, MRData)
        assert isinstance(self.cov_models, list)
        assert all(
            [isinstance(cov_model, CovModel) for cov_model in self.cov_models])
        assert (self.inlier_pct >= 0.0) and (self.inlier_pct <= 1.0)

    def get_cov_model(self, name: str) -> CovModel:
        """Choose covariate model with name.
        """
        index = self.get_cov_model_index(name)
        return self.cov_models[index]

    def get_cov_model_index(self, name: str) -> int:
        """From cov_model name get the index.
        """
        matching_index = [
            index for index, cov_model_name in enumerate(self.cov_model_names)
            if cov_model_name == name
        ]
        num_matching_index = len(matching_index)
        assert num_matching_index == 1, f"Number of matching index is {num_matching_index}."
        return matching_index[0]

    def create_x_fun(self, data=None):
        """Create the fixed effects function, link with limetr.
        """
        data = self.data if data is None else data
        # create design functions
        design_funs = [
            cov_model.create_x_fun(data) for cov_model in self.cov_models
        ]
        funs, jac_funs = list(zip(*design_funs))

        def x_fun(beta, funs=funs):
            return sum(
                fun(beta[self.x_vars_indices[i]])
                for i, fun in enumerate(funs))

        def x_jac_fun(beta, jac_funs=jac_funs):
            return np.hstack([
                jac_fun(beta[self.x_vars_indices[i]])
                for i, jac_fun in enumerate(jac_funs)
            ])

        return x_fun, x_jac_fun

    def create_z_mat(self, data=None):
        """Create the random effects matrix, link with limetr.
        """
        data = self.data if data is None else data
        mat = np.hstack(
            [cov_model.create_z_mat(data) for cov_model in self.cov_models])

        return mat

    def create_c_mat(self):
        """Create the constraints matrices.
        """
        c_mat = np.zeros((0, self.num_vars))
        c_vec = np.zeros((2, 0))

        for i, cov_model in enumerate(self.cov_models):
            if cov_model.num_constraints != 0:
                c_mat_sub = np.zeros(
                    (cov_model.num_constraints, self.num_vars))
                c_mat_sub[:, self.x_vars_indices[
                    i]], c_vec_sub = cov_model.create_constraint_mat()
                c_mat = np.vstack((c_mat, c_mat_sub))
                c_vec = np.hstack((c_vec, c_vec_sub))

        return c_mat, c_vec

    def create_h_mat(self):
        """Create the regularizer matrices.
        """
        h_mat = np.zeros((0, self.num_vars))
        h_vec = np.zeros((2, 0))

        for i, cov_model in enumerate(self.cov_models):
            if cov_model.num_regularizations != 0:
                h_mat_sub = np.zeros(
                    (cov_model.num_regularizations, self.num_vars))
                h_mat_sub[:, self.x_vars_indices[
                    i]], h_vec_sub = cov_model.create_regularization_mat()
                h_mat = np.vstack((h_mat, h_mat_sub))
                h_vec = np.hstack((h_vec, h_vec_sub))

        return h_mat, h_vec

    def create_uprior(self):
        """Create direct uniform prior.
        """
        uprior = np.array([[-np.inf] * self.num_vars,
                           [np.inf] * self.num_vars])

        for i, cov_model in enumerate(self.cov_models):
            uprior[:, self.x_vars_indices[i]] = cov_model.prior_beta_uniform
            uprior[:, self.z_vars_indices[i] +
                   self.num_x_vars] = cov_model.prior_gamma_uniform

        return uprior

    def create_gprior(self):
        """Create direct gaussian prior.
        """
        gprior = np.array([[0] * self.num_vars, [np.inf] * self.num_vars])

        for i, cov_model in enumerate(self.cov_models):
            gprior[:, self.x_vars_indices[i]] = cov_model.prior_beta_gaussian
            gprior[:, self.z_vars_indices[i] +
                   self.num_x_vars] = cov_model.prior_gamma_gaussian

        return gprior

    def create_lprior(self):
        """Create direct laplace prior.
        """
        lprior = np.array([[0] * self.num_vars, [np.inf] * self.num_vars])

        for i, cov_model in enumerate(self.cov_models):
            lprior[:, self.x_vars_indices[i]] = cov_model.prior_beta_laplace
            lprior[:, self.z_vars_indices[i] +
                   self.num_x_vars] = cov_model.prior_gamma_laplace

        return lprior

    def fit_model(self, **fit_options):
        """Fitting the model through limetr.

        Args:
            x0 (np.ndarray): Initial guess for the optimization problem.
            inner_print_level (int): If non-zero printing iteration information of the inner problem.
            inner_max_iter (int): Maximum inner number of iterations.
            inner_tol (float): Tolerance of the inner problem.
            outer_verbose (bool): If `True` print out iteration information.
            outer_max_iter (int): Maximum outer number of iterations.
            outer_step_size (float): Step size of the outer problem.
            outer_tol (float): Tolerance of the outer problem.
            normalize_trimming_grad (bool): If `True`, normalize the gradient of the outer trimmign problem.
        """
        # dimensions
        n = self.data.study_sizes
        k_beta = self.num_x_vars
        k_gamma = self.num_z_vars

        # data
        y = self.data.obs
        s = self.data.obs_se

        # create x fun and z mat
        x_fun, x_fun_jac = self.create_x_fun()
        z_mat = self.create_z_mat()
        # scale z_mat
        z_scale = np.max(np.abs(z_mat), axis=0)
        z_mat /= z_scale

        # priors
        c_mat, c_vec = self.create_c_mat()
        h_mat, h_vec = self.create_h_mat()
        c_fun, c_fun_jac = utils.mat_to_fun(c_mat)
        h_fun, h_fun_jac = utils.mat_to_fun(h_mat)

        uprior = self.create_uprior()
        uprior[:, self.num_x_vars:self.num_vars] *= z_scale**2
        gprior = self.create_gprior()
        gprior[:, self.num_x_vars:self.num_vars] *= z_scale**2
        lprior = self.create_lprior()
        lprior[:, self.num_x_vars:self.num_vars] *= z_scale**2

        if np.isneginf(uprior[0]).all() and np.isposinf(uprior[1]).all():
            uprior = None
        if np.isposinf(gprior[1]).all():
            gprior = None
        if np.isposinf(lprior[1]).all():
            lprior = None

        # create limetr object
        self.lt = LimeTr(n,
                         k_beta,
                         k_gamma,
                         y,
                         x_fun,
                         x_fun_jac,
                         z_mat,
                         S=s,
                         C=c_fun,
                         JC=c_fun_jac,
                         c=c_vec,
                         H=h_fun,
                         JH=h_fun_jac,
                         h=h_vec,
                         uprior=uprior,
                         gprior=gprior,
                         lprior=lprior,
                         inlier_percentage=self.inlier_pct)

        self.lt.fitModel(**fit_options)
        self.lt.Z *= z_scale
        if hasattr(self.lt, 'gprior'):
            self.lt.gprior[:, self.lt.idx_gamma] /= z_scale**2
        if hasattr(self.lt, 'uprior'):
            self.lt.uprior[:, self.lt.idx_gamma] /= z_scale**2
        if hasattr(self.lt, 'lprior'):
            self.lt.lprior[:, self.lt.idx_gamma] /= z_scale**2
        self.lt.gamma /= z_scale**2

        self.beta_soln = self.lt.beta.copy()
        self.gamma_soln = self.lt.gamma.copy()
        self.w_soln = self.lt.w.copy()
        self.u_soln = self.lt.estimateRE()
        self.re_soln = {
            study: self.u_soln[i]
            for i, study in enumerate(self.data.studies)
        }

    def extract_re(self, study_id: np.ndarray) -> np.ndarray:
        """Extract the random effect for a given dataset.
        """
        re = np.vstack([
            self.re_soln[study]
            if study in self.re_soln else np.zeros(self.num_z_vars)
            for study in study_id
        ])
        return re

    def predict(self,
                data: MRData,
                predict_for_study: bool = False,
                sort_by_data_id: bool = False) -> np.ndarray:
        """Create new prediction with existing solution.

        Args:
            data (MRData): MRData object contains the predict data.
            predict_for_study (bool, optional):
                If `True`, use the random effects information to prediction for specific
                study. If the `study_id` in `data` do not contain in the fitting data, it
                will assume the corresponding random effects equal to 0.
            sort_by_data_id (bool, optional):
                If `True`, will sort the final prediction as the order of the original
                data frame that used to create the `data`. Default to False.

        Returns:
            np.ndarray: Predicted outcome array.
        """
        assert data.has_covs(
            self.cov_names
        ), "Prediction data do not have covariates used for fitting."
        x_fun, _ = self.create_x_fun(data=data)
        prediction = x_fun(self.beta_soln)
        if predict_for_study:
            z_mat = self.create_z_mat(data=data)
            re = self.extract_re(data.study_id)
            prediction += np.sum(z_mat * re, axis=1)

        if sort_by_data_id:
            prediction = prediction[np.argsort(data.data_id)]

        return prediction

    def sample_soln(self,
                    sample_size: int = 1,
                    sim_prior: bool = True,
                    sim_re: bool = True,
                    print_level: int = 0) -> Tuple[np.ndarray, np.ndarray]:
        """Sample solutions.

        Args:
            sample_size (int, optional): Number of samples.
            sim_prior (bool, optional): If `True`, simulate priors.
            sim_re (bool, optional): If `True`, simulate random effects.
            print_level (int, optional):
                Level detailed of optimization information printed out during sampling process.
                If 0, no information will be printed out.

        Return:
            Tuple[np.ndarray, np.ndarray]:
                Return beta samples and gamma samples.
        """
        if self.lt is None:
            raise ValueError('Please fit the model first.')

        beta_soln_samples, gamma_soln_samples = \
            self.lt.sampleSoln(self.lt,
                               sample_size=sample_size,
                               sim_prior=sim_prior,
                               sim_re=sim_re,
                               print_level=print_level)

        return beta_soln_samples, gamma_soln_samples

    def create_draws(self,
                     data: MRData,
                     beta_samples: np.ndarray,
                     gamma_samples: np.ndarray,
                     random_study: bool = True,
                     sort_by_study_id: bool = False) -> np.ndarray:
        """Create draws for the given data set.

        Args:
            data (MRData): MRData object contains predict data.
            beta_samples (np.ndarray): Samples of beta.
            gamma_samples (np.ndarray): Samples of gamma.
            random_study (bool, optional):
                If `True` the draws will include uncertainty from study heterogeneity.
            sort_by_data_id (bool, optional):
                If `True`, will sort the final prediction as the order of the original
                data frame that used to create the `data`. Default to False.

        Returns:
            np.ndarray: Returns outcome sample matrix.
        """
        sample_size = beta_samples.shape[0]
        assert beta_samples.shape == (sample_size, self.num_x_vars)
        assert gamma_samples.shape == (sample_size, self.num_z_vars)

        x_fun, x_jac_fun = self.create_x_fun(data=data)
        z_mat = self.create_z_mat(data=data)

        y_samples = np.vstack(
            [x_fun(beta_sample) for beta_sample in beta_samples])

        if random_study:
            u_samples = np.random.randn(
                sample_size, self.num_z_vars) * np.sqrt(gamma_samples)
            y_samples += u_samples.dot(z_mat.T)
        else:
            re = self.extract_re(data.study_id)
            y_samples += np.sum(z_mat * re, axis=1)

        if sort_by_study_id:
            y_samples = y_samples[:, np.argsort(data.data_id)]

        return y_samples.T
コード例 #8
0
ファイル: model.py プロジェクト: vishalbelsare/MRTool
    def fit_model(self, **fit_options):
        """Fitting the model through limetr.

        Args:
            x0 (np.ndarray): Initial guess for the optimization problem.
            inner_print_level (int): If non-zero printing iteration information of the inner problem.
            inner_max_iter (int): Maximum inner number of iterations.
            inner_tol (float): Tolerance of the inner problem.
            outer_verbose (bool): If `True` print out iteration information.
            outer_max_iter (int): Maximum outer number of iterations.
            outer_step_size (float): Step size of the outer problem.
            outer_tol (float): Tolerance of the outer problem.
            normalize_trimming_grad (bool): If `True`, normalize the gradient of the outer trimmign problem.
        """
        # dimensions
        n = self.data.study_sizes
        k_beta = self.num_x_vars
        k_gamma = self.num_z_vars

        # data
        y = self.data.obs
        s = self.data.obs_se

        # create x fun and z mat
        x_fun, x_fun_jac = self.create_x_fun()
        z_mat = self.create_z_mat()
        # scale z_mat
        z_scale = np.max(np.abs(z_mat), axis=0)
        z_mat /= z_scale

        # priors
        c_mat, c_vec = self.create_c_mat()
        h_mat, h_vec = self.create_h_mat()
        c_fun, c_fun_jac = utils.mat_to_fun(c_mat)
        h_fun, h_fun_jac = utils.mat_to_fun(h_mat)

        uprior = self.create_uprior()
        uprior[:, self.num_x_vars:self.num_vars] *= z_scale**2
        gprior = self.create_gprior()
        gprior[:, self.num_x_vars:self.num_vars] *= z_scale**2
        lprior = self.create_lprior()
        lprior[:, self.num_x_vars:self.num_vars] *= z_scale**2

        if np.isneginf(uprior[0]).all() and np.isposinf(uprior[1]).all():
            uprior = None
        if np.isposinf(gprior[1]).all():
            gprior = None
        if np.isposinf(lprior[1]).all():
            lprior = None

        # create limetr object
        self.lt = LimeTr(n,
                         k_beta,
                         k_gamma,
                         y,
                         x_fun,
                         x_fun_jac,
                         z_mat,
                         S=s,
                         C=c_fun,
                         JC=c_fun_jac,
                         c=c_vec,
                         H=h_fun,
                         JH=h_fun_jac,
                         h=h_vec,
                         uprior=uprior,
                         gprior=gprior,
                         lprior=lprior,
                         inlier_percentage=self.inlier_pct)

        self.lt.fitModel(**fit_options)
        self.lt.Z *= z_scale
        if hasattr(self.lt, 'gprior'):
            self.lt.gprior[:, self.lt.idx_gamma] /= z_scale**2
        if hasattr(self.lt, 'uprior'):
            self.lt.uprior[:, self.lt.idx_gamma] /= z_scale**2
        if hasattr(self.lt, 'lprior'):
            self.lt.lprior[:, self.lt.idx_gamma] /= z_scale**2
        self.lt.gamma /= z_scale**2

        self.beta_soln = self.lt.beta.copy()
        self.gamma_soln = self.lt.gamma.copy()
        self.w_soln = self.lt.w.copy()
        self.u_soln = self.lt.estimateRE()
        self.re_soln = {
            study: self.u_soln[i]
            for i, study in enumerate(self.data.studies)
        }
コード例 #9
0
 def sampleGlobalWithLimeTr(self, sample_size=100, max_iter=300):
     beta_samples, gamma_samples = LimeTr.sampleSoln(
         self.model, sample_size=sample_size, max_iter=max_iter)
     return beta_samples, gamma_samples
コード例 #10
0
    def optimize(self,
                 var=None,
                 S=None,
                 trim_percentage=0.0,
                 share_obs_std=True,
                 fit_fixed=True,
                 inner_print_level=5,
                 inner_max_iter=100,
                 inner_tol=1e-5,
                 inner_verbose=True,
                 inner_acceptable_tol=1e-4,
                 inner_nlp_scaling_min_value=1e-8,
                 outer_verbose=False,
                 outer_max_iter=1,
                 outer_step_size=1,
                 outer_tol=1e-6,
                 normalize_Z=False,
                 build_X=True,
                 random_seed=0):
        """
        Run optimization routine via LimeTr.

        Args:
            var (numpy.ndarray | None, optional):
                One-dimensional array that gives initialization for variables.
                If None, the program will first run without random effects
                to obtain a starting point.
            S (numpy.ndarray | None, optional):
                One-dimensional numpy array that gives standard deviation for
                each measurement. The size of S should be the same as that of
                measurements vector. If None standard deviation of measurement
                will be treated as variables and optimized.
            trim_percentage (float | 0.0, optional):
                A float that gives percentage of datapoints to trim.
                Default is 0, i.e. no trimming.
            share_obs_std (boolean | True, optional):
                A boolean that indicates whether the model should assume data
                across studies share the same measurement standard deviation.
            fit_fixed (boolean | True, optional):
                A boolean that indicates whether to run a fit without random
                effects first in order to obtain a good starting point.
            inner_print_level (int | 5, optional):
                ``print_level`` for Ipopt.
            inner_max_iter (int | 100, optional):
                Maximum number of iterations for inner optimization.
            inner_tol (float | 1e-5, optional):
                Tolerance level for inner optimization.
            inner_verbose (boolean | True, optional):
                Verbose option for inner optimization.
            inner_acceptable_tol (float | 1e-4, optional):
                Acceptable tolerance level for inner optimization.
            inner_nlp_scaling_min_value (float | 1e-8, optional):
                Min scaling for objective function.
            outer_verbose (boolean | False, optional):
                Verbose option for outer optimization.
            outer_max_iter (int | 1, optional):
                Maximum number of iterations for outer optimization. When there
                is no trimming, outer optimization is not needed, so the default
                is set to be 1.
            outer_step_size (float |1.0, optional):
                Step size for outer optimization. Used in trimming.
            outer_tol (float | 1e-6, optional):
                Tolerance level for outer optimization.
            normalize_Z (bool | False, optional):
                Whether to normalize Z matrix before optimization.
            build_X (bool | True, optional):
                Whether to explicitly build and store X matrix.
            random_seed (int | 0, optional):
                random seed for choosing an initial point for optimization. If equals 0
                the initial point is chosen to be a vector of 0.01.
        """
        self.S = S
        self.share_obs_std = share_obs_std
        Z_norm = self.buildZ(normalize_Z)
        k = self.k_beta + self.k_gamma
        if S is None:
            if share_obs_std:
                k += 1
            else:
                k += len(self.grouping)
        print('n_groups', self.n_groups)
        print('k_beta', self.k_beta)
        print('k_gamma', self.k_gamma)
        print('total number of fixed effects variables', k)

        if self.k_gamma == 0:
            self.add_re = False
            self.k_gamma = 1
            k += 1
        else:
            self.add_re = True

        C = []
        start = self.k_beta
        for ran in self.ran_list:
            _, dims = ran
            m = np.prod(dims[self.n_grouping_dims:])
            c = np.zeros((m - 1, k))
            for i in range(m - 1):
                c[i, start + i] = 1
                c[i, start + i + 1] = -1
            C.append(c)
            start += m
        if len(C) > 0:
            self.constraints = np.vstack(C)
            assert self.constraints.shape[1] == k
        else:
            self.constraints = []

        C = None
        if len(self.constraints) > 0:

            def C(var):
                return self.constraints.dot(var)

        JC = None
        if len(self.constraints) > 0:

            def JC(var):
                return self.constraints

        c = None
        if len(self.constraints) > 0:
            c = np.zeros((2, self.constraints.shape[0]))

        self.uprior = np.array(
            [[-np.inf] * self.k_beta + [1e-7] * self.k_gamma + [1e-7] *
             (k - self.k_beta - self.k_gamma), [np.inf] * k])

        if self.global_cov_bounds is not None:
            if self.global_intercept:
                self.uprior[:, 1:len(self.global_ids) +
                            1] = self.global_cov_bounds
            else:
                self.uprior[:, :len(self.global_ids)] = self.global_cov_bounds

        self.gprior = None
        if self.use_gprior:
            assert len(self.ran_eff_gamma_sd) == self.k_gamma
            self.gprior = np.array(
                [[0] * k, [np.inf] * self.k_beta + self.ran_eff_gamma_sd +
                 [np.inf] * (k - self.k_beta - self.k_gamma)])

        x0 = np.ones(k) * .01
        if random_seed != 0:
            np.random.seed(random_seed)
            x0 = np.random.randn(k) * .01
        if var is not None:
            if self.add_re is True:
                assert len(var) == k
                x0 = var
            else:
                assert len(var) == self.k_beta
                x0 = np.append(var, [1e-8])
                assert len(x0) == k

        if build_X:
            self._buildX()
        if fit_fixed or self.add_re is False:
            uprior_fixed = copy.deepcopy(self.uprior)
            uprior_fixed[:, self.k_beta:self.k_beta + self.k_gamma] = 1e-8
            if S is None or trim_percentage >= 0.01:
                model_fixed = LimeTr(self.grouping,
                                     int(self.k_beta),
                                     int(self.k_gamma),
                                     self.Y,
                                     self.X,
                                     self.JX,
                                     self.Z,
                                     S=S,
                                     C=C,
                                     JC=JC,
                                     c=c,
                                     inlier_percentage=1. - trim_percentage,
                                     share_obs_std=share_obs_std,
                                     uprior=uprior_fixed)
                model_fixed.optimize(
                    x0=x0,
                    print_level=inner_print_level,
                    max_iter=inner_max_iter,
                    tol=inner_tol,
                    acceptable_tol=inner_acceptable_tol,
                    nlp_scaling_min_value=inner_nlp_scaling_min_value)

                x0 = model_fixed.soln
                self.beta_fixed = model_fixed.beta
                if self.add_re is False:
                    self.beta_soln = self.beta_fixed
                    self.delta_soln = model_fixed.delta
                    self.gamma_soln = model_fixed.gamma
                    self.w_soln = model_fixed.w
                    self.info = model_fixed.info['status_msg']
                    self.yfit_no_random = model_fixed.F(model_fixed.beta)
                    return
            else:
                self.beta_fixed = self._solveBeta(S)
                x0 = np.append(self.beta_fixed, [1e-8] * self.k_gamma)
                if self.add_re is False:
                    self.beta_soln = self.beta_fixed
                    self.yfit_no_random = self.Xm.dot(self.beta_fixed)
                    return

        model = LimeTr(self.grouping,
                       int(self.k_beta),
                       int(self.k_gamma),
                       self.Y,
                       self.X,
                       self.JX,
                       self.Z,
                       S=S,
                       C=C,
                       JC=JC,
                       c=c,
                       inlier_percentage=1 - trim_percentage,
                       share_obs_std=share_obs_std,
                       uprior=self.uprior,
                       gprior=self.gprior)
        model.fitModel(x0=x0,
                       inner_print_level=inner_print_level,
                       inner_max_iter=inner_max_iter,
                       inner_acceptable_tol=inner_acceptable_tol,
                       inner_nlp_scaling_min_value=inner_nlp_scaling_min_value,
                       inner_tol=inner_tol,
                       outer_verbose=outer_verbose,
                       outer_max_iter=outer_max_iter,
                       outer_step_size=outer_step_size,
                       outer_tol=outer_tol)
        self.beta_soln = model.beta
        self.gamma_soln = model.gamma
        if normalize_Z:
            self.gamma_soln /= Z_norm**2
        self.delta_soln = model.delta
        self.info = model.info
        self.w_soln = model.w
        self.u_soln = model.estimateRE()
        self.solve_status = model.info['status']
        self.solve_status_msg = model.info['status_msg']

        self.yfit_no_random = model.F(model.beta)

        self.yfit = []
        Z_split = np.split(self.Z, self.n_groups)
        yfit_no_random_split = np.split(self.yfit_no_random, self.n_groups)

        for i in range(self.n_groups):
            self.yfit.append(yfit_no_random_split[i] +
                             Z_split[i].dot(self.u_soln[i]))
        self.yfit = np.concatenate(self.yfit)
        self.model = model

        if inner_verbose == True and self.solve_status != 0:
            print(self.solve_status_msg)
コード例 #11
0
ファイル: model.py プロジェクト: vishalbelsare/CrossWalk
    def fit(self,
            max_iter=100,
            inlier_pct=1.0,
            outer_max_iter=100,
            outer_step_size=1.0):
        """Optimize the model parameters.
        This is a interface to limetr.
        Args:
            max_iter (int, optional):
                Maximum number of iterations.
            inlier_pct (float, optional):
                How much percentage of the data do you trust.
            outer_max_iter (int, optional):
                Outer maximum number of iterations.
            outer_step_size (float, optional):
                Step size of the trimming problem, the larger the step size the faster it will converge,
                and the less quality of trimming it will guarantee.
        """
        # dimensions for limetr
        n = self.cwdata.study_sizes
        if n.size == 0:
            n = np.full(self.cwdata.num_obs, 1)
        k_beta = self.num_vars
        k_gamma = 1
        y = self.cwdata.obs
        s = self.cwdata.obs_se
        x = self.design_mat
        z = np.ones((self.cwdata.num_obs, 1))

        uprior = np.hstack(
            (self.prior_beta_uniform, self.prior_gamma_uniform[:, None]))
        gprior = np.hstack(
            (self.prior_beta_gaussian, self.prior_gamma_gaussian[:, None]))

        if self.constraint_mat is None:
            cfun = None
            jcfun = None
            cvec = None
        else:
            num_constraints = self.constraint_mat.shape[0]
            cmat = np.hstack(
                (self.constraint_mat, np.zeros((num_constraints, 1))))

            cvec = np.array([[-np.inf] * num_constraints,
                             [0.0] * num_constraints])

            def cfun(var):
                return cmat.dot(var)

            def jcfun(var):
                return cmat

        def fun(var):
            return x.dot(var)

        def jfun(beta):
            return x

        self.lt = LimeTr(n,
                         k_beta,
                         k_gamma,
                         y,
                         fun,
                         jfun,
                         z,
                         S=s,
                         gprior=gprior,
                         uprior=uprior,
                         C=cfun,
                         JC=jcfun,
                         c=cvec,
                         inlier_percentage=inlier_pct)
        self.beta, self.gamma, self.w = self.lt.fitModel(
            inner_print_level=5,
            inner_max_iter=max_iter,
            outer_max_iter=outer_max_iter,
            outer_step_size=outer_step_size)

        self.fixed_vars = {
            var: self.beta[self.var_idx[var]]
            for var in self.vars
        }
        if self.use_random_intercept:
            u = self.lt.estimateRE()
            self.random_vars = {
                sid: u[i]
                for i, sid in enumerate(self.cwdata.unique_study_id)
            }
        else:
            self.random_vars = dict()

        # compute the posterior distribution of beta
        hessian = self.get_beta_hessian()
        unconstrained_id = np.hstack([
            np.arange(self.lt.k_beta)[self.var_idx[dorm]]
            for dorm in self.cwdata.unique_dorms if dorm != self.gold_dorm
        ])
        self.beta_sd = np.zeros(self.lt.k_beta)
        self.beta_sd[unconstrained_id] = np.sqrt(
            np.diag(np.linalg.inv(hessian)))
コード例 #12
0
ファイル: model.py プロジェクト: vishalbelsare/CrossWalk
class CWModel:
    """Cross Walk model.
    """
    def __init__(self,
                 cwdata,
                 obs_type='diff_log',
                 cov_models=None,
                 gold_dorm=None,
                 order_prior=None,
                 use_random_intercept=True,
                 prior_gamma_uniform=None,
                 prior_gamma_gaussian=None):
        """Constructor of CWModel.
        Args:
            cwdata (data.CWData):
                Data for cross walk.
            obs_type (str, optional):
                Type of observation can only be chosen from `'diff_log'` and
                `'diff_logit'`.
            cov_models (list{crosswalk.CovModel}):
                A list of covariate models for the definitions/methods
            gold_dorm (str | None, optional):
                Gold standard definition/method.
            order_prior (list{list{str}} | None, optional):
                Order priors between different definitions.
            use_random_intercept (bool, optional):
                If ``True``, use random intercept.
            prior_gamma_uniform (Tuple[float, float], optional):
                If not ``None``, use it as the bound of gamma.
            prior_gamma_gaussian (Tuple[float, float], optional):
                If not ``None``, use it as the gaussian prior of gamma.
        """
        self.cwdata = cwdata
        self.obs_type = obs_type
        self.cov_models = utils.default_input(cov_models,
                                              [CovModel('intercept')])
        self.gold_dorm = utils.default_input(gold_dorm, cwdata.max_ref_dorm)
        self.order_prior = order_prior
        self.use_random_intercept = use_random_intercept
        if self.cwdata.num_studies == 0 and self.use_random_intercept:
            warnings.warn("Must have study_id to use random intercept."
                          " Reset use_random_intercept to False.")
            self.use_random_intercept = False

        # check input
        self.check()

        # create function for prediction
        if self.obs_type == 'diff_log':

            def obs_fun(x):
                return np.log(x)

            def obs_inv_fun(y):
                return np.exp(y)
        else:

            def obs_fun(x):
                return np.log(x / (1.0 - x))

            def obs_inv_fun(y):
                return 1.0 / (1.0 + np.exp(-y))

        self.obs_fun = obs_fun
        self.obs_inv_fun = obs_inv_fun

        # variable names
        self.vars = [dorm for dorm in self.cwdata.unique_dorms]

        # dimensions
        self.num_vars_per_dorm = sum(
            [model.num_vars for model in self.cov_models])
        self.num_vars = self.num_vars_per_dorm * self.cwdata.num_dorms

        # indices for easy access the variables
        var_sizes = np.array([self.num_vars_per_dorm] * self.cwdata.num_dorms)
        var_idx = utils.sizes_to_indices(var_sizes)
        self.var_idx = {var: var_idx[i] for i, var in enumerate(self.vars)}

        # create design matrix
        self.relation_mat = self.create_relation_mat()
        self._check_relation_mat()
        self.cov_mat = self.create_cov_mat()
        self._assert_covs_independent()
        self.design_mat = self.create_design_mat()
        self._assert_rank_efficient()
        self.constraint_mat = self.create_constraint_mat()

        # gamma bounds
        self.prior_gamma_uniform = np.array([
            0.0, np.inf
        ]) if prior_gamma_uniform is None else np.array(prior_gamma_uniform)
        if not self.use_random_intercept:
            self.prior_gamma_uniform = np.zeros(2)
        if self.prior_gamma_uniform[0] < 0.0:
            warnings.warn(
                "Lower bound of gamma has to be non-negative, reset it to zero."
            )
            self.prior_gamma_uniform[0] = 0.0

        # gamma Gaussian prior
        self.prior_gamma_gaussian = np.array([
            0.0, np.inf
        ]) if prior_gamma_gaussian is None else np.array(prior_gamma_gaussian)
        if not self.use_random_intercept:
            self.prior_gamma_gaussian = np.array([0.0, np.inf])

        # beta bounds
        uprior = np.repeat(np.array([[-np.inf], [np.inf]]),
                           self.num_vars,
                           axis=1)
        for i, cov_model in enumerate(self.cov_models):
            for dorm, prior in cov_model.prior_beta_uniform.items():
                uprior[:, self.var_idx[dorm][i]] = prior
        uprior[:, self.var_idx[self.gold_dorm]] = 0.0
        self.prior_beta_uniform = uprior

        # beta Gaussian prior
        gprior = np.repeat(np.array([[0.0], [np.inf]]), self.num_vars, axis=1)
        for i, cov_model in enumerate(self.cov_models):
            for dorm, prior in cov_model.prior_beta_gaussian.items():
                gprior[:, self.var_idx[dorm][i]] = prior
        gprior[:, self.var_idx[self.gold_dorm]] = np.array([[0.0], [np.inf]])
        self.prior_beta_gaussian = gprior

        # place holder for the solutions
        self.beta = None
        self.beta_sd = None
        self.gamma = None
        self.fixed_vars = None
        self.random_vars = None

    def check(self):
        """Check input type, dimension and values.
        """
        assert isinstance(self.cwdata, data.CWData)
        assert self.obs_type in ['diff_log', 'diff_logit'], \
            "Unsupport observation type"
        assert isinstance(self.cov_models, list)
        assert all([isinstance(model, CovModel) for model in self.cov_models])

        assert self.gold_dorm in self.cwdata.unique_dorms

        assert self.order_prior is None or isinstance(self.order_prior, list)

    def _assert_covs_independent(self):
        """Check if the covariates are independent.
        """
        rank = np.linalg.matrix_rank(self.cov_mat)
        if rank < self.cov_mat.shape[1]:
            raise ValueError(
                "Covariates are collinear, that is, some covariate column is a linear combination of "
                "some of the other columns. Please check them carefully.")

    def _assert_rank_efficient(self):
        """Check the rank of the design matrix.
        """
        rank = np.linalg.matrix_rank(self.design_mat)
        num_unknowns = self.num_vars_per_dorm * (self.cwdata.num_dorms - 1)
        if rank < num_unknowns:
            raise ValueError(
                f"Not enough information in the data to recover parameters."
                f"Number of effective data points is {rank} and number of unknowns is {num_unknowns}."
                f"Please include more effective data or reduce the number of covariates."
            )

    def create_relation_mat(self, cwdata=None):
        """Creating relation matrix.

        Args:
            cwdata (data.CWData | None, optional):
                Optional data set, if None, use `self.cwdata`.

        Returns:
            numpy.ndarray:
                Returns relation matrix with 1 encode alternative definition
                and -1 encode reference definition.
        """
        cwdata = utils.default_input(cwdata, default=self.cwdata)
        assert isinstance(cwdata, data.CWData)

        relation_mat = np.zeros((cwdata.num_obs, cwdata.num_dorms))
        for i, dorms in enumerate(cwdata.alt_dorms):
            for dorm in dorms:
                relation_mat[i, cwdata.dorm_idx[dorm]] += 1.0

        for i, dorms in enumerate(cwdata.ref_dorms):
            for dorm in dorms:
                relation_mat[i, cwdata.dorm_idx[dorm]] -= 1.0

        return relation_mat

    def _check_relation_mat(self):
        """Check relation matrix, detect unused dorms.
        """
        col_scales = np.max(np.abs(self.relation_mat), axis=0)
        unused_dorms = [
            self.cwdata.unique_dorms[i] for i, scale in enumerate(col_scales)
            if scale == 0.0
        ]
        if len(unused_dorms) != 0:
            raise ValueError(
                f"{unused_dorms} appears to be unused, most likely it is (they are) "
                f"in both alt_dorms and ref_dorms at the same time for all its (their) "
                f"appearance. Please remove {unused_dorms} from alt_dorms and ref_dorms."
            )

    def create_cov_mat(self, cwdata=None):
        """Create covariates matrix for definitions/methods model.

        Args:
            cwdata (data.CWData | None, optional):
                Optional data set, if None, use `self.cwdata`.

        Returns:
            numpy.ndarray:
                Returns covarites matrix.
        """
        cwdata = utils.default_input(cwdata, default=self.cwdata)
        assert isinstance(cwdata, data.CWData)

        return np.hstack(
            [model.create_design_mat(cwdata) for model in self.cov_models])

    def create_design_mat(self, cwdata=None, relation_mat=None, cov_mat=None):
        """Create linear design matrix.

        Args:
            cwdata (data.CWData | None, optional):
                Optional data set, if None, use `self.cwdata`.
            relation_mat (numpy.ndarray | None, optional):
                Optional relation matrix, if None, use `self.relation_mat`
            cov_mat (numpy.ndarray | None, optional):
                Optional covariates matrix, if None, use `self.cov_mat`

        Returns:
            numpy.ndarray:
                Returns linear design matrix.
        """
        cwdata = utils.default_input(cwdata, default=self.cwdata)
        relation_mat = utils.default_input(relation_mat,
                                           default=self.relation_mat)
        cov_mat = utils.default_input(cov_mat, default=self.cov_mat)

        mat = (relation_mat.ravel()[:, None] *
               np.repeat(cov_mat, cwdata.num_dorms, axis=0)).reshape(
                   cwdata.num_obs, self.num_vars)

        return mat

    def create_constraint_mat(self):
        """Create constraint matrix.

        Returns:
            numpy.ndarray:
                Return constraints matrix.
        """
        mat = np.array([]).reshape(0, self.num_vars)
        if self.order_prior is not None:
            dorm_constraint_mat = []
            cov_mat = self.cov_mat
            min_cov_mat = np.min(cov_mat, axis=0)
            max_cov_mat = np.max(cov_mat, axis=0)

            if np.allclose(min_cov_mat, max_cov_mat):
                design_mat = min_cov_mat[None, :]
            else:
                design_mat = np.vstack((min_cov_mat, max_cov_mat))
            for p in self.order_prior:
                sub_mat = np.zeros((design_mat.shape[0], self.num_vars))
                sub_mat[:, self.var_idx[p[0]]] = design_mat
                sub_mat[:, self.var_idx[p[1]]] = -design_mat
                dorm_constraint_mat.append(sub_mat)
            dorm_constraint_mat = np.vstack(dorm_constraint_mat)
            mat = np.vstack((mat, dorm_constraint_mat))

        if mat.size == 0:
            return None
        else:
            return mat

    def fit(self,
            max_iter=100,
            inlier_pct=1.0,
            outer_max_iter=100,
            outer_step_size=1.0):
        """Optimize the model parameters.
        This is a interface to limetr.
        Args:
            max_iter (int, optional):
                Maximum number of iterations.
            inlier_pct (float, optional):
                How much percentage of the data do you trust.
            outer_max_iter (int, optional):
                Outer maximum number of iterations.
            outer_step_size (float, optional):
                Step size of the trimming problem, the larger the step size the faster it will converge,
                and the less quality of trimming it will guarantee.
        """
        # dimensions for limetr
        n = self.cwdata.study_sizes
        if n.size == 0:
            n = np.full(self.cwdata.num_obs, 1)
        k_beta = self.num_vars
        k_gamma = 1
        y = self.cwdata.obs
        s = self.cwdata.obs_se
        x = self.design_mat
        z = np.ones((self.cwdata.num_obs, 1))

        uprior = np.hstack(
            (self.prior_beta_uniform, self.prior_gamma_uniform[:, None]))
        gprior = np.hstack(
            (self.prior_beta_gaussian, self.prior_gamma_gaussian[:, None]))

        if self.constraint_mat is None:
            cfun = None
            jcfun = None
            cvec = None
        else:
            num_constraints = self.constraint_mat.shape[0]
            cmat = np.hstack(
                (self.constraint_mat, np.zeros((num_constraints, 1))))

            cvec = np.array([[-np.inf] * num_constraints,
                             [0.0] * num_constraints])

            def cfun(var):
                return cmat.dot(var)

            def jcfun(var):
                return cmat

        def fun(var):
            return x.dot(var)

        def jfun(beta):
            return x

        self.lt = LimeTr(n,
                         k_beta,
                         k_gamma,
                         y,
                         fun,
                         jfun,
                         z,
                         S=s,
                         gprior=gprior,
                         uprior=uprior,
                         C=cfun,
                         JC=jcfun,
                         c=cvec,
                         inlier_percentage=inlier_pct)
        self.beta, self.gamma, self.w = self.lt.fitModel(
            inner_print_level=5,
            inner_max_iter=max_iter,
            outer_max_iter=outer_max_iter,
            outer_step_size=outer_step_size)

        self.fixed_vars = {
            var: self.beta[self.var_idx[var]]
            for var in self.vars
        }
        if self.use_random_intercept:
            u = self.lt.estimateRE()
            self.random_vars = {
                sid: u[i]
                for i, sid in enumerate(self.cwdata.unique_study_id)
            }
        else:
            self.random_vars = dict()

        # compute the posterior distribution of beta
        hessian = self.get_beta_hessian()
        unconstrained_id = np.hstack([
            np.arange(self.lt.k_beta)[self.var_idx[dorm]]
            for dorm in self.cwdata.unique_dorms if dorm != self.gold_dorm
        ])
        self.beta_sd = np.zeros(self.lt.k_beta)
        self.beta_sd[unconstrained_id] = np.sqrt(
            np.diag(np.linalg.inv(hessian)))

    def get_beta_hessian(self) -> np.ndarray:
        # compute the posterior distribution of beta
        x = self.lt.JF(self.lt.beta) * np.sqrt(self.lt.w)[:, None]
        z = self.lt.Z * np.sqrt(self.lt.w)[:, None]
        v = limetr.utils.VarMat(self.lt.V**self.lt.w, z, self.lt.gamma,
                                self.lt.n)

        if hasattr(self.lt, 'gprior'):
            beta_gprior_sd = self.lt.gprior[:, self.lt.idx_beta][1]
        else:
            beta_gprior_sd = np.repeat(np.inf, self.lt.k_beta)

        hessian = x.T.dot(v.invDot(x)) + np.diag(1.0 / beta_gprior_sd**2)
        hessian = np.delete(hessian, self.var_idx[self.gold_dorm], axis=0)
        hessian = np.delete(hessian, self.var_idx[self.gold_dorm], axis=1)

        return hessian

    def get_cov_names(self) -> List[str]:
        # column of covariate name
        cov_names = []
        for model in self.cov_models:
            if model.spline is None:
                cov_names.append(model.cov_name)
            else:
                cov_names.extend([
                    f'{model.cov_name}_spline_{i}'
                    for i in range(model.num_vars)
                ])
        return cov_names

    def create_result_df(self) -> pd.DataFrame:
        """Create result data frame.

        Returns:
            pd.DataFrame: Data frame that contains the result.
        """
        # column of dorms
        dorms = np.repeat(self.cwdata.unique_dorms, self.num_vars_per_dorm)
        cov_names = self.get_cov_names()
        cov_names *= self.cwdata.num_dorms

        # create data frame
        df = pd.DataFrame({
            'dorms': dorms,
            'cov_names': cov_names,
            'beta': self.beta,
            'beta_sd': self.beta_sd,
        })
        if self.use_random_intercept:
            gamma = np.hstack((self.lt.gamma, np.full(self.num_vars - 1,
                                                      np.nan)))
            re = np.hstack(
                (self.lt.u,
                 np.full((self.cwdata.num_studies, self.num_vars - 1),
                         np.nan)))
            df['gamma'] = gamma
            for i, study_id in enumerate(self.cwdata.unique_study_id):
                df[study_id] = re[i]

        return df

    def save_result_df(self, folder: str, filename: str = 'result.csv'):
        """Save result.

        Args:
            folder (str): Path to the result folder.
            filename (str): Name of the result. Default to `'result.csv'`.
        """
        if not filename.endswith('.csv'):
            filename += '.csv'
        df = self.create_result_df()
        df.to_csv(folder + '/' + filename, index=False)

    def adjust_orig_vals(self,
                         df,
                         orig_dorms,
                         orig_vals_mean,
                         orig_vals_se,
                         study_id=None,
                         data_id=None,
                         ref_dorms=None):
        """Adjust alternative values.

        Args:
            df (pd.DataFrame):
                Data frame of the alternative values that need to be adjusted.
            orig_dorms (str):
                Name of the column in `df` that contains the alternative
                definitions or methods.
            orig_vals_mean (str):
                Name of the column in `df` that contains the alternative values.
            orig_vals_se (str):
                Name of the column in `df` that contains the standard error of
                alternative values.
            study_id (str | None, optional):
                If not `None`, predict with the random effects.
            data_id (str | None, optional):
                If `None` create data_id by the integer sequence.
            ref_dorms (str, optional):
                Name of the column with reference dorms, if is ``None``, use the
                gold_dorm as the reference dorm. Default to ``None``.

        Returns:
            pandas.DataFrame:
                The adjusted values and standard deviations.
        """
        df_copy = df.copy()
        if ref_dorms is None:
            ref_dorms = 'ref_dorms'
            df_copy[ref_dorms] = np.array([self.gold_dorm] * df_copy.shape[0])
        if 'intercept' not in df_copy.columns:
            df_copy['intercept'] = np.ones(df_copy.shape[0])
        new_cwdata = data.CWData(df_copy,
                                 alt_dorms=orig_dorms,
                                 ref_dorms=ref_dorms,
                                 dorm_separator=self.cwdata.dorm_separator,
                                 covs=list(self.cwdata.covs.columns),
                                 data_id=data_id,
                                 add_intercept=False)

        # transfer data dorm structure to the new_cwdata
        new_cwdata.copy_dorm_structure(self.cwdata)

        # create new design matrix
        new_relation_mat = self.create_relation_mat(cwdata=new_cwdata)
        new_cov_mat = self.create_cov_mat(cwdata=new_cwdata)
        new_design_mat = self.create_design_mat(cwdata=new_cwdata,
                                                relation_mat=new_relation_mat,
                                                cov_mat=new_cov_mat)

        # calculate the random effects
        if study_id is not None:
            random_effects = np.array([
                self.random_vars[sid][0] if sid in self.random_vars else 0.0
                for sid in df[study_id]
            ])
        else:
            random_effects = np.zeros(df.shape[0])
        random_effects[df[orig_dorms].values == self.gold_dorm] = 0.0

        # compute the corresponding gold_dorm value
        if self.obs_type == 'diff_log':
            transformed_orig_vals_mean,\
            transformed_orig_vals_se = utils.linear_to_log(
                df[orig_vals_mean].values,
                df[orig_vals_se].values
            )
        else:
            transformed_orig_vals_mean, \
            transformed_orig_vals_se = utils.linear_to_logit(
                df[orig_vals_mean].values,
                df[orig_vals_se].values
            )

        pred_diff_mean = new_design_mat.dot(self.beta)
        pred_diff_sd = np.sqrt(
            np.array([(new_design_mat[i]**2).dot(self.beta_sd**2)
                      if dorm != self.gold_dorm else 0.0
                      for i, dorm in enumerate(df[orig_dorms])]))

        transformed_ref_vals_mean = transformed_orig_vals_mean - \
            pred_diff_mean - random_effects
        transformed_ref_vals_sd = np.sqrt(transformed_orig_vals_se**2 +
                                          pred_diff_sd**2 + self.gamma[0]**2)
        if self.obs_type == 'diff_log':
            ref_vals_mean,\
            ref_vals_sd = utils.log_to_linear(transformed_ref_vals_mean,
                                              transformed_ref_vals_sd)
        else:
            ref_vals_mean,\
            ref_vals_sd = utils.logit_to_linear(transformed_ref_vals_mean,
                                                transformed_ref_vals_sd)

        pred_df = pd.DataFrame({
            'ref_vals_mean': ref_vals_mean,
            'ref_vals_sd': ref_vals_sd,
            'pred_diff_mean': pred_diff_mean,
            'pred_diff_sd': pred_diff_sd,
            'data_id': new_cwdata.data_id
        })

        return pred_df