コード例 #1
0
ファイル: rlassoEffects.py プロジェクト: maxhuppertz/hdmpy
def simul_ci(k=1, Omega=None, var=None, seed=0, fix_seed=True, verbose=False):
    if Omega is None:
        Omega = np.identity(k)
    else:
        k = Omega.shape[0]

    if var is None:
        var = np.diag(Omega)

    try:
        if fix_seed:
            # This is a key difference between the R and Python implementation.
            # For some data sets, especially when k > n, scipy.stats.norm() will
            # return an error, claiming than Omega is singular. R's
            # MASS::mvrnorm(), on the other hand, will happily use Omega and
            # calculate draws from it. I had to add allow_singular to get both
            # implementations to work similarly.
            beta = multivariate_normal(
                cov=Omega, allow_singular=True).rvs(random_state=seed)
        else:
            beta = multivariate_normal(cov=Omega, allow_singular=True).rvs()

        sim = np.amax(np.abs(cvec(beta) / cvec(np.sqrt(var))))
    except Exception as e:
        if verbose:
            print('Error encountered in simul_ci():')
            print(e)
            print()

        sim = np.nan

    return sim
コード例 #2
0
ファイル: rlassoEffects.py プロジェクト: maxhuppertz/hdmpy
def get_cov(X, e, add_intercept=True, homoskedastic=False):
    """ Calculates OLS variance estimator based on X and residuals

    Inputs
    X: n by k matrix, RHS variables
    e: n by 1 vector or vector-like, residuals from an OLS regression
    add_intercept: Boolean, if True, adds an intercept as the first column of X
                   (and increases k by one)

    Outputs
    V_hat: k by k NumPy array, estimated covariance matrix
    """
    # Get the number of observations n and parameters k
    n, k = X.shape

    # Check whether an intercept needs to be added
    if add_intercept:
        # If so, add the intercept
        X = np.concatenate([np.ones(shape=(n, 1)), X], axis=1)

        # Don't forget to increase k
        k = k + 1

    # Make sure the residuals are a proper column vector
    e = cvec(e)

    # Calculate X'X
    XX = X.T @ X

    # Calculate its inverse
    XXinv = linalg.inv(XX)

    # Check whether to use homoskedastic errors
    if homoskedastic:
        # If so, calculate the homoskedastic variance estimator
        V_hat = (1 / (n - k)) * XXinv * (e.T @ e)
    else:
        # Otherwise, calculate an intermediate object
        S = (e @ np.ones(shape=(1, k))) * X

        # Then, get the HC0 sandwich estimator
        V_hat = (n / (n - k)) * XXinv @ (S.transpose() @ S) @ XXinv

    # Return the result
    return V_hat
コード例 #3
0
ファイル: rlassoEffects.py プロジェクト: maxhuppertz/hdmpy
def rlassoEffect_wrapper(i,
                         x,
                         y,
                         d,
                         method='double selection',
                         I3=None,
                         post=True,
                         colnames_d=None,
                         colnames_x=None,
                         intercept=True,
                         model=True,
                         homoskedastic=False,
                         X_dependent_lambda=False,
                         lambda_start=None,
                         c=1.1,
                         gamma=None,
                         numSim=5000,
                         numIter=15,
                         tol=10**(-5),
                         threshold=-np.inf,
                         par=True,
                         corecap=np.inf,
                         fix_seed=True,
                         verbose=False):
    """ Wrapper for rlassoEffect()

    Inputs
    i: Integer, index of the current variable of interest

    See the rlassoEffect() documentation for other inputs

    Output
    res: Dictionary, contains a collection of results from rlassoEffect(), or a
         collection of empty strings and NANs if an error is encountered while
         running rlassoEffect()
    """
    if np.amin(x.shape) == 1:
        x = cvec(x)

    y = cvec(y)

    d = cvec(d)

    try:
        col = rlassoEffect(x,
                           y,
                           d,
                           method=method,
                           I3=I3,
                           post=post,
                           colnames_d=colnames_d,
                           colnames_x=colnames_x,
                           intercept=intercept,
                           model=model,
                           homoskedastic=homoskedastic,
                           X_dependent_lambda=X_dependent_lambda,
                           lambda_start=lambda_start,
                           c=c,
                           gamma=gamma,
                           numSim=numSim,
                           numIter=numIter,
                           tol=tol,
                           threshold=threshold,
                           par=par,
                           corecap=corecap,
                           fix_seed=fix_seed)

        smat = np.zeros(shape=(x.shape[1] + 1, 1)) * np.nan
        smat[np.arange(smat.shape[0]) != i] = col['selection_index']

        res = {
            'coefficients': [i, col['alpha']],
            'se': [i, col['se'][0]],
            't': [i, col['t'][0]],
            'pval': [i, col['pval'][0]],
            'lasso_regs': {
                i: col
            },
            'reside': [i, col['residuals']['epsilon']],
            'residv': [i, col['residuals']['v']],
            'coef_mat': {
                i: col['coefficients_reg']
            },
            'selection_matrix': [i, smat]
        }
    except Exception as e:
        # Mimic the results in the original code, where any errors result in a
        # variable being skipped, and the preallocated results arrays containing
        # either NANs or empty lists
        res = {
            'coefficients': [i, np.nan],
            'se': [i, np.nan],
            't': [i, np.nan],
            'lasso_regs': {
                i: e
            },
            'pval': [i, np.nan],
            'reside': [i, np.zeros(shape=(x.shape[0], 1)) * np.nan],
            'residv': [i, np.zeros(shape=(x.shape[0], 1)) * np.nan],
            'coef_mat': {
                i: []
            },
            'selection_matrix':
            [i, np.zeros(shape=(x.shape[1] + 1, 1)) * np.nan]
        }

        if verbose:
            print('Error encountered in rlassoEffect_wrapper()')
            print(e)
            print()

    return res
コード例 #4
0
ファイル: rlassoEffects.py プロジェクト: maxhuppertz/hdmpy
    def confint(self,
                parm=None,
                B=500,
                level=.95,
                joint=False,
                par=None,
                corecap=None,
                fix_seed=None,
                verbose=None):
        self.B = B

        if par is None:
            par = self.par_any

        if corecap is None:
            corecap = self.corecap

        if fix_seed is None:
            fix_seed = self.fix_seed

        if verbose is None:
            verbose = self.verbose

        n = self.res['samplesize']

        k = p1 = len(self.res['coefficients'])

        cf = self.res['coefficients']

        pnames = cf.index.values

        self.parm = parm
        self.level = level
        self.joint = joint

        if self.parm is None:
            self.parm = pnames
        elif np.issubdtype(self.parm, np.number):
            self.parm = pnames[parm]

        if not self.joint:
            a = (1 - self.level) / 2

            a = cvec([a, 1 - a])

            fac = norm.ppf(a)

            pct = [str(np.round(x * 100, 3)) + ' %' for x in a[:, 0]]

            ses = self.res['se'].loc[self.parm, :]

            self.ci = cf.loc[self.parm, :] @ np.ones(shape=(1,
                                                            2)) + ses @ fac.T

            self.ci.columns = pct

            return self.ci
        else:
            if self.verbose:
                print('\nCaution: Joint confidence intervals for hdmpy are',
                      'currently different from those of the original R',
                      'package hdm. This is a known bug.')
            e = self.res['residuals']['e'].values
            v = self.res['residuals']['v'].values

            ev = e * v

            Ev2 = np.mean(v**2, axis=0)

            Omegahat = np.zeros(shape=(self.k, self.k)) * np.nan

            for j in np.arange(self.k):
                for l in np.arange(start=j, stop=self.k):
                    Omegahat[j,
                             l] = Omegahat[l,
                                           j] = (1 / (Ev2[j] * Ev2[l]) *
                                                 np.mean(ev[:, j] * ev[:, l]))

            var = np.diag(Omegahat)

            # Check whether to use parallel processing
            if par:
                # If so, get the number of cores to use
                cores = np.int(np.amin([mp.cpu_count(), self.corecap]))
            else:
                # Otherwise, use only one core (i.e. run sequentially)
                cores = 1

            sim = jbl.Parallel(n_jobs=cores)(
                jbl.delayed(simul_ci)(Omega=Omegahat / self.n,
                                      var=var,
                                      seed=i * 20,
                                      fix_seed=fix_seed,
                                      verbose=verbose)
                for i in np.arange(self.B))

            sim = cvec(sim)

            a = 1 - self.level

            ab = cvec([a / 2, 1 - a / 2])

            pct = [str(np.round(x * 100, 3)) + ' %' for x in ab[:, 0]]

            var = pd.DataFrame(var, index=self.parm)

            hatc = np.quantile(sim, q=1 - a)

            ci1 = cf.loc[self.parm, :] - hatc * np.sqrt(var.loc[self.parm, :])

            ci2 = cf.loc[self.parm, :] + hatc * np.sqrt(var.loc[self.parm, :])

            self.ci = pd.concat([ci1.iloc[:, 0], ci2.iloc[:, 0]], axis=1)

            self.ci.columns = pct

            return self.ci
コード例 #5
0
ファイル: rlassoEffects.py プロジェクト: maxhuppertz/hdmpy
    def __init__(self,
                 x,
                 y,
                 index=None,
                 method='partialling out',
                 I3=None,
                 post=True,
                 colnames=None,
                 intercept=True,
                 model=True,
                 homoskedastic=False,
                 X_dependent_lambda=False,
                 lambda_start=None,
                 c=1.1,
                 gamma=None,
                 numSim=5000,
                 numIter=15,
                 tol=10**(-5),
                 threshold=-np.inf,
                 par_outer=True,
                 par_inner=False,
                 par_any=True,
                 corecap=np.inf,
                 fix_seed=True,
                 verbose=False):
        # Initialize internal variables
        if isinstance(x, pd.DataFrame) and colnames is None:
            colnames = x.columns

        self.x = np.array(x).astype(np.float32)
        self.y = cvec(y).astype(np.float32)

        if index is None:
            self.index = cvec(np.arange(self.x.shape[1]))
        else:
            self.index = cvec(index)

        self.method = method

        self.I3 = I3

        self.post = post

        self.colnames = colnames

        if self.index.dtype == bool:
            self.k = self.p1 = self.index.sum()
        else:
            self.k = self.p1 = len(self.index)

        self.n = x.shape[1]

        self.intercept = intercept
        self.model = model

        self.homoskedastic = homoskedastic
        self.X_dependent_lambda = X_dependent_lambda
        self.lambda_start = lambda_start

        self.c = c
        self.gamma = gamma

        self.numSim = numSim
        self.numIter = numIter
        self.tol = tol
        self.threshold = threshold

        self.par_outer = par_outer
        self.par_inner = par_inner
        self.par_any = par_any
        self.corecap = corecap
        self.fix_seed = fix_seed

        if not self.par_any:
            self.par_outer = self.par_inner = False
        elif self.par_outer and self.par_inner:
            self.par_outer = False

        self.verbose = verbose

        # Initialize internal variables used in other functions
        self.B = None
        self.parm = None
        self.level = None
        self.joint = None

        # preprocessing index numerical vector
        if np.issubdtype(self.index.dtype, np.number):
            self.index = self.index.astype(np.int)

            if not (np.all(self.index[:, 0] < self.x.shape[1]) and
                    (len(self.index) <= self.x.shape[1])):
                raise ValueError('Numeric index includes elements which are ' +
                                 'outside of the column range of x, or the ' +
                                 'indexing vector is too long')

        elif self.index.dtype == bool:
            if not (len(self.index) <= self.x.shape[1]):
                raise ValueError('Boolean index vector is too long')

            self.index = cvec([i for i, b in enumerate(self.index[:, 0]) if b])

        elif np.issubdtype(self.index.dtype, np.str_):
            if not np.all([s in self.x.columns for s in self.index[:, 0]]):
                raise ValueError('String index specifies column names which ' +
                                 'are not in the column names of x')

            self.index = (cvec([
                i for i, s in enumerate(self.index[:, 0])
                if s in self.x.columns
            ]))

        else:
            raise ValueError('Argument index has an invalid type')

        if (self.method == 'double selection') and (self.I3 is not None):
            I3ind = cvec([i for i, b in enumerate(self.I3) if b])

            if I3ind != []:
                if len([x for x in I3ind[:, 0] if x in self.index[:, 0]]) > 0:
                    raise ValueError('I3 and index must not overlap!')

        if self.colnames is None:
            self.colnames = ['V' + str(i + 1) for i in range(self.x.shape[1])]

        # Check whether to use parallel processing
        if self.par_outer:
            # If so, get the number of cores to use
            cores = np.int(np.amin([mp.cpu_count(), self.corecap]))
        else:
            # Otherwise, use only one core (i.e. run sequentially)
            cores = 1

        if (self.I3 is not None):
            res = jbl.Parallel(n_jobs=cores)(jbl.delayed(rlassoEffect_wrapper)(
                i,
                x=np.delete(self.x, i, axis=1),
                y=self.y,
                d=self.x[:, i],
                method=self.method,
                I3=np.delete(self.I3, i, axis=0),
                post=self.post,
                colnames_d=self.colnames[i],
                colnames_x=[c for j, c in enumerate(self.colnames) if j != i],
                intercept=self.intercept,
                model=self.model,
                homoskedastic=self.homoskedastic,
                X_dependent_lambda=self.X_dependent_lambda,
                lambda_start=self.lambda_start,
                c=self.c,
                gamma=self.gamma,
                numSim=self.numSim,
                numIter=self.numIter,
                tol=self.tol,
                threshold=self.threshold,
                par=self.par_inner,
                corecap=self.corecap,
                fix_seed=self.fix_seed,
                verbose=self.verbose) for i in self.index[:, 0])
        else:
            res = jbl.Parallel(n_jobs=cores)(jbl.delayed(rlassoEffect_wrapper)(
                i,
                x=np.delete(self.x, i, axis=1),
                y=self.y,
                d=self.x[:, i],
                method=self.method,
                I3=self.I3,
                post=self.post,
                colnames_d=self.colnames[i],
                colnames_x=[c for j, c in enumerate(self.colnames) if j != i],
                intercept=self.intercept,
                model=self.model,
                homoskedastic=self.homoskedastic,
                X_dependent_lambda=self.X_dependent_lambda,
                lambda_start=self.lambda_start,
                c=self.c,
                gamma=self.gamma,
                numSim=self.numSim,
                numIter=self.numIter,
                tol=self.tol,
                threshold=self.threshold,
                par=self.par_inner,
                corecap=self.corecap,
                fix_seed=self.fix_seed,
                verbose=self.verbose) for i in self.index[:, 0])

        # Convert collection of parallel results into usable results sorted by
        # their index
        coefficients = np.array([r['coefficients'] for r in res])
        coefficients = cvec(coefficients[coefficients[:, 0].argsort(), 1])

        se = np.array([r['se'] for r in res])
        se = cvec(se[se[:, 0].argsort(), 1])

        t = np.array([r['t'] for r in res])
        t = cvec(t[t[:, 0].argsort(), 1])

        pval = np.array([r['pval'] for r in res])
        pval = cvec(pval[pval[:, 0].argsort(), 1])

        lasso_regs = {}
        [lasso_regs.update(r['lasso_regs']) for r in res]

        reside = (np.array([
            np.concatenate([cvec(r['reside'][0]), r['reside'][1]],
                           axis=0)[:, 0] for r in res
        ]))
        reside = reside[reside[:, 0].argsort(), 1:].T

        residv = (np.array([
            np.concatenate([cvec(r['residv'][0]), r['residv'][1]],
                           axis=0)[:, 0] for r in res
        ]))
        residv = residv[residv[:, 0].argsort(), 1:].T

        coef_mat = {}
        [coef_mat.update(r['coef_mat']) for r in res]

        # Replaced this with the following two steps, to ensure this always
        # results in a two dimensional array
        #selection_matrix = (
        #    np.array([np.concatenate([cvec(r['selection_matrix'][0]),
        #                              r['selection_matrix'][1]],
        #                             axis=0)[:,0]
        #              for r in res])
        #)
        selection_matrix = [
            np.concatenate(
                [cvec(r['selection_matrix'][0]), r['selection_matrix'][1]],
                axis=0).T for r in res
        ]
        selection_matrix = (np.concatenate(selection_matrix, axis=0))
        selection_matrix = (selection_matrix[selection_matrix[:, 0].argsort(),
                                             1:])

        # Added this, to be able to add names to results objects
        idx = [self.colnames[i] for i in self.index[:, 0]]

        residuals = {
            'e': pd.DataFrame(reside, columns=idx),
            'v': pd.DataFrame(residv, columns=idx)
        }

        self.res = {
            'coefficients':
            pd.DataFrame(coefficients, index=idx),
            'se':
            pd.DataFrame(se, index=idx),
            't':
            pd.DataFrame(t, index=idx),
            'pval':
            pd.DataFrame(pval, index=idx),
            'lasso_regs':
            lasso_regs,
            'index':
            pd.DataFrame(self.index, index=idx),
            #call = match.call(),
            'samplesize':
            self.n,
            'residuals':
            residuals,
            'coef_mat':
            coef_mat,
            'selection_matrix':
            pd.DataFrame(selection_matrix,
                         index=idx,
                         columns=list(self.colnames))
        }
コード例 #6
0
ファイル: rlassoEffects.py プロジェクト: maxhuppertz/hdmpy
def rlassoEffect(x,
                 y,
                 d,
                 method='double selection',
                 I3=None,
                 post=True,
                 colnames_d=None,
                 colnames_x=None,
                 intercept=True,
                 model=True,
                 homoskedastic=False,
                 X_dependent_lambda=False,
                 lambda_start=None,
                 c=1.1,
                 gamma=None,
                 numSim=5000,
                 numIter=15,
                 tol=10**(-5),
                 threshold=-np.inf,
                 par=True,
                 corecap=np.inf,
                 fix_seed=True):
    d = cvec(d)

    y = cvec(y)

    n, kx = x.shape

    if colnames_d is None:
        colnames_d = ['d1']

    if (colnames_x is None) and (x is not None):
        colnames_x = ['x' + str(i) for i in np.arange(kx)]

    if method == 'double selection':
        I1 = rlasso(x,
                    d,
                    post=post,
                    colnames=colnames_x,
                    intercept=intercept,
                    model=model,
                    homoskedastic=homoskedastic,
                    X_dependent_lambda=X_dependent_lambda,
                    lambda_start=lambda_start,
                    c=c,
                    gamma=gamma,
                    numSim=numSim,
                    numIter=numIter,
                    tol=tol,
                    threshold=threshold,
                    par=par,
                    corecap=corecap,
                    fix_seed=fix_seed).est['index']
        I2 = rlasso(x,
                    y,
                    post=post,
                    colnames=colnames_x,
                    intercept=intercept,
                    model=model,
                    homoskedastic=homoskedastic,
                    X_dependent_lambda=X_dependent_lambda,
                    lambda_start=lambda_start,
                    c=c,
                    gamma=gamma,
                    numSim=numSim,
                    numIter=numIter,
                    tol=tol,
                    threshold=threshold,
                    par=par,
                    corecap=corecap,
                    fix_seed=fix_seed).est['index']

        # Original code checks if type(I3) is bool, but I believe they only do
        # that to see whether it has been defined by the user
        if I3 is not None:
            I3 = cvec(I3)

            I = cvec(I1.astype(bool) | I2.astype(bool) | I3.astype(bool))
        else:
            I = cvec(I1.astype(bool) | I2.astype(bool))
            # missing here: names(I) <- union(names(I1),names(I2))

        if I.sum() == 0:
            I = None

        x = np.concatenate([d, x[:, I[:, 0]]], axis=1)

        reg1 = lm(fit_intercept=True).fit(x, y)

        alpha = reg1.coef_[0, 0]

        names_alpha = colnames_d

        resid = y - cvec(reg1.predict(x))

        if I is None:
            xi = (resid) * np.sqrt(n / (n - 1))
        else:
            xi = (resid) * np.sqrt(n / (n - I.sum() - 1))

        if I is None:
            # Fit an intercept-only model
            reg2 = lm(fit_intercept=False).fit(np.ones_like(d), d)

            v = d - cvec(reg2.predict(np.ones_like(d)))
        else:
            reg2 = lm(fit_intercept=True).fit(x[:, 1:], d)

            v = d - cvec(reg2.predict(x[:, 1:]))

        var = ((1 / n) * (1 / np.mean(v**2, axis=0)) * np.mean(
            (v**2) * (xi**2), axis=0) * (1 / np.mean(v**2, axis=0)))

        se = np.sqrt(var)

        tval = alpha / np.sqrt(var)

        pval = 2 * norm.cdf(-np.abs(tval))

        if I is None:
            no_selected = 1
        else:
            no_selected = 0

        res = {'epsilon': xi, 'v': v}

        if np.issubdtype(type(colnames_d), np.str_):
            colnames_d = [colnames_d]

        results = {
            'alpha': alpha,
            #'se': pd.DataFrame(se, index=colnames_d),
            'se': se,
            't': tval,
            'pval': pval,
            'no_selected': no_selected,
            'coefficients': alpha,
            'coefficient': alpha,
            'coefficients_reg': reg1.coef_,
            'selection_index': I,
            'residuals': res,
            #call = match.call(),
            'samplesize': n
        }
    elif method == 'partialling out':
        reg1 = rlasso(x,
                      y,
                      post=post,
                      colnames=colnames_x,
                      intercept=intercept,
                      model=model,
                      homoskedastic=homoskedastic,
                      X_dependent_lambda=X_dependent_lambda,
                      lambda_start=lambda_start,
                      c=c,
                      gamma=gamma,
                      numSim=numSim,
                      numIter=numIter,
                      tol=tol,
                      threshold=threshold,
                      par=par,
                      corecap=corecap,
                      fix_seed=fix_seed)

        yr = reg1.est['residuals']

        reg2 = rlasso(x,
                      d,
                      post=post,
                      colnames=colnames_x,
                      intercept=intercept,
                      model=model,
                      homoskedastic=homoskedastic,
                      X_dependent_lambda=X_dependent_lambda,
                      lambda_start=lambda_start,
                      c=c,
                      gamma=gamma,
                      numSim=numSim,
                      numIter=numIter,
                      tol=tol,
                      threshold=threshold,
                      par=par,
                      corecap=corecap,
                      fix_seed=fix_seed)

        dr = reg2.est['residuals']

        reg3 = lm(fit_intercept=True).fit(dr, yr)

        alpha = reg3.coef_[0, 0]

        resid = yr - cvec(reg3.predict(dr))

        # This is a difference to the original code. The original code uses
        # var <- vcov(reg3)[2, 2], which is the homoskedastic covariance
        # estimator for OLS. I wrote get_cov() to calculate that, because the
        # linear regression implementation in sklearn does not include standard
        # error calculations. (I could have switched to statsmodels instead, but
        # sklearn seems more likely to be maintained in the future.) I then
        # added the option to get_cov() to calculate heteroskedastic standard
        # errors. I believe that if the penalty term is adjusted for
        # heteroskedasticity, heteroskedastic standard errors should also be
        # used here, to be internally consistent.
        var = np.array([get_cov(dr, resid, homoskedastic=homoskedastic)[1, 1]])

        se = np.sqrt(var)

        tval = alpha / np.sqrt(var)

        pval = 2 * norm.cdf(-np.abs(tval))

        res = {'epsilon': resid, 'v': dr}

        I1 = reg1.est['index']

        I2 = reg2.est['index']

        I = cvec(I1.astype(bool) | I2.astype(bool))

        #names(I) <- union(names(I1),names(I2))

        results = {
            'alpha': alpha,
            'se': se,
            't': tval,
            'pval': pval,
            'coefficients': alpha,
            'coefficient': alpha,
            'coefficients_reg': reg1.est['coefficients'],
            'selection_index': I,
            'residuals': res,
            #call = match.call(),
            'samplesize': n
        }

    return results
コード例 #7
0
def LassoShooting_fit(x,
                      y,
                      lmbda,
                      maxIter=1000,
                      optTol=10**(-5),
                      zeroThreshold=10**(-6),
                      XX=None,
                      Xy=None,
                      beta_start=None):
    """ Shooting LASSO algorithm with variable dependent penalty weights

    Inputs
    x: n by p NumPy array, RHS variables
    y: n by 1 NumPy array, outcome variable
    lmbda: p by 1 NumPy array, variable dependent penalty terms. The j-th
           element is the penalty term for the j-th RHS variable.
    maxIter: integer, maximum number of shooting LASSO updated
    optTol: scalar, algorithm terminated once the sum of absolute differences
            between the updated and current weights is below optTol
    zeroThreshold: scalar, if any final weights are below zeroThreshold, they
                   will be set to zero instead
    XX: k by k NumPy array, pre-calculated version of x'x
    Xy: k by 1 NumPy array, pre-calculated version of x'y
    beta_start: k by 1 NumPy array, initial weights

    Outputs
    w: k by 1 NumPy array, final weights
    wp: k by m + 1 NumPy array, where m is the number of iterations the
        algorithm took. History of weight updates, starting with the initial
        weights.
    m: integer, number of iterations the algorithm took
    """
    # Make sure that y and lmbda are proper column vectors
    y = cvec(y)
    lmbda = cvec(lmbda)

    # Get number of observations n and number of variables p
    n, p = x.shape

    # Check whether XX and Xy were provided, calculate them if not
    if XX is None:
        XX = x.T @ x
    if Xy is None:
        Xy = x.T @ y

    # Check whether an initial value for the intercept was provided
    if beta_start is None:
        # If not, use init_values from help_functions, which will return
        # regression estimates for the five variables in x which are most
        # correlated with y, and initialize all other coefficients as zero
        beta = init_values(x, y, intercept=False)['coefficients']
    else:
        # Otherwise, use the provided initial weights
        beta = beta_start

    # Set up a history of weights over time, starting with the initial ones
    wp = beta

    # Keep track of the number of iterations
    m = 1

    # Create versions of XX and Xy which are just those matrices times two
    XX2 = XX * 2
    Xy2 = Xy * 2

    # Go through all iterations
    while m < maxIter:
        # Save the last set of weights (the .copy() is important, otherwise
        # beta_old will be updated every time beta is changed during the
        # following loop)
        beta_old = beta.copy()

        # Go through all parameters
        for j in np.arange(p):
            # Calculate the shoot
            S0 = XX2[j, :] @ beta - XX2[j, j] * beta[j, 0] - Xy2[j, 0]

            # Update the weights
            if np.isnan(S0).sum() >= 1:
                beta[j] = 0
            elif S0 > lmbda[j]:
                beta[j] = (lmbda[j] - S0) / XX2[j, j]
            elif S0 < -lmbda[j]:
                beta[j] = (-lmbda[j] - S0) / XX2[j, j]
            elif np.abs(S0) <= lmbda[j]:
                beta[j] = 0

        # Add the updated weights to the history of weights
        wp = np.concatenate([wp, beta], axis=1)

        # Check whether the weights are within tolerance
        if np.abs(beta - beta_old).sum() < optTol:
            # If so, break the while loop
            break

        # Increase the iteration counter
        m = m + 1

    # Set the final weights to the last updated weights
    w = beta

    # Set weights which are within zeroThreshold to zero
    w[np.abs(w) < zeroThreshold] = 0

    # Return the weights, history of weights, and iteration counter
    return {'coefficients': w, 'coef.list': wp, 'num.it': m}
コード例 #8
0
def lambdaCalculation(homoskedastic=False,
                      X_dependent_lambda=False,
                      lambda_start=None,
                      c=1.1,
                      gamma=0.1,
                      numSim=5000,
                      y=None,
                      x=None,
                      par=True,
                      corecap=np.inf,
                      fix_seed=True):
    # Get number of observations n and number of variables p
    n, p = x.shape

    # Get number of simulations to use (if simulations are necessary)
    R = numSim

    # Go through all possible combinations of homoskedasticy/heteroskedasticity
    # and X-dependent or independent error terms. The first two cases are
    # special cases: Handling the case there homoskedastic was set to None, and
    # where lambda_start was provided.
    #
    # 1) If homoskedastic was set to None (special case)
    if homoskedastic is None:
        # Initialize lambda
        lmbda0 = lambda_start

        Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T

        # Calculate the final vector of penalty terms
        lmbda = lmbda0 * Ups0

    # 2) If lambda_start was provided (special case)
    elif lambda_start is not None:
        # Check whether a homogeneous penalty term was provided (a scalar)
        if np.amax(cvec(lambda_start).shape) == 1:
            # If so, repeat that p times as the penalty term
            lmbda = np.ones(shape=(p, 1)) * lambda_start
        else:
            # Otherwise, use the provided vector of penalty terms as is
            lmbda = lambda_start

    # 3) Homoskedastic and X-independent
    elif (homoskedastic == True) and (X_dependent_lambda == False):
        # Initilaize lambda
        lmbda0 = 2 * c * np.sqrt(n) * norm.ppf(1 - gamma / (2 * p))

        # Use ddof=1 to be consistent with R's var() function
        Ups0 = np.sqrt(np.var(y, axis=0, ddof=1))

        # Calculate the final vector of penalty terms
        lmbda = np.zeros(shape=(p, 1)) + lmbda0 * Ups0

    # 4) Homoskedastic and X-dependent
    elif (homoskedastic == True) and (X_dependent_lambda == True):
        psi = cvec((x**2).mean(axis=0))

        tXtpsi = (x.T / np.sqrt(psi)).T

        # Check whether to use parallel processing
        if par == True:
            # If so, get the number of cores to use
            cores = np.int(np.amin([mp.cpu_count(), corecap]))
        else:
            # Otherwise, use only one core (i.e. run sequentially)
            cores = 1

        # Get simulated distribution
        sim = jbl.Parallel(n_jobs=cores)(jbl.delayed(simul_pen)(
            n, p, tXtpsi, seed=l * 20, fix_seed=fix_seed)
                                         for l in np.arange(R))

        # Convert it to a proper column vector
        sim = cvec(sim)

        # Initialize lambda based on the simulated quantiles
        lmbda0 = c * np.quantile(sim, q=1 - gamma, axis=0)

        Ups0 = np.sqrt(np.var(y, axis=0, ddof=1))

        # Calculate the final vector of penalty terms
        lmbda = np.zeros(shape=(p, 1)) + lmbda0 * Ups0

    # 5) Heteroskedastic and X-independent
    elif (homoskedastic == False) and (X_dependent_lambda == False):
        # The original includes the comment, "1=num endogenous variables"
        lmbda0 = 2 * c * np.sqrt(n) * norm.ppf(1 - gamma / (2 * p * 1))

        Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T

        lmbda = lmbda0 * Ups0

    # 6) Heteroskedastic and X-dependent
    elif (homoskedastic == False) and (X_dependent_lambda == True):
        eh = y

        ehat = eh @ np.ones(shape=(1, p))

        xehat = x * ehat

        psi = cvec((xehat**2).mean(axis=0)).T

        tXehattpsi = (xehat / (np.ones(shape=(n, 1)) @ np.sqrt(psi)))

        # Check whether to use parallel processing
        if par == True:
            # If so, get the number of cores to use
            cores = np.int(np.amin([mp.cpu_count(), corecap]))
        else:
            # Otherwise, use only one core (i.e. run sequentially)
            cores = 1

        # Get simulated distribution
        sim = jbl.Parallel(n_jobs=cores)(jbl.delayed(simul_pen)(
            n, p, tXehattpsi, seed=l * 20, fix_seed=fix_seed)
                                         for l in np.arange(R))

        # Convert it to a proper column vector
        sim = cvec(sim)

        # Initialize lambda based on the simulated quantiles
        lmbda0 = c * np.quantile(sim, q=1 - gamma, axis=0)

        Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T

        # Calculate the final vector of penalty terms
        lmbda = lmbda0 * Ups0

    # Return results
    return {'lambda0': lmbda0, 'lambda': lmbda, 'Ups0': Ups0}
コード例 #9
0
    def __init__(self,
                 x,
                 y,
                 colnames=None,
                 post=True,
                 intercept=True,
                 model=True,
                 homoskedastic=False,
                 X_dependent_lambda=False,
                 lambda_start=None,
                 c=1.1,
                 gamma=None,
                 numSim=5000,
                 numIter=15,
                 tol=10**(-5),
                 threshold=-np.inf,
                 par=True,
                 corecap=np.inf,
                 fix_seed=True):
        # Initialize internal variables
        if isinstance(x, pd.DataFrame) and colnames is None:
            colnames = x.columns

        self.x = np.array(x).astype(np.float32)
        self.y = cvec(y).astype(np.float32)

        self.n, self.p = self.x.shape

        if colnames is None:
            self.colnames = ['V' + str(i + 1) for i in np.arange(self.p)]
        else:
            self.colnames = colnames

        # Unused line in the original code
        # ind_names = np.arange(self.p) + 1

        self.post = post
        self.intercept = intercept
        self.model = model
        self.homoskedastic = homoskedastic
        self.X_dependent_lambda = X_dependent_lambda
        self.lambda_start = lambda_start
        self.c = c

        if gamma is None:
            self.gamma = .1 / np.log(self.n)
        else:
            self.gamma = gamma

        self.numSim = numSim
        self.numIter = numIter
        self.tol = tol
        self.threshold = threshold

        self.par = par
        self.corecap = corecap
        self.fix_seed = fix_seed

        if (self.post == False) and (self.c is None):
            self.c = .5

        if ((self.post == False) and (self.homoskedastic == False)
                and (self.X_dependent_lambda == False)
                and (self.lambda_start == None) and (self.c == 1.1)
                and (self.gamma == .1 / np.log(self.n))):
            self.c = .5

        # For now, instantiate estimate as None
        self.est = None

        # Calculate robust LASSO coefficients
        if self.intercept == True:
            meanx = cvec(self.x.mean(axis=0))

            self.x = self.x - np.ones(shape=(self.n, 1)) @ meanx.T

            mu = self.y.mean()

            self.y = self.y - mu
        else:
            meanx = np.zeros(shape=(self.p, 1))

            mu = 0

        normx = np.sqrt(np.var(self.x, axis=1, ddof=1))

        Psi = cvec(np.mean(self.x**2, axis=0))

        ind = np.zeros(shape=(self.p, 1)).astype(bool)

        XX = self.x.T @ self.x

        Xy = self.x.T @ self.y

        startingval = init_values(self.x, self.y)['residuals']

        pen = lambdaCalculation(homoskedastic=self.homoskedastic,
                                X_dependent_lambda=self.X_dependent_lambda,
                                lambda_start=self.lambda_start,
                                c=self.c,
                                gamma=self.gamma,
                                numSim=self.numSim,
                                y=startingval,
                                x=self.x,
                                par=self.par,
                                corecap=self.corecap,
                                fix_seed=self.fix_seed)

        lmbda = pen['lambda']
        Ups0 = Ups1 = pen['Ups0']
        lmbda0 = pen['lambda0']

        mm = 1
        s0 = np.sqrt(np.var(y, axis=0, ddof=1))

        while mm <= self.numIter:
            if (mm == 1) and self.post:
                coefTemp = (LassoShooting_fit(self.x,
                                              self.y,
                                              lmbda / 2,
                                              XX=XX,
                                              Xy=Xy)['coefficients'])
            else:
                coefTemp = (LassoShooting_fit(self.x,
                                              self.y,
                                              lmbda,
                                              XX=XX,
                                              Xy=Xy)['coefficients'])

            coefTemp[np.isnan(coefTemp)] = 0

            ind1 = (np.abs(coefTemp) > 0)

            x1 = self.x[:, ind1[:, 0]]

            if x1.shape[1] == 0:
                if self.intercept:
                    intercept_value = np.mean(self.y + mu)

                    coef = np.zeros(shape=(self.p + 1, 1))

                    coef = (pd.DataFrame(coef,
                                         index=['(Intercept)'] +
                                         list(self.colnames)))
                else:
                    intercept_value = np.mean(self.y)

                    coef = np.zeros(shape=(self.p, 1))

                    coef = pd.DataFrame(coef, index=self.colnames)

                self.est = {
                    'coefficients':
                    coef,
                    'beta':
                    np.zeros(shape=(self.p, 1)),
                    'intercept':
                    intercept_value,
                    'index':
                    pd.DataFrame(np.zeros(shape=(self.p, 1)).astype(bool),
                                 index=self.colnames),
                    'lambda':
                    lmbda,
                    'lambda0':
                    lmbda0,
                    'loadings':
                    Ups0,
                    'residuals':
                    self.y - np.mean(self.y),
                    'sigma':
                    np.var(self.y, axis=0, ddof=1),
                    'iter':
                    mm,
                    #'call': Not a Python option
                    'options': {
                        'post': self.post,
                        'intercept': self.intercept,
                        'ind.scale': ind,
                        'mu': mu,
                        'meanx': meanx
                    }
                }

                if self.model:
                    self.est['model'] = self.x
                else:
                    self.est['model'] = None

                self.est['tss'] = self.est['rss'] = (((
                    self.y - np.mean(self.y))**2).sum())

                self.est['dev']: self.y - np.mean(self.y)
                # In R, return() breaks while loops
                return

            # Refinement variance estimation
            if self.post:
                reg = lm(fit_intercept=False).fit(x1, self.y)

                coefT = reg.coef_.T

                coefT[np.isnan(coefT)] = 0

                e1 = self.y - x1 @ coefT

                coefTemp[ind1[:, 0]] = coefT
            else:
                e1 = self.y - x1 @ coefTemp[ind1[:, 0]]

            s1 = np.sqrt(np.var(e1, ddof=1))

            # Homoskedastic and X-independent
            if ((self.homoskedastic == True)
                    and (self.X_dependent_lambda == False)):
                Ups1 = s1 * Psi

                lmbda = pen['lambda0'] * Ups1

            # Homoskedastic and X-dependent
            elif ((self.homoskedastic == True)
                  and (self.X_dependent_lambda == True)):
                Ups1 = s1 * Psi

                lmbda = pen['lambda0'] * Ups1

            # Heteroskedastic and X-independent
            elif ((self.homoskedastic == False)
                  and (self.X_dependent_lambda == False)):
                Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt(
                    (e1**2).T @ self.x**2).T)

                lmbda = pen['lambda0'] * Ups1

            # Heteroskedastic and X-dependent
            elif ((self.homoskedastic == False)
                  and (self.X_dependent_lambda == True)):
                lc = lambdaCalculation(
                    homoskedastic=self.homoskedastic,
                    X_dependent_lambda=self.X_dependent_lambda,
                    lambda_start=self.lambda_start,
                    c=self.c,
                    gamma=self.gamma,
                    numSim=self.numSim,
                    y=e1,
                    x=self.x,
                    par=self.par,
                    corecap=self.corecap,
                    fix_seed=self.fix_seed)

                Ups1 = lc['Ups0']

                lmbda = lc['lambda']

            # If homoskedastic is set to None
            elif self.homoskedastic is None:
                Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt(
                    (e1**2).T @ self.x**2).T)

                lmbda = pen['lambda0'] * Ups1

            mm = mm + 1

            if np.abs(s0 - s1) < self.tol:
                break

            s0 = s1

        if x1.shape[1] == 0:
            #coefTemp = None
            ind1 = np.zeros(shape=(self.p, 1))

        coefTemp = cvec(coefTemp)

        coefTemp[np.abs(coefTemp) < self.threshold] = 0

        coefTemp = pd.DataFrame(coefTemp, index=self.colnames)

        ind1 = cvec(ind1)

        ind1 = pd.DataFrame(ind1, index=self.colnames)

        if self.intercept:
            if mu is None:
                mu = 0
            if meanx is None:
                meanx = np.zeros(shape=(coefTemp.shape[0], 1))
            if ind.sum() == 0:
                intercept_value = mu - (meanx * coefTemp).sum()
            else:
                intercept_value = mu - (meanx * coefTemp).sum()
        else:
            intercept_value = np.nan

        if self.intercept:
            beta = (np.concatenate([cvec(intercept_value), coefTemp.values],
                                   axis=0))

            beta = pd.DataFrame(beta,
                                index=['(Intercept)'] + list(self.colnames))
        else:
            beta = coefTemp

        s1 = np.sqrt(np.var(e1, ddof=1))

        self.est = {
            'coefficients': beta,
            'beta': pd.DataFrame(coefTemp, index=self.colnames),
            'intercept': intercept_value,
            'index': ind1,
            'lambda': pd.DataFrame(lmbda, index=self.colnames),
            'lambda0': lmbda0,
            'loadings': Ups1,
            'residuals': cvec(e1),
            'sigma': s1,
            'iter': mm,
            #'call': Not a Python option
            'options': {
                'post': self.post,
                'intercept': self.intercept,
                'ind.scale': ind,
                'mu': mu,
                'meanx': meanx
            },
            'model': model
        }

        if model:
            self.x = self.x + np.ones(shape=(self.n, 1)) @ meanx.T

            self.est['model'] = self.x
        else:
            self.est['model'] = None

        self.est['tss'] = ((self.y - np.mean(self.y))**2).sum()
        self.est['rss'] = (self.est['residuals']**2).sum()
        self.est['dev'] = self.y - np.mean(self.y)