def simul_ci(k=1, Omega=None, var=None, seed=0, fix_seed=True, verbose=False): if Omega is None: Omega = np.identity(k) else: k = Omega.shape[0] if var is None: var = np.diag(Omega) try: if fix_seed: # This is a key difference between the R and Python implementation. # For some data sets, especially when k > n, scipy.stats.norm() will # return an error, claiming than Omega is singular. R's # MASS::mvrnorm(), on the other hand, will happily use Omega and # calculate draws from it. I had to add allow_singular to get both # implementations to work similarly. beta = multivariate_normal( cov=Omega, allow_singular=True).rvs(random_state=seed) else: beta = multivariate_normal(cov=Omega, allow_singular=True).rvs() sim = np.amax(np.abs(cvec(beta) / cvec(np.sqrt(var)))) except Exception as e: if verbose: print('Error encountered in simul_ci():') print(e) print() sim = np.nan return sim
def get_cov(X, e, add_intercept=True, homoskedastic=False): """ Calculates OLS variance estimator based on X and residuals Inputs X: n by k matrix, RHS variables e: n by 1 vector or vector-like, residuals from an OLS regression add_intercept: Boolean, if True, adds an intercept as the first column of X (and increases k by one) Outputs V_hat: k by k NumPy array, estimated covariance matrix """ # Get the number of observations n and parameters k n, k = X.shape # Check whether an intercept needs to be added if add_intercept: # If so, add the intercept X = np.concatenate([np.ones(shape=(n, 1)), X], axis=1) # Don't forget to increase k k = k + 1 # Make sure the residuals are a proper column vector e = cvec(e) # Calculate X'X XX = X.T @ X # Calculate its inverse XXinv = linalg.inv(XX) # Check whether to use homoskedastic errors if homoskedastic: # If so, calculate the homoskedastic variance estimator V_hat = (1 / (n - k)) * XXinv * (e.T @ e) else: # Otherwise, calculate an intermediate object S = (e @ np.ones(shape=(1, k))) * X # Then, get the HC0 sandwich estimator V_hat = (n / (n - k)) * XXinv @ (S.transpose() @ S) @ XXinv # Return the result return V_hat
def rlassoEffect_wrapper(i, x, y, d, method='double selection', I3=None, post=True, colnames_d=None, colnames_x=None, intercept=True, model=True, homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15, tol=10**(-5), threshold=-np.inf, par=True, corecap=np.inf, fix_seed=True, verbose=False): """ Wrapper for rlassoEffect() Inputs i: Integer, index of the current variable of interest See the rlassoEffect() documentation for other inputs Output res: Dictionary, contains a collection of results from rlassoEffect(), or a collection of empty strings and NANs if an error is encountered while running rlassoEffect() """ if np.amin(x.shape) == 1: x = cvec(x) y = cvec(y) d = cvec(d) try: col = rlassoEffect(x, y, d, method=method, I3=I3, post=post, colnames_d=colnames_d, colnames_x=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed) smat = np.zeros(shape=(x.shape[1] + 1, 1)) * np.nan smat[np.arange(smat.shape[0]) != i] = col['selection_index'] res = { 'coefficients': [i, col['alpha']], 'se': [i, col['se'][0]], 't': [i, col['t'][0]], 'pval': [i, col['pval'][0]], 'lasso_regs': { i: col }, 'reside': [i, col['residuals']['epsilon']], 'residv': [i, col['residuals']['v']], 'coef_mat': { i: col['coefficients_reg'] }, 'selection_matrix': [i, smat] } except Exception as e: # Mimic the results in the original code, where any errors result in a # variable being skipped, and the preallocated results arrays containing # either NANs or empty lists res = { 'coefficients': [i, np.nan], 'se': [i, np.nan], 't': [i, np.nan], 'lasso_regs': { i: e }, 'pval': [i, np.nan], 'reside': [i, np.zeros(shape=(x.shape[0], 1)) * np.nan], 'residv': [i, np.zeros(shape=(x.shape[0], 1)) * np.nan], 'coef_mat': { i: [] }, 'selection_matrix': [i, np.zeros(shape=(x.shape[1] + 1, 1)) * np.nan] } if verbose: print('Error encountered in rlassoEffect_wrapper()') print(e) print() return res
def confint(self, parm=None, B=500, level=.95, joint=False, par=None, corecap=None, fix_seed=None, verbose=None): self.B = B if par is None: par = self.par_any if corecap is None: corecap = self.corecap if fix_seed is None: fix_seed = self.fix_seed if verbose is None: verbose = self.verbose n = self.res['samplesize'] k = p1 = len(self.res['coefficients']) cf = self.res['coefficients'] pnames = cf.index.values self.parm = parm self.level = level self.joint = joint if self.parm is None: self.parm = pnames elif np.issubdtype(self.parm, np.number): self.parm = pnames[parm] if not self.joint: a = (1 - self.level) / 2 a = cvec([a, 1 - a]) fac = norm.ppf(a) pct = [str(np.round(x * 100, 3)) + ' %' for x in a[:, 0]] ses = self.res['se'].loc[self.parm, :] self.ci = cf.loc[self.parm, :] @ np.ones(shape=(1, 2)) + ses @ fac.T self.ci.columns = pct return self.ci else: if self.verbose: print('\nCaution: Joint confidence intervals for hdmpy are', 'currently different from those of the original R', 'package hdm. This is a known bug.') e = self.res['residuals']['e'].values v = self.res['residuals']['v'].values ev = e * v Ev2 = np.mean(v**2, axis=0) Omegahat = np.zeros(shape=(self.k, self.k)) * np.nan for j in np.arange(self.k): for l in np.arange(start=j, stop=self.k): Omegahat[j, l] = Omegahat[l, j] = (1 / (Ev2[j] * Ev2[l]) * np.mean(ev[:, j] * ev[:, l])) var = np.diag(Omegahat) # Check whether to use parallel processing if par: # If so, get the number of cores to use cores = np.int(np.amin([mp.cpu_count(), self.corecap])) else: # Otherwise, use only one core (i.e. run sequentially) cores = 1 sim = jbl.Parallel(n_jobs=cores)( jbl.delayed(simul_ci)(Omega=Omegahat / self.n, var=var, seed=i * 20, fix_seed=fix_seed, verbose=verbose) for i in np.arange(self.B)) sim = cvec(sim) a = 1 - self.level ab = cvec([a / 2, 1 - a / 2]) pct = [str(np.round(x * 100, 3)) + ' %' for x in ab[:, 0]] var = pd.DataFrame(var, index=self.parm) hatc = np.quantile(sim, q=1 - a) ci1 = cf.loc[self.parm, :] - hatc * np.sqrt(var.loc[self.parm, :]) ci2 = cf.loc[self.parm, :] + hatc * np.sqrt(var.loc[self.parm, :]) self.ci = pd.concat([ci1.iloc[:, 0], ci2.iloc[:, 0]], axis=1) self.ci.columns = pct return self.ci
def __init__(self, x, y, index=None, method='partialling out', I3=None, post=True, colnames=None, intercept=True, model=True, homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15, tol=10**(-5), threshold=-np.inf, par_outer=True, par_inner=False, par_any=True, corecap=np.inf, fix_seed=True, verbose=False): # Initialize internal variables if isinstance(x, pd.DataFrame) and colnames is None: colnames = x.columns self.x = np.array(x).astype(np.float32) self.y = cvec(y).astype(np.float32) if index is None: self.index = cvec(np.arange(self.x.shape[1])) else: self.index = cvec(index) self.method = method self.I3 = I3 self.post = post self.colnames = colnames if self.index.dtype == bool: self.k = self.p1 = self.index.sum() else: self.k = self.p1 = len(self.index) self.n = x.shape[1] self.intercept = intercept self.model = model self.homoskedastic = homoskedastic self.X_dependent_lambda = X_dependent_lambda self.lambda_start = lambda_start self.c = c self.gamma = gamma self.numSim = numSim self.numIter = numIter self.tol = tol self.threshold = threshold self.par_outer = par_outer self.par_inner = par_inner self.par_any = par_any self.corecap = corecap self.fix_seed = fix_seed if not self.par_any: self.par_outer = self.par_inner = False elif self.par_outer and self.par_inner: self.par_outer = False self.verbose = verbose # Initialize internal variables used in other functions self.B = None self.parm = None self.level = None self.joint = None # preprocessing index numerical vector if np.issubdtype(self.index.dtype, np.number): self.index = self.index.astype(np.int) if not (np.all(self.index[:, 0] < self.x.shape[1]) and (len(self.index) <= self.x.shape[1])): raise ValueError('Numeric index includes elements which are ' + 'outside of the column range of x, or the ' + 'indexing vector is too long') elif self.index.dtype == bool: if not (len(self.index) <= self.x.shape[1]): raise ValueError('Boolean index vector is too long') self.index = cvec([i for i, b in enumerate(self.index[:, 0]) if b]) elif np.issubdtype(self.index.dtype, np.str_): if not np.all([s in self.x.columns for s in self.index[:, 0]]): raise ValueError('String index specifies column names which ' + 'are not in the column names of x') self.index = (cvec([ i for i, s in enumerate(self.index[:, 0]) if s in self.x.columns ])) else: raise ValueError('Argument index has an invalid type') if (self.method == 'double selection') and (self.I3 is not None): I3ind = cvec([i for i, b in enumerate(self.I3) if b]) if I3ind != []: if len([x for x in I3ind[:, 0] if x in self.index[:, 0]]) > 0: raise ValueError('I3 and index must not overlap!') if self.colnames is None: self.colnames = ['V' + str(i + 1) for i in range(self.x.shape[1])] # Check whether to use parallel processing if self.par_outer: # If so, get the number of cores to use cores = np.int(np.amin([mp.cpu_count(), self.corecap])) else: # Otherwise, use only one core (i.e. run sequentially) cores = 1 if (self.I3 is not None): res = jbl.Parallel(n_jobs=cores)(jbl.delayed(rlassoEffect_wrapper)( i, x=np.delete(self.x, i, axis=1), y=self.y, d=self.x[:, i], method=self.method, I3=np.delete(self.I3, i, axis=0), post=self.post, colnames_d=self.colnames[i], colnames_x=[c for j, c in enumerate(self.colnames) if j != i], intercept=self.intercept, model=self.model, homoskedastic=self.homoskedastic, X_dependent_lambda=self.X_dependent_lambda, lambda_start=self.lambda_start, c=self.c, gamma=self.gamma, numSim=self.numSim, numIter=self.numIter, tol=self.tol, threshold=self.threshold, par=self.par_inner, corecap=self.corecap, fix_seed=self.fix_seed, verbose=self.verbose) for i in self.index[:, 0]) else: res = jbl.Parallel(n_jobs=cores)(jbl.delayed(rlassoEffect_wrapper)( i, x=np.delete(self.x, i, axis=1), y=self.y, d=self.x[:, i], method=self.method, I3=self.I3, post=self.post, colnames_d=self.colnames[i], colnames_x=[c for j, c in enumerate(self.colnames) if j != i], intercept=self.intercept, model=self.model, homoskedastic=self.homoskedastic, X_dependent_lambda=self.X_dependent_lambda, lambda_start=self.lambda_start, c=self.c, gamma=self.gamma, numSim=self.numSim, numIter=self.numIter, tol=self.tol, threshold=self.threshold, par=self.par_inner, corecap=self.corecap, fix_seed=self.fix_seed, verbose=self.verbose) for i in self.index[:, 0]) # Convert collection of parallel results into usable results sorted by # their index coefficients = np.array([r['coefficients'] for r in res]) coefficients = cvec(coefficients[coefficients[:, 0].argsort(), 1]) se = np.array([r['se'] for r in res]) se = cvec(se[se[:, 0].argsort(), 1]) t = np.array([r['t'] for r in res]) t = cvec(t[t[:, 0].argsort(), 1]) pval = np.array([r['pval'] for r in res]) pval = cvec(pval[pval[:, 0].argsort(), 1]) lasso_regs = {} [lasso_regs.update(r['lasso_regs']) for r in res] reside = (np.array([ np.concatenate([cvec(r['reside'][0]), r['reside'][1]], axis=0)[:, 0] for r in res ])) reside = reside[reside[:, 0].argsort(), 1:].T residv = (np.array([ np.concatenate([cvec(r['residv'][0]), r['residv'][1]], axis=0)[:, 0] for r in res ])) residv = residv[residv[:, 0].argsort(), 1:].T coef_mat = {} [coef_mat.update(r['coef_mat']) for r in res] # Replaced this with the following two steps, to ensure this always # results in a two dimensional array #selection_matrix = ( # np.array([np.concatenate([cvec(r['selection_matrix'][0]), # r['selection_matrix'][1]], # axis=0)[:,0] # for r in res]) #) selection_matrix = [ np.concatenate( [cvec(r['selection_matrix'][0]), r['selection_matrix'][1]], axis=0).T for r in res ] selection_matrix = (np.concatenate(selection_matrix, axis=0)) selection_matrix = (selection_matrix[selection_matrix[:, 0].argsort(), 1:]) # Added this, to be able to add names to results objects idx = [self.colnames[i] for i in self.index[:, 0]] residuals = { 'e': pd.DataFrame(reside, columns=idx), 'v': pd.DataFrame(residv, columns=idx) } self.res = { 'coefficients': pd.DataFrame(coefficients, index=idx), 'se': pd.DataFrame(se, index=idx), 't': pd.DataFrame(t, index=idx), 'pval': pd.DataFrame(pval, index=idx), 'lasso_regs': lasso_regs, 'index': pd.DataFrame(self.index, index=idx), #call = match.call(), 'samplesize': self.n, 'residuals': residuals, 'coef_mat': coef_mat, 'selection_matrix': pd.DataFrame(selection_matrix, index=idx, columns=list(self.colnames)) }
def rlassoEffect(x, y, d, method='double selection', I3=None, post=True, colnames_d=None, colnames_x=None, intercept=True, model=True, homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15, tol=10**(-5), threshold=-np.inf, par=True, corecap=np.inf, fix_seed=True): d = cvec(d) y = cvec(y) n, kx = x.shape if colnames_d is None: colnames_d = ['d1'] if (colnames_x is None) and (x is not None): colnames_x = ['x' + str(i) for i in np.arange(kx)] if method == 'double selection': I1 = rlasso(x, d, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed).est['index'] I2 = rlasso(x, y, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed).est['index'] # Original code checks if type(I3) is bool, but I believe they only do # that to see whether it has been defined by the user if I3 is not None: I3 = cvec(I3) I = cvec(I1.astype(bool) | I2.astype(bool) | I3.astype(bool)) else: I = cvec(I1.astype(bool) | I2.astype(bool)) # missing here: names(I) <- union(names(I1),names(I2)) if I.sum() == 0: I = None x = np.concatenate([d, x[:, I[:, 0]]], axis=1) reg1 = lm(fit_intercept=True).fit(x, y) alpha = reg1.coef_[0, 0] names_alpha = colnames_d resid = y - cvec(reg1.predict(x)) if I is None: xi = (resid) * np.sqrt(n / (n - 1)) else: xi = (resid) * np.sqrt(n / (n - I.sum() - 1)) if I is None: # Fit an intercept-only model reg2 = lm(fit_intercept=False).fit(np.ones_like(d), d) v = d - cvec(reg2.predict(np.ones_like(d))) else: reg2 = lm(fit_intercept=True).fit(x[:, 1:], d) v = d - cvec(reg2.predict(x[:, 1:])) var = ((1 / n) * (1 / np.mean(v**2, axis=0)) * np.mean( (v**2) * (xi**2), axis=0) * (1 / np.mean(v**2, axis=0))) se = np.sqrt(var) tval = alpha / np.sqrt(var) pval = 2 * norm.cdf(-np.abs(tval)) if I is None: no_selected = 1 else: no_selected = 0 res = {'epsilon': xi, 'v': v} if np.issubdtype(type(colnames_d), np.str_): colnames_d = [colnames_d] results = { 'alpha': alpha, #'se': pd.DataFrame(se, index=colnames_d), 'se': se, 't': tval, 'pval': pval, 'no_selected': no_selected, 'coefficients': alpha, 'coefficient': alpha, 'coefficients_reg': reg1.coef_, 'selection_index': I, 'residuals': res, #call = match.call(), 'samplesize': n } elif method == 'partialling out': reg1 = rlasso(x, y, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed) yr = reg1.est['residuals'] reg2 = rlasso(x, d, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed) dr = reg2.est['residuals'] reg3 = lm(fit_intercept=True).fit(dr, yr) alpha = reg3.coef_[0, 0] resid = yr - cvec(reg3.predict(dr)) # This is a difference to the original code. The original code uses # var <- vcov(reg3)[2, 2], which is the homoskedastic covariance # estimator for OLS. I wrote get_cov() to calculate that, because the # linear regression implementation in sklearn does not include standard # error calculations. (I could have switched to statsmodels instead, but # sklearn seems more likely to be maintained in the future.) I then # added the option to get_cov() to calculate heteroskedastic standard # errors. I believe that if the penalty term is adjusted for # heteroskedasticity, heteroskedastic standard errors should also be # used here, to be internally consistent. var = np.array([get_cov(dr, resid, homoskedastic=homoskedastic)[1, 1]]) se = np.sqrt(var) tval = alpha / np.sqrt(var) pval = 2 * norm.cdf(-np.abs(tval)) res = {'epsilon': resid, 'v': dr} I1 = reg1.est['index'] I2 = reg2.est['index'] I = cvec(I1.astype(bool) | I2.astype(bool)) #names(I) <- union(names(I1),names(I2)) results = { 'alpha': alpha, 'se': se, 't': tval, 'pval': pval, 'coefficients': alpha, 'coefficient': alpha, 'coefficients_reg': reg1.est['coefficients'], 'selection_index': I, 'residuals': res, #call = match.call(), 'samplesize': n } return results
def LassoShooting_fit(x, y, lmbda, maxIter=1000, optTol=10**(-5), zeroThreshold=10**(-6), XX=None, Xy=None, beta_start=None): """ Shooting LASSO algorithm with variable dependent penalty weights Inputs x: n by p NumPy array, RHS variables y: n by 1 NumPy array, outcome variable lmbda: p by 1 NumPy array, variable dependent penalty terms. The j-th element is the penalty term for the j-th RHS variable. maxIter: integer, maximum number of shooting LASSO updated optTol: scalar, algorithm terminated once the sum of absolute differences between the updated and current weights is below optTol zeroThreshold: scalar, if any final weights are below zeroThreshold, they will be set to zero instead XX: k by k NumPy array, pre-calculated version of x'x Xy: k by 1 NumPy array, pre-calculated version of x'y beta_start: k by 1 NumPy array, initial weights Outputs w: k by 1 NumPy array, final weights wp: k by m + 1 NumPy array, where m is the number of iterations the algorithm took. History of weight updates, starting with the initial weights. m: integer, number of iterations the algorithm took """ # Make sure that y and lmbda are proper column vectors y = cvec(y) lmbda = cvec(lmbda) # Get number of observations n and number of variables p n, p = x.shape # Check whether XX and Xy were provided, calculate them if not if XX is None: XX = x.T @ x if Xy is None: Xy = x.T @ y # Check whether an initial value for the intercept was provided if beta_start is None: # If not, use init_values from help_functions, which will return # regression estimates for the five variables in x which are most # correlated with y, and initialize all other coefficients as zero beta = init_values(x, y, intercept=False)['coefficients'] else: # Otherwise, use the provided initial weights beta = beta_start # Set up a history of weights over time, starting with the initial ones wp = beta # Keep track of the number of iterations m = 1 # Create versions of XX and Xy which are just those matrices times two XX2 = XX * 2 Xy2 = Xy * 2 # Go through all iterations while m < maxIter: # Save the last set of weights (the .copy() is important, otherwise # beta_old will be updated every time beta is changed during the # following loop) beta_old = beta.copy() # Go through all parameters for j in np.arange(p): # Calculate the shoot S0 = XX2[j, :] @ beta - XX2[j, j] * beta[j, 0] - Xy2[j, 0] # Update the weights if np.isnan(S0).sum() >= 1: beta[j] = 0 elif S0 > lmbda[j]: beta[j] = (lmbda[j] - S0) / XX2[j, j] elif S0 < -lmbda[j]: beta[j] = (-lmbda[j] - S0) / XX2[j, j] elif np.abs(S0) <= lmbda[j]: beta[j] = 0 # Add the updated weights to the history of weights wp = np.concatenate([wp, beta], axis=1) # Check whether the weights are within tolerance if np.abs(beta - beta_old).sum() < optTol: # If so, break the while loop break # Increase the iteration counter m = m + 1 # Set the final weights to the last updated weights w = beta # Set weights which are within zeroThreshold to zero w[np.abs(w) < zeroThreshold] = 0 # Return the weights, history of weights, and iteration counter return {'coefficients': w, 'coef.list': wp, 'num.it': m}
def lambdaCalculation(homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=0.1, numSim=5000, y=None, x=None, par=True, corecap=np.inf, fix_seed=True): # Get number of observations n and number of variables p n, p = x.shape # Get number of simulations to use (if simulations are necessary) R = numSim # Go through all possible combinations of homoskedasticy/heteroskedasticity # and X-dependent or independent error terms. The first two cases are # special cases: Handling the case there homoskedastic was set to None, and # where lambda_start was provided. # # 1) If homoskedastic was set to None (special case) if homoskedastic is None: # Initialize lambda lmbda0 = lambda_start Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T # Calculate the final vector of penalty terms lmbda = lmbda0 * Ups0 # 2) If lambda_start was provided (special case) elif lambda_start is not None: # Check whether a homogeneous penalty term was provided (a scalar) if np.amax(cvec(lambda_start).shape) == 1: # If so, repeat that p times as the penalty term lmbda = np.ones(shape=(p, 1)) * lambda_start else: # Otherwise, use the provided vector of penalty terms as is lmbda = lambda_start # 3) Homoskedastic and X-independent elif (homoskedastic == True) and (X_dependent_lambda == False): # Initilaize lambda lmbda0 = 2 * c * np.sqrt(n) * norm.ppf(1 - gamma / (2 * p)) # Use ddof=1 to be consistent with R's var() function Ups0 = np.sqrt(np.var(y, axis=0, ddof=1)) # Calculate the final vector of penalty terms lmbda = np.zeros(shape=(p, 1)) + lmbda0 * Ups0 # 4) Homoskedastic and X-dependent elif (homoskedastic == True) and (X_dependent_lambda == True): psi = cvec((x**2).mean(axis=0)) tXtpsi = (x.T / np.sqrt(psi)).T # Check whether to use parallel processing if par == True: # If so, get the number of cores to use cores = np.int(np.amin([mp.cpu_count(), corecap])) else: # Otherwise, use only one core (i.e. run sequentially) cores = 1 # Get simulated distribution sim = jbl.Parallel(n_jobs=cores)(jbl.delayed(simul_pen)( n, p, tXtpsi, seed=l * 20, fix_seed=fix_seed) for l in np.arange(R)) # Convert it to a proper column vector sim = cvec(sim) # Initialize lambda based on the simulated quantiles lmbda0 = c * np.quantile(sim, q=1 - gamma, axis=0) Ups0 = np.sqrt(np.var(y, axis=0, ddof=1)) # Calculate the final vector of penalty terms lmbda = np.zeros(shape=(p, 1)) + lmbda0 * Ups0 # 5) Heteroskedastic and X-independent elif (homoskedastic == False) and (X_dependent_lambda == False): # The original includes the comment, "1=num endogenous variables" lmbda0 = 2 * c * np.sqrt(n) * norm.ppf(1 - gamma / (2 * p * 1)) Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T lmbda = lmbda0 * Ups0 # 6) Heteroskedastic and X-dependent elif (homoskedastic == False) and (X_dependent_lambda == True): eh = y ehat = eh @ np.ones(shape=(1, p)) xehat = x * ehat psi = cvec((xehat**2).mean(axis=0)).T tXehattpsi = (xehat / (np.ones(shape=(n, 1)) @ np.sqrt(psi))) # Check whether to use parallel processing if par == True: # If so, get the number of cores to use cores = np.int(np.amin([mp.cpu_count(), corecap])) else: # Otherwise, use only one core (i.e. run sequentially) cores = 1 # Get simulated distribution sim = jbl.Parallel(n_jobs=cores)(jbl.delayed(simul_pen)( n, p, tXehattpsi, seed=l * 20, fix_seed=fix_seed) for l in np.arange(R)) # Convert it to a proper column vector sim = cvec(sim) # Initialize lambda based on the simulated quantiles lmbda0 = c * np.quantile(sim, q=1 - gamma, axis=0) Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T # Calculate the final vector of penalty terms lmbda = lmbda0 * Ups0 # Return results return {'lambda0': lmbda0, 'lambda': lmbda, 'Ups0': Ups0}
def __init__(self, x, y, colnames=None, post=True, intercept=True, model=True, homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15, tol=10**(-5), threshold=-np.inf, par=True, corecap=np.inf, fix_seed=True): # Initialize internal variables if isinstance(x, pd.DataFrame) and colnames is None: colnames = x.columns self.x = np.array(x).astype(np.float32) self.y = cvec(y).astype(np.float32) self.n, self.p = self.x.shape if colnames is None: self.colnames = ['V' + str(i + 1) for i in np.arange(self.p)] else: self.colnames = colnames # Unused line in the original code # ind_names = np.arange(self.p) + 1 self.post = post self.intercept = intercept self.model = model self.homoskedastic = homoskedastic self.X_dependent_lambda = X_dependent_lambda self.lambda_start = lambda_start self.c = c if gamma is None: self.gamma = .1 / np.log(self.n) else: self.gamma = gamma self.numSim = numSim self.numIter = numIter self.tol = tol self.threshold = threshold self.par = par self.corecap = corecap self.fix_seed = fix_seed if (self.post == False) and (self.c is None): self.c = .5 if ((self.post == False) and (self.homoskedastic == False) and (self.X_dependent_lambda == False) and (self.lambda_start == None) and (self.c == 1.1) and (self.gamma == .1 / np.log(self.n))): self.c = .5 # For now, instantiate estimate as None self.est = None # Calculate robust LASSO coefficients if self.intercept == True: meanx = cvec(self.x.mean(axis=0)) self.x = self.x - np.ones(shape=(self.n, 1)) @ meanx.T mu = self.y.mean() self.y = self.y - mu else: meanx = np.zeros(shape=(self.p, 1)) mu = 0 normx = np.sqrt(np.var(self.x, axis=1, ddof=1)) Psi = cvec(np.mean(self.x**2, axis=0)) ind = np.zeros(shape=(self.p, 1)).astype(bool) XX = self.x.T @ self.x Xy = self.x.T @ self.y startingval = init_values(self.x, self.y)['residuals'] pen = lambdaCalculation(homoskedastic=self.homoskedastic, X_dependent_lambda=self.X_dependent_lambda, lambda_start=self.lambda_start, c=self.c, gamma=self.gamma, numSim=self.numSim, y=startingval, x=self.x, par=self.par, corecap=self.corecap, fix_seed=self.fix_seed) lmbda = pen['lambda'] Ups0 = Ups1 = pen['Ups0'] lmbda0 = pen['lambda0'] mm = 1 s0 = np.sqrt(np.var(y, axis=0, ddof=1)) while mm <= self.numIter: if (mm == 1) and self.post: coefTemp = (LassoShooting_fit(self.x, self.y, lmbda / 2, XX=XX, Xy=Xy)['coefficients']) else: coefTemp = (LassoShooting_fit(self.x, self.y, lmbda, XX=XX, Xy=Xy)['coefficients']) coefTemp[np.isnan(coefTemp)] = 0 ind1 = (np.abs(coefTemp) > 0) x1 = self.x[:, ind1[:, 0]] if x1.shape[1] == 0: if self.intercept: intercept_value = np.mean(self.y + mu) coef = np.zeros(shape=(self.p + 1, 1)) coef = (pd.DataFrame(coef, index=['(Intercept)'] + list(self.colnames))) else: intercept_value = np.mean(self.y) coef = np.zeros(shape=(self.p, 1)) coef = pd.DataFrame(coef, index=self.colnames) self.est = { 'coefficients': coef, 'beta': np.zeros(shape=(self.p, 1)), 'intercept': intercept_value, 'index': pd.DataFrame(np.zeros(shape=(self.p, 1)).astype(bool), index=self.colnames), 'lambda': lmbda, 'lambda0': lmbda0, 'loadings': Ups0, 'residuals': self.y - np.mean(self.y), 'sigma': np.var(self.y, axis=0, ddof=1), 'iter': mm, #'call': Not a Python option 'options': { 'post': self.post, 'intercept': self.intercept, 'ind.scale': ind, 'mu': mu, 'meanx': meanx } } if self.model: self.est['model'] = self.x else: self.est['model'] = None self.est['tss'] = self.est['rss'] = ((( self.y - np.mean(self.y))**2).sum()) self.est['dev']: self.y - np.mean(self.y) # In R, return() breaks while loops return # Refinement variance estimation if self.post: reg = lm(fit_intercept=False).fit(x1, self.y) coefT = reg.coef_.T coefT[np.isnan(coefT)] = 0 e1 = self.y - x1 @ coefT coefTemp[ind1[:, 0]] = coefT else: e1 = self.y - x1 @ coefTemp[ind1[:, 0]] s1 = np.sqrt(np.var(e1, ddof=1)) # Homoskedastic and X-independent if ((self.homoskedastic == True) and (self.X_dependent_lambda == False)): Ups1 = s1 * Psi lmbda = pen['lambda0'] * Ups1 # Homoskedastic and X-dependent elif ((self.homoskedastic == True) and (self.X_dependent_lambda == True)): Ups1 = s1 * Psi lmbda = pen['lambda0'] * Ups1 # Heteroskedastic and X-independent elif ((self.homoskedastic == False) and (self.X_dependent_lambda == False)): Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt( (e1**2).T @ self.x**2).T) lmbda = pen['lambda0'] * Ups1 # Heteroskedastic and X-dependent elif ((self.homoskedastic == False) and (self.X_dependent_lambda == True)): lc = lambdaCalculation( homoskedastic=self.homoskedastic, X_dependent_lambda=self.X_dependent_lambda, lambda_start=self.lambda_start, c=self.c, gamma=self.gamma, numSim=self.numSim, y=e1, x=self.x, par=self.par, corecap=self.corecap, fix_seed=self.fix_seed) Ups1 = lc['Ups0'] lmbda = lc['lambda'] # If homoskedastic is set to None elif self.homoskedastic is None: Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt( (e1**2).T @ self.x**2).T) lmbda = pen['lambda0'] * Ups1 mm = mm + 1 if np.abs(s0 - s1) < self.tol: break s0 = s1 if x1.shape[1] == 0: #coefTemp = None ind1 = np.zeros(shape=(self.p, 1)) coefTemp = cvec(coefTemp) coefTemp[np.abs(coefTemp) < self.threshold] = 0 coefTemp = pd.DataFrame(coefTemp, index=self.colnames) ind1 = cvec(ind1) ind1 = pd.DataFrame(ind1, index=self.colnames) if self.intercept: if mu is None: mu = 0 if meanx is None: meanx = np.zeros(shape=(coefTemp.shape[0], 1)) if ind.sum() == 0: intercept_value = mu - (meanx * coefTemp).sum() else: intercept_value = mu - (meanx * coefTemp).sum() else: intercept_value = np.nan if self.intercept: beta = (np.concatenate([cvec(intercept_value), coefTemp.values], axis=0)) beta = pd.DataFrame(beta, index=['(Intercept)'] + list(self.colnames)) else: beta = coefTemp s1 = np.sqrt(np.var(e1, ddof=1)) self.est = { 'coefficients': beta, 'beta': pd.DataFrame(coefTemp, index=self.colnames), 'intercept': intercept_value, 'index': ind1, 'lambda': pd.DataFrame(lmbda, index=self.colnames), 'lambda0': lmbda0, 'loadings': Ups1, 'residuals': cvec(e1), 'sigma': s1, 'iter': mm, #'call': Not a Python option 'options': { 'post': self.post, 'intercept': self.intercept, 'ind.scale': ind, 'mu': mu, 'meanx': meanx }, 'model': model } if model: self.x = self.x + np.ones(shape=(self.n, 1)) @ meanx.T self.est['model'] = self.x else: self.est['model'] = None self.est['tss'] = ((self.y - np.mean(self.y))**2).sum() self.est['rss'] = (self.est['residuals']**2).sum() self.est['dev'] = self.y - np.mean(self.y)