def run_linear_regression(genotype_df, phenotype_df, covariate_df, add_intercept=True): phenotype_names = phenotype_df.columns.astype('str').to_series() C = covariate_df.to_numpy('float64', copy=True) if add_intercept: C = gwas_fx._add_intercept(C, genotype_df.shape[0]) if not C.size: C = np.zeros((genotype_df.shape[0], 1)) Y = phenotype_df.to_numpy('float64', copy=True) Y_mask = ~np.isnan(Y) Y[~Y_mask] = 0 Y -= Y.mean(axis=0) Q = np.linalg.qr(C)[0] Y = gwas_fx._residualize_in_place(Y, Q) * Y_mask Y_scale = np.ones(Y.shape[1]) Y_state = lr._create_YState(Y, phenotype_df, pd.DataFrame({}), Y_mask, np.float64, None) dof = C.shape[0] - C.shape[1] - 1 pdf = pd.DataFrame( {lr._VALUES_COLUMN_NAME: list(genotype_df.to_numpy('float64').T)}) return lr._linear_regression_inner(pdf, Y_state, Y_mask.astype('float64'), Y_scale, Q, dof, phenotype_names, None, None)
def run_score_test(genotype_df, phenotype_df, covariate_df, correction=lr.correction_none, add_intercept=True): C = covariate_df.to_numpy(copy=True) if add_intercept: C = gwas_fx._add_intercept(C, phenotype_df.shape[0]) Y = phenotype_df.to_numpy(copy=True) Y_mask = ~np.isnan(Y) Y[~Y_mask] = 0 state_rows = [ lr._prepare_one_phenotype( C, pd.Series({ 'label': p, 'values': phenotype_df[p] }), correction, add_intercept) for p in phenotype_df ] phenotype_names = phenotype_df.columns.to_series().astype('str') state = lr._pdf_to_log_reg_state(pd.DataFrame(state_rows), phenotype_names, C.shape[1]) values_df = pd.DataFrame( {gwas_fx._VALUES_COLUMN_NAME: list(genotype_df.to_numpy().T)}) return lr._logistic_regression_inner( values_df, state, C, Y, Y_mask, None, lr.correction_none, 0.05, phenotype_df.columns.to_series().astype('str'))
def statsmodels_baseline(genotype_df, phenotype_df, covariate_df, offset_dfs=None, add_intercept=True): # Project out covariates from genotypes and phenotypes C = covariate_df.to_numpy('float64') num_samples = C.shape[0] if C.size else genotype_df.shape[0] if add_intercept: C = gwas_fx._add_intercept(C, num_samples) Y = phenotype_df.to_numpy('float64') X = genotype_df.to_numpy('float64') phenotype_df.columns = phenotype_df.columns.astype('str') dof = C.shape[0] - C.shape[1] - 1 effects = [] errors = [] tvalues = [] pvalues = [] for phenotype_idx in range(Y.shape[1]): for genotype_idx in range(X.shape[1]): phenotype = Y[:, phenotype_idx].copy() phenotype_mask = ~np.isnan(phenotype) phenotype[~phenotype_mask] = 0 phenotype -= phenotype.mean() phenotype = residualize(phenotype, C) * phenotype_mask phenotype_scale = np.sqrt( (phenotype**2).sum() / (phenotype_mask.sum() - C.shape[1])) phenotype /= phenotype_scale if offset_dfs: offset = offset_dfs[genotype_idx].iloc[:, phenotype_idx].to_numpy( 'float64') phenotype = phenotype - offset phenotype[~phenotype_mask] = np.nan genotype = residualize(X[:, genotype_idx], C) genotype = pd.Series(genotype, name='genotype') model = sm.OLS(phenotype, genotype, missing='drop') model.df_resid = dof results = model.fit() effects.append(results.params.genotype * phenotype_scale) errors.append(results.bse.genotype * phenotype_scale) tvalues.append(results.tvalues.genotype) pvalues.append(results.pvalues.genotype) return pd.DataFrame({ 'effect': effects, 'standard_error': errors, 'tvalue': tvalues, 'pvalue': pvalues, 'phenotype': phenotype_df.columns.to_series().repeat(genotype_df.shape[1]) })