def map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, prefix, interaction_s=None, maf_threshold_interaction=0.05, group_s=None, window=1000000, run_eigenmt=False, output_dir='.', write_top=True, write_stats=True, logger=None, verbose=True): """ cis-QTL mapping: nominal associations for all variant-phenotype pairs Association results for each chromosome are written to parquet files in the format <output_dir>/<prefix>.cis_qtl_pairs.<chr>.parquet If interaction_s is provided, the top association per phenotype is written to <output_dir>/<prefix>.cis_qtl_top_assoc.txt.gz unless write_top is set to False, in which case it is returned as a DataFrame """ assert np.all(phenotype_df.columns == covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() if group_s is not None: group_dict = group_s.to_dict() logger.write( 'cis-QTL mapping: nominal associations for all variant-phenotype pairs' ) logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(variant_df.shape[0])) if interaction_s is not None: assert np.all(interaction_s.index == covariates_df.index) logger.write(' * including interaction term') if maf_threshold_interaction > 0: logger.write(' * using {:.2f} MAF threshold'.format( maf_threshold_interaction)) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) if interaction_s is None: dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] else: dof = phenotype_df.shape[1] - 4 - covariates_df.shape[1] interaction_t = torch.tensor(interaction_s.values.reshape(1, -1), dtype=torch.float32).to(device) if maf_threshold_interaction > 0: interaction_mask_t = torch.BoolTensor( interaction_s >= interaction_s.median()).to(device) else: interaction_mask_t = None igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=group_s, window=window) # iterate over chromosomes best_assoc = [] start_time = time.time() k = 0 logger.write(' * Computing associations') for chrom in igc.chrs: logger.write(' Mapping chromosome {}'.format(chrom)) # allocate arrays n = 0 if group_s is None: for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr'] == chrom].index: j = igc.cis_ranges[i] n += j[1] - j[0] + 1 else: for i in igc.group_s[igc.phenotype_pos_df['chr'] == chrom].drop_duplicates().index: j = igc.cis_ranges[i] n += j[1] - j[0] + 1 chr_res = OrderedDict() chr_res['phenotype_id'] = [] chr_res['variant_id'] = [] chr_res['tss_distance'] = np.empty(n, dtype=np.int32) chr_res['maf'] = np.empty(n, dtype=np.float32) chr_res['ma_samples'] = np.empty(n, dtype=np.int32) chr_res['ma_count'] = np.empty(n, dtype=np.int32) if interaction_s is None: chr_res['pval_nominal'] = np.empty(n, dtype=np.float64) chr_res['slope'] = np.empty(n, dtype=np.float32) chr_res['slope_se'] = np.empty(n, dtype=np.float32) else: chr_res['pval_g'] = np.empty(n, dtype=np.float64) chr_res['b_g'] = np.empty(n, dtype=np.float32) chr_res['b_g_se'] = np.empty(n, dtype=np.float32) chr_res['pval_i'] = np.empty(n, dtype=np.float64) chr_res['b_i'] = np.empty(n, dtype=np.float32) chr_res['b_i_se'] = np.empty(n, dtype=np.float32) chr_res['pval_gi'] = np.empty(n, dtype=np.float64) chr_res['b_gi'] = np.empty(n, dtype=np.float32) chr_res['b_gi_se'] = np.empty(n, dtype=np.float32) start = 0 if group_s is None: for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate( igc.generate_data(chrom=chrom, verbose=verbose), k + 1): # copy genotypes to GPU phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) variant_ids = variant_df.index[ genotype_range[0]:genotype_range[-1] + 1] tss_distance = np.int32( variant_df['pos']. values[genotype_range[0]:genotype_range[-1] + 1] - igc.phenotype_tss[phenotype_id]) if interaction_s is None: res = calculate_cis_nominal(genotypes_t, phenotype_t, residualizer) tstat, slope, slope_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] n = len(variant_ids) else: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold_interaction) if genotypes_t.shape[0] > 0: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) tstat, b, b_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] mask = mask_t.cpu().numpy() variant_ids = variant_ids[mask] tss_distance = tss_distance[mask] n = len(variant_ids) # top association ix = np.nanargmax(np.abs(tstat[:, 2])) top_s = pd.Series([ phenotype_id, variant_ids[ix], tss_distance[ix], maf[ix], ma_samples[ix], ma_count[ix], tstat[ix, 0], b[ix, 0], b_se[ix, 0], tstat[ix, 1], b[ix, 1], b_se[ix, 1], tstat[ix, 2], b[ix, 2], b_se[ix, 2] ], index=chr_res.keys()) if run_eigenmt: # compute eigenMT correction top_s['tests_emt'] = eigenmt.compute_tests( genotypes_t, var_thresh=0.99, variant_window=200) best_assoc.append(top_s) else: # all genotypes in window were filtered out n = 0 if n > 0: chr_res['phenotype_id'].extend([phenotype_id] * n) chr_res['variant_id'].extend(variant_ids) chr_res['tss_distance'][start:start + n] = tss_distance chr_res['maf'][start:start + n] = maf chr_res['ma_samples'][start:start + n] = ma_samples chr_res['ma_count'][start:start + n] = ma_count if interaction_s is None: chr_res['pval_nominal'][start:start + n] = tstat chr_res['slope'][start:start + n] = slope chr_res['slope_se'][start:start + n] = slope_se else: chr_res['pval_g'][start:start + n] = tstat[:, 0] chr_res['b_g'][start:start + n] = b[:, 0] chr_res['b_g_se'][start:start + n] = b_se[:, 0] chr_res['pval_i'][start:start + n] = tstat[:, 1] chr_res['b_i'][start:start + n] = b[:, 1] chr_res['b_i_se'][start:start + n] = b_se[:, 1] chr_res['pval_gi'][start:start + n] = tstat[:, 2] chr_res['b_gi'][start:start + n] = b[:, 2] chr_res['b_gi_se'][start:start + n] = b_se[:, 2] start += n # update pointer else: # groups for k, (phenotypes, genotypes, genotype_range, phenotype_ids, group_id) in enumerate( igc.generate_data(chrom=chrom, verbose=verbose), k + 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) variant_ids = variant_df.index[ genotype_range[0]:genotype_range[-1] + 1] # assuming that the TSS for all grouped phenotypes is the same tss_distance = np.int32( variant_df['pos']. values[genotype_range[0]:genotype_range[-1] + 1] - igc.phenotype_tss[phenotype_ids[0]]) if interaction_s is not None: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold_interaction) mask = mask_t.cpu().numpy() variant_ids = variant_ids[mask] tss_distance = tss_distance[mask] n = len(variant_ids) if genotypes_t.shape[0] > 0: # process first phenotype in group phenotype_id = phenotype_ids[0] phenotype_t = torch.tensor(phenotypes[0], dtype=torch.float).to(device) if interaction_s is None: res = calculate_cis_nominal(genotypes_t, phenotype_t, residualizer) tstat, slope, slope_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] else: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) tstat, b, b_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] px = [phenotype_id] * n # iterate over remaining phenotypes in group for phenotype, phenotype_id in zip(phenotypes[1:], phenotype_ids[1:]): phenotype_t = torch.tensor( phenotype, dtype=torch.float).to(device) if interaction_s is None: res = calculate_cis_nominal( genotypes_t, phenotype_t, residualizer) tstat0, slope0, slope_se0, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] else: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) tstat0, b0, b_se0, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] # find associations that are stronger for current phenotype if interaction_s is None: ix = np.where(np.abs(tstat0) > np.abs(tstat))[0] else: ix = np.where( np.abs(tstat0[:, 2]) > np.abs(tstat[:, 2]))[0] # update relevant positions for j in ix: px[j] = phenotype_id if interaction_s is None: tstat[ix] = tstat0[ix] slope[ix] = slope0[ix] slope_se[ix] = slope_se0[ix] else: tstat[ix] = tstat0[ix] b[ix] = b0[ix] b_se[ix] = b_se0[ix] chr_res['phenotype_id'].extend(px) chr_res['variant_id'].extend(variant_ids) chr_res['tss_distance'][start:start + n] = tss_distance chr_res['maf'][start:start + n] = maf chr_res['ma_samples'][start:start + n] = ma_samples chr_res['ma_count'][start:start + n] = ma_count if interaction_s is None: chr_res['pval_nominal'][start:start + n] = tstat chr_res['slope'][start:start + n] = slope chr_res['slope_se'][start:start + n] = slope_se else: chr_res['pval_g'][start:start + n] = tstat[:, 0] chr_res['b_g'][start:start + n] = b[:, 0] chr_res['b_g_se'][start:start + n] = b_se[:, 0] chr_res['pval_i'][start:start + n] = tstat[:, 1] chr_res['b_i'][start:start + n] = b[:, 1] chr_res['b_i_se'][start:start + n] = b_se[:, 1] chr_res['pval_gi'][start:start + n] = tstat[:, 2] chr_res['b_gi'][start:start + n] = b[:, 2] chr_res['b_gi_se'][start:start + n] = b_se[:, 2] # top association for the group if interaction_s is not None: ix = np.nanargmax(np.abs(tstat[:, 2])) top_s = pd.Series([ chr_res['phenotype_id'][start:start + n][ix], variant_ids[ix], tss_distance[ix], maf[ix], ma_samples[ix], ma_count[ix], tstat[ix, 0], b[ix, 0], b_se[ix, 0], tstat[ix, 1], b[ix, 1], b_se[ix, 1], tstat[ix, 2], b[ix, 2], b_se[ix, 2] ], index=chr_res.keys()) top_s['num_phenotypes'] = len(phenotype_ids) if run_eigenmt: # compute eigenMT correction top_s['tests_emt'] = eigenmt.compute_tests( genotypes_t, var_thresh=0.99, variant_window=200) best_assoc.append(top_s) start += n # update pointer logger.write(' time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) # convert to dataframe, compute p-values and write current chromosome if start < len(chr_res['maf']): for x in chr_res: chr_res[x] = chr_res[x][:start] if write_stats: chr_res_df = pd.DataFrame(chr_res) if interaction_s is None: m = chr_res_df['pval_nominal'].notnull() chr_res_df.loc[m, 'pval_nominal'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_nominal'].abs(), dof) else: m = chr_res_df['pval_gi'].notnull() chr_res_df.loc[m, 'pval_g'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_g'].abs(), dof) chr_res_df.loc[m, 'pval_i'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_i'].abs(), dof) chr_res_df.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_gi'].abs(), dof) print(' * writing output') chr_res_df.to_parquet( os.path.join( output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom))) if interaction_s is not None: best_assoc = pd.concat( best_assoc, axis=1, sort=False).T.set_index('phenotype_id').infer_objects() m = best_assoc['pval_g'].notnull() best_assoc.loc[m, 'pval_g'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_g'].abs(), dof) best_assoc.loc[m, 'pval_i'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_i'].abs(), dof) best_assoc.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_gi'].abs(), dof) if run_eigenmt: if group_s is None: best_assoc['pval_emt'] = np.minimum( best_assoc['tests_emt'] * best_assoc['pval_gi'], 1) else: best_assoc['pval_emt'] = np.minimum( best_assoc['num_phenotypes'] * best_assoc['tests_emt'] * best_assoc['pval_gi'], 1) best_assoc['pval_adj_bh'] = eigenmt.padjust_bh( best_assoc['pval_emt']) if write_top: best_assoc.to_csv(os.path.join( output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)), sep='\t', float_format='%.6g') else: return best_assoc logger.write('done.')
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, group_s=None, beta_approx=True, nperm=10000, window=1000000, logger=None, seed=None, verbose=True): """Run cis-QTL mapping""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") assert np.all(phenotype_df.columns == covariates_df.index) if logger is None: logger = SimpleLogger() logger.write('cis-QTL mapping: empirical p-values for phenotypes') logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) if group_s is not None: logger.write(' * {} phenotype groups'.format(len(group_s.unique()))) group_dict = group_s.to_dict() logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(genotype_df.shape[0])) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] # permutation indices n_samples = phenotype_df.shape[1] ix = np.arange(n_samples) if seed is not None: logger.write(' * using seed {}'.format(seed)) np.random.seed(seed) permutation_ix_t = torch.LongTensor( np.array([np.random.permutation(ix) for i in range(nperm)])).to(device) res_df = [] igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=group_s, window=window) start_time = time.time() logger.write(' * computing permutations') if group_s is None: for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) res = calculate_cis_permutations(genotypes_t, phenotype_t, residualizer, permutation_ix_t) r_nominal, std_ratio, var_ix, r2_perm, g = [ i.cpu().numpy() for i in res ] var_ix = genotype_range[var_ix] variant_id = variant_df.index[var_ix] tss_distance = variant_df['pos'].values[ var_ix] - igc.phenotype_tss[phenotype_id] res_s = prepare_cis_output(r_nominal, r2_perm, std_ratio, g, genotypes.shape[0], dof, variant_id, tss_distance, phenotype_id, nperm=nperm) if beta_approx: res_s[[ 'pval_beta', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df' ]] = calculate_beta_approx_pval(r2_perm, r_nominal * r_nominal, dof) res_df.append(res_s) else: for k, (phenotypes, genotypes, genotype_range, phenotype_ids, group_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) # iterate over phenotypes buf = [] for phenotype, phenotype_id in zip(phenotypes, phenotype_ids): phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) res = calculate_cis_permutations(genotypes_t, phenotype_t, residualizer, permutation_ix_t) res = [i.cpu().numpy() for i in res ] # r_nominal, std_ratio, var_ix, r2_perm, g res[2] = genotype_range[res[2]] buf.append(res + [genotypes.shape[0], phenotype_id]) res_s = _process_group_permutations( buf, variant_df, igc.phenotype_tss[phenotype_ids[0]], dof, group_id, nperm=nperm) res_df.append(res_s) res_df = pd.concat(res_df, axis=1, sort=False).T res_df.index.name = 'phenotype_id' logger.write(' Time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) logger.write('done.') return res_df.astype(output_dtype_dict).infer_objects()
def map_independent(genotype_df, variant_df, cis_df, phenotype_df, phenotype_pos_df, covariates_df, group_s=None, fdr=0.05, fdr_col='qval', nperm=10000, window=1000000, logger=None, seed=None, verbose=True): """ Run independent cis-QTL mapping (forward-backward regression) cis_df: output from map_cis, annotated with q-values (calculate_qvalues) """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") assert np.all(phenotype_df.index == phenotype_pos_df.index) assert np.all(covariates_df.index == phenotype_df.columns) if logger is None: logger = SimpleLogger() signif_df = cis_df[cis_df[fdr_col] <= fdr].copy() cols = [ 'num_var', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df', 'variant_id', 'tss_distance', 'ma_samples', 'ma_count', 'maf', 'ref_factor', 'pval_nominal', 'slope', 'slope_se', 'pval_perm', 'pval_beta', ] if group_s is not None: cols += ['group_id', 'group_size'] signif_df = signif_df[cols] signif_threshold = signif_df['pval_beta'].max() # subset significant phenotypes if group_s is None: ix = phenotype_df.index[phenotype_df.index.isin(signif_df.index)] else: ix = group_s[phenotype_df.index].loc[group_s[phenotype_df.index].isin( signif_df['group_id'])].index logger.write('cis-QTL mapping: conditionally independent variants') logger.write(' * {} samples'.format(phenotype_df.shape[1])) if group_s is None: logger.write(' * {}/{} significant phenotypes'.format( signif_df.shape[0], cis_df.shape[0])) else: logger.write(' * {}/{} significant groups'.format( signif_df.shape[0], cis_df.shape[0])) logger.write(' {}/{} phenotypes'.format(len(ix), phenotype_df.shape[0])) group_dict = group_s.to_dict() logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(genotype_df.shape[0])) # print('Significance threshold: {}'.format(signif_threshold)) phenotype_df = phenotype_df.loc[ix] phenotype_pos_df = phenotype_pos_df.loc[ix] genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] ix_dict = {i: k for k, i in enumerate(genotype_df.index)} # permutation indices n_samples = phenotype_df.shape[1] ix = np.arange(n_samples) if seed is not None: logger.write(' * using seed {}'.format(seed)) np.random.seed(seed) permutation_ix_t = torch.LongTensor( np.array([np.random.permutation(ix) for i in range(nperm)])).to(device) res_df = [] igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=group_s, window=window) logger.write(' * computing independent QTLs') start_time = time.time() if group_s is None: for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) # 1) forward pass forward_df = [signif_df.loc[phenotype_id] ] # initialize results with top variant covariates = covariates_df.values.copy() # initialize covariates dosage_dict = {} while True: # add variant to covariates variant_id = forward_df[-1]['variant_id'] ig = genotype_df.values[ix_dict[variant_id], genotype_ix] dosage_dict[variant_id] = ig covariates = np.hstack([covariates, ig.reshape(-1, 1)]).astype(np.float32) dof = phenotype_df.shape[1] - 2 - covariates.shape[1] covariates_t = torch.tensor(covariates, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t res = calculate_cis_permutations(genotypes_t, phenotype_t, residualizer, permutation_ix_t) r_nominal, std_ratio, var_ix, r2_perm, g = [ i.cpu().numpy() for i in res ] x = calculate_beta_approx_pval(r2_perm, r_nominal * r_nominal, dof) # add to list if empirical p-value passes significance threshold if x[0] <= signif_threshold: var_ix = genotype_range[var_ix] variant_id = variant_df.index[var_ix] tss_distance = variant_df['pos'].values[ var_ix] - igc.phenotype_tss[phenotype_id] res_s = prepare_cis_output(r_nominal, r2_perm, std_ratio, g, genotypes.shape[0], dof, variant_id, tss_distance, phenotype_id, nperm=nperm) res_s[[ 'pval_beta', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df' ]] = x forward_df.append(res_s) else: break forward_df = pd.concat(forward_df, axis=1, sort=False).T dosage_df = pd.DataFrame(dosage_dict) # 2) backward pass if forward_df.shape[0] > 1: back_df = [] variant_set = set() for k, i in enumerate(forward_df['variant_id'], 1): covariates = np.hstack([ covariates_df.values, dosage_df[np.setdiff1d(forward_df['variant_id'], i)].values, ]) dof = phenotype_df.shape[1] - 2 - covariates.shape[1] covariates_t = torch.tensor(covariates, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t res = calculate_cis_permutations(genotypes_t, phenotype_t, residualizer, permutation_ix_t) r_nominal, std_ratio, var_ix, r2_perm, g = [ i.cpu().numpy() for i in res ] var_ix = genotype_range[var_ix] variant_id = variant_df.index[var_ix] x = calculate_beta_approx_pval(r2_perm, r_nominal * r_nominal, dof) if x[0] <= signif_threshold and variant_id not in variant_set: tss_distance = variant_df['pos'].values[ var_ix] - igc.phenotype_tss[phenotype_id] res_s = prepare_cis_output(r_nominal, r2_perm, std_ratio, g, genotypes.shape[0], dof, variant_id, tss_distance, phenotype_id, nperm=nperm) res_s[[ 'pval_beta', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df' ]] = x res_s['rank'] = k back_df.append(res_s) variant_set.add(variant_id) if len(back_df) > 0: res_df.append(pd.concat(back_df, axis=1, sort=False).T) else: # single independent variant forward_df['rank'] = 1 res_df.append(forward_df) else: # grouped phenotypes for k, (phenotypes, genotypes, genotype_range, phenotype_ids, group_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) # 1) forward pass forward_df = [ signif_df[signif_df['group_id'] == group_id].iloc[0] ] # initialize results with top variant covariates = covariates_df.values.copy() # initialize covariates dosage_dict = {} while True: # add variant to covariates variant_id = forward_df[-1]['variant_id'] ig = genotype_df.values[ix_dict[variant_id], genotype_ix] dosage_dict[variant_id] = ig covariates = np.hstack([covariates, ig.reshape(-1, 1)]).astype(np.float32) dof = phenotype_df.shape[1] - 2 - covariates.shape[1] covariates_t = torch.tensor(covariates, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t # iterate over phenotypes buf = [] for phenotype, phenotype_id in zip(phenotypes, phenotype_ids): phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) res = calculate_cis_permutations(genotypes_t, phenotype_t, residualizer, permutation_ix_t) res = [i.cpu().numpy() for i in res ] # r_nominal, std_ratio, var_ix, r2_perm, g res[2] = genotype_range[res[2]] buf.append(res + [genotypes.shape[0], phenotype_id]) res_s = _process_group_permutations( buf, variant_df, igc.phenotype_tss[phenotype_ids[0]], dof, group_id, nperm=nperm) # add to list if significant if res_s['pval_beta'] <= signif_threshold: forward_df.append(res_s) else: break forward_df = pd.concat(forward_df, axis=1, sort=False).T dosage_df = pd.DataFrame(dosage_dict) # 2) backward pass if forward_df.shape[0] > 1: back_df = [] variant_set = set() for k, variant_id in enumerate(forward_df['variant_id'], 1): covariates = np.hstack([ covariates_df.values, dosage_df[np.setdiff1d(forward_df['variant_id'], variant_id)].values, ]) dof = phenotype_df.shape[1] - 2 - covariates.shape[1] covariates_t = torch.tensor(covariates, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t # iterate over phenotypes buf = [] for phenotype, phenotype_id in zip(phenotypes, phenotype_ids): phenotype_t = torch.tensor( phenotype, dtype=torch.float).to(device) res = calculate_cis_permutations( genotypes_t, phenotype_t, residualizer, permutation_ix_t) res = [i.cpu().numpy() for i in res ] # r_nominal, std_ratio, var_ix, r2_perm, g res[2] = genotype_range[res[2]] buf.append(res + [genotypes.shape[0], phenotype_id]) res_s = _process_group_permutations( buf, variant_df, igc.phenotype_tss[phenotype_ids[0]], dof, group_id, nperm=nperm) if res_s[ 'pval_beta'] <= signif_threshold and variant_id not in variant_set: res_s['rank'] = k back_df.append(res_s) variant_set.add(variant_id) if len(back_df) > 0: res_df.append(pd.concat(back_df, axis=1, sort=False).T) else: # single independent variant forward_df['rank'] = 1 res_df.append(forward_df) res_df = pd.concat(res_df, axis=0, sort=False) res_df.index.name = 'phenotype_id' logger.write(' Time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) logger.write('done.') return res_df.reset_index().astype(output_dtype_dict)
def map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, prefix, interaction_s=None, maf_threshold_interaction=0.05, group_s=None, window=1000000, run_eigenmt=False, output_dir='.', logger=None, verbose=True): """ cis-QTL mapping: nominal associations for all variant-phenotype pairs Association results for each chromosome are written to parquet files in the format <output_dir>/<prefix>.cis_qtl_pairs.<chr>.parquet """ assert np.all(phenotype_df.columns == covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() if group_s is not None: group_dict = group_s.to_dict() logger.write( 'cis-QTL mapping: nominal associations for all variant-phenotype pairs' ) logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(variant_df.shape[0])) if interaction_s is not None: assert np.all(interaction_s.index == covariates_df.index) logger.write(' * including interaction term') if maf_threshold_interaction > 0: logger.write(' * using {:.2f} MAF threshold'.format( maf_threshold_interaction)) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) residualizer = Residualizer(covariates_t) del covariates_t genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) if interaction_s is None: dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] else: dof = phenotype_df.shape[1] - 4 - covariates_df.shape[1] interaction_t = torch.tensor(interaction_s.values.reshape(1, -1), dtype=torch.float32).to(device) if maf_threshold_interaction > 0: interaction_mask_t = torch.BoolTensor( interaction_s >= interaction_s.median()).to(device) else: interaction_mask_t = None igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window) # iterate over chromosomes best_assoc = [] start_time = time.time() prev_phenotype_id = None k = 0 logger.write(' * Computing associations') for chrom in igc.chrs: logger.write(' Mapping chromosome {}'.format(chrom)) chr_res_df = [] for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate( igc.generate_data(chrom=chrom, verbose=verbose), k + 1): # copy genotypes to GPU phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) if interaction_s is None: res = calculate_cis_nominal(genotypes_t, phenotype_t, residualizer) tstat, slope, slope_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] variant_ids = variant_df.index[ genotype_range[0]:genotype_range[-1] + 1] tss_distance = np.int32( variant_df['pos']. values[genotype_range[0]:genotype_range[-1] + 1] - igc.phenotype_tss[phenotype_id]) res_df = pd.DataFrame( OrderedDict([ ('phenotype_id', [phenotype_id] * len(variant_ids)), ('variant_id', variant_ids), ('tss_distance', tss_distance), ('maf', maf), ('ma_samples', ma_samples), ('ma_count', ma_count), ('pval_nominal', tstat ), #### replace with pval (currently on CPU, below) ('slope', slope), ('slope_se', slope_se), ])) else: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold_interaction) if genotypes_t.shape[0] > 0: res = calculate_interaction_nominal( genotypes_t, phenotype_t.unsqueeze(0), interaction_t, residualizer, return_sparse=False) if run_eigenmt: # compute eigenMT correction m_eff = eigenmt.compute_tests(genotypes_t, var_thresh=0.99, variant_window=200) tstat, b, b_se, maf, ma_samples, ma_count = [ i.cpu().numpy() for i in res ] mask = mask_t.cpu().numpy() r = igc.cis_ranges[phenotype_id] variant_ids = variant_df.index[r[0]:r[-1] + 1] tss_distance = np.int32( variant_df['pos'].values[r[0]:r[-1] + 1] - igc.phenotype_tss[phenotype_id]) variant_ids = variant_ids[mask] tss_distance = tss_distance[mask] nv = len(variant_ids) res_df = pd.DataFrame( OrderedDict([ ('phenotype_id', [phenotype_id] * nv), ('variant_id', variant_ids), ('tss_distance', tss_distance), ('maf', maf), ('ma_samples', ma_samples), ('ma_count', ma_count), ('pval_g', tstat[:, 0]), ('b_g', b[:, 0]), ('b_g_se', b_se[:, 0]), ('pval_i', tstat[:, 1]), ('b_i', b[:, 1]), ('b_i_se', b_se[:, 1]), ('pval_gi', tstat[:, 2]), ('b_gi', b[:, 2]), ('b_gi_se', b_se[:, 2]), ])) top_s = res_df.loc[res_df['pval_gi'].abs().idxmax()].copy() if run_eigenmt: top_s['tests_emt'] = m_eff best_assoc.append( top_s ) # top variant only (pval_gi is t-statistic here, hence max) else: # all genotypes in window were filtered out res_df = None if group_s is not None and group_dict[ phenotype_id] == group_dict.get(prev_phenotype_id): # store the strongest association within each group if interaction_s is None: ix = res_df['pval_nominal'] > chr_res_df[-1][ 'pval_nominal'] # compare t-statistics else: ix = res_df['pval_gi'] > chr_res_df[-1]['pval_gi'] chr_res_df[-1].loc[ix] = res_df.loc[ix] else: chr_res_df.append(res_df) prev_phenotype_id = phenotype_id logger.write(' time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) # compute p-values and write current chromosome chr_res_df = pd.concat(chr_res_df, copy=False) if interaction_s is None: m = chr_res_df['pval_nominal'].notnull() chr_res_df.loc[m, 'pval_nominal'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_nominal'].abs(), dof) else: m = chr_res_df['pval_gi'].notnull() chr_res_df.loc[m, 'pval_g'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_g'].abs(), dof) chr_res_df.loc[m, 'pval_i'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_i'].abs(), dof) chr_res_df.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -chr_res_df.loc[m, 'pval_gi'].abs(), dof) print(' * writing output') chr_res_df.to_parquet( os.path.join(output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom))) if interaction_s is not None: best_assoc = pd.concat( best_assoc, axis=1, sort=False).T.set_index('phenotype_id').infer_objects() m = best_assoc['pval_g'].notnull() best_assoc.loc[m, 'pval_g'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_g'].abs(), dof) best_assoc.loc[m, 'pval_i'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_i'].abs(), dof) best_assoc.loc[m, 'pval_gi'] = 2 * stats.t.cdf( -best_assoc.loc[m, 'pval_gi'].abs(), dof) if run_eigenmt: best_assoc['pval_emt'] = np.minimum( best_assoc['tests_emt'] * best_assoc['pval_gi'], 1) best_assoc['pval_adj_bh'] = eigenmt.padjust_bh( best_assoc['pval_emt']) best_assoc.to_csv(os.path.join( output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)), sep='\t', float_format='%.6g') logger.write('done.')
def run_eigenmt(genotype_df, variant_df, phenotype_df, phenotype_pos_df, interaction_s=None, maf_threshold=0, var_thresh=0.99, variant_window=200, window=1000000, verbose=True, logger=None): """Standalone function for computing eigenMT correction. Returns the number of tests for each gene """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write( 'eigenMT: estimating number of independent variants tested for each phenotype' ) logger.write('cis-QTL mapping: empirical p-values for phenotypes') logger.write(f' * {phenotype_df.shape[1]} samples') logger.write(f' * {phenotype_df.shape[0]} phenotypes') logger.write(f' * {genotype_df.shape[0]} variants') if interaction_s is not None and maf_threshold > 0: interaction_mask_t = torch.BoolTensor( interaction_s >= interaction_s.median()).to(device) else: interaction_mask_t = None genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window) start_time = time.time() m_eff = OrderedDict() for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) if interaction_s is None: mask_t = calculate_maf(genotypes_t) >= maf_threshold genotypes_t = genotypes_t[mask_t] else: genotypes_t, mask_t = filter_maf_interaction( genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold) m_eff[phenotype_id] = compute_tests(genotypes_t, var_thresh=var_thresh, variant_window=variant_window) logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min') return pd.Series(m_eff)
def run_pairs(genotype_df, variant_df, phenotype1_df, phenotype2_df, phenotype_pos_df, covariates1_df=None, covariates2_df=None, p1=1e-4, p2=1e-4, p12=1e-5, mode='beta', maf_threshold=0, window=1000000, batch_size=10000, logger=None, verbose=True): """Compute COLOC for all phenotype pairs""" assert np.all(phenotype1_df.index == phenotype2_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write('Computing COLOC for all pairs of phenotypes') logger.write(f' * {phenotype1_df.shape[0]} phenotypes') logger.write(f' * phenotype group 1: {phenotype1_df.shape[1]} samples') logger.write(f' * phenotype group 2: {phenotype2_df.shape[1]} samples') if covariates1_df is not None: assert np.all(phenotype1_df.columns == covariates1_df.index) logger.write( f' * phenotype group 1: {covariates1_df.shape[1]} covariates') residualizer1 = Residualizer( torch.tensor(covariates1_df.values, dtype=torch.float32).to(device)) else: residualizer1 = None if covariates2_df is not None: assert np.all(phenotype2_df.columns == covariates2_df.index) logger.write( f' * phenotype group 2: {covariates2_df.shape[1]} covariates') residualizer2 = Residualizer( torch.tensor(covariates2_df.values, dtype=torch.float32).to(device)) else: residualizer2 = None if maf_threshold > 0: logger.write( f' * applying in-sample {maf_threshold} MAF filter (in at least one cohort)' ) genotype1_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype1_df.columns]) genotype1_ix_t = torch.from_numpy(genotype1_ix).to(device) genotype2_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype2_df.columns]) genotype2_ix_t = torch.from_numpy(genotype2_ix).to(device) igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype1_df, phenotype_pos_df, window=window) coloc_df = [] start_time = time.time() logger.write(' * Computing pairwise colocalization') for phenotype1, genotypes, genotype_range, phenotype_id in igc.generate_data( verbose=verbose): phenotype2 = phenotype2_df.loc[phenotype_id] # copy to GPU phenotype1_t = torch.tensor(phenotype1, dtype=torch.float).to(device) phenotype2_t = torch.tensor(phenotype2, dtype=torch.float).to(device) genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes1_t = genotypes_t[:, genotype1_ix_t] genotypes2_t = genotypes_t[:, genotype2_ix_t] del genotypes_t # filter monomorphic sites m = ((genotypes1_t == 0).all(1) | (genotypes1_t == 1).all(1) | (genotypes1_t == 2).all(1) | (genotypes2_t == 0).all(1) | (genotypes2_t == 1).all(1) | (genotypes2_t == 2).all(1)) genotypes1_t = genotypes1_t[~m] genotypes2_t = genotypes2_t[~m] impute_mean(genotypes1_t) impute_mean(genotypes2_t) if maf_threshold > 0: maf1_t = calculate_maf(genotypes1_t) maf2_t = calculate_maf(genotypes2_t) mask_t = (maf1_t >= maf_threshold) | (maf2_t >= maf_threshold) genotypes1_t = genotypes1_t[mask_t] genotypes2_t = genotypes2_t[mask_t] coloc_t = coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t, residualizer1=residualizer1, residualizer2=residualizer2, p1=p1, p2=p2, p12=p12, mode=mode) coloc_df.append(coloc_t.cpu().numpy()) logger.write(' time elapsed: {:.2f} min'.format( (time.time() - start_time) / 60)) coloc_df = pd.DataFrame(coloc_df, columns=[f'pp_h{i}_abf' for i in range(5)], index=phenotype1_df.index) logger.write('done.') return coloc_df
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, mapper, prefix, window=1000000, output_dir='.', logger=None, verbose=True, interaction=False, kwargs={}, kwargs_interaction={}, num_of_permutation=None, permutation_chunk_size=10): ''' Wrapper for cis-QTL mapping. The QTL caller is `mapper` which should have * mapper.init(phenotype, covariate) * mapper.map(genotype) If interaction: * mapper.map_one_multi_x(X) with X being generated from kwargs_interaction['transform_fun'](kwargs['design_matrix'] @ genotype, **kwargs_interaction['transform_fun_args']) implemented. mapper.map_one should return 'bhat', 'pval' in a dictionary. ''' assert np.all(phenotype_df.columns==covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write('cis-QTL mapping') logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(variant_df.shape[0])) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device) # FIXME: this is not ideal since we may initialize for some phenotypes that does not have cis genotype. # So, for now, as it is not taken care of inside the caller, # we need to make sure that these phenotypes are not part of the input. ## mapper call mapper.init(phenotypes_t.T, covariates_t, **kwargs) phenotype_names = phenotype_df.index.to_list() igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=window) # iterate over chromosomes best_assoc = [] start_time = time.time() k = 0 logger.write(' * Computing associations') for chrom in igc.chrs: logger.write(' Mapping chromosome {}'.format(chrom)) # allocate arrays n = 0 for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr']==chrom].index: j = igc.cis_ranges[i] n += j[1] - j[0] + 1 chr_res = OrderedDict() chr_res['phenotype_id'] = [] chr_res['variant_id'] = [] chr_res['tss_distance'] = np.empty(n, dtype=np.int32) chr_res['pval'] = np.empty(n, dtype=np.float64) chr_res['b'] = np.empty(n, dtype=np.float32) if num_of_permutation is not None: chr_res['pval_permutation'] = np.empty(n, dtype=np.float64) start = 0 for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(chrom=chrom, verbose=verbose), k+1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) impute_mean(genotypes_t) variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1]+1] n = len(variant_ids) if n <= 0: continue tss_distance = np.int32(variant_df['pos'].values[genotype_range[0]:genotype_range[-1]+1] - igc.phenotype_tss[phenotype_id]) phenotype_idx = name_to_index(phenotype_names, phenotype_id) ## mapper call if interaction is False: res_i = mapper.map_one(genotypes_t.T, phenotype_idx) elif interaction is True: X = kwargs_interaction['transform_fun'](torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T, **kwargs_interaction['transform_fun_args']) res_i = mapper.map_one_multi_x(X, phenotype_idx) ## take care of permutation if num_of_permutation is not None: list_pval_perm = [] permutor = Permutor(num_of_permutation, chunk_size=permutation_chunk_size) if interaction is False: for x, nchunk in permutor.gen_permuted_columns(genotypes_t.T): _, pval_perm = mapper.map_one(x, phenotype_idx) list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk)) elif interaction is True: traveler = permutor.gen_permuted_columns_interaction( torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T, kwargs_interaction['permutation']['transform_fun'], kwargs_interaction['permutation']['transform_fun_args'] ) for x, nchunk in traveler: _, pval_perm = mapper.map_one_multi_x(x, phenotype_idx) list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk)) else: raise ValueError(f'The args interaction can only be True or False. Wrong interaction = {interaction}.') pval_from_permutation = permutor.add_permutation_pval(torch.Tensor(res_i[1]), torch.cat(list_pval_perm, axis=0)) chr_res['phenotype_id'].extend([phenotype_id]*n) chr_res['variant_id'].extend(variant_ids) chr_res['tss_distance'][start:start+n] = tss_distance chr_res['pval'][start:start+n] = res_i[1] chr_res['b'][start:start+n] = res_i[0] if num_of_permutation is not None: chr_res['pval_permutation'][start:start+n] = pval_from_permutation start += n # update pointer logger.write(' time elapsed: {:.2f} min'.format((time.time()-start_time)/60)) # prepare output if start < len(chr_res['tss_distance']): for x in chr_res: chr_res[x] = chr_res[x][:start] chr_res_df = pd.DataFrame(chr_res) print(' * writing output') chr_res_df.to_parquet(os.path.join(output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom))) logger.write('done.')
def map(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, L=10, scaled_prior_variance=0.2, estimate_residual_variance=True, estimate_prior_variance=True, tol=1e-3, coverage=0.95, min_abs_corr=0.5, summary_only=True, maf_threshold=0, max_iter=100, window=1000000, logger=None, verbose=True, warn_monomorphic=False): """ SuSiE fine-mapping: computes SuSiE model for all phenotypes """ assert np.all(phenotype_df.columns == covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write('SuSiE fine-mapping') logger.write(f' * {phenotype_df.shape[1]} samples') logger.write(f' * {phenotype_df.shape[0]} phenotypes') logger.write(f' * {covariates_df.shape[1]} covariates') logger.write(f' * {variant_df.shape[0]} variants') if maf_threshold > 0: logger.write(f' * applying in-sample MAF >= {maf_threshold} filter') residualizer = Residualizer( torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window) if igc.n_phenotypes == 0: raise ValueError('No valid phenotypes found.') start_time = time.time() logger.write(' * fine-mapping') copy_keys = ['pip', 'sets', 'converged', 'niter'] if not summary_only: susie_res = {} else: susie_res = [] for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1] + 1] # filter monomorphic variants mask_t = ~(genotypes_t == genotypes_t[:, [0]]).all(1) if warn_monomorphic: logger.write( f' * WARNING: excluding {~mask_t.sum()} monomorphic variants' ) if maf_threshold > 0: maf_t = calculate_maf(genotypes_t) mask_t &= maf_t >= maf_threshold if mask_t.any(): genotypes_t = genotypes_t[mask_t] mask = mask_t.cpu().numpy().astype(bool) variant_ids = variant_ids[mask] genotype_range = genotype_range[mask] if genotypes_t.shape[0] == 0: logger.write( f'WARNING: skipping {phenotype_id} (no valid variants)') continue phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_res_t = residualizer.transform( genotypes_t) # variants x samples phenotype_res_t = residualizer.transform(phenotype_t.reshape( 1, -1)) # phenotypes x samples res = susie(genotypes_res_t.T, phenotype_res_t.T, L=L, scaled_prior_variance=scaled_prior_variance, coverage=coverage, min_abs_corr=min_abs_corr, estimate_residual_variance=estimate_residual_variance, estimate_prior_variance=estimate_prior_variance, tol=tol, max_iter=max_iter) af_t = genotypes_t.sum(1) / (2 * genotypes_t.shape[1]) res['pip'] = pd.DataFrame({ 'pip': res['pip'], 'af': af_t.cpu().numpy() }, index=variant_ids) if not summary_only: susie_res[phenotype_id] = {k: res[k] for k in copy_keys} else: if res['sets']['cs'] is not None: # assert res['converged'] == True if res['converged'] == True: for c in sorted(res['sets']['cs'], key=lambda x: int(x.replace('L', ''))): cs = res['sets']['cs'][c] # indexes p = res['pip'].iloc[cs].copy().reset_index() p['cs_id'] = c.replace('L', '') p.insert(0, 'phenotype_id', phenotype_id) susie_res.append(p) else: print(f' * phenotype ID: {phenotype_id}') logger.write(f' Time elapsed: {(time.time()-start_time)/60:.2f} min') logger.write('done.') if summary_only and susie_res: susie_res = pd.concat(susie_res, axis=0).rename(columns={ 'snp': 'variant_id' }).reset_index(drop=True) return susie_res
def map(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, L=10, scaled_prior_variance=0.2, estimate_residual_variance=True, estimate_prior_variance=True, tol=1e-3, coverage=0.95, min_abs_corr=0.5, max_iter=100, window=1000000, logger=None, verbose=True): """ SuSiE fine-mapping: computes SuSiE model for all phenotypes """ assert np.all(phenotype_df.columns == covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write('SuSiE fine-mapping') logger.write(f' * {phenotype_df.shape[1]} samples') logger.write(f' * {phenotype_df.shape[0]} phenotypes') logger.write(f' * {covariates_df.shape[1]} covariates') logger.write(f' * {variant_df.shape[0]} variants') residualizer = Residualizer( torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window) if igc.n_phenotypes == 0: raise ValueError('No valid phenotypes found.') start_time = time.time() logger.write(' * fine-mapping') copy_keys = ['pip', 'sets', 'converged', 'niter'] susie_res = {} for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] # filter monomorphic variants # genotypes_t = genotypes_t[~(genotypes_t == genotypes_t[:, [0]]).all(1)] # if genotypes_t.shape[0] == 0: # logger.write(f'WARNING: skipping {phenotype_id} (no valid variants)') # continue impute_mean(genotypes_t) phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) genotypes_res_t = residualizer.transform( genotypes_t) # variants x samples phenotype_res_t = residualizer.transform(phenotype_t.reshape( 1, -1)) # phenotypes x samples res = susie(genotypes_res_t.T, phenotype_res_t.T, L=L, scaled_prior_variance=scaled_prior_variance, coverage=coverage, min_abs_corr=min_abs_corr, estimate_residual_variance=estimate_residual_variance, estimate_prior_variance=estimate_prior_variance, tol=tol, max_iter=max_iter) af_t = genotypes_t.sum(1) / (2 * genotypes_t.shape[1]) variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1] + 1] res['pip'] = pd.DataFrame({ 'pip': res['pip'], 'af': af_t.cpu().numpy() }, index=variant_ids) susie_res[phenotype_id] = {k: res[k] for k in copy_keys} logger.write(f' Time elapsed: {(time.time()-start_time)/60:.2f} min') logger.write('done.') return susie_res
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=None, group_s=None, maf_threshold=0, beta_approx=True, nperm=10000, window=1000000, random_tiebreak=False, logger=None, seed=None, verbose=True, warn_monomorphic=True): """Run cis-QTL mapping""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write('cis-QTL mapping: empirical p-values for phenotypes') logger.write(f' * {phenotype_df.shape[1]} samples') logger.write(f' * {phenotype_df.shape[0]} phenotypes') if group_s is not None: logger.write(f' * {len(group_s.unique())} phenotype groups') group_dict = group_s.to_dict() if covariates_df is not None: assert np.all(phenotype_df.columns==covariates_df.index) logger.write(f' * {covariates_df.shape[1]} covariates') residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) dof = phenotype_df.shape[1] - 2 - covariates_df.shape[1] else: residualizer = None dof = phenotype_df.shape[1] - 2 logger.write(f' * {genotype_df.shape[0]} variants') if maf_threshold > 0: logger.write(f' * applying in-sample {maf_threshold} MAF filter') if random_tiebreak: logger.write(f' * randomly selecting top variant in case of ties') genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) # permutation indices n_samples = phenotype_df.shape[1] ix = np.arange(n_samples) if seed is not None: logger.write(f' * using seed {seed}') np.random.seed(seed) permutation_ix_t = torch.LongTensor(np.array([np.random.permutation(ix) for i in range(nperm)])).to(device) res_df = [] igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=group_s, window=window) if igc.n_phenotypes == 0: raise ValueError('No valid phenotypes found.') start_time = time.time() logger.write(' * computing permutations') if group_s is None: for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:,genotype_ix_t] impute_mean(genotypes_t) if maf_threshold > 0: maf_t = calculate_maf(genotypes_t) mask_t = maf_t >= maf_threshold genotypes_t = genotypes_t[mask_t] mask = mask_t.cpu().numpy().astype(bool) genotype_range = genotype_range[mask] # filter monomorphic variants mono_t = (genotypes_t == genotypes_t[:, [0]]).all(1) if mono_t.any(): genotypes_t = genotypes_t[~mono_t] genotype_range = genotype_range[~mono_t.cpu()] if warn_monomorphic: logger.write(f' * WARNING: excluding {mono_t.sum()} monomorphic variants') if genotypes_t.shape[0] == 0: logger.write(f'WARNING: skipping {phenotype_id} (no valid variants)') continue phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) res = calculate_cis_permutations(genotypes_t, phenotype_t, permutation_ix_t, residualizer=residualizer, random_tiebreak=random_tiebreak) r_nominal, std_ratio, var_ix, r2_perm, g = [i.cpu().numpy() for i in res] var_ix = genotype_range[var_ix] variant_id = variant_df.index[var_ix] tss_distance = variant_df['pos'].values[var_ix] - igc.phenotype_tss[phenotype_id] res_s = prepare_cis_output(r_nominal, r2_perm, std_ratio, g, genotypes_t.shape[0], dof, variant_id, tss_distance, phenotype_id, nperm=nperm) if beta_approx: res_s[['pval_beta', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df']] = calculate_beta_approx_pval(r2_perm, r_nominal*r_nominal, dof) res_df.append(res_s) else: # grouped mode for k, (phenotypes, genotypes, genotype_range, phenotype_ids, group_id) in enumerate(igc.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) genotypes_t = genotypes_t[:,genotype_ix_t] impute_mean(genotypes_t) if maf_threshold > 0: maf_t = calculate_maf(genotypes_t) mask_t = maf_t >= maf_threshold genotypes_t = genotypes_t[mask_t] mask = mask_t.cpu().numpy().astype(bool) genotype_range = genotype_range[mask] # filter monomorphic variants mono_t = (genotypes_t == genotypes_t[:, [0]]).all(1) if mono_t.any(): genotypes_t = genotypes_t[~mono_t] genotype_range = genotype_range[~mono_t.cpu()] if warn_monomorphic: logger.write(f' * WARNING: excluding {mono_t.sum()} monomorphic variants') if genotypes_t.shape[0] == 0: logger.write(f'WARNING: skipping {phenotype_id} (no valid variants)') continue # iterate over phenotypes buf = [] for phenotype, phenotype_id in zip(phenotypes, phenotype_ids): phenotype_t = torch.tensor(phenotype, dtype=torch.float).to(device) res = calculate_cis_permutations(genotypes_t, phenotype_t, permutation_ix_t, residualizer=residualizer, random_tiebreak=random_tiebreak) res = [i.cpu().numpy() for i in res] # r_nominal, std_ratio, var_ix, r2_perm, g res[2] = genotype_range[res[2]] buf.append(res + [genotypes_t.shape[0], phenotype_id]) res_s = _process_group_permutations(buf, variant_df, igc.phenotype_tss[phenotype_ids[0]], dof, group_id, nperm=nperm, beta_approx=beta_approx) res_df.append(res_s) res_df = pd.concat(res_df, axis=1, sort=False).T res_df.index.name = 'phenotype_id' logger.write(f' Time elapsed: {(time.time()-start_time)/60:.2f} min') logger.write('done.') return res_df.astype(output_dtype_dict).infer_objects()