def calculate_afc(gene_ids, variant_ids, counts_df, genotype_df, covariates_df=None, select_covariates=True, imputation='offset', count_threshold=0): """ Calculate allelic fold-change (aFC) for variant-gene pairs genotype_df: genotype dosages counts_df: read counts scaled with DESeq size factors. Zeros are imputed using log(counts + 1) (imputation='offset'; default) or with half-minimum (imputation='half_min'). covariates_df: covariates (genotype PCs, PEER factors, etc.) aFC [1] is computed using the total read count (trc) model from mixQTL [2]. [1] Mohammadi et al., 2017 (genome.cshlp.org/content/27/11/1872) [2] Liang et al., 2021 (10.1038/s41467-021-21592-8) """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") assert len(variant_ids) == len(gene_ids) genotypes_t = torch.tensor(genotype_df[genotype_df.index.isin(variant_ids)].loc[variant_ids].values, dtype=torch.float32).to(device) genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in counts_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) genotypes_t = genotypes_t[:,genotype_ix_t] impute_mean(genotypes_t) counts_t = torch.tensor(counts_df.loc[gene_ids].values, dtype=torch.float32).to(device) if covariates_df is not None: covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) else: covariates_t = None afc = [] afc_se = [] for k in range(len(variant_ids)): if (k+1) % 10 == 0 or k+1 == len(variant_ids): print(f"\rCalculating aFC for variant-gene pair {k+1}/{len(variant_ids)}", end='', flush=True) _, b, b_se = mixqtl.trc(genotypes_t[[k]], counts_t[k], covariates_t=covariates_t, select_covariates=select_covariates, count_threshold=count_threshold, imputation=imputation, return_af=False) afc.append(float(b.cpu())) afc_se.append(float(b_se.cpu())) print() afc = np.array(afc) * np.log2(np.e) afc_se = np.array(afc_se) * np.log2(np.e) afc_df = pd.DataFrame({'gene_id':gene_ids, 'variant_id':variant_ids, 'afc':afc, 'afc_se':afc_se}) return afc_df
def calculate_replication(res_df, genotype_df, phenotype_df, covariates_df, interaction_s=None, lambda_qvalue=None): """res_df: DataFrame with 'variant_id' column and phenotype IDs as index""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") genotypes_t = torch.tensor(genotype_df.loc[res_df['variant_id']].values, dtype=torch.float).to(device) genotype_ix = np.array( [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) genotype_ix_t = torch.from_numpy(genotype_ix).to(device) genotypes_t = genotypes_t[:, genotype_ix_t] impute_mean(genotypes_t) phenotypes_t = torch.tensor(phenotype_df.loc[res_df.index].values, dtype=torch.float32).to(device) residualizer = Residualizer( torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) # calculate MAF n2 = 2 * genotypes_t.shape[1] af_t = genotypes_t.sum(1) / n2 ix_t = af_t <= 0.5 maf_t = torch.where(ix_t, af_t, 1 - af_t) # calculate MA samples and counts m = genotypes_t > 0.5 a = m.sum(1).int() b = (genotypes_t < 1.5).sum(1).int() ma_samples_t = torch.where(ix_t, a, b) a = (genotypes_t * m.float()).sum(1).int() ma_count_t = torch.where(ix_t, a, n2 - a) if interaction_s is None: genotype_res_t = residualizer.transform( genotypes_t) # variants x samples phenotype_res_t = residualizer.transform( phenotypes_t) # phenotypes x samples gstd = genotype_res_t.var(1) pstd = phenotype_res_t.var(1) std_ratio_t = torch.sqrt(pstd / gstd) # center and normalize genotype_res_t = center_normalize(genotype_res_t, dim=1) phenotype_res_t = center_normalize(phenotype_res_t, dim=1) r_nominal_t = (genotype_res_t * phenotype_res_t).sum(1) r2_nominal_t = r_nominal_t.double().pow(2) dof = residualizer.dof tstat_t = torch.sqrt((dof * r2_nominal_t) / (1 - r2_nominal_t)) slope_t = r_nominal_t * std_ratio_t slope_se_t = (slope_t.abs().double() / tstat_t).float() pval = 2 * stats.t.cdf(-np.abs(tstat_t.cpu()), dof) rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), maf_t.cpu(), pval, slope_t.cpu(), slope_se_t.cpu()], columns=[ 'phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'maf', 'pval_nominal', 'slope', 'slope_se' ]).infer_objects() else: interaction_t = torch.tensor(interaction_s.values.reshape(1, -1), dtype=torch.float32).to(device) ng, ns = genotypes_t.shape nps = phenotypes_t.shape[0] # centered inputs g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True) gi_t = genotypes_t * interaction_t gi0_t = gi_t - gi_t.mean(1, keepdim=True) i0_t = interaction_t - interaction_t.mean() p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True) # residualize rows g0_t = residualizer.transform(g0_t, center=False) gi0_t = residualizer.transform(gi0_t, center=False) p0_t = residualizer.transform(p0_t, center=False) # np x ns i0_t = residualizer.transform(i0_t, center=False) i0_t = i0_t.repeat(ng, 1) # regression (in float; loss of precision may occur in edge cases) X_t = torch.stack([g0_t, i0_t, gi0_t], 2) # ng x ns x 3 Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3 b_t = (torch.matmul(Xinv, torch.transpose(X_t, 1, 2)) * p0_t.unsqueeze(1)).sum(2) # ng x 3 r_t = (X_t * b_t.unsqueeze(1)).sum(2) - p0_t dof = residualizer.dof - 2 rss_t = (r_t * r_t).sum(1) # ng x np b_se_t = torch.sqrt(Xinv[:, torch.eye(3, dtype=torch.uint8).bool()] * rss_t.unsqueeze(-1) / dof) tstat_t = (b_t.double() / b_se_t.double()).float() pval = 2 * stats.t.cdf(-np.abs(tstat_t.cpu()), dof) b = b_t.cpu() b_se = b_se_t.cpu() rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), maf_t.cpu(), pval[:, 0], b[:, 0], b_se[:, 0], pval[:, 1], b[:, 1], b_se[:, 1], pval[:, 2], b[:, 2], b_se[:, 2]], columns=[ 'phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'maf', 'pval_g', 'b_g', 'b_g_se', 'pval_i', 'b_i', 'b_i_se', 'pval_gi', 'b_gi', 'b_gi_se' ]).infer_objects() pval = pval[:, 2] try: pi1 = 1 - rfunc.pi0est(pval, lambda_qvalue=lambda_qvalue)[0] except: pi1 = np.NaN return pi1, rep_df
def map_trans(genotype_df, phenotype_df, covariates_df, mapper, pval_threshold=1e-5, maf_threshold=0.05, batch_size=20000, logger=None, verbose=True, kwargs={}): ''' Wrapper for trans-QTL mapping. The QTL caller is `mapper` which should have * mapper.init(phenotype, covariate) * mapper.map(genotype) implemented. mapper.map should return 'bhat', 'pval' in a dictionary. ''' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger(verbose=verbose) assert np.all(phenotype_df.columns==covariates_df.index) variant_ids = genotype_df.index.tolist() variant_dict = {i:j for i,j in enumerate(variant_ids)} n_variants = len(variant_ids) n_samples = phenotype_df.shape[1] logger.write('trans-QTL mapping') logger.write(' * {} samples'.format(n_samples)) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(n_variants)) phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) ## mapper call mapper.init(phenotypes_t.T, covariates_t, **kwargs) # genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) # genotype_ix_t = torch.from_numpy(genotype_ix).to(device) ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size) start_time = time.time() res = [] n_variants = 0 for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) # filter by MAF # genotypes_t = genotypes_t[:,genotype_ix_t] impute_mean(genotypes_t) genotypes_t, variant_ids, maf_t = filter_maf(genotypes_t, variant_ids, maf_threshold) n_variants += genotypes_t.shape[0] ## mapper call res_i = mapper.map(genotypes_t.T) del genotypes_t res_i = np.c_[ np.repeat(variant_ids, phenotype_df.index.shape[0]), np.tile(phenotype_df.index, variant_ids.shape[0]), res_i[0], res_i[1], np.repeat(maf_t.cpu(), phenotype_df.index.shape[0]) ] res.append(res_i) logger.write(' elapsed time: {:.2f} min'.format((time.time()-start_time)/60)) del phenotypes_t # post-processing: concatenate batches res = np.concatenate(res) pval_df = pd.DataFrame(res, columns=['variant_id', 'phenotype_id', 'bhat', 'pval', 'maf']) if maf_threshold > 0: logger.write(' * {} variants passed MAF >= {:.2f} filtering'.format(n_variants, maf_threshold)) logger.write('done.') return pval_df
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df, mapper, prefix, window=1000000, output_dir='.', logger=None, verbose=True, interaction=False, kwargs={}, kwargs_interaction={}, num_of_permutation=None, permutation_chunk_size=10): ''' Wrapper for cis-QTL mapping. The QTL caller is `mapper` which should have * mapper.init(phenotype, covariate) * mapper.map(genotype) If interaction: * mapper.map_one_multi_x(X) with X being generated from kwargs_interaction['transform_fun'](kwargs['design_matrix'] @ genotype, **kwargs_interaction['transform_fun_args']) implemented. mapper.map_one should return 'bhat', 'pval' in a dictionary. ''' assert np.all(phenotype_df.columns==covariates_df.index) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if logger is None: logger = SimpleLogger() logger.write('cis-QTL mapping') logger.write(' * {} samples'.format(phenotype_df.shape[1])) logger.write(' * {} phenotypes'.format(phenotype_df.shape[0])) logger.write(' * {} covariates'.format(covariates_df.shape[1])) logger.write(' * {} variants'.format(variant_df.shape[0])) covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device) # FIXME: this is not ideal since we may initialize for some phenotypes that does not have cis genotype. # So, for now, as it is not taken care of inside the caller, # we need to make sure that these phenotypes are not part of the input. ## mapper call mapper.init(phenotypes_t.T, covariates_t, **kwargs) phenotype_names = phenotype_df.index.to_list() igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=window) # iterate over chromosomes best_assoc = [] start_time = time.time() k = 0 logger.write(' * Computing associations') for chrom in igc.chrs: logger.write(' Mapping chromosome {}'.format(chrom)) # allocate arrays n = 0 for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr']==chrom].index: j = igc.cis_ranges[i] n += j[1] - j[0] + 1 chr_res = OrderedDict() chr_res['phenotype_id'] = [] chr_res['variant_id'] = [] chr_res['tss_distance'] = np.empty(n, dtype=np.int32) chr_res['pval'] = np.empty(n, dtype=np.float64) chr_res['b'] = np.empty(n, dtype=np.float32) if num_of_permutation is not None: chr_res['pval_permutation'] = np.empty(n, dtype=np.float64) start = 0 for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(chrom=chrom, verbose=verbose), k+1): # copy genotypes to GPU genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) impute_mean(genotypes_t) variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1]+1] n = len(variant_ids) if n <= 0: continue tss_distance = np.int32(variant_df['pos'].values[genotype_range[0]:genotype_range[-1]+1] - igc.phenotype_tss[phenotype_id]) phenotype_idx = name_to_index(phenotype_names, phenotype_id) ## mapper call if interaction is False: res_i = mapper.map_one(genotypes_t.T, phenotype_idx) elif interaction is True: X = kwargs_interaction['transform_fun'](torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T, **kwargs_interaction['transform_fun_args']) res_i = mapper.map_one_multi_x(X, phenotype_idx) ## take care of permutation if num_of_permutation is not None: list_pval_perm = [] permutor = Permutor(num_of_permutation, chunk_size=permutation_chunk_size) if interaction is False: for x, nchunk in permutor.gen_permuted_columns(genotypes_t.T): _, pval_perm = mapper.map_one(x, phenotype_idx) list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk)) elif interaction is True: traveler = permutor.gen_permuted_columns_interaction( torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T, kwargs_interaction['permutation']['transform_fun'], kwargs_interaction['permutation']['transform_fun_args'] ) for x, nchunk in traveler: _, pval_perm = mapper.map_one_multi_x(x, phenotype_idx) list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk)) else: raise ValueError(f'The args interaction can only be True or False. Wrong interaction = {interaction}.') pval_from_permutation = permutor.add_permutation_pval(torch.Tensor(res_i[1]), torch.cat(list_pval_perm, axis=0)) chr_res['phenotype_id'].extend([phenotype_id]*n) chr_res['variant_id'].extend(variant_ids) chr_res['tss_distance'][start:start+n] = tss_distance chr_res['pval'][start:start+n] = res_i[1] chr_res['b'][start:start+n] = res_i[0] if num_of_permutation is not None: chr_res['pval_permutation'][start:start+n] = pval_from_permutation start += n # update pointer logger.write(' time elapsed: {:.2f} min'.format((time.time()-start_time)/60)) # prepare output if start < len(chr_res['tss_distance']): for x in chr_res: chr_res[x] = chr_res[x][:start] chr_res_df = pd.DataFrame(chr_res) print(' * writing output') chr_res_df.to_parquet(os.path.join(output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom))) logger.write('done.')