コード例 #1
0
ファイル: post.py プロジェクト: broadinstitute/tensorqtl
def calculate_afc(gene_ids, variant_ids, counts_df, genotype_df, covariates_df=None,
                  select_covariates=True, imputation='offset', count_threshold=0):
    """
    Calculate allelic fold-change (aFC) for variant-gene pairs

      genotype_df: genotype dosages
      counts_df: read counts scaled with DESeq size factors. Zeros are imputed using
                 log(counts + 1) (imputation='offset'; default) or with half-minimum
                 (imputation='half_min').
      covariates_df: covariates (genotype PCs, PEER factors, etc.)

    aFC [1] is computed using the total read count (trc) model from mixQTL [2].

      [1] Mohammadi et al., 2017 (genome.cshlp.org/content/27/11/1872)
      [2] Liang et al., 2021 (10.1038/s41467-021-21592-8)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    assert len(variant_ids) == len(gene_ids)

    genotypes_t = torch.tensor(genotype_df[genotype_df.index.isin(variant_ids)].loc[variant_ids].values,
                               dtype=torch.float32).to(device)
    genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in counts_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
    genotypes_t = genotypes_t[:,genotype_ix_t]
    impute_mean(genotypes_t)

    counts_t = torch.tensor(counts_df.loc[gene_ids].values, dtype=torch.float32).to(device)

    if covariates_df is not None:
        covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
    else:
        covariates_t = None

    afc = []
    afc_se = []
    for k in range(len(variant_ids)):
        if (k+1) % 10 == 0 or k+1 == len(variant_ids):
            print(f"\rCalculating aFC for variant-gene pair {k+1}/{len(variant_ids)}", end='', flush=True)
        _, b, b_se = mixqtl.trc(genotypes_t[[k]], counts_t[k], covariates_t=covariates_t,
                                select_covariates=select_covariates, count_threshold=count_threshold,
                                imputation=imputation, return_af=False)
        afc.append(float(b.cpu()))
        afc_se.append(float(b_se.cpu()))
    print()
    afc = np.array(afc) * np.log2(np.e)
    afc_se = np.array(afc_se) * np.log2(np.e)
    afc_df = pd.DataFrame({'gene_id':gene_ids, 'variant_id':variant_ids,
                           'afc':afc, 'afc_se':afc_se})
    return afc_df
コード例 #2
0
def calculate_replication(res_df,
                          genotype_df,
                          phenotype_df,
                          covariates_df,
                          interaction_s=None,
                          lambda_qvalue=None):
    """res_df: DataFrame with 'variant_id' column and phenotype IDs as index"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    genotypes_t = torch.tensor(genotype_df.loc[res_df['variant_id']].values,
                               dtype=torch.float).to(device)
    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
    genotypes_t = genotypes_t[:, genotype_ix_t]
    impute_mean(genotypes_t)

    phenotypes_t = torch.tensor(phenotype_df.loc[res_df.index].values,
                                dtype=torch.float32).to(device)

    residualizer = Residualizer(
        torch.tensor(covariates_df.values, dtype=torch.float32).to(device))

    # calculate MAF
    n2 = 2 * genotypes_t.shape[1]
    af_t = genotypes_t.sum(1) / n2
    ix_t = af_t <= 0.5
    maf_t = torch.where(ix_t, af_t, 1 - af_t)
    # calculate MA samples and counts
    m = genotypes_t > 0.5
    a = m.sum(1).int()
    b = (genotypes_t < 1.5).sum(1).int()
    ma_samples_t = torch.where(ix_t, a, b)
    a = (genotypes_t * m.float()).sum(1).int()
    ma_count_t = torch.where(ix_t, a, n2 - a)

    if interaction_s is None:
        genotype_res_t = residualizer.transform(
            genotypes_t)  # variants x samples
        phenotype_res_t = residualizer.transform(
            phenotypes_t)  # phenotypes x samples

        gstd = genotype_res_t.var(1)
        pstd = phenotype_res_t.var(1)
        std_ratio_t = torch.sqrt(pstd / gstd)

        # center and normalize
        genotype_res_t = center_normalize(genotype_res_t, dim=1)
        phenotype_res_t = center_normalize(phenotype_res_t, dim=1)

        r_nominal_t = (genotype_res_t * phenotype_res_t).sum(1)
        r2_nominal_t = r_nominal_t.double().pow(2)

        dof = residualizer.dof
        tstat_t = torch.sqrt((dof * r2_nominal_t) / (1 - r2_nominal_t))
        slope_t = r_nominal_t * std_ratio_t
        slope_se_t = (slope_t.abs().double() / tstat_t).float()
        pval = 2 * stats.t.cdf(-np.abs(tstat_t.cpu()), dof)

        rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'],
                                    ma_samples_t.cpu(),
                                    ma_count_t.cpu(),
                                    maf_t.cpu(), pval,
                                    slope_t.cpu(),
                                    slope_se_t.cpu()],
                              columns=[
                                  'phenotype_id', 'variant_id', 'ma_samples',
                                  'ma_count', 'maf', 'pval_nominal', 'slope',
                                  'slope_se'
                              ]).infer_objects()

    else:
        interaction_t = torch.tensor(interaction_s.values.reshape(1, -1),
                                     dtype=torch.float32).to(device)
        ng, ns = genotypes_t.shape
        nps = phenotypes_t.shape[0]

        # centered inputs
        g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)
        gi_t = genotypes_t * interaction_t
        gi0_t = gi_t - gi_t.mean(1, keepdim=True)
        i0_t = interaction_t - interaction_t.mean()
        p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)

        # residualize rows
        g0_t = residualizer.transform(g0_t, center=False)
        gi0_t = residualizer.transform(gi0_t, center=False)
        p0_t = residualizer.transform(p0_t, center=False)  # np x ns
        i0_t = residualizer.transform(i0_t, center=False)
        i0_t = i0_t.repeat(ng, 1)

        # regression (in float; loss of precision may occur in edge cases)
        X_t = torch.stack([g0_t, i0_t, gi0_t], 2)  # ng x ns x 3
        Xinv = torch.matmul(torch.transpose(X_t, 1, 2),
                            X_t).inverse()  # ng x 3 x 3
        b_t = (torch.matmul(Xinv, torch.transpose(X_t, 1, 2)) *
               p0_t.unsqueeze(1)).sum(2)  # ng x 3
        r_t = (X_t * b_t.unsqueeze(1)).sum(2) - p0_t
        dof = residualizer.dof - 2
        rss_t = (r_t * r_t).sum(1)  # ng x np
        b_se_t = torch.sqrt(Xinv[:, torch.eye(3, dtype=torch.uint8).bool()] *
                            rss_t.unsqueeze(-1) / dof)
        tstat_t = (b_t.double() / b_se_t.double()).float()
        pval = 2 * stats.t.cdf(-np.abs(tstat_t.cpu()), dof)
        b = b_t.cpu()
        b_se = b_se_t.cpu()

        rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'],
                                    ma_samples_t.cpu(),
                                    ma_count_t.cpu(),
                                    maf_t.cpu(), pval[:, 0], b[:, 0],
                                    b_se[:, 0], pval[:, 1], b[:, 1],
                                    b_se[:, 1], pval[:, 2], b[:, 2], b_se[:,
                                                                          2]],
                              columns=[
                                  'phenotype_id', 'variant_id', 'ma_samples',
                                  'ma_count', 'maf', 'pval_g', 'b_g', 'b_g_se',
                                  'pval_i', 'b_i', 'b_i_se', 'pval_gi', 'b_gi',
                                  'b_gi_se'
                              ]).infer_objects()
        pval = pval[:, 2]

    try:
        pi1 = 1 - rfunc.pi0est(pval, lambda_qvalue=lambda_qvalue)[0]
    except:
        pi1 = np.NaN
    return pi1, rep_df
コード例 #3
0
ファイル: wrapper.py プロジェクト: liangyy/tensorqtl
def map_trans(genotype_df, phenotype_df, covariates_df, mapper, pval_threshold=1e-5, 
              maf_threshold=0.05, batch_size=20000,
              logger=None, verbose=True, kwargs={}):
    '''
    Wrapper for trans-QTL mapping.
    The QTL caller is `mapper` which should have 
    * mapper.init(phenotype, covariate)
    * mapper.map(genotype) 
    implemented.
    mapper.map should return 'bhat', 'pval' in a dictionary.
    
    '''

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger(verbose=verbose)
    assert np.all(phenotype_df.columns==covariates_df.index)

    variant_ids = genotype_df.index.tolist()
    variant_dict = {i:j for i,j in enumerate(variant_ids)}
    n_variants = len(variant_ids)
    n_samples = phenotype_df.shape[1]

    logger.write('trans-QTL mapping')
    logger.write('  * {} samples'.format(n_samples))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(n_variants))
    

    phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device)
    covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
    
    ## mapper call
    mapper.init(phenotypes_t.T, covariates_t, **kwargs)
    # genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    # genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    
    ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
    start_time = time.time()
    res = []
    n_variants = 0
    for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
        # copy genotypes to GPU
        genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)

        # filter by MAF
        # genotypes_t = genotypes_t[:,genotype_ix_t]
        impute_mean(genotypes_t)
        genotypes_t, variant_ids, maf_t = filter_maf(genotypes_t, variant_ids, maf_threshold)
        n_variants += genotypes_t.shape[0]
        
        ## mapper call
        res_i = mapper.map(genotypes_t.T)
        
        del genotypes_t
        
        res_i = np.c_[ 
            np.repeat(variant_ids, phenotype_df.index.shape[0]),
            np.tile(phenotype_df.index, variant_ids.shape[0]),
            res_i[0],
            res_i[1],
            np.repeat(maf_t.cpu(), phenotype_df.index.shape[0])
        ]
        res.append(res_i)
        
    logger.write('    elapsed time: {:.2f} min'.format((time.time()-start_time)/60))
    del phenotypes_t

    # post-processing: concatenate batches
    res = np.concatenate(res)
    pval_df = pd.DataFrame(res, columns=['variant_id', 'phenotype_id', 'bhat', 'pval', 'maf'])

    if maf_threshold > 0:
        logger.write('  * {} variants passed MAF >= {:.2f} filtering'.format(n_variants, maf_threshold))
    logger.write('done.')
    return pval_df
コード例 #4
0
ファイル: wrapper.py プロジェクト: liangyy/tensorqtl
def map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
            covariates_df, mapper, prefix, 
            window=1000000, output_dir='.', 
            logger=None, verbose=True, interaction=False, kwargs={}, kwargs_interaction={},
            num_of_permutation=None, permutation_chunk_size=10):
    '''
    Wrapper for cis-QTL mapping.
    The QTL caller is `mapper` which should have 
    * mapper.init(phenotype, covariate)
    * mapper.map(genotype) 
    If interaction: 
    * mapper.map_one_multi_x(X) with X being generated from 
    kwargs_interaction['transform_fun'](kwargs['design_matrix'] @ genotype, **kwargs_interaction['transform_fun_args'])
    implemented.
    mapper.map_one should return 'bhat', 'pval' in a dictionary.
    '''
    
    assert np.all(phenotype_df.columns==covariates_df.index)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()

    logger.write('cis-QTL mapping')
    logger.write('  * {} samples'.format(phenotype_df.shape[1]))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(variant_df.shape[0]))
    

    covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
    phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device)
    
    # FIXME: this is not ideal since we may initialize for some phenotypes that does not have cis genotype.
    # So, for now, as it is not taken care of inside the caller, 
    # we need to make sure that these phenotypes are not part of the input.
    ## mapper call
    mapper.init(phenotypes_t.T, covariates_t, **kwargs)
    phenotype_names = phenotype_df.index.to_list()

    
    igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=window)
    # iterate over chromosomes
    best_assoc = []
    start_time = time.time()
    k = 0
    logger.write('  * Computing associations')
    for chrom in igc.chrs:
        logger.write('    Mapping chromosome {}'.format(chrom))
        # allocate arrays
        n = 0
        for i in igc.phenotype_pos_df[igc.phenotype_pos_df['chr']==chrom].index:
            j = igc.cis_ranges[i]
            n += j[1] - j[0] + 1
        
        chr_res = OrderedDict()
        chr_res['phenotype_id'] = []
        chr_res['variant_id'] = []
        chr_res['tss_distance'] = np.empty(n, dtype=np.int32)
        chr_res['pval'] = np.empty(n, dtype=np.float64)
        chr_res['b'] =        np.empty(n, dtype=np.float32)
        
        if num_of_permutation is not None:
            chr_res['pval_permutation'] = np.empty(n, dtype=np.float64)
        
        start = 0
        
        for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(chrom=chrom, verbose=verbose), k+1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            impute_mean(genotypes_t)

            variant_ids = variant_df.index[genotype_range[0]:genotype_range[-1]+1]
            
            n = len(variant_ids)
            if n <= 0:
                continue
            
            tss_distance = np.int32(variant_df['pos'].values[genotype_range[0]:genotype_range[-1]+1] - igc.phenotype_tss[phenotype_id])
            
            phenotype_idx = name_to_index(phenotype_names, phenotype_id)

            ## mapper call
            if interaction is False:
                res_i = mapper.map_one(genotypes_t.T, phenotype_idx)
            elif interaction is True:
                X = kwargs_interaction['transform_fun'](torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T, **kwargs_interaction['transform_fun_args'])
                res_i = mapper.map_one_multi_x(X, phenotype_idx)
            
                
            ## take care of permutation
            if num_of_permutation is not None:
                list_pval_perm = []
                permutor = Permutor(num_of_permutation, chunk_size=permutation_chunk_size)
                if interaction is False:
                    for x, nchunk in permutor.gen_permuted_columns(genotypes_t.T):
                        _, pval_perm = mapper.map_one(x, phenotype_idx)
                        list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk))
                elif interaction is True:
                    traveler = permutor.gen_permuted_columns_interaction(
                        torch.Tensor(kwargs['design_matrix']) @ genotypes_t.T,
                        kwargs_interaction['permutation']['transform_fun'],
                        kwargs_interaction['permutation']['transform_fun_args']
                    )
                    for x, nchunk in traveler: 
                        _, pval_perm = mapper.map_one_multi_x(x, phenotype_idx)
                        list_pval_perm.append(permutor.rearrange(torch.Tensor(pval_perm), nchunk))
                else:
                    raise ValueError(f'The args interaction can only be True or False. Wrong interaction = {interaction}.')
                pval_from_permutation = permutor.add_permutation_pval(torch.Tensor(res_i[1]), torch.cat(list_pval_perm, axis=0))
        
            chr_res['phenotype_id'].extend([phenotype_id]*n)
            chr_res['variant_id'].extend(variant_ids)
            chr_res['tss_distance'][start:start+n] = tss_distance
            chr_res['pval'][start:start+n] = res_i[1]
            chr_res['b'][start:start+n] = res_i[0]
            if num_of_permutation is not None:
                chr_res['pval_permutation'][start:start+n] = pval_from_permutation
            start += n  # update pointer
        

        logger.write('    time elapsed: {:.2f} min'.format((time.time()-start_time)/60))

        # prepare output
        if start < len(chr_res['tss_distance']):
            for x in chr_res:
                chr_res[x] = chr_res[x][:start]

        chr_res_df = pd.DataFrame(chr_res)
            
        print('    * writing output')
        chr_res_df.to_parquet(os.path.join(output_dir, '{}.cis_qtl_pairs.{}.parquet'.format(prefix, chrom)))

    
    logger.write('done.')