コード例 #1
0
def map_permutations(genotype_df,
                     covariates_df,
                     permutations=None,
                     chr_s=None,
                     nperms=10000,
                     maf_threshold=0.05,
                     batch_size=20000,
                     logger=None,
                     seed=None,
                     verbose=True):
    """


    Warning: this function assumes that all phenotypes are normally distributed,
             e.g., inverse normal transformed
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger()
    assert covariates_df.index.isin(genotype_df.columns).all()
    sample_ids = covariates_df.index.values

    variant_ids = genotype_df.index.tolist()

    # index of VCF samples corresponding to phenotypes
    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in sample_ids])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    n_variants = len(variant_ids)
    n_samples = len(sample_ids)
    dof = n_samples - 2 - covariates_df.shape[1]

    logger.write('trans-QTL mapping (permutations)')
    logger.write('  * {} samples'.format(n_samples))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(n_variants))

    if permutations is None:  # generate permutations assuming normal distribution
        q = stats.norm.ppf(np.arange(1, n_samples + 1) / (n_samples + 1))
        permutations = np.tile(q, [nperms, 1])
        if seed is not None:
            np.random.seed(seed)
        for i in np.arange(nperms):
            np.random.shuffle(permutations[i, :])
    else:
        assert permutations.shape[1] == n_samples
        nperms = permutations.shape[0]
        logger.write('  * {} permutations'.format(nperms))

    permutations_t = torch.tensor(permutations, dtype=torch.float32).to(device)
    covariates_t = torch.tensor(covariates_df.values,
                                dtype=torch.float32).to(device)
    residualizer = Residualizer(covariates_t)
    del covariates_t

    if chr_s is not None:
        start_time = time.time()
        n_variants = 0
        ggt = genotypeio.GenotypeGeneratorTrans(genotype_df,
                                                batch_size=batch_size,
                                                chr_s=chr_s)
        total_batches = np.sum(
            [len(ggt.chr_batch_indexes[c]) for c in ggt.chroms])

        chr_max_r2 = OrderedDict()
        k = 0
        for chrom in ggt.chroms:
            max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device)
            for k, (genotypes, variant_ids) in enumerate(
                    ggt.generate_data(chrom=chrom,
                                      verbose=verbose,
                                      enum_start=k + 1), k + 1):
                genotypes_t = torch.tensor(genotypes,
                                           dtype=torch.float).to(device)
                genotypes_t, variant_ids, maf_t = filter_maf(
                    genotypes_t[:, genotype_ix_t], variant_ids, maf_threshold)
                n_variants += genotypes_t.shape[0]
                r2_t = calculate_corr(genotypes_t, permutations_t,
                                      residualizer).pow(2)
                del genotypes_t
                m, _ = r2_t.max(0)
                max_r2_t = torch.max(m, max_r2_t)
            chr_max_r2[chrom] = max_r2_t.cpu()
        logger.write('    time elapsed: {:.2f} min'.format(
            (time.time() - start_time) / 60))
        if maf_threshold > 0:
            logger.write(
                '  * {} variants passed MAF >= {:.2f} filtering'.format(
                    n_variants, maf_threshold))
        chr_max_r2 = pd.DataFrame(chr_max_r2)

        # leave-one-out max
        max_r2 = OrderedDict()
        for c in chr_max_r2:
            max_r2[c] = chr_max_r2[np.setdiff1d(chr_max_r2.columns, c)].max(1)
        max_r2 = pd.DataFrame(max_r2)  # nperms x chrs

        # empirical p-values
        tstat = np.sqrt(dof * max_r2 / (1 - max_r2))
        minp_empirical = pd.DataFrame(2 * stats.t.cdf(-np.abs(tstat), dof),
                                      columns=tstat.columns)  # nperms x chrs

        beta_shape1 = OrderedDict()
        beta_shape2 = OrderedDict()
        true_dof = OrderedDict()
        minp_vec = OrderedDict()
        for c in max_r2:
            beta_shape1[c], beta_shape2[c], true_dof[c], minp_vec[
                c] = fit_beta_parameters(max_r2[c], dof, return_minp=True)

        beta_df = pd.DataFrame(
            OrderedDict([
                ('beta_shape1', beta_shape1),
                ('beta_shape2', beta_shape2),
                ('true_df', true_dof),
                ('minp_true_df', minp_vec),
                ('minp_empirical',
                 {c: minp_empirical[c].values
                  for c in minp_empirical}),
            ]))
        return beta_df

    else:  # not split_chr
        ggt = genotypeio.GenotypeGeneratorTrans(genotype_df,
                                                batch_size=batch_size)
        start_time = time.time()
        max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device)
        n_variants = 0
        for k, (genotypes,
                variant_ids) in enumerate(ggt.generate_data(verbose=verbose),
                                          1):
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
            genotypes_t, variant_ids, maf_t = filter_maf(
                genotypes_t[:, genotype_ix_t], variant_ids, maf_threshold)
            n_variants += genotypes_t.shape[0]
            r2_t = calculate_corr(genotypes_t, permutations_t,
                                  residualizer).pow(2)
            del genotypes_t
            m, _ = r2_t.max(0)
            max_r2_t = torch.max(m, max_r2_t)
        logger.write('    time elapsed: {:.2f} min'.format(
            (time.time() - start_time) / 60))
        if maf_threshold > 0:
            logger.write(
                '  * {} variants passed MAF >= {:.2f} filtering'.format(
                    n_variants, maf_threshold))
        max_r2 = max_r2_t.cpu().numpy().astype(np.float64)
        tstat = np.sqrt(dof * max_r2 / (1 - max_r2))
        minp_empirical = 2 * stats.t.cdf(-np.abs(tstat), dof)
        beta_shape1, beta_shape2, true_dof, minp_vec = fit_beta_parameters(
            max_r2, dof, tol=1e-4, return_minp=True)

        beta_s = pd.Series([
            n_samples, dof, beta_shape1, beta_shape2, true_dof, minp_vec,
            minp_empirical
        ],
                           index=[
                               'num_samples', 'df', 'beta_shape1',
                               'beta_shape2', 'true_df', 'minp_true_df',
                               'minp_empirical'
                           ])
        return beta_s
コード例 #2
0
def map_trans(genotype_df,
              phenotype_df,
              covariates_df,
              interaction_s=None,
              return_sparse=True,
              pval_threshold=1e-5,
              maf_threshold=0.05,
              alleles=2,
              return_r2=False,
              batch_size=20000,
              logger=None,
              verbose=True):
    """Run trans-QTL mapping"""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger(verbose=verbose)
    assert np.all(phenotype_df.columns == covariates_df.index)

    variant_ids = genotype_df.index.tolist()
    variant_dict = {i: j for i, j in enumerate(variant_ids)}
    n_variants = len(variant_ids)
    n_samples = phenotype_df.shape[1]

    logger.write('trans-QTL mapping')
    logger.write('  * {} samples'.format(n_samples))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(n_variants))
    if interaction_s is not None:
        logger.write('  * including interaction term')

    phenotypes_t = torch.tensor(phenotype_df.values,
                                dtype=torch.float32).to(device)
    covariates_t = torch.tensor(covariates_df.values,
                                dtype=torch.float32).to(device)
    residualizer = Residualizer(covariates_t)
    del covariates_t

    genotype_ix = np.array(
        [genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    # calculate correlation threshold for sparse output
    dof = n_samples - 2 - covariates_df.shape[1]
    if return_sparse:
        tstat_threshold = -stats.t.ppf(pval_threshold / 2, dof)
        r_threshold = tstat_threshold / np.sqrt(dof + tstat_threshold**2)
    else:
        tstat_threshold = None
        r_threshold = None

    if interaction_s is None:
        ggt = genotypeio.GenotypeGeneratorTrans(genotype_df,
                                                batch_size=batch_size)
        start_time = time.time()
        res = []
        n_variants = 0
        for k, (genotypes,
                variant_ids) in enumerate(ggt.generate_data(verbose=verbose),
                                          1):
            # copy genotypes to GPU
            genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)

            # filter by MAF
            genotypes_t = genotypes_t[:, genotype_ix_t]
            impute_mean(genotypes_t)
            genotypes_t, variant_ids, maf_t = filter_maf(
                genotypes_t, variant_ids, maf_threshold)
            n_variants += genotypes_t.shape[0]

            r_t, genotype_var_t, phenotype_var_t = calculate_corr(
                genotypes_t, phenotypes_t, residualizer, return_var=True)
            del genotypes_t

            if return_sparse:
                m = r_t.abs() >= r_threshold
                ix_t = m.nonzero()  # sparse index
                ix = ix_t.cpu().numpy()

                r_t = r_t.masked_select(m).type(torch.float64)
                r2_t = r_t.pow(2)
                tstat_t = r_t * torch.sqrt(dof / (1 - r2_t))
                std_ratio_t = torch.sqrt(phenotype_var_t[ix_t[:, 1]] /
                                         genotype_var_t[ix_t[:, 0]])
                b_t = r_t * std_ratio_t
                b_se_t = (b_t / tstat_t).type(torch.float32)

                res.append(np.c_[variant_ids[ix[:, 0]],
                                 phenotype_df.index[ix[:, 1]],
                                 tstat_t.cpu(),
                                 b_t.cpu(),
                                 b_se_t.cpu(),
                                 r2_t.float().cpu(), maf_t[ix_t[:, 0]].cpu(),
                                 tstat_t.cpu()])
            else:
                r_t = r_t.type(torch.float64)
                tstat_t = r_t * torch.sqrt(dof / (1 - r_t.pow(2)))
                res.append(np.c_[variant_ids, tstat_t.cpu()])
        logger.write('    elapsed time: {:.2f} min'.format(
            (time.time() - start_time) / 60))
        del phenotypes_t
        del residualizer

        # post-processing: concatenate batches
        res = np.concatenate(res)
        if return_sparse:
            res[:, 2] = 2 * \
                stats.t.cdf(-np.abs(res[:, 2].astype(np.float64)), dof)
            pval_df = pd.DataFrame(res,
                                   columns=[
                                       'variant_id', 'phenotype_id', 'pval',
                                       'b', 'b_se', 'r2', 'maf', 't_stat'
                                   ])
            pval_df['pval'] = pval_df['pval'].astype(np.float64)
            pval_df['b'] = pval_df['b'].astype(np.float32)
            pval_df['b_se'] = pval_df['b_se'].astype(np.float32)
            pval_df['r2'] = pval_df['r2'].astype(np.float32)
            pval_df['maf'] = pval_df['maf'].astype(np.float32)
            pval_df['t_stat'] = pval_df['t_stat'].astype(np.float32)
            if not return_r2:
                pval_df.drop('r2', axis=1, inplace=True)
        else:
            pval = 2 * stats.t.cdf(-np.abs(res[:, 1:].astype(np.float64)), dof)
            pval_df = pd.DataFrame(pval,
                                   index=res[:, 0],
                                   columns=phenotype_df.index)
            pval_df.index.name = 'variant_id'

        if maf_threshold > 0:
            logger.write(
                '  * {} variants passed MAF >= {:.2f} filtering'.format(
                    n_variants, maf_threshold))
        logger.write('done.')
        return pval_df

    else:  # interaction model
        dof = n_samples - 4 - covariates_df.shape[1]
        interaction_t = torch.tensor(interaction_s.values.reshape(1, -1),
                                     dtype=torch.float32).to(device)
        interaction_mask_t = torch.BoolTensor(
            interaction_s >= interaction_s.median()).to(device)

        ggt = genotypeio.GenotypeGeneratorTrans(genotype_df,
                                                batch_size=batch_size)
        start_time = time.time()
        if return_sparse:

            nps = phenotypes_t.shape[0]
            i0_t = interaction_t - interaction_t.mean()
            p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)
            p0_t = residualizer.transform(p0_t, center=False)
            i0_t = residualizer.transform(i0_t, center=False)

            tstat_g_list = []
            tstat_i_list = []
            tstat_gi_list = []
            maf_list = []
            ix0 = []
            ix1 = []
            for k, (genotypes, variant_ids) in enumerate(
                    ggt.generate_data(verbose=verbose), 1):
                genotypes_t = torch.tensor(genotypes,
                                           dtype=torch.float).to(device)
                genotypes_t, mask_t = filter_maf_interaction(
                    genotypes_t[:, genotype_ix_t],
                    interaction_mask_t=interaction_mask_t,
                    maf_threshold_interaction=maf_threshold)
                if genotypes_t.shape[0] > 0:
                    ng, ns = genotypes_t.shape

                    # calculate MAF
                    af_t = genotypes_t.sum(1) / (2 * ns)
                    maf_t = torch.where(af_t <= 0.5, af_t, 1 - af_t)

                    # centered inputs
                    g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)
                    gi_t = genotypes_t * interaction_t
                    gi0_t = gi_t - gi_t.mean(1, keepdim=True)
                    # residualize rows
                    g0_t = residualizer.transform(g0_t, center=False)
                    gi0_t = residualizer.transform(gi0_t, center=False)

                    # regression
                    X_t = torch.stack([g0_t, i0_t.repeat(ng, 1), gi0_t],
                                      2)  # ng x ns x 3
                    Xinv = torch.matmul(torch.transpose(X_t, 1, 2),
                                        X_t).inverse()  # ng x 3 x 3
                    b_t = torch.matmul(
                        torch.matmul(Xinv, torch.transpose(X_t, 1, 2)),
                        p0_t.t())  # ng x 3 x np
                    dof = residualizer.dof - 2

                    rss_t = (torch.matmul(X_t, b_t) - p0_t.t()).pow(2).sum(
                        1)  # ng x np
                    b_se_t = torch.sqrt(
                        Xinv[:, torch.eye(3, dtype=torch.uint8).bool(
                        )].unsqueeze(-1).repeat([1, 1, nps]) *
                        rss_t.unsqueeze(1).repeat([1, 3, 1]) / dof)
                    tstat_t = (b_t.double() /
                               b_se_t.double()).float()  # (ng x 3 x np)
                    tstat_g_t = tstat_t[:, 0, :]  # genotypes x phenotypes
                    tstat_i_t = tstat_t[:, 1, :]
                    tstat_gi_t = tstat_t[:, 2, :]
                    m = tstat_gi_t.abs() >= tstat_threshold
                    tstat_g_t = tstat_g_t[m]
                    tstat_i_t = tstat_i_t[m]
                    tstat_gi_t = tstat_gi_t[m]
                    ix = m.nonzero()  # indexes: [genotype, phenotype]
                    maf_t = maf_t[ix[:, 0]]
                    # return tstat_g_t, tstat_i_t, tstat_gi_t, maf_t[ix[:,0]], ix

                    res = [tstat_g_t, tstat_i_t, tstat_gi_t, maf_t, ix]
                    tstat_g, tstat_i, tstat_gi, maf, ix = [
                        i.cpu().numpy() for i in res
                    ]
                    mask = mask_t.cpu().numpy()
                    # convert sparse indexes
                    if len(ix) > 0:
                        variant_ids = variant_ids[mask.astype(bool)]
                        tstat_g_list.append(tstat_g)
                        tstat_i_list.append(tstat_i)
                        tstat_gi_list.append(tstat_gi)
                        maf_list.append(maf)
                        ix0.extend(variant_ids[ix[:, 0]].tolist())
                        ix1.extend(phenotype_df.index[ix[:, 1]].tolist())

            logger.write('    time elapsed: {:.2f} min'.format(
                (time.time() - start_time) / 60))

            # concatenate
            pval_g = 2 * stats.t.cdf(-np.abs(np.concatenate(tstat_g_list)),
                                     dof)
            pval_i = 2 * stats.t.cdf(-np.abs(np.concatenate(tstat_i_list)),
                                     dof)
            pval_gi = 2 * \
                stats.t.cdf(-np.abs(np.concatenate(tstat_gi_list)), dof)
            maf = np.concatenate(maf_list)

            pval_df = pd.DataFrame(np.c_[ix0, ix1, pval_g, pval_i, pval_gi,
                                         maf],
                                   columns=[
                                       'variant_id', 'phenotype_id', 'pval_g',
                                       'pval_i', 'pval_gi', 'maf'
                                   ]).astype({
                                       'pval_g': np.float64,
                                       'pval_i': np.float64,
                                       'pval_gi': np.float64,
                                       'maf': np.float32
                                   })
            return pval_df
        else:  # dense output
            output_list = []
            for k, (genotypes, variant_ids) in enumerate(
                    ggt.generate_data(verbose=verbose), 1):
                genotypes_t = torch.tensor(genotypes,
                                           dtype=torch.float).to(device)
                genotypes_t, mask_t = filter_maf_interaction(
                    genotypes_t[:, genotype_ix_t],
                    interaction_mask_t=interaction_mask_t,
                    maf_threshold_interaction=maf_threshold)
                res = calculate_interaction_nominal(
                    genotypes_t,
                    phenotypes_t,
                    interaction_t,
                    residualizer,
                    return_sparse=return_sparse)
                # res: tstat, b, b_se, maf, ma_samples, ma_count
                res = [i.cpu().numpy() for i in res]
                mask = mask_t.cpu().numpy()
                variant_ids = variant_ids[mask.astype(bool)]
                output_list.append(res + [variant_ids])
            logger.write('    time elapsed: {:.2f} min'.format(
                (time.time() - start_time) / 60))

            # concatenate outputs
            tstat = np.concatenate([i[0] for i in output_list])
            pval = 2 * stats.t.cdf(-np.abs(tstat), dof)
            b = np.concatenate([i[1] for i in output_list])
            b_se = np.concatenate([i[2] for i in output_list])
            maf = np.concatenate([i[3] for i in output_list])
            ma_samples = np.concatenate([i[4] for i in output_list])
            ma_count = np.concatenate([i[5] for i in output_list])
            variant_ids = np.concatenate([i[6] for i in output_list])

            pval_g_df = pd.DataFrame(pval[:, 0, :],
                                     index=variant_ids,
                                     columns=phenotype_df.index)
            pval_i_df = pd.DataFrame(pval[:, 1, :],
                                     index=variant_ids,
                                     columns=phenotype_df.index)
            pval_gi_df = pd.DataFrame(pval[:, 2, :],
                                      index=variant_ids,
                                      columns=phenotype_df.index)
            maf_s = pd.Series(maf, index=variant_ids,
                              name='maf').astype(np.float32)
            ma_samples_s = pd.Series(ma_samples, index=variant_ids,
                                     name='maf').astype(np.int32)
            ma_count_s = pd.Series(ma_count, index=variant_ids,
                                   name='maf').astype(np.int32)
            return pval_g_df, pval_i_df, pval_gi_df, maf_s, ma_samples_s, ma_count_s
コード例 #3
0
ファイル: wrapper.py プロジェクト: liangyy/tensorqtl
def map_trans(genotype_df, phenotype_df, covariates_df, mapper, pval_threshold=1e-5, 
              maf_threshold=0.05, batch_size=20000,
              logger=None, verbose=True, kwargs={}):
    '''
    Wrapper for trans-QTL mapping.
    The QTL caller is `mapper` which should have 
    * mapper.init(phenotype, covariate)
    * mapper.map(genotype) 
    implemented.
    mapper.map should return 'bhat', 'pval' in a dictionary.
    
    '''

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if logger is None:
        logger = SimpleLogger(verbose=verbose)
    assert np.all(phenotype_df.columns==covariates_df.index)

    variant_ids = genotype_df.index.tolist()
    variant_dict = {i:j for i,j in enumerate(variant_ids)}
    n_variants = len(variant_ids)
    n_samples = phenotype_df.shape[1]

    logger.write('trans-QTL mapping')
    logger.write('  * {} samples'.format(n_samples))
    logger.write('  * {} phenotypes'.format(phenotype_df.shape[0]))
    logger.write('  * {} covariates'.format(covariates_df.shape[1]))
    logger.write('  * {} variants'.format(n_variants))
    

    phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device)
    covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
    
    ## mapper call
    mapper.init(phenotypes_t.T, covariates_t, **kwargs)
    # genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
    # genotype_ix_t = torch.from_numpy(genotype_ix).to(device)

    
    ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
    start_time = time.time()
    res = []
    n_variants = 0
    for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
        # copy genotypes to GPU
        genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)

        # filter by MAF
        # genotypes_t = genotypes_t[:,genotype_ix_t]
        impute_mean(genotypes_t)
        genotypes_t, variant_ids, maf_t = filter_maf(genotypes_t, variant_ids, maf_threshold)
        n_variants += genotypes_t.shape[0]
        
        ## mapper call
        res_i = mapper.map(genotypes_t.T)
        
        del genotypes_t
        
        res_i = np.c_[ 
            np.repeat(variant_ids, phenotype_df.index.shape[0]),
            np.tile(phenotype_df.index, variant_ids.shape[0]),
            res_i[0],
            res_i[1],
            np.repeat(maf_t.cpu(), phenotype_df.index.shape[0])
        ]
        res.append(res_i)
        
    logger.write('    elapsed time: {:.2f} min'.format((time.time()-start_time)/60))
    del phenotypes_t

    # post-processing: concatenate batches
    res = np.concatenate(res)
    pval_df = pd.DataFrame(res, columns=['variant_id', 'phenotype_id', 'bhat', 'pval', 'maf'])

    if maf_threshold > 0:
        logger.write('  * {} variants passed MAF >= {:.2f} filtering'.format(n_variants, maf_threshold))
    logger.write('done.')
    return pval_df