Beispiel #1
0
def map_tissue_eqtls(
        tissue,
        genes,
        genotype_df,
        variant_df,
        eqtls,
        covariates_dir,
        expression_dir,
        pval_threshold,
        maf_threshold,
        eqtl_project
):
    '''
    covariates_fp = ''
    phenotype_fp = ''
    if eqtl_project.lower() == 'gtex': # TODO rename files to be consistent.
        covariates_fp = os.path.join(
            covariates_dir, tissue + '.v8.covariates.txt')
        phenotype_fp = os.path.join(
            expression_dir, tissue + '.v8.normalized_expression.bed.gz')
    else:
        covariates_fp = os.path.join(
            covariates_dir, tissue + '.covariates.txt')
        phenotype_fp = os.path.join(
            expression_dir, tissue + '.normalized_expression.bed.gz')
    if not (os.path.exists(covariates_fp) and os.path.exists(phenotype_fp)):
        return

    covariates_df = pd.read_csv(covariates_fp, sep='\t', index_col=0).T
    phenotype_df, pos_df = tensorqtl.read_phenotype_bed(phenotype_fp)
    if pairs_df['pid'].iloc[0] != '': # Spatial connections
        phenotype_df = phenotype_df[
            phenotype_df.index.isin(pairs_df['pid'])]
    '''
    phenotype_df, covariates_df = fetch_phenotypes(
        tissue, genes, covariates_dir, expression_dir, eqtl_project)
    eqtl_df = trans.map_trans(
        genotype_df,
        phenotype_df,
        covariates_df,
        return_sparse=True,
        pval_threshold=float(pval_threshold),
        maf_threshold=float(maf_threshold),
        batch_size=20000,
        verbose=False)
    eqtl_df['tissue'] = tissue
    eqtls.append(eqtl_df[~((eqtl_df['variant_id'].isnull()) |
                           (eqtl_df['phenotype_id'].isnull()))])
# Set up file paths
phenotype_bed_file = path + "phenotypes/INTERVAL_RNAseq_phase1-2_filteredSamplesGenes_TMMNormalised_FPKM_Counts_foranalysis_chr" + chr + ".bed.gz"
covariates_file = path + "covariates/INTERVAL_RNAseq_phase1-2_fullcovariates_foranalysis.txt"
plink_prefix_path = path + "genotypes/INTERVAL_RNAseq_Phase1-2_imputed_b38_biallelic_MAF0.005_chr" + chr

# Read in phenotypes
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(
    phenotype_bed_file)

# Read in covariates and make subset to only ids that are in the phenotype file
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0)
covariates_df = covariates_df[phenotype_df.columns].T

# Read in genotypes
pr = genotypeio.PlinkReader(plink_prefix_path)

# load genotypes and variants into data frames
genotype_df = pd.DataFrame(pr.load_genotypes(),
                           index=pr.bim['snp'],
                           columns=pr.fam['iid'])
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

# Call trans-eQTLs
trans_df = trans.map_trans(genotype_df,
                           phenotype_df,
                           covariates_df,
                           return_sparse=True,
                           maf_threshold=0.005)
trans_df.to_csv(outpath + "tensorqtl_trans_MAF0.005_chr" + chr + ".csv",
                index=False)
# read in genotypes:
chr9_geno_df = pd.read_csv('../../data/tensorqtldata/chr9.csv')
chr9_geno_df = chr9_geno_df.drop(chr9_geno_df.columns[[1]], axis=1)
chr9_geno_df = chr9_geno_df.set_index('ID')
#chr9_geno_df = torch.from_numpy(chr9_geno_df.values)
# load phenotype and covariates
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T

import timeit
import time
start_time = timeit.default_timer()
trans_df = trans.map_trans(chr9_geno_df,
                           phenotype_df,
                           covariates_df,
                           batch_size=10000,
                           return_sparse=True,
                           pval_threshold=1e-5,
                           maf_threshold=0.05)
#print("Tensorqtl trans.map_trans function took:"
#print(timeit.default_timer() - start_time)
timetaken = timeit.default_timer() - start_time
msg = "{func} took {time} seconds to complete."
print(msg.format(func=trans.map_trans.__name__, time=timetaken))

# trans_df.to_csv('tensorqtl-scan-covar.csv')
# map_trans took 4.8016955852508545 seconds to complete. # use GPU Nvidia V100

# subset data set, getting a smaller end result for the purpose of comparing TensorQTL vs LiteQTL.
Beispiel #4
0
for i in range(1, 23):
    print(i)
    plink_prefix_path = rootpath + 'genotypes/INTERVAL_RNAseq_Phase1-2_imputed_b38_biallelic_MAF0.005_chr' + str(
        i)
    print(plink_prefix_path)
    gw_pr = genotypeio.PlinkReader(plink_prefix_path)
    gw_genotype_df = pd.DataFrame(gw_pr.load_genotypes(),
                                  index=gw_pr.bim['snp'],
                                  columns=gw_pr.fam['iid'])
    gw_variant_df = gw_pr.bim.set_index('snp')[['chrom', 'pos']]
    MAF_filter = 0.005
    gw_trans_df = trans.map_trans(gw_genotype_df,
                                  phenotype_df,
                                  covariates_df,
                                  return_sparse=True,
                                  return_r2=True,
                                  maf_threshold=MAF_filter,
                                  batch_size=gw_variant_df.shape[0])
    gw_trans_df.to_csv(
        outdir + "/tensorqtl_trans_MAF" + str(MAF_filter) +
        "_all_age_sex_rin_batch_readDepth_PC10_PEER20_COVID19_CHR" + str(i) +
        ".csv")

# chrX trans
plink_prefix_path_x = "/rds/project/jmmh2/rds-jmmh2-projects/interval_rna_seq/covid19/genotypes/INTERVAL_chrX_merged_cleaned_RNAseq_phase1-2_b38_rsids_deduplicated_MAF0.005"
x_pr = genotypeio.PlinkReader(plink_prefix_path_x)
x_genotype_df = pd.DataFrame(x_pr.load_genotypes(),
                             index=x_pr.bim['snp'],
                             columns=x_pr.fam['iid'])
x_variant_df = x_pr.bim.set_index('snp')[['chrom', 'pos']]
Beispiel #5
0
cis_df = cis.map_cis(gmpr_genotype_df, gmpr_variant_df, phenotype_df,
                     phenotype_pos_df, covariates_df)
tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85)
cis_df.to_csv(outpath + "tensorqtl_cis_cisPerGene_chr" + chr + ".csv",
              index=True,
              index_label="Phenotype")

# Cis nominal mapping
cisnom_df = cis.map_nominal(gmpr_genotype_df,
                            gmpr_variant_df,
                            phenotype_df,
                            phenotype_pos_df,
                            covariates_df,
                            prefix=outpath + "tensorqtl_cis_cisNominal_chr" +
                            chr)
cisnom_df2 = pd.read_parquet(
    outpath + "tensorqtl_cis_cisNominal_chr6.cis_qtl_pairs.6.parquet")
cisnom_df2.to_csv(outpath +
                  "tensorqtl_cis_cisNominal_chr6.cis_qtl_pairs.6.csv",
                  index=False)

# Call trans-eQTLs
trans_min_df = trans.map_trans(gmpr_genotype_df,
                               gw_phenotype_df,
                               covariates_df,
                               return_sparse=True)
trans_min_df.to_csv(outpath + "tensorqtl_trans.csv", index=False)

# Conditional cis-analysis (may time out!)
#indep_df = cis.map_independent(gmpr_genotype_df, gmpr_variant_df, cis_df, phenotype_df, phenotype_pos_df, covariates_df, nperm=10000)
#indep_df.to_csv(outpath + "tensorqtl_cis_cisIndependent_chr" + chr + ".csv", index=True, index_label = "Phenotype")
        f'time_stamp,device,data_transfer_time,compute_time,result_reorg_time,pval_time,elapsed_total\n'
    )
# device = torch.device("cpu")
# trans_df, cpucalctime = trans.map_trans(small_geno_df, phenotype_df, batch_size=20000,
#                                 return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05, device=device, timing_file=timing_file)

for i in range(0, 10):
    ##################################### Full Matrix Case ########################################
    for numthreads in [20]:

        torch.set_num_threads(numthreads)
        device = torch.device("cpu")
        trans_df, cpucalctime = trans.map_trans(small_geno_df,
                                                phenotype_df,
                                                batch_size=20000,
                                                return_sparse=False,
                                                pval_threshold=1,
                                                maf_threshold=0.00,
                                                device=device,
                                                timing_file=timing_file)

        device = torch.device("cuda")
        (trans_df, gpucalctime) = trans.map_trans(small_geno_df,
                                                  phenotype_df,
                                                  batch_size=20000,
                                                  return_sparse=False,
                                                  pval_threshold=1,
                                                  maf_threshold=0.00,
                                                  device=device,
                                                  timing_file=timing_file)

        n = small_geno_df.shape[1]
Beispiel #7
0
                group_s=None,
                run_eigenmt=True,
                output_dir=covdir)

for i in [8, 9, 21]:
    df = pd.read_parquet(
        covdir + "tensorqtl_cis_MAF0.005_cisGxE_covid19.cis_qtl_pairs." +
        str(i) + ".parquet")
    df.to_csv(covdir + "tensorqtl_cis_MAF0.005_cisGxE_covid19.cis_qtl_pairs." +
              str(i) + ".csv",
              index=False)

# trans
trans_peer_df = trans.map_trans(genotype_df,
                                pheno_df_noACE2,
                                covariates_peer_df,
                                return_sparse=True,
                                maf_threshold=0.005)
trans_peer_df.to_csv(
    outdir +
    "tensorqtl_trans_MAF0.005_all_age_sex_rin_batch_readDepth_PC10_PEER20_COVID19.csv"
)

#################################################################
# chrX
import pandas as pd
import tensorqtl
from tensorqtl import genotypeio, cis, trans


# Function to re-add RSids as these are no longer in the vcf file