def read_genotype(plink_prefix):
    ''' Read WGS VCF '''
    pr = genotypeio.PlinkReader(plink_prefix)
    genotype_df = pd.DataFrame(
        pr.load_genotypes(),
        #index=pr.bim['snp'],
        columns=pr.fam['iid'])
    return genotype_df
Exemple #2
0
def fetch_genotypes(snps, geno, plink_prefix, C):
    print('  * Loading genotypes')
    snp_ref = pd.read_csv(f'{geno}.bim', sep='\t', engine='c', memory_map=True, compression=None,
                          usecols=[1], names = ['snp'], header=None)
    snp_list = (snp_ref[snp_ref['snp'].isin(set(snps))]
                .sort_values(by=['snp'])
    )['snp'].drop_duplicates().tolist()
    filtering = time.time()
    cmd = f'''{C.plink} \
    --bfile {geno} \
    --snps {', '.join(snp_list)} \
    --out {plink_prefix} \
    --make-bed \
    --silent
    '''
    filter_snps = subprocess.run(cmd, shell=True, check=True)
    if filter_snps.returncode != 0:
        sys.exit(f'Could not fetch SNPs.')
    pr = genotypeio.PlinkReader(plink_prefix, verbose=False)
    genotype_df = pr.load_genotypes()
    variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
    plink = time.time()

    return genotype_df, variant_df
# Set up file paths
phenotype_bed_file = path + "phenotypes/INTERVAL_RNAseq_phase1-2_filteredSamplesGenes_TMMNormalised_FPKM_Counts_foranalysis_chr" + chr + ".bed.gz"
covariates_file = path + "covariates/INTERVAL_RNAseq_phase1-2_fullcovariates_foranalysis.txt"
plink_prefix_path = path + "genotypes/INTERVAL_RNAseq_Phase1-2_imputed_b38_biallelic_MAF0.005_chr" + chr

# Read in phenotypes
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(
    phenotype_bed_file)

# Read in covariates and make subset to only ids that are in the phenotype file
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0)
covariates_df = covariates_df[phenotype_df.columns].T

# Read in genotypes
pr = genotypeio.PlinkReader(plink_prefix_path)

# load genotypes and variants into data frames
genotype_df = pd.DataFrame(pr.load_genotypes(),
                           index=pr.bim['snp'],
                           columns=pr.fam['iid'])
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

# Call trans-eQTLs
trans_df = trans.map_trans(genotype_df,
                           phenotype_df,
                           covariates_df,
                           return_sparse=True,
                           maf_threshold=0.005)
trans_df.to_csv(outpath + "tensorqtl_trans_MAF0.005_chr" + chr + ".csv",
                index=False)
Exemple #4
0
# Load phenotypes and covariates:
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(pheno_file)
covariates_df = pd.read_csv(cov_file, sep='\t', index_col=0).T # samples x covariates
assert np.all(phenotype_df.columns==covariates_df.index)

# Load interaction data
interaction_s = pd.read_csv(interaction_file, sep='\t', index_col=0, header=None, squeeze=True)
## Select individuals that are in the interaction dataset
phenotype_df = phenotype_df.iloc[:, phenotype_df.columns.isin(interaction_s.index)]
covariates_df = covariates_df[covariates_df.index.isin(interaction_s.index)]
assert np.all(phenotype_df.columns==covariates_df.index)
assert covariates_df.index.isin(interaction_s.index).all()
interaction_s = interaction_s.loc[covariates_df.index].astype(np.float32)

# Load genotypes (for VCFs with hard GT calls only, specify type as np.int8 to save memory)
pr = genotypeio.PlinkReader(geno_path, select_samples=phenotype_df.columns, dtype=np.int8)

# Load genotypes for each chromosome separately
top_df = []
for chrom in pr.chrs:
    g, pos_s = pr.get_region(chrom)
    genotype_df = pd.DataFrame(g, index=pos_s.index, columns=pr.fam['iid'])[phenotype_df.columns]
    variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
    # Map cis_nominal with intercation term and eigenMT correction
    chr_df = cis.map_nominal(genotype_df, variant_df[variant_df['chrom']==chrom], phenotype_df[phenotype_pos_df['chr']==chrom], phenotype_pos_df[phenotype_pos_df['chr']==chrom], covariates_df, prefix, interaction_s=interaction_s, maf_threshold_interaction=0.1, window=1000000, output_dir=output_dir, write_top=False, run_eigenmt=True)
    top_df.append(chr_df)

top_df = pd.concat(top_df)
top_df.to_csv(os.path.join(output_dir, '{}.cis_qtl_top_assoc.txt.gz'.format(prefix)), sep='\t', float_format='%.6g')

# if __name__ == '__main__':
Exemple #5
0
    excluded_chr_list = None
else:
    all_chrs_list.remove(chr_id)
    excluded_chr_list = all_chrs_list

logger.write('[{}] Running TensorQTL: {}-QTL mapping'.format(
    datetime.now().strftime("%b %d %H:%M:%S"),
    args.mode.split('_')[0]))

# logger = SimpleLogger()

# load phenotypes and covariates
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T

pr = genotypeio.PlinkReader(plink_prefix_path, exclude_chrs=excluded_chr_list)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

if mode == 'cis':
    # cis-QTL: empirical p-values for phenotypes
    if excluded_chr_list:
        cis_df = cis.map_cis(
            genotype_df,
            variant_df,
            phenotype_df.loc[phenotype_pos_df['chr'] == chr_id],
            phenotype_pos_df.loc[phenotype_pos_df['chr'] == chr_id],
            covariates_df=covariates_df,
            seed=args.seed)
    else:
        cis_df = cis.map_cis(genotype_df,
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T
#phenotype_gene_df, phenotype_gene_pos_df = tensorqtl.read_phenotype_bed(expression_gene_bed)
covariates_df = covariates_df.astype('float64')
#interaction_s = pd.Series(data=covariates_df['IRSnoAge'], index=covariates_df.index.values)

old = covariates_df['SDC_AGE_CALC'] < 46
covariates_df_old = covariates_df[old]

interaction_s = pd.Series(data=covariates_df['IRSnoAge'],
                          index=covariates_df.index.values)
interaction_s_old = pd.Series(data=covariates_df_old['IRSnoAge'],
                              index=covariates_df_old.index.values)

# PLINK reader for genotypes
pr = genotypeio.PlinkReader(plink_prefix_path,
                            select_samples=covariates_df_old.index)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

phenotype_pos_df["chr"] = phenotype_pos_df["chr"].astype(str)

##########################
# Nominal eqtls - canonical
##########################

##cis.map_nominal(genotype_df, variant_df,
#                phenotype_df.loc[phenotype_pos_df['chr'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22'])],
#                phenotype_pos_df.loc[phenotype_pos_df['chr'].isin(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22'])],
#                prefix, covariates_df=covariates_df)

##########################
outdir = covdir

phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(
    phenotype_bed_file)
covariates_peer_df = pd.read_csv(covariates_peer, sep='\t',
                                 index_col=0).T  # samples x covariates

interaction_s = pd.read_csv(
    "/rds/project/jmmh2/rds-jmmh2-projects/interval_rna_seq/covid19/INTERVAL_RNAseq_COVID19_neutPCT_GxE.txt",
    sep="\t",
    index_col=0,
    squeeze=True).T
interaction_s = interaction_s.squeeze()

plink_prefix_path = "/rds/user/jm2294/rds-jmmh2-projects/interval_rna_seq/analysis/03_tensorqtl/genotypes/INTERVAL_b38_autosomes_RNAseqPhase1_biallelic_all_MAF0.005"
pr = genotypeio.PlinkReader(plink_prefix_path)
genotype_df = pd.DataFrame(pr.get_all_genotypes(),
                           index=pr.bim['snp'],
                           columns=pr.fam['iid'])
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

# cis
# Cis gene-level mapping
pheno_df_noACE2 = phenotype_df.drop("ENSG00000130234")
phenopos_df_noACE2 = phenotype_pos_df.drop("ENSG00000130234")

pheno_df_noACE2 = pheno_df_noACE2.drop("ENSG00000184012")
phenopos_df_noACE2 = phenopos_df_noACE2.drop("ENSG00000184012")

cis_df = cis.map_cis(genotype_df, variant_df, pheno_df_noACE2,
                     phenopos_df_noACE2, covariates_peer_df)