コード例 #1
0
ファイル: test_plots.py プロジェクト: Nealelab/preimp_qc
def man_qq_plts(mt):

    gwas_ht = hl.linear_regression_rows(y=mt.is_case,
                                        x=mt.GT.n_alt_alleles(),
                                        covariates=[1.0])

    pvals = gwas_ht.select(gwas_ht.p_value)
    man_df = pvals.to_pandas()

    man_df_pruned = man_df[['locus.contig', 'locus.position', 'p_value']]
    man_df_pruned.columns = ['CHR', 'BP', 'P']
    man_df_pruned = man_df_pruned.dropna()

    man_df_pruned = man_df_pruned.replace(to_replace=["X", "Y", "MT"],
                                          value=[23, 24, 25])

    buffer = io.BytesIO()
    figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
    qqman.manhattan(man_df_pruned,
                    ax=axes[0],
                    xrotation=90.0,
                    title="Manhattan plot")
    qqman.qqplot(man_df_pruned, ax=axes[1], title="QQ plot")

    figure.tight_layout()
    plt.savefig(buffer, format='PNG')
    plt.clf()
    plt.close()
    buffer.seek(0)

    plt_base64 = base64.b64encode(buffer.read()).decode('ascii')
    return '<img src="data:image/png;base64,{}">'.format(plt_base64)
コード例 #2
0
ファイル: gwas_hail.py プロジェクト: hail-is/batch-demo
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
コード例 #3
0
def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    '''Runs GWAS in Hail'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})
    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    pass_through = list(set(['rsid'] + pass_through))
    print(f'variables to pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(Z=gwas_ht[ss_template.SNP].Z,
                              N=gwas_ht[ss_template.SNP].N)

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss
コード例 #4
0
def run_grouped_regressions(mt, ss_output, pheno, pheno_name):
    ht = hl.linear_regression_rows(
        y=[[mt['phenotypes'][y]] for y in pheno],
        x=mt.dosage,
        covariates=[
            1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
        ],
        pass_through=['varid', 'rsid'])

    ht = ht.annotate_globals(phenotypes=pheno)  # check this

    ht.write(ss_output + pheno_name + '.ht', overwrite=args.overwrite)
コード例 #5
0
def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=True,
         is_std_cov_list=False):
    '''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'y': y}, entry_exprs={'x': x})
    if normalize_x:
        mt = normalize_genotypes(mt, mt.x)
        mt = mt.annotate_entries(x=mt.__norm_gt).drop('__norm_gt')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(pass_through)

    gwas_ht = hl.linear_regression_rows(y=mt.y,
                                        x=mt.x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    sumstats_template = hl.import_table(
        'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz', types={'N': hl.tint64})
    sumstats_template = sumstats_template.key_by('SNP')

    sumstats = sumstats_template.annotate(Z=gwas_ht[sumstats_template.SNP].Z,
                                          N=gwas_ht[sumstats_template.SNP].N)

    if path_to_save is not None:
        sumstats.export(path_to_save)

    return gwas_ht
コード例 #6
0
def linear_regression_rows(mt_path):
    mt = hl.read_matrix_table(mt_path)
    num_phenos = 100
    num_covs = 20
    pheno_dict = {f"pheno_{i}": hl.rand_unif(0, 1) for i in range(num_phenos)}
    cov_dict = {f"cov_{i}": hl.rand_unif(0, 1) for i in range(num_covs)}
    mt = mt.annotate_cols(**pheno_dict)
    mt = mt.annotate_cols(**cov_dict)
    res = hl.linear_regression_rows(
        y=[mt[key] for key in pheno_dict.keys()],
        x=mt.x,
        covariates=[mt[key] for key in cov_dict.keys()])
    res._force_count()
コード例 #7
0
def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht
コード例 #8
0
ファイル: run_gwas.py プロジェクト: saponas/hail
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
コード例 #9
0
        if n_chunks == 2:  #traditional split of population into even halves (no meta-analysis)
            mt_A = mt.filter_cols(mt.group_id == 0)
            mt_B = mt.filter_cols(mt.group_id == 1)

            cov_list_A = [
                mt_A['isFemale'], mt_A['age'], mt_A['age_squared'],
                mt_A['age_isFemale'], mt_A['age_squared_isFemale']
            ] + [mt_A['PC{:}'.format(i)] for i in range(1, 21)]

            cov_list_B = [
                mt_B['isFemale'], mt_B['age'], mt_B['age_squared'],
                mt_B['age_isFemale'], mt_B['age_squared_isFemale']
            ] + [mt_B['PC{:}'.format(i)] for i in range(1, 21)]

            ht_A = hl.linear_regression_rows(y=mt_A.y,
                                             x=mt_A.x,
                                             covariates=[1] + cov_list_A,
                                             pass_through=['rsid'])

            ht_B = hl.linear_regression_rows(y=mt_B.y,
                                             x=mt_B.x,
                                             covariates=[1] + cov_list_B,
                                             pass_through=['rsid'])

            ht_A = ht_A.rename({'rsid': 'SNP'}).key_by('SNP')
            ht_B = ht_B.rename({'rsid': 'SNP'}).key_by('SNP')

            ht_A = ht_A.select(Z=ht_A.beta / ht_A.standard_error)
            ht_B = ht_B.select(Z=ht_B.beta / ht_B.standard_error)

            sumstats_template = hl.import_table(
                'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',
コード例 #10
0
    test="firth",  #controls false positives
    y=hl.float(mt.AffectionBool),
    x=mt.GT.n_alt_alleles(),
    covariates=[
        1,
        hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2],
        mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7],
        mt.scores[8], mt.scores[9]
    ])

######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable)
gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1),
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[
                                     1,
                                     hl.float(mt.PackYear), mt.scores[0],
                                     mt.scores[1], mt.scores[2], mt.scores[3],
                                     mt.scores[4], mt.scores[5], mt.scores[6],
                                     mt.scores[7], mt.scores[8], mt.scores[9]
                                 ])

######## 6.3 Q-Q plot
qqplot = hl.plot.qq(gwas.p_value)
show(qqplot)

######## 6.4 Manhattan-like plots
#GWAS significanse level = 5.0 10e-8, suggestive: 5.0 10e-8 < P < 5.0 * 10e-6.

# Calculate Bonferroni based cut off lines
Bonferroni_line = -np.log10(0.05 / mt.count_rows())
Suggestive_line = -np.log10(1 / mt.count_rows())
コード例 #11
0
args = parser.parse_args()


hl.import_vcf(args.vcf_path).write(args.workdir + 'hail.mt', overwrite=True)
mt = hl.read_matrix_table(args.workdir + 'hail.mt')
table = (hl.import_table(args.annotation_path, impute=True)
     .key_by('Sample'))

# Start benchmarking after I/O
t0 = time.time()
mt = mt.annotate_cols(pheno = table[mt.s])
mt = hl.sample_qc(mt)
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                    (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                    (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                             x=mt.GT.n_alt_alleles(),
                             covariates=[1.0])
p = hl.plot.manhattan(gwas.p_value)

print('Time Elapsed: {}'.format(time.time()- t0))
コード例 #12
0
def gwas(y, x, cov):
    g = hl.linear_regression_rows(y=y,
                                  x=x,
                                  covariates=cov,
                                  pass_through=['rsid'])
    return g
コード例 #13
0
# COMMAND ----------

print('Samples: %d  Variants: %d' %
      (common_mt.count_cols(), common_mt.count_rows()))

# COMMAND ----------

# MAGIC %md These filters removed about 15% of sites (we started with a bit over 10,000). This is _NOT_ representative of most sequencing datasets! We have already downsampled the full thousand genomes dataset to include more common variants than we'd expect by chance.
# MAGIC
# MAGIC In Hail, the association tests accept column fields for the sample phenotype and covariates. Since we've already got our phenotype of interest (caffeine consumption) in the dataset, we are good to go:

# COMMAND ----------

gwas = hl.linear_regression_rows(y=common_mt.CaffeineConsumption,
                                 x=common_mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()

# COMMAND ----------

# MAGIC %md Looking at the bottom of the above printout, you can see the linear regression adds new row fields for the beta, standard error, t-statistic, and p-value.
# MAGIC
# MAGIC Hail makes it easy to make a [Q-Q (quantile-quantile) plot](https://en.wikipedia.org/wiki/Q–Q_plot).

# COMMAND ----------

p = hl.plot.qq(gwas.p_value)
displayBokeh(p)

# COMMAND ----------
コード例 #14
0
def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    r'''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})

    print('\n... Calculating allele frequency ...')
    mt_freq_rows = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) /
                                    2).rows()  #frequency of alternate allele
    mt_freq_rows = mt_freq_rows.key_by('rsid')

    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(f'pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.key_by('rsid')

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(chr=gwas_ht[ss_template.SNP].locus.contig,
                              bpos=gwas_ht[ss_template.SNP].locus.position,
                              freq=mt_freq_rows[ss_template.SNP].freq,
                              beta=gwas_ht[ss_template.SNP].beta,
                              z=gwas_ht[ss_template.SNP].t_stat,
                              pval=gwas_ht[ss_template.SNP].p_value,
                              n=gwas_ht[ss_template.SNP].n)
    ss = ss.drop('N')
    ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2'})

    print(ss.describe())

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss
コード例 #15
0
 gwas = hl.linear_regression_rows(
     y=[
         mt.sample_qc_and_phenotype.fbc.neut_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hgb_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.plt_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mcv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mch_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mchc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.irf_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_baso_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_eo_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_neut_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_p_myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gran_gwas_normalised
         #                   mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         #                   mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised],
     ],
     x=mt.GT.n_alt_alleles(),
     covariates=[1.0],
     pass_through=[mt.rsid])
コード例 #16
0
u = 8

rgs = [-0.3, -0.1, 0, 0.1, 0.3]

# Import matrix and annotate cols with phenotypes
mt = hl.read_matrix_table(input_matrix)

mt = mt.annotate_cols(U=mt.y[u])
mt = mt.annotate_cols(X=mt.y[x])

for i in range(5):
    mt = mt.annotate_cols(Y=mt.y[i + 2])

    # GWAS of X, Y in all
    result_ht = hl.linear_regression_rows(y=[mt.X, mt.Y],
                                          x=mt.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        output_bucket + 'gwas/gwas_X_' + str(rgs[i]) + '.tsv',
        output_bucket + 'gwas/gwas_Y_' + str(rgs[i]) + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,
コード例 #17
0
mt = hl.import_bgen(
    path=
    f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen',
    sample_file=f'gs://ukb31063/ukb31063.{contig}.sample',
    entry_fields=['dosage'],
    variants=ht_variants)

mt = mt.annotate_cols(phenotypes=ht_phenotypes[mt.s],
                      covariates=ht_covariates[mt.s])

phenotypes = list(mt['phenotypes'].keys())

ht = hl.linear_regression_rows(
    y=[[mt['phenotypes'][y]] for y in phenotypes],
    x=mt.dosage,
    covariates=[
        1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
    ],
    pass_through=['varid', 'rsid'])

ht = ht.annotate_globals(phenotypes=phenotypes)

if dilution:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.dilution_factor.ht',
        overwrite=True)
else:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.ht',
        overwrite=True)
コード例 #18
0
def main(args):
    betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100']
    spike_slab = hl.import_table(
        'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt',
        impute=True)
    spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v))
    if args.compute_true_phenotypes:
        # get the white british subset
        eur = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv'
        ).key_by('s')

        # read in imputed data, subset to chr22
        mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        mt = mt.annotate_rows(ss=spike_slab[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt.ss))

        # compute true PRS (i.e. phenotypes)
        annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas}

        # write out phenos for white British unrelated subset
        mt = mt.annotate_cols(**annot_expr)
        mt = mt.filter_cols(hl.is_defined(eur[mt.s]))
        mt.cols().write(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht',
            stage_locally=True,
            overwrite=True)

    if args.run_gwas:
        # read back in PRS (now true phenotypes)
        phenos = hl.read_table(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht'
        ).key_by('s')
        phenos.show()
        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            impute=True,
            types={
                's': hl.tstr
            }).key_by('s')
        full_mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        full_mt = full_mt.annotate_cols(**covariates[full_mt.s])
        full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key])
        full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss))

        # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs
        for i in range(10):
            subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1))
            subset_pheno = subset_pheno.order_by(
                subset_pheno.r).add_index('global_idx').key_by('s')
            subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000)
            mt = full_mt.annotate_cols(**subset_pheno[full_mt.s])
            mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2)
            result_ht = hl.linear_regression_rows(
                y=[mt[i] for i in betas],
                x=mt.dosage,
                covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)],
                pass_through=['rsid', 'maf'])

            subset_pheno.export(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_'
                + str(i) + '.tsv.gz')
            result_ht.write(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht',
                overwrite=True)

    if args.write_gwas:
        for i in range(10):
            result_ht = hl.read_table(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht')
            result_ht = result_ht.key_by()
            get_expr = {
                field + '_' + x: result_ht[field][i]
                for i, x in enumerate(betas)
                for field in ['beta', 'standard_error', 'p_value']
            }
            result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0],
                             alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\
                .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')
コード例 #19
0
    # Filter MatrixTable and get sample
    samples_to_keep = set(df.loc[(df['sel'] == 1), 's'])
    set_to_keep = hl.literal(samples_to_keep)
    mt_sampled = mt.filter_cols(set_to_keep.contains(mt['s']), keep=True)

    i = '_' + str(OR_x) + '_' + str(or_sex)

    # Export phenotypes
    mt_sampled.cols().select(
        's', 'sex',
        'y0').key_by().export(out_bucket + 'phenotypes/pheno' + i + '.tsv')

    # Unadjusted GWASs
    result_ht = hl.linear_regression_rows(y=[mt_sampled.sex, mt_sampled.y0],
                                          x=mt_sampled.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        out_bucket + 'gwas/gwas_sex' + i + '.tsv',
        out_bucket + 'gwas/gwas_y0' + i + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,
コード例 #20
0
ファイル: gwas_on_subset.py プロジェクト: liangyy/ptrs-ukb
tend = time.time()
logging.info('--> Annotate with {} FINISHED! {} seconds elapsed'.format('covariates', tend - tstart))

# prepare phenotypes and covariates into list of lists and list
logging.info('Start preparing `y` and `covariates` for `linear_regression_rows`')
pheno_list_of_lists = [ [ mt[i][j] for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
pheno_list_of_names = [ [ f'{i}_x_{j}' for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
covar_list = [ mt.covariates[i] for i in list(mt.covariates.keys()) ]
logging.info('Prepare `y` and `covariates` for `linear_regression_rows` FINISHED!')

# run GWAS
logging.info('Start running GWAS')
tstart = time.time()
gwas_out = hl.linear_regression_rows(
    y = pheno_list_of_lists,
    x = mt.dosage,
    covariates = [1] + covar_list,
    pass_through = ['varid', 'rsid']
)
gwas_out = gwas_out.annotate_globals(phenotypes = pheno_list_of_names)
tend = time.time()
logging.info('Running GWAS FINISHED! {} seconds elapsed'.format(tend - tstart))

# write GWAS results onto disk
logging.info('Start writing GWAS result to disk')
tstart = time.time()
## if target folder does not exist, create it
target_folder = os.path.dirname(args.output_filename)
if not os.path.exists(target_folder) and target_folder is not '':
    os.makedirs(target_folder)
## check if extension of output file is .ht, if not add it
filename, file_extension = os.path.splitext(args.output_filename)
コード例 #21
0
ファイル: annotations.py プロジェクト: atgu/GWASpy
    def filter(self, mt):
        gwas = hl.linear_regression_rows(y=mt.is_case, x=mt.GT.n_alt_alleles(), covariates=[1.0])
        n_sig_variants = gwas.filter(gwas.p_value < 5E-8).count()

        return gwas, n_sig_variants
コード例 #22
0
ファイル: ldscsim_v2.0-test.py プロジェクト: nikbaya/ldscsim
sim_mt.aggregate_cols(hl.agg.stats(sim_mt.y_no_noise))

y = sim_mt.select_cols(sim_mt.y_no_noise).make_table()

cov = hl.import_table('/Users/nbaya/Documents/lab/ldscsim/ukb31063.gwas_covariates.both_sexes.tsv',impute=True, types={'s': hl.tstr}).key_by('s')

mt0 = sim_mt.annotate_cols(**cov[sim_mt.s])

mt0 = mt0.rename({'__norm_gt__': 'x'})

mt = mt0

cov_list = [ mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'],
                            mt['age_squared_isFemale'] ]+ [mt['PC{:}'.format(i)] for i in range(1, 21)] 

ht = hl.linear_regression_rows(
                y=mt.y,
                x=mt.x,
                covariates=[1]+cov_list,
                pass_through = ['rsid'])

ht = ht.rename({'rsid':'SNP'}).key_by('SNP')

ht = ht.select(Z = ht.beta/ht.standard_error)

sumstats_template = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',types={'N': hl.tint64})
sumstats_template = sumstats_template.key_by('SNP')
sumstats_template = sumstats_template.annotate(N = n_samples)
#            sumstats_template.show()

sumstats = sumstats_template.annotate(Z = ht[sumstats_template.SNP]['Z'])