Esempio n. 1
0
def man_qq_plts(mt):

    gwas_ht = hl.linear_regression_rows(y=mt.is_case,
                                        x=mt.GT.n_alt_alleles(),
                                        covariates=[1.0])

    pvals = gwas_ht.select(gwas_ht.p_value)
    man_df = pvals.to_pandas()

    man_df_pruned = man_df[['locus.contig', 'locus.position', 'p_value']]
    man_df_pruned.columns = ['CHR', 'BP', 'P']
    man_df_pruned = man_df_pruned.dropna()

    man_df_pruned = man_df_pruned.replace(to_replace=["X", "Y", "MT"],
                                          value=[23, 24, 25])

    buffer = io.BytesIO()
    figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
    qqman.manhattan(man_df_pruned,
                    ax=axes[0],
                    xrotation=90.0,
                    title="Manhattan plot")
    qqman.qqplot(man_df_pruned, ax=axes[1], title="QQ plot")

    figure.tight_layout()
    plt.savefig(buffer, format='PNG')
    plt.clf()
    plt.close()
    buffer.seek(0)

    plt_base64 = base64.b64encode(buffer.read()).decode('ascii')
    return '<img src="data:image/png;base64,{}">'.format(plt_base64)
Esempio n. 2
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Esempio n. 3
0
def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    '''Runs GWAS in Hail'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})
    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    pass_through = list(set(['rsid'] + pass_through))
    print(f'variables to pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(Z=gwas_ht[ss_template.SNP].Z,
                              N=gwas_ht[ss_template.SNP].N)

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss
Esempio n. 4
0
def run_grouped_regressions(mt, ss_output, pheno, pheno_name):
    ht = hl.linear_regression_rows(
        y=[[mt['phenotypes'][y]] for y in pheno],
        x=mt.dosage,
        covariates=[
            1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
        ],
        pass_through=['varid', 'rsid'])

    ht = ht.annotate_globals(phenotypes=pheno)  # check this

    ht.write(ss_output + pheno_name + '.ht', overwrite=args.overwrite)
Esempio n. 5
0
def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=True,
         is_std_cov_list=False):
    '''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'y': y}, entry_exprs={'x': x})
    if normalize_x:
        mt = normalize_genotypes(mt, mt.x)
        mt = mt.annotate_entries(x=mt.__norm_gt).drop('__norm_gt')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(pass_through)

    gwas_ht = hl.linear_regression_rows(y=mt.y,
                                        x=mt.x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    sumstats_template = hl.import_table(
        'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz', types={'N': hl.tint64})
    sumstats_template = sumstats_template.key_by('SNP')

    sumstats = sumstats_template.annotate(Z=gwas_ht[sumstats_template.SNP].Z,
                                          N=gwas_ht[sumstats_template.SNP].N)

    if path_to_save is not None:
        sumstats.export(path_to_save)

    return gwas_ht
Esempio n. 6
0
def linear_regression_rows(mt_path):
    mt = hl.read_matrix_table(mt_path)
    num_phenos = 100
    num_covs = 20
    pheno_dict = {f"pheno_{i}": hl.rand_unif(0, 1) for i in range(num_phenos)}
    cov_dict = {f"cov_{i}": hl.rand_unif(0, 1) for i in range(num_covs)}
    mt = mt.annotate_cols(**pheno_dict)
    mt = mt.annotate_cols(**cov_dict)
    res = hl.linear_regression_rows(
        y=[mt[key] for key in pheno_dict.keys()],
        x=mt.x,
        covariates=[mt[key] for key in cov_dict.keys()])
    res._force_count()
Esempio n. 7
0
def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht
Esempio n. 8
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Esempio n. 9
0
        if n_chunks == 2:  #traditional split of population into even halves (no meta-analysis)
            mt_A = mt.filter_cols(mt.group_id == 0)
            mt_B = mt.filter_cols(mt.group_id == 1)

            cov_list_A = [
                mt_A['isFemale'], mt_A['age'], mt_A['age_squared'],
                mt_A['age_isFemale'], mt_A['age_squared_isFemale']
            ] + [mt_A['PC{:}'.format(i)] for i in range(1, 21)]

            cov_list_B = [
                mt_B['isFemale'], mt_B['age'], mt_B['age_squared'],
                mt_B['age_isFemale'], mt_B['age_squared_isFemale']
            ] + [mt_B['PC{:}'.format(i)] for i in range(1, 21)]

            ht_A = hl.linear_regression_rows(y=mt_A.y,
                                             x=mt_A.x,
                                             covariates=[1] + cov_list_A,
                                             pass_through=['rsid'])

            ht_B = hl.linear_regression_rows(y=mt_B.y,
                                             x=mt_B.x,
                                             covariates=[1] + cov_list_B,
                                             pass_through=['rsid'])

            ht_A = ht_A.rename({'rsid': 'SNP'}).key_by('SNP')
            ht_B = ht_B.rename({'rsid': 'SNP'}).key_by('SNP')

            ht_A = ht_A.select(Z=ht_A.beta / ht_A.standard_error)
            ht_B = ht_B.select(Z=ht_B.beta / ht_B.standard_error)

            sumstats_template = hl.import_table(
                'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',
Esempio n. 10
0
    test="firth",  #controls false positives
    y=hl.float(mt.AffectionBool),
    x=mt.GT.n_alt_alleles(),
    covariates=[
        1,
        hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2],
        mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7],
        mt.scores[8], mt.scores[9]
    ])

######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable)
gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1),
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[
                                     1,
                                     hl.float(mt.PackYear), mt.scores[0],
                                     mt.scores[1], mt.scores[2], mt.scores[3],
                                     mt.scores[4], mt.scores[5], mt.scores[6],
                                     mt.scores[7], mt.scores[8], mt.scores[9]
                                 ])

######## 6.3 Q-Q plot
qqplot = hl.plot.qq(gwas.p_value)
show(qqplot)

######## 6.4 Manhattan-like plots
#GWAS significanse level = 5.0 10e-8, suggestive: 5.0 10e-8 < P < 5.0 * 10e-6.

# Calculate Bonferroni based cut off lines
Bonferroni_line = -np.log10(0.05 / mt.count_rows())
Suggestive_line = -np.log10(1 / mt.count_rows())
Esempio n. 11
0
args = parser.parse_args()


hl.import_vcf(args.vcf_path).write(args.workdir + 'hail.mt', overwrite=True)
mt = hl.read_matrix_table(args.workdir + 'hail.mt')
table = (hl.import_table(args.annotation_path, impute=True)
     .key_by('Sample'))

# Start benchmarking after I/O
t0 = time.time()
mt = mt.annotate_cols(pheno = table[mt.s])
mt = hl.sample_qc(mt)
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                    (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                    (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                             x=mt.GT.n_alt_alleles(),
                             covariates=[1.0])
p = hl.plot.manhattan(gwas.p_value)

print('Time Elapsed: {}'.format(time.time()- t0))
def gwas(y, x, cov):
    g = hl.linear_regression_rows(y=y,
                                  x=x,
                                  covariates=cov,
                                  pass_through=['rsid'])
    return g
Esempio n. 13
0
# COMMAND ----------

print('Samples: %d  Variants: %d' %
      (common_mt.count_cols(), common_mt.count_rows()))

# COMMAND ----------

# MAGIC %md These filters removed about 15% of sites (we started with a bit over 10,000). This is _NOT_ representative of most sequencing datasets! We have already downsampled the full thousand genomes dataset to include more common variants than we'd expect by chance.
# MAGIC
# MAGIC In Hail, the association tests accept column fields for the sample phenotype and covariates. Since we've already got our phenotype of interest (caffeine consumption) in the dataset, we are good to go:

# COMMAND ----------

gwas = hl.linear_regression_rows(y=common_mt.CaffeineConsumption,
                                 x=common_mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()

# COMMAND ----------

# MAGIC %md Looking at the bottom of the above printout, you can see the linear regression adds new row fields for the beta, standard error, t-statistic, and p-value.
# MAGIC
# MAGIC Hail makes it easy to make a [Q-Q (quantile-quantile) plot](https://en.wikipedia.org/wiki/Q–Q_plot).

# COMMAND ----------

p = hl.plot.qq(gwas.p_value)
displayBokeh(p)

# COMMAND ----------
Esempio n. 14
0
def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    r'''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})

    print('\n... Calculating allele frequency ...')
    mt_freq_rows = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) /
                                    2).rows()  #frequency of alternate allele
    mt_freq_rows = mt_freq_rows.key_by('rsid')

    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(f'pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.key_by('rsid')

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(chr=gwas_ht[ss_template.SNP].locus.contig,
                              bpos=gwas_ht[ss_template.SNP].locus.position,
                              freq=mt_freq_rows[ss_template.SNP].freq,
                              beta=gwas_ht[ss_template.SNP].beta,
                              z=gwas_ht[ss_template.SNP].t_stat,
                              pval=gwas_ht[ss_template.SNP].p_value,
                              n=gwas_ht[ss_template.SNP].n)
    ss = ss.drop('N')
    ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2'})

    print(ss.describe())

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss
Esempio n. 15
0
 gwas = hl.linear_regression_rows(
     y=[
         mt.sample_qc_and_phenotype.fbc.neut_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hgb_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.plt_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mcv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mch_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mchc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.irf_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_baso_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_eo_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_neut_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_p_myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gran_gwas_normalised
         #                   mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         #                   mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised],
     ],
     x=mt.GT.n_alt_alleles(),
     covariates=[1.0],
     pass_through=[mt.rsid])
Esempio n. 16
0
u = 8

rgs = [-0.3, -0.1, 0, 0.1, 0.3]

# Import matrix and annotate cols with phenotypes
mt = hl.read_matrix_table(input_matrix)

mt = mt.annotate_cols(U=mt.y[u])
mt = mt.annotate_cols(X=mt.y[x])

for i in range(5):
    mt = mt.annotate_cols(Y=mt.y[i + 2])

    # GWAS of X, Y in all
    result_ht = hl.linear_regression_rows(y=[mt.X, mt.Y],
                                          x=mt.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        output_bucket + 'gwas/gwas_X_' + str(rgs[i]) + '.tsv',
        output_bucket + 'gwas/gwas_Y_' + str(rgs[i]) + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,
Esempio n. 17
0
mt = hl.import_bgen(
    path=
    f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen',
    sample_file=f'gs://ukb31063/ukb31063.{contig}.sample',
    entry_fields=['dosage'],
    variants=ht_variants)

mt = mt.annotate_cols(phenotypes=ht_phenotypes[mt.s],
                      covariates=ht_covariates[mt.s])

phenotypes = list(mt['phenotypes'].keys())

ht = hl.linear_regression_rows(
    y=[[mt['phenotypes'][y]] for y in phenotypes],
    x=mt.dosage,
    covariates=[
        1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
    ],
    pass_through=['varid', 'rsid'])

ht = ht.annotate_globals(phenotypes=phenotypes)

if dilution:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.dilution_factor.ht',
        overwrite=True)
else:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.ht',
        overwrite=True)
Esempio n. 18
0
def main(args):
    betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100']
    spike_slab = hl.import_table(
        'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt',
        impute=True)
    spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v))
    if args.compute_true_phenotypes:
        # get the white british subset
        eur = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv'
        ).key_by('s')

        # read in imputed data, subset to chr22
        mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        mt = mt.annotate_rows(ss=spike_slab[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt.ss))

        # compute true PRS (i.e. phenotypes)
        annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas}

        # write out phenos for white British unrelated subset
        mt = mt.annotate_cols(**annot_expr)
        mt = mt.filter_cols(hl.is_defined(eur[mt.s]))
        mt.cols().write(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht',
            stage_locally=True,
            overwrite=True)

    if args.run_gwas:
        # read back in PRS (now true phenotypes)
        phenos = hl.read_table(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht'
        ).key_by('s')
        phenos.show()
        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            impute=True,
            types={
                's': hl.tstr
            }).key_by('s')
        full_mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        full_mt = full_mt.annotate_cols(**covariates[full_mt.s])
        full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key])
        full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss))

        # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs
        for i in range(10):
            subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1))
            subset_pheno = subset_pheno.order_by(
                subset_pheno.r).add_index('global_idx').key_by('s')
            subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000)
            mt = full_mt.annotate_cols(**subset_pheno[full_mt.s])
            mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2)
            result_ht = hl.linear_regression_rows(
                y=[mt[i] for i in betas],
                x=mt.dosage,
                covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)],
                pass_through=['rsid', 'maf'])

            subset_pheno.export(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_'
                + str(i) + '.tsv.gz')
            result_ht.write(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht',
                overwrite=True)

    if args.write_gwas:
        for i in range(10):
            result_ht = hl.read_table(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht')
            result_ht = result_ht.key_by()
            get_expr = {
                field + '_' + x: result_ht[field][i]
                for i, x in enumerate(betas)
                for field in ['beta', 'standard_error', 'p_value']
            }
            result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0],
                             alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\
                .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')
Esempio n. 19
0
    # Filter MatrixTable and get sample
    samples_to_keep = set(df.loc[(df['sel'] == 1), 's'])
    set_to_keep = hl.literal(samples_to_keep)
    mt_sampled = mt.filter_cols(set_to_keep.contains(mt['s']), keep=True)

    i = '_' + str(OR_x) + '_' + str(or_sex)

    # Export phenotypes
    mt_sampled.cols().select(
        's', 'sex',
        'y0').key_by().export(out_bucket + 'phenotypes/pheno' + i + '.tsv')

    # Unadjusted GWASs
    result_ht = hl.linear_regression_rows(y=[mt_sampled.sex, mt_sampled.y0],
                                          x=mt_sampled.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        out_bucket + 'gwas/gwas_sex' + i + '.tsv',
        out_bucket + 'gwas/gwas_y0' + i + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,
Esempio n. 20
0
tend = time.time()
logging.info('--> Annotate with {} FINISHED! {} seconds elapsed'.format('covariates', tend - tstart))

# prepare phenotypes and covariates into list of lists and list
logging.info('Start preparing `y` and `covariates` for `linear_regression_rows`')
pheno_list_of_lists = [ [ mt[i][j] for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
pheno_list_of_names = [ [ f'{i}_x_{j}' for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
covar_list = [ mt.covariates[i] for i in list(mt.covariates.keys()) ]
logging.info('Prepare `y` and `covariates` for `linear_regression_rows` FINISHED!')

# run GWAS
logging.info('Start running GWAS')
tstart = time.time()
gwas_out = hl.linear_regression_rows(
    y = pheno_list_of_lists,
    x = mt.dosage,
    covariates = [1] + covar_list,
    pass_through = ['varid', 'rsid']
)
gwas_out = gwas_out.annotate_globals(phenotypes = pheno_list_of_names)
tend = time.time()
logging.info('Running GWAS FINISHED! {} seconds elapsed'.format(tend - tstart))

# write GWAS results onto disk
logging.info('Start writing GWAS result to disk')
tstart = time.time()
## if target folder does not exist, create it
target_folder = os.path.dirname(args.output_filename)
if not os.path.exists(target_folder) and target_folder is not '':
    os.makedirs(target_folder)
## check if extension of output file is .ht, if not add it
filename, file_extension = os.path.splitext(args.output_filename)
Esempio n. 21
0
    def filter(self, mt):
        gwas = hl.linear_regression_rows(y=mt.is_case, x=mt.GT.n_alt_alleles(), covariates=[1.0])
        n_sig_variants = gwas.filter(gwas.p_value < 5E-8).count()

        return gwas, n_sig_variants
Esempio n. 22
0
sim_mt.aggregate_cols(hl.agg.stats(sim_mt.y_no_noise))

y = sim_mt.select_cols(sim_mt.y_no_noise).make_table()

cov = hl.import_table('/Users/nbaya/Documents/lab/ldscsim/ukb31063.gwas_covariates.both_sexes.tsv',impute=True, types={'s': hl.tstr}).key_by('s')

mt0 = sim_mt.annotate_cols(**cov[sim_mt.s])

mt0 = mt0.rename({'__norm_gt__': 'x'})

mt = mt0

cov_list = [ mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'],
                            mt['age_squared_isFemale'] ]+ [mt['PC{:}'.format(i)] for i in range(1, 21)] 

ht = hl.linear_regression_rows(
                y=mt.y,
                x=mt.x,
                covariates=[1]+cov_list,
                pass_through = ['rsid'])

ht = ht.rename({'rsid':'SNP'}).key_by('SNP')

ht = ht.select(Z = ht.beta/ht.standard_error)

sumstats_template = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',types={'N': hl.tint64})
sumstats_template = sumstats_template.key_by('SNP')
sumstats_template = sumstats_template.annotate(N = n_samples)
#            sumstats_template.show()

sumstats = sumstats_template.annotate(Z = ht[sumstats_template.SNP]['Z'])