Python linear_regression_rows Examples, hail.linear_regression_rows Python Examples

Example #1

0

Show file

File: test_plots.py Project: Nealelab/preimp_qc

def man_qq_plts(mt):

    gwas_ht = hl.linear_regression_rows(y=mt.is_case,
                                        x=mt.GT.n_alt_alleles(),
                                        covariates=[1.0])

    pvals = gwas_ht.select(gwas_ht.p_value)
    man_df = pvals.to_pandas()

    man_df_pruned = man_df[['locus.contig', 'locus.position', 'p_value']]
    man_df_pruned.columns = ['CHR', 'BP', 'P']
    man_df_pruned = man_df_pruned.dropna()

    man_df_pruned = man_df_pruned.replace(to_replace=["X", "Y", "MT"],
                                          value=[23, 24, 25])

    buffer = io.BytesIO()
    figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
    qqman.manhattan(man_df_pruned,
                    ax=axes[0],
                    xrotation=90.0,
                    title="Manhattan plot")
    qqman.qqplot(man_df_pruned, ax=axes[1], title="QQ plot")

    figure.tight_layout()
    plt.savefig(buffer, format='PNG')
    plt.clf()
    plt.close()
    buffer.seek(0)

    plt_base64 = base64.b64encode(buffer.read()).decode('ascii')
    return '<img src="data:image/png;base64,{}">'.format(plt_base64)

Example #2

0

Show file

File: gwas_hail.py Project: hail-is/batch-demo

def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)

Example #3

0

Show file

def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    '''Runs GWAS in Hail'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})
    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    pass_through = list(set(['rsid'] + pass_through))
    print(f'variables to pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(Z=gwas_ht[ss_template.SNP].Z,
                              N=gwas_ht[ss_template.SNP].N)

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss

Example #4

0

Show file

def run_grouped_regressions(mt, ss_output, pheno, pheno_name):
    ht = hl.linear_regression_rows(
        y=[[mt['phenotypes'][y]] for y in pheno],
        x=mt.dosage,
        covariates=[
            1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
        ],
        pass_through=['varid', 'rsid'])

    ht = ht.annotate_globals(phenotypes=pheno)  # check this

    ht.write(ss_output + pheno_name + '.ht', overwrite=args.overwrite)

Example #5

0

Show file

def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=True,
         is_std_cov_list=False):
    '''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'y': y}, entry_exprs={'x': x})
    if normalize_x:
        mt = normalize_genotypes(mt, mt.x)
        mt = mt.annotate_entries(x=mt.__norm_gt).drop('__norm_gt')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(pass_through)

    gwas_ht = hl.linear_regression_rows(y=mt.y,
                                        x=mt.x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    sumstats_template = hl.import_table(
        'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz', types={'N': hl.tint64})
    sumstats_template = sumstats_template.key_by('SNP')

    sumstats = sumstats_template.annotate(Z=gwas_ht[sumstats_template.SNP].Z,
                                          N=gwas_ht[sumstats_template.SNP].N)

    if path_to_save is not None:
        sumstats.export(path_to_save)

    return gwas_ht

Example #6

0

Show file

def linear_regression_rows(mt_path):
    mt = hl.read_matrix_table(mt_path)
    num_phenos = 100
    num_covs = 20
    pheno_dict = {f"pheno_{i}": hl.rand_unif(0, 1) for i in range(num_phenos)}
    cov_dict = {f"cov_{i}": hl.rand_unif(0, 1) for i in range(num_covs)}
    mt = mt.annotate_cols(**pheno_dict)
    mt = mt.annotate_cols(**cov_dict)
    res = hl.linear_regression_rows(
        y=[mt[key] for key in pheno_dict.keys()],
        x=mt.x,
        covariates=[mt[key] for key in cov_dict.keys()])
    res._force_count()

Example #7

0

Show file

def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht

Example #8

0

Show file

File: run_gwas.py Project: saponas/hail

def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)

Example #9

0

Show file

File: sex_strat_iter_split_gwas.py Project: nikbaya/split

        if n_chunks == 2:  #traditional split of population into even halves (no meta-analysis)
            mt_A = mt.filter_cols(mt.group_id == 0)
            mt_B = mt.filter_cols(mt.group_id == 1)

            cov_list_A = [
                mt_A['isFemale'], mt_A['age'], mt_A['age_squared'],
                mt_A['age_isFemale'], mt_A['age_squared_isFemale']
            ] + [mt_A['PC{:}'.format(i)] for i in range(1, 21)]

            cov_list_B = [
                mt_B['isFemale'], mt_B['age'], mt_B['age_squared'],
                mt_B['age_isFemale'], mt_B['age_squared_isFemale']
            ] + [mt_B['PC{:}'.format(i)] for i in range(1, 21)]

            ht_A = hl.linear_regression_rows(y=mt_A.y,
                                             x=mt_A.x,
                                             covariates=[1] + cov_list_A,
                                             pass_through=['rsid'])

            ht_B = hl.linear_regression_rows(y=mt_B.y,
                                             x=mt_B.x,
                                             covariates=[1] + cov_list_B,
                                             pass_through=['rsid'])

            ht_A = ht_A.rename({'rsid': 'SNP'}).key_by('SNP')
            ht_B = ht_B.rename({'rsid': 'SNP'}).key_by('SNP')

            ht_A = ht_A.select(Z=ht_A.beta / ht_A.standard_error)
            ht_B = ht_B.select(Z=ht_B.beta / ht_B.standard_error)

            sumstats_template = hl.import_table(
                'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',

Example #10

0

Show file

    test="firth",  #controls false positives
    y=hl.float(mt.AffectionBool),
    x=mt.GT.n_alt_alleles(),
    covariates=[
        1,
        hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2],
        mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7],
        mt.scores[8], mt.scores[9]
    ])

######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable)
gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1),
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[
                                     1,
                                     hl.float(mt.PackYear), mt.scores[0],
                                     mt.scores[1], mt.scores[2], mt.scores[3],
                                     mt.scores[4], mt.scores[5], mt.scores[6],
                                     mt.scores[7], mt.scores[8], mt.scores[9]
                                 ])

######## 6.3 Q-Q plot
qqplot = hl.plot.qq(gwas.p_value)
show(qqplot)

######## 6.4 Manhattan-like plots
#GWAS significanse level = 5.0 10e-8, suggestive: 5.0 10e-8 < P < 5.0 * 10e-6.

# Calculate Bonferroni based cut off lines
Bonferroni_line = -np.log10(0.05 / mt.count_rows())
Suggestive_line = -np.log10(1 / mt.count_rows())

Example #11

0

Show file

args = parser.parse_args()


hl.import_vcf(args.vcf_path).write(args.workdir + 'hail.mt', overwrite=True)
mt = hl.read_matrix_table(args.workdir + 'hail.mt')
table = (hl.import_table(args.annotation_path, impute=True)
     .key_by('Sample'))

# Start benchmarking after I/O
t0 = time.time()
mt = mt.annotate_cols(pheno = table[mt.s])
mt = hl.sample_qc(mt)
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                    (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                    (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                             x=mt.GT.n_alt_alleles(),
                             covariates=[1.0])
p = hl.plot.manhattan(gwas.p_value)

print('Time Elapsed: {}'.format(time.time()- t0))

Example #12

0

Show file

File: sex_differential_simulations_sample_size.py Project: hegu2692/genobias

def gwas(y, x, cov):
    g = hl.linear_regression_rows(y=y,
                                  x=x,
                                  covariates=cov,
                                  pass_through=['rsid'])
    return g

Example #13

0

Show file

# COMMAND ----------

print('Samples: %d  Variants: %d' %
      (common_mt.count_cols(), common_mt.count_rows()))

# COMMAND ----------

# MAGIC %md These filters removed about 15% of sites (we started with a bit over 10,000). This is _NOT_ representative of most sequencing datasets! We have already downsampled the full thousand genomes dataset to include more common variants than we'd expect by chance.
# MAGIC
# MAGIC In Hail, the association tests accept column fields for the sample phenotype and covariates. Since we've already got our phenotype of interest (caffeine consumption) in the dataset, we are good to go:

# COMMAND ----------

gwas = hl.linear_regression_rows(y=common_mt.CaffeineConsumption,
                                 x=common_mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()

# COMMAND ----------

# MAGIC %md Looking at the bottom of the above printout, you can see the linear regression adds new row fields for the beta, standard error, t-statistic, and p-value.
# MAGIC
# MAGIC Hail makes it easy to make a [Q-Q (quantile-quantile) plot](https://en.wikipedia.org/wiki/Q–Q_plot).

# COMMAND ----------

p = hl.plot.qq(gwas.p_value)
displayBokeh(p)

# COMMAND ----------

Example #14

0

Show file

def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    r'''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})

    print('\n... Calculating allele frequency ...')
    mt_freq_rows = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) /
                                    2).rows()  #frequency of alternate allele
    mt_freq_rows = mt_freq_rows.key_by('rsid')

    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(f'pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.key_by('rsid')

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(chr=gwas_ht[ss_template.SNP].locus.contig,
                              bpos=gwas_ht[ss_template.SNP].locus.position,
                              freq=mt_freq_rows[ss_template.SNP].freq,
                              beta=gwas_ht[ss_template.SNP].beta,
                              z=gwas_ht[ss_template.SNP].t_stat,
                              pval=gwas_ht[ss_template.SNP].p_value,
                              n=gwas_ht[ss_template.SNP].n)
    ss = ss.drop('N')
    ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2'})

    print(ss.describe())

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss

Example #15

0

Show file

 gwas = hl.linear_regression_rows(
     y=[
         mt.sample_qc_and_phenotype.fbc.neut_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hgb_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.plt_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mcv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mch_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mchc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.irf_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_baso_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_eo_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_neut_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_p_myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gran_gwas_normalised
         #                   mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         #                   mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised],
     ],
     x=mt.GT.n_alt_alleles(),
     covariates=[1.0],
     pass_through=[mt.rsid])

Example #16

0

Show file

File: simulations_heckman.py Project: hegu2692/genobias

u = 8

rgs = [-0.3, -0.1, 0, 0.1, 0.3]

# Import matrix and annotate cols with phenotypes
mt = hl.read_matrix_table(input_matrix)

mt = mt.annotate_cols(U=mt.y[u])
mt = mt.annotate_cols(X=mt.y[x])

for i in range(5):
    mt = mt.annotate_cols(Y=mt.y[i + 2])

    # GWAS of X, Y in all
    result_ht = hl.linear_regression_rows(y=[mt.X, mt.Y],
                                          x=mt.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        output_bucket + 'gwas/gwas_X_' + str(rgs[i]) + '.tsv',
        output_bucket + 'gwas/gwas_Y_' + str(rgs[i]) + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,

Example #17

0

Show file

mt = hl.import_bgen(
    path=
    f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen',
    sample_file=f'gs://ukb31063/ukb31063.{contig}.sample',
    entry_fields=['dosage'],
    variants=ht_variants)

mt = mt.annotate_cols(phenotypes=ht_phenotypes[mt.s],
                      covariates=ht_covariates[mt.s])

phenotypes = list(mt['phenotypes'].keys())

ht = hl.linear_regression_rows(
    y=[[mt['phenotypes'][y]] for y in phenotypes],
    x=mt.dosage,
    covariates=[
        1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
    ],
    pass_through=['varid', 'rsid'])

ht = ht.annotate_globals(phenotypes=phenotypes)

if dilution:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.dilution_factor.ht',
        overwrite=True)
else:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.ht',
        overwrite=True)

Example #18

0

Show file

File: spike_slab.py Project: armartin/mama_benchmarks

def main(args):
    betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100']
    spike_slab = hl.import_table(
        'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt',
        impute=True)
    spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v))
    if args.compute_true_phenotypes:
        # get the white british subset
        eur = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv'
        ).key_by('s')

        # read in imputed data, subset to chr22
        mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        mt = mt.annotate_rows(ss=spike_slab[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt.ss))

        # compute true PRS (i.e. phenotypes)
        annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas}

        # write out phenos for white British unrelated subset
        mt = mt.annotate_cols(**annot_expr)
        mt = mt.filter_cols(hl.is_defined(eur[mt.s]))
        mt.cols().write(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht',
            stage_locally=True,
            overwrite=True)

    if args.run_gwas:
        # read back in PRS (now true phenotypes)
        phenos = hl.read_table(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht'
        ).key_by('s')
        phenos.show()
        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            impute=True,
            types={
                's': hl.tstr
            }).key_by('s')
        full_mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        full_mt = full_mt.annotate_cols(**covariates[full_mt.s])
        full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key])
        full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss))

        # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs
        for i in range(10):
            subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1))
            subset_pheno = subset_pheno.order_by(
                subset_pheno.r).add_index('global_idx').key_by('s')
            subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000)
            mt = full_mt.annotate_cols(**subset_pheno[full_mt.s])
            mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2)
            result_ht = hl.linear_regression_rows(
                y=[mt[i] for i in betas],
                x=mt.dosage,
                covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)],
                pass_through=['rsid', 'maf'])

            subset_pheno.export(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_'
                + str(i) + '.tsv.gz')
            result_ht.write(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht',
                overwrite=True)

    if args.write_gwas:
        for i in range(10):
            result_ht = hl.read_table(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht')
            result_ht = result_ht.key_by()
            get_expr = {
                field + '_' + x: result_ht[field][i]
                for i, x in enumerate(betas)
                for field in ['beta', 'standard_error', 'p_value']
            }
            result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0],
                             alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\
                .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')

Example #19

0

Show file

File: review_simulation_C.py Project: hegu2692/genobias

    # Filter MatrixTable and get sample
    samples_to_keep = set(df.loc[(df['sel'] == 1), 's'])
    set_to_keep = hl.literal(samples_to_keep)
    mt_sampled = mt.filter_cols(set_to_keep.contains(mt['s']), keep=True)

    i = '_' + str(OR_x) + '_' + str(or_sex)

    # Export phenotypes
    mt_sampled.cols().select(
        's', 'sex',
        'y0').key_by().export(out_bucket + 'phenotypes/pheno' + i + '.tsv')

    # Unadjusted GWASs
    result_ht = hl.linear_regression_rows(y=[mt_sampled.sex, mt_sampled.y0],
                                          x=mt_sampled.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        out_bucket + 'gwas/gwas_sex' + i + '.tsv',
        out_bucket + 'gwas/gwas_y0' + i + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,

Example #20

0

Show file

File: gwas_on_subset.py Project: liangyy/ptrs-ukb

tend = time.time()
logging.info('--> Annotate with {} FINISHED! {} seconds elapsed'.format('covariates', tend - tstart))

# prepare phenotypes and covariates into list of lists and list
logging.info('Start preparing `y` and `covariates` for `linear_regression_rows`')
pheno_list_of_lists = [ [ mt[i][j] for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
pheno_list_of_names = [ [ f'{i}_x_{j}' for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
covar_list = [ mt.covariates[i] for i in list(mt.covariates.keys()) ]
logging.info('Prepare `y` and `covariates` for `linear_regression_rows` FINISHED!')

# run GWAS
logging.info('Start running GWAS')
tstart = time.time()
gwas_out = hl.linear_regression_rows(
    y = pheno_list_of_lists,
    x = mt.dosage,
    covariates = [1] + covar_list,
    pass_through = ['varid', 'rsid']
)
gwas_out = gwas_out.annotate_globals(phenotypes = pheno_list_of_names)
tend = time.time()
logging.info('Running GWAS FINISHED! {} seconds elapsed'.format(tend - tstart))

# write GWAS results onto disk
logging.info('Start writing GWAS result to disk')
tstart = time.time()
## if target folder does not exist, create it
target_folder = os.path.dirname(args.output_filename)
if not os.path.exists(target_folder) and target_folder is not '':
    os.makedirs(target_folder)
## check if extension of output file is .ht, if not add it
filename, file_extension = os.path.splitext(args.output_filename)

Example #21

0

Show file

File: annotations.py Project: atgu/GWASpy

    def filter(self, mt):
        gwas = hl.linear_regression_rows(y=mt.is_case, x=mt.GT.n_alt_alleles(), covariates=[1.0])
        n_sig_variants = gwas.filter(gwas.p_value < 5E-8).count()

        return gwas, n_sig_variants

Example #22

0

Show file

File: ldscsim_v2.0-test.py Project: nikbaya/ldscsim

sim_mt.aggregate_cols(hl.agg.stats(sim_mt.y_no_noise))

y = sim_mt.select_cols(sim_mt.y_no_noise).make_table()

cov = hl.import_table('/Users/nbaya/Documents/lab/ldscsim/ukb31063.gwas_covariates.both_sexes.tsv',impute=True, types={'s': hl.tstr}).key_by('s')

mt0 = sim_mt.annotate_cols(**cov[sim_mt.s])

mt0 = mt0.rename({'__norm_gt__': 'x'})

mt = mt0

cov_list = [ mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'],
                            mt['age_squared_isFemale'] ]+ [mt['PC{:}'.format(i)] for i in range(1, 21)] 

ht = hl.linear_regression_rows(
                y=mt.y,
                x=mt.x,
                covariates=[1]+cov_list,
                pass_through = ['rsid'])

ht = ht.rename({'rsid':'SNP'}).key_by('SNP')

ht = ht.select(Z = ht.beta/ht.standard_error)

sumstats_template = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',types={'N': hl.tint64})
sumstats_template = sumstats_template.key_by('SNP')
sumstats_template = sumstats_template.annotate(N = n_samples)
#            sumstats_template.show()

sumstats = sumstats_template.annotate(Z = ht[sumstats_template.SNP]['Z'])