Python linear_regression_rowsの例、hail.linear_regression_rows Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_plots.py プロジェクト: Nealelab/preimp_qc

def man_qq_plts(mt):

    gwas_ht = hl.linear_regression_rows(y=mt.is_case,
                                        x=mt.GT.n_alt_alleles(),
                                        covariates=[1.0])

    pvals = gwas_ht.select(gwas_ht.p_value)
    man_df = pvals.to_pandas()

    man_df_pruned = man_df[['locus.contig', 'locus.position', 'p_value']]
    man_df_pruned.columns = ['CHR', 'BP', 'P']
    man_df_pruned = man_df_pruned.dropna()

    man_df_pruned = man_df_pruned.replace(to_replace=["X", "Y", "MT"],
                                          value=[23, 24, 25])

    buffer = io.BytesIO()
    figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
    qqman.manhattan(man_df_pruned,
                    ax=axes[0],
                    xrotation=90.0,
                    title="Manhattan plot")
    qqman.qqplot(man_df_pruned, ax=axes[1], title="QQ plot")

    figure.tight_layout()
    plt.savefig(buffer, format='PNG')
    plt.clf()
    plt.close()
    buffer.seek(0)

    plt_base64 = base64.b64encode(buffer.read()).decode('ascii')
    return '<img src="data:image/png;base64,{}">'.format(plt_base64)

コード例 #2

0

ファイルを表示

ファイル: gwas_hail.py プロジェクト: hail-is/batch-demo

def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)

コード例 #3

0

ファイルを表示

def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    '''Runs GWAS in Hail'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})
    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    pass_through = list(set(['rsid'] + pass_through))
    print(f'variables to pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(Z=gwas_ht[ss_template.SNP].Z,
                              N=gwas_ht[ss_template.SNP].N)

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss

コード例 #4

0

ファイルを表示

def run_grouped_regressions(mt, ss_output, pheno, pheno_name):
    ht = hl.linear_regression_rows(
        y=[[mt['phenotypes'][y]] for y in pheno],
        x=mt.dosage,
        covariates=[
            1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
        ],
        pass_through=['varid', 'rsid'])

    ht = ht.annotate_globals(phenotypes=pheno)  # check this

    ht.write(ss_output + pheno_name + '.ht', overwrite=args.overwrite)

コード例 #5

0

ファイルを表示

def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=True,
         is_std_cov_list=False):
    '''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'y': y}, entry_exprs={'x': x})
    if normalize_x:
        mt = normalize_genotypes(mt, mt.x)
        mt = mt.annotate_entries(x=mt.__norm_gt).drop('__norm_gt')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(pass_through)

    gwas_ht = hl.linear_regression_rows(y=mt.y,
                                        x=mt.x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP')

    gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n)

    sumstats_template = hl.import_table(
        'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz', types={'N': hl.tint64})
    sumstats_template = sumstats_template.key_by('SNP')

    sumstats = sumstats_template.annotate(Z=gwas_ht[sumstats_template.SNP].Z,
                                          N=gwas_ht[sumstats_template.SNP].N)

    if path_to_save is not None:
        sumstats.export(path_to_save)

    return gwas_ht

コード例 #6

0

ファイルを表示

def linear_regression_rows(mt_path):
    mt = hl.read_matrix_table(mt_path)
    num_phenos = 100
    num_covs = 20
    pheno_dict = {f"pheno_{i}": hl.rand_unif(0, 1) for i in range(num_phenos)}
    cov_dict = {f"cov_{i}": hl.rand_unif(0, 1) for i in range(num_covs)}
    mt = mt.annotate_cols(**pheno_dict)
    mt = mt.annotate_cols(**cov_dict)
    res = hl.linear_regression_rows(
        y=[mt[key] for key in pheno_dict.keys()],
        x=mt.x,
        covariates=[mt[key] for key in cov_dict.keys()])
    res._force_count()

コード例 #7

0

ファイルを表示

def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht

コード例 #8

0

ファイルを表示

ファイル: run_gwas.py プロジェクト: saponas/hail

def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)

コード例 #9

0

ファイルを表示

ファイル: sex_strat_iter_split_gwas.py プロジェクト: nikbaya/split

        if n_chunks == 2:  #traditional split of population into even halves (no meta-analysis)
            mt_A = mt.filter_cols(mt.group_id == 0)
            mt_B = mt.filter_cols(mt.group_id == 1)

            cov_list_A = [
                mt_A['isFemale'], mt_A['age'], mt_A['age_squared'],
                mt_A['age_isFemale'], mt_A['age_squared_isFemale']
            ] + [mt_A['PC{:}'.format(i)] for i in range(1, 21)]

            cov_list_B = [
                mt_B['isFemale'], mt_B['age'], mt_B['age_squared'],
                mt_B['age_isFemale'], mt_B['age_squared_isFemale']
            ] + [mt_B['PC{:}'.format(i)] for i in range(1, 21)]

            ht_A = hl.linear_regression_rows(y=mt_A.y,
                                             x=mt_A.x,
                                             covariates=[1] + cov_list_A,
                                             pass_through=['rsid'])

            ht_B = hl.linear_regression_rows(y=mt_B.y,
                                             x=mt_B.x,
                                             covariates=[1] + cov_list_B,
                                             pass_through=['rsid'])

            ht_A = ht_A.rename({'rsid': 'SNP'}).key_by('SNP')
            ht_B = ht_B.rename({'rsid': 'SNP'}).key_by('SNP')

            ht_A = ht_A.select(Z=ht_A.beta / ht_A.standard_error)
            ht_B = ht_B.select(Z=ht_B.beta / ht_B.standard_error)

            sumstats_template = hl.import_table(
                'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',

コード例 #10

0

ファイルを表示

    test="firth",  #controls false positives
    y=hl.float(mt.AffectionBool),
    x=mt.GT.n_alt_alleles(),
    covariates=[
        1,
        hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2],
        mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7],
        mt.scores[8], mt.scores[9]
    ])

######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable)
gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1),
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[
                                     1,
                                     hl.float(mt.PackYear), mt.scores[0],
                                     mt.scores[1], mt.scores[2], mt.scores[3],
                                     mt.scores[4], mt.scores[5], mt.scores[6],
                                     mt.scores[7], mt.scores[8], mt.scores[9]
                                 ])

######## 6.3 Q-Q plot
qqplot = hl.plot.qq(gwas.p_value)
show(qqplot)

######## 6.4 Manhattan-like plots
#GWAS significanse level = 5.0 10e-8, suggestive: 5.0 10e-8 < P < 5.0 * 10e-6.

# Calculate Bonferroni based cut off lines
Bonferroni_line = -np.log10(0.05 / mt.count_rows())
Suggestive_line = -np.log10(1 / mt.count_rows())

コード例 #11

0

ファイルを表示

args = parser.parse_args()


hl.import_vcf(args.vcf_path).write(args.workdir + 'hail.mt', overwrite=True)
mt = hl.read_matrix_table(args.workdir + 'hail.mt')
table = (hl.import_table(args.annotation_path, impute=True)
     .key_by('Sample'))

# Start benchmarking after I/O
t0 = time.time()
mt = mt.annotate_cols(pheno = table[mt.s])
mt = hl.sample_qc(mt)
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                    (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                    (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                             x=mt.GT.n_alt_alleles(),
                             covariates=[1.0])
p = hl.plot.manhattan(gwas.p_value)

print('Time Elapsed: {}'.format(time.time()- t0))

コード例 #12

0

ファイルを表示

ファイル: sex_differential_simulations_sample_size.py プロジェクト: hegu2692/genobias

def gwas(y, x, cov):
    g = hl.linear_regression_rows(y=y,
                                  x=x,
                                  covariates=cov,
                                  pass_through=['rsid'])
    return g

コード例 #13

0

ファイルを表示

# COMMAND ----------

print('Samples: %d  Variants: %d' %
      (common_mt.count_cols(), common_mt.count_rows()))

# COMMAND ----------

# MAGIC %md These filters removed about 15% of sites (we started with a bit over 10,000). This is _NOT_ representative of most sequencing datasets! We have already downsampled the full thousand genomes dataset to include more common variants than we'd expect by chance.
# MAGIC
# MAGIC In Hail, the association tests accept column fields for the sample phenotype and covariates. Since we've already got our phenotype of interest (caffeine consumption) in the dataset, we are good to go:

# COMMAND ----------

gwas = hl.linear_regression_rows(y=common_mt.CaffeineConsumption,
                                 x=common_mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()

# COMMAND ----------

# MAGIC %md Looking at the bottom of the above printout, you can see the linear regression adds new row fields for the beta, standard error, t-statistic, and p-value.
# MAGIC
# MAGIC Hail makes it easy to make a [Q-Q (quantile-quantile) plot](https://en.wikipedia.org/wiki/Q–Q_plot).

# COMMAND ----------

p = hl.plot.qq(gwas.p_value)
displayBokeh(p)

# COMMAND ----------

コード例 #14

0

ファイルを表示

def gwas(mt,
         x,
         y,
         cov_list=[],
         with_intercept=True,
         pass_through=[],
         path_to_save=None,
         normalize_x=False,
         is_std_cov_list=False):
    r'''Runs GWAS'''

    mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x})

    print('\n... Calculating allele frequency ...')
    mt_freq_rows = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) /
                                    2).rows()  #frequency of alternate allele
    mt_freq_rows = mt_freq_rows.key_by('rsid')

    if normalize_x:
        mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x))
        mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) /
                                 mt.__gt_stats.stdev)
        mt = mt.drop('__gt_stats')

    if is_std_cov_list:
        cov_list = [
            'isFemale', 'age', 'age_squared', 'age_isFemale',
            'age_squared_isFemale'
        ] + ['PC{:}'.format(i) for i in range(1, 21)]

    if str in list(map(lambda x: type(x), cov_list)):
        cov_list = list(map(lambda x: mt[x]
                            if type(x) is str else x, cov_list))

    cov_list = ([1] if with_intercept else []) + cov_list

    print(f'pass through: {pass_through}')

    gwas_ht = hl.linear_regression_rows(y=mt.__y,
                                        x=mt.__x,
                                        covariates=cov_list,
                                        pass_through=['rsid'] + pass_through)

    gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept)

    gwas_ht = gwas_ht.key_by('rsid')

    ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht'
                                )  # sumstats template as a hail table
    ss_template = ss_template.key_by('SNP')

    ss = ss_template.annotate(chr=gwas_ht[ss_template.SNP].locus.contig,
                              bpos=gwas_ht[ss_template.SNP].locus.position,
                              freq=mt_freq_rows[ss_template.SNP].freq,
                              beta=gwas_ht[ss_template.SNP].beta,
                              z=gwas_ht[ss_template.SNP].t_stat,
                              pval=gwas_ht[ss_template.SNP].p_value,
                              n=gwas_ht[ss_template.SNP].n)
    ss = ss.drop('N')
    ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2'})

    print(ss.describe())

    if path_to_save is not None:
        ss.export(path_to_save)

    return ss

コード例 #15

0

ファイルを表示

 gwas = hl.linear_regression_rows(
     y=[
         mt.sample_qc_and_phenotype.fbc.neut_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_p_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pct_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hgb_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.plt_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.rdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.pdw_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mcv_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mch_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mchc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.ret_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.hlr_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.mono_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.lymph_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.irf_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_baso_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_eo_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_neut_sum_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.gran_p_myeloid_wbc_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.eo_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.neut_p_gran_gwas_normalised,
         mt.sample_qc_and_phenotype.fbc.baso_p_gran_gwas_normalised
         #                   mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised,
         #                   mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised],
     ],
     x=mt.GT.n_alt_alleles(),
     covariates=[1.0],
     pass_through=[mt.rsid])

コード例 #16

0

ファイルを表示

ファイル: simulations_heckman.py プロジェクト: hegu2692/genobias

u = 8

rgs = [-0.3, -0.1, 0, 0.1, 0.3]

# Import matrix and annotate cols with phenotypes
mt = hl.read_matrix_table(input_matrix)

mt = mt.annotate_cols(U=mt.y[u])
mt = mt.annotate_cols(X=mt.y[x])

for i in range(5):
    mt = mt.annotate_cols(Y=mt.y[i + 2])

    # GWAS of X, Y in all
    result_ht = hl.linear_regression_rows(y=[mt.X, mt.Y],
                                          x=mt.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        output_bucket + 'gwas/gwas_X_' + str(rgs[i]) + '.tsv',
        output_bucket + 'gwas/gwas_Y_' + str(rgs[i]) + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,

コード例 #17

0

ファイルを表示

mt = hl.import_bgen(
    path=
    f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen',
    sample_file=f'gs://ukb31063/ukb31063.{contig}.sample',
    entry_fields=['dosage'],
    variants=ht_variants)

mt = mt.annotate_cols(phenotypes=ht_phenotypes[mt.s],
                      covariates=ht_covariates[mt.s])

phenotypes = list(mt['phenotypes'].keys())

ht = hl.linear_regression_rows(
    y=[[mt['phenotypes'][y]] for y in phenotypes],
    x=mt.dosage,
    covariates=[
        1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())]
    ],
    pass_through=['varid', 'rsid'])

ht = ht.annotate_globals(phenotypes=phenotypes)

if dilution:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.dilution_factor.ht',
        overwrite=True)
else:
    ht.write(
        f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.ht',
        overwrite=True)

コード例 #18

0

ファイルを表示

ファイル: spike_slab.py プロジェクト: armartin/mama_benchmarks

def main(args):
    betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100']
    spike_slab = hl.import_table(
        'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt',
        impute=True)
    spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v))
    if args.compute_true_phenotypes:
        # get the white british subset
        eur = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv'
        ).key_by('s')

        # read in imputed data, subset to chr22
        mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        mt = mt.annotate_rows(ss=spike_slab[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt.ss))

        # compute true PRS (i.e. phenotypes)
        annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas}

        # write out phenos for white British unrelated subset
        mt = mt.annotate_cols(**annot_expr)
        mt = mt.filter_cols(hl.is_defined(eur[mt.s]))
        mt.cols().write(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht',
            stage_locally=True,
            overwrite=True)

    if args.run_gwas:
        # read back in PRS (now true phenotypes)
        phenos = hl.read_table(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht'
        ).key_by('s')
        phenos.show()
        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            impute=True,
            types={
                's': hl.tstr
            }).key_by('s')
        full_mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        full_mt = full_mt.annotate_cols(**covariates[full_mt.s])
        full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key])
        full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss))

        # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs
        for i in range(10):
            subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1))
            subset_pheno = subset_pheno.order_by(
                subset_pheno.r).add_index('global_idx').key_by('s')
            subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000)
            mt = full_mt.annotate_cols(**subset_pheno[full_mt.s])
            mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2)
            result_ht = hl.linear_regression_rows(
                y=[mt[i] for i in betas],
                x=mt.dosage,
                covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)],
                pass_through=['rsid', 'maf'])

            subset_pheno.export(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_'
                + str(i) + '.tsv.gz')
            result_ht.write(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht',
                overwrite=True)

    if args.write_gwas:
        for i in range(10):
            result_ht = hl.read_table(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht')
            result_ht = result_ht.key_by()
            get_expr = {
                field + '_' + x: result_ht[field][i]
                for i, x in enumerate(betas)
                for field in ['beta', 'standard_error', 'p_value']
            }
            result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0],
                             alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\
                .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')

コード例 #19

0

ファイルを表示

ファイル: review_simulation_C.py プロジェクト: hegu2692/genobias

    # Filter MatrixTable and get sample
    samples_to_keep = set(df.loc[(df['sel'] == 1), 's'])
    set_to_keep = hl.literal(samples_to_keep)
    mt_sampled = mt.filter_cols(set_to_keep.contains(mt['s']), keep=True)

    i = '_' + str(OR_x) + '_' + str(or_sex)

    # Export phenotypes
    mt_sampled.cols().select(
        's', 'sex',
        'y0').key_by().export(out_bucket + 'phenotypes/pheno' + i + '.tsv')

    # Unadjusted GWASs
    result_ht = hl.linear_regression_rows(y=[mt_sampled.sex, mt_sampled.y0],
                                          x=mt_sampled.GT.n_alt_alleles(),
                                          covariates=[1],
                                          pass_through=['rsid'])

    result_ht = result_ht.annotate(A1=result_ht.alleles[0],
                                   A2=result_ht.alleles[1]).key_by()

    file_names = [
        out_bucket + 'gwas/gwas_sex' + i + '.tsv',
        out_bucket + 'gwas/gwas_y0' + i + '.tsv'
    ]

    for j, file_name in enumerate(file_names):
        result_ht.select(result_ht.locus,
                         result_ht.A1,
                         result_ht.A2,
                         result_ht.rsid,

コード例 #20

0

ファイルを表示

ファイル: gwas_on_subset.py プロジェクト: liangyy/ptrs-ukb

tend = time.time()
logging.info('--> Annotate with {} FINISHED! {} seconds elapsed'.format('covariates', tend - tstart))

# prepare phenotypes and covariates into list of lists and list
logging.info('Start preparing `y` and `covariates` for `linear_regression_rows`')
pheno_list_of_lists = [ [ mt[i][j] for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
pheno_list_of_names = [ [ f'{i}_x_{j}' for j in mt[i] ] for i in list(subset_ht_dic.keys()) ]
covar_list = [ mt.covariates[i] for i in list(mt.covariates.keys()) ]
logging.info('Prepare `y` and `covariates` for `linear_regression_rows` FINISHED!')

# run GWAS
logging.info('Start running GWAS')
tstart = time.time()
gwas_out = hl.linear_regression_rows(
    y = pheno_list_of_lists,
    x = mt.dosage,
    covariates = [1] + covar_list,
    pass_through = ['varid', 'rsid']
)
gwas_out = gwas_out.annotate_globals(phenotypes = pheno_list_of_names)
tend = time.time()
logging.info('Running GWAS FINISHED! {} seconds elapsed'.format(tend - tstart))

# write GWAS results onto disk
logging.info('Start writing GWAS result to disk')
tstart = time.time()
## if target folder does not exist, create it
target_folder = os.path.dirname(args.output_filename)
if not os.path.exists(target_folder) and target_folder is not '':
    os.makedirs(target_folder)
## check if extension of output file is .ht, if not add it
filename, file_extension = os.path.splitext(args.output_filename)

コード例 #21

0

ファイルを表示

ファイル: annotations.py プロジェクト: atgu/GWASpy

    def filter(self, mt):
        gwas = hl.linear_regression_rows(y=mt.is_case, x=mt.GT.n_alt_alleles(), covariates=[1.0])
        n_sig_variants = gwas.filter(gwas.p_value < 5E-8).count()

        return gwas, n_sig_variants

コード例 #22

0

ファイルを表示

ファイル: ldscsim_v2.0-test.py プロジェクト: nikbaya/ldscsim

sim_mt.aggregate_cols(hl.agg.stats(sim_mt.y_no_noise))

y = sim_mt.select_cols(sim_mt.y_no_noise).make_table()

cov = hl.import_table('/Users/nbaya/Documents/lab/ldscsim/ukb31063.gwas_covariates.both_sexes.tsv',impute=True, types={'s': hl.tstr}).key_by('s')

mt0 = sim_mt.annotate_cols(**cov[sim_mt.s])

mt0 = mt0.rename({'__norm_gt__': 'x'})

mt = mt0

cov_list = [ mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'],
                            mt['age_squared_isFemale'] ]+ [mt['PC{:}'.format(i)] for i in range(1, 21)] 

ht = hl.linear_regression_rows(
                y=mt.y,
                x=mt.x,
                covariates=[1]+cov_list,
                pass_through = ['rsid'])

ht = ht.rename({'rsid':'SNP'}).key_by('SNP')

ht = ht.select(Z = ht.beta/ht.standard_error)

sumstats_template = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',types={'N': hl.tint64})
sumstats_template = sumstats_template.key_by('SNP')
sumstats_template = sumstats_template.annotate(N = n_samples)
#            sumstats_template.show()

sumstats = sumstats_template.annotate(Z = ht[sumstats_template.SNP]['Z'])