Ejemplo n.º 1
0
def export_ma_format(batch_size=256):
    r'''
    Export columns for .ma format (A1, A2, freq, beta, se, N) for select phenotypes
    '''
    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())
    
    highprev = hl.import_table(f'{ldprune_dir}/joined_ukbb_lancet_age_high_prev.tsv', impute=True)
    highprev = highprev.annotate(pheno = highprev.code.replace('_irnt',''))
    pheno_list = highprev.pheno.collect()
    pheno_list = [p for p in pheno_list if p is not None]
    meta_mt0 = meta_mt0.filter_cols(hl.literal(pheno_list).contains(meta_mt0.pheno))

    meta_mt0 = meta_mt0.annotate_cols(pheno_id = (meta_mt0.trait_type+'-'+
                                      meta_mt0.phenocode+'-'+
                                      meta_mt0.pheno_sex+
                                      hl.if_else(hl.len(meta_mt0.coding)>0, '-'+meta_mt0.coding, '')+
                                      hl.if_else(hl.len(meta_mt0.modifier)>0, '-'+meta_mt0.modifier, '')
                                      ).replace(' ','_').replace('/','_'))
    
    meta_mt0 = meta_mt0.annotate_rows(SNP = meta_mt0.locus.contig+':'+hl.str(meta_mt0.locus.position)+':'+meta_mt0.alleles[0]+':'+meta_mt0.alleles[1],
                                      A1 = meta_mt0.alleles[1], # .ma format requires A1 = effect allele, which in this case is A2 for UKB GWAS
                                      A2 = meta_mt0.alleles[0])

    meta_field_rename_dict = {'BETA':'b',
                          'SE':'se',
                          'Pvalue':'p',
                          'AF_Allele2':'freq',
                          'N':'N'}

    for pop in ['AFR','EUR']: #['AFR','AMR','CSA','EAS','EUR','MID']:
        print(f'not_{pop}')

        req_pop_list = [p for p in POPS if p is not pop]

        loo_pop = meta_mt0.annotate_cols(idx = meta_mt0.meta_analysis_data.pop.index(hl.literal(req_pop_list))) # get index of which meta-analysis is the leave-on-out for current pop
        loo_pop = loo_pop.filter_cols(hl.is_defined(loo_pop.idx))
        
        annotate_dict = {meta_field_rename_dict[field]: loo_pop.meta_analysis[field][loo_pop.idx] for field in  ['AF_Allele2','BETA','SE','Pvalue','N']} 
        batch_idx = 1

    export_out = f'{ldprune_dir}/loo/not_{pop}/batch{batch_idx}'
    while hl.hadoop_is_dir(export_out):
        batch_idx += 1
        export_out = f'{ldprune_dir}/loo/not_{pop}/batch{batch_idx}'
    checkpoint_path = f'gs://ukbb-diverse-temp-30day/loo/not_{pop}/batch{batch_idx}.mt'
#        print(f'\nCheckpointing to: {checkpoint_path}\n')
    loo_pop = loo_pop.checkpoint(checkpoint_path,
                                 _read_if_exists=True,
                                 overwrite=True)
    loo_pop = loo_pop.filter_entries(hl.is_defined(loo_pop.b))
    print(f'\nExporting to: {export_out}\n')
    hl.experimental.export_entries_by_col(mt = loo_pop,
                                          path = export_out,
                                          bgzip = True,
                                          batch_size = batch_size,
                                          use_string_key_as_file_name = True,
                                          header_json_in_file = False)
Ejemplo n.º 2
0
def export_loo(batch_size=256):
    r'''
    For exporting p-values of meta-analysis of leave-one-out population sets
    '''
    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())

    meta_mt0 = meta_mt0.filter_cols(hl.len(meta_mt0.pheno_data.pop) == 6)

    meta_mt0 = meta_mt0.annotate_cols(pheno_id=(
        meta_mt0.trait_type + '-' + meta_mt0.phenocode + '-' +
        meta_mt0.pheno_sex +
        hl.if_else(hl.len(meta_mt0.coding) > 0, '-' + meta_mt0.coding, '') +
        hl.if_else(hl.len(meta_mt0.modifier) > 0, '-' +
                   meta_mt0.modifier, '')).replace(' ', '_').replace('/', '_'))

    meta_mt0 = meta_mt0.annotate_rows(
        SNP=(meta_mt0.locus.contig + ':' + hl.str(meta_mt0.locus.position) +
             ':' + meta_mt0.alleles[0] + ':' + meta_mt0.alleles[1]))

    all_pops = sorted(['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID'])

    annotate_dict = {}
    for pop_idx, pop in enumerate(
            all_pops, 1
    ):  # pop idx corresponds to the alphabetic ordering of the pops (entry with idx=0 is 6-pop meta-analysis)
        annotate_dict.update(
            {f'pval_not_{pop}': meta_mt0.meta_analysis.Pvalue[pop_idx]})
    meta_mt1 = meta_mt0.annotate_entries(**annotate_dict)

    meta_mt1 = meta_mt1.key_cols_by('pheno_id')
    meta_mt1 = meta_mt1.key_rows_by().drop('locus', 'alleles', 'gene',
                                           'annotation', 'meta_analysis')

    print(meta_mt1.describe())

    batch_idx = 1
    get_export_path = lambda batch_idx: f'{ldprune_dir}/loo/sumstats/batch{batch_idx}'
    while hl.hadoop_is_dir(get_export_path(batch_idx)):
        batch_idx += 1
    print(f'\nExporting to: {get_export_path(batch_idx)}\n')
    hl.experimental.export_entries_by_col(mt=meta_mt1,
                                          path=get_export_path(batch_idx),
                                          bgzip=True,
                                          batch_size=batch_size,
                                          use_string_key_as_file_name=True,
                                          header_json_in_file=False)
Ejemplo n.º 3
0
def export_binary_eur(cluster_idx, num_clusters=10, batch_size = 256):
    r'''
    Export summary statistics for binary traits defined only for EUR. 
    Given the large number of such traits (4184), it makes sense to batch this 
    across `num_clusters` clusters for reduced wall time and robustness to mid-export errors.
    NOTE: `cluster_idx` is 1-indexed.
    '''
    mt0 = get_final_sumstats_mt_for_export()
    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())
    
    mt0 = mt0.annotate_cols(pheno_id = get_pheno_id(tb=mt0))
    mt0 = mt0.annotate_rows(chr = mt0.locus.contig,
                            pos = mt0.locus.position,
                            ref = mt0.alleles[0],
                            alt = mt0.alleles[1])
    
    trait_types_to_run = ['categorical','phecode', 'icd10', 'prescriptions'] # list of which trait_type to run
        
    # fields specific to each category of trait    
    meta_fields = ['AF_Cases','AF_Controls']
    fields = ['AF.Cases','AF.Controls']
    
    # dictionaries for renaming fields
    meta_field_rename_dict = {'BETA':'beta_meta',
                                     'SE':'se_meta',
                                     'Pvalue':'pval_meta',
                                     'AF_Cases':'af_cases_meta',
                                     'AF_Controls':'af_controls_meta',
                                     'Pvalue_het':'pval_heterogeneity'}
    field_rename_dict = {'AF.Cases':'af_cases',
                                'AF.Controls':'af_controls',
                                'BETA':'beta',
                                'SE':'se',
                                'Pvalue':'pval',
                                'low_confidence':'low_confidence'} # decided on this implementation to make later code cleaner
    
    all_binary_trait_types = {'categorical','phecode', 'icd10', 'prescriptions'}
    
    meta_fields += ['BETA','SE','Pvalue','Pvalue_het']
    fields += ['BETA','SE','Pvalue','low_confidence']
    
    trait_category = 'binary'        
    trait_types = all_binary_trait_types.intersection(trait_types_to_run) # get list of binary trait types to run
    pop_set = {'EUR'}
    start = time()
    
    mt1 = mt0.filter_cols((hl.literal(trait_types).contains(mt0.trait_type))&
                          (hl.set(mt0.pheno_data.pop)==hl.literal(pop_set)))
    
    pheno_id_list = mt1.pheno_id.collect()
    
    num_traits = len(pheno_id_list) # total number of traits to run
    
    traits_per_cluster = ceil(num_traits/num_clusters) # maximum traits to run per cluster
    
    cluster_pheno_id_list = pheno_id_list[(cluster_idx-1)*traits_per_cluster:cluster_idx*traits_per_cluster] # list of traits to run in current cluster
    
    print(len(cluster_pheno_id_list))
    
    mt1 = mt1.filter_cols(hl.literal(cluster_pheno_id_list).contains(mt1.pheno_id))
    
    pop_list = sorted(pop_set)
    
    annotate_dict = {}
    
    keyed_mt = meta_mt0[mt1.row_key,mt1.col_key]
    if len(pop_set)>1:
        for field in meta_fields: # NOTE: Meta-analysis columns go before per-population columns
            field_expr = keyed_mt.meta_analysis[field][0]
            annotate_dict.update({f'{meta_field_rename_dict[field]}': hl.if_else(hl.is_nan(field_expr),
                                                                      hl.str(field_expr),
                                                                      hl.format('%.3e', field_expr))})

    for field in fields:
        for pop_idx, pop in enumerate(pop_list):
            field_expr = mt1.summary_stats[field][pop_idx]
            annotate_dict.update({f'{field_rename_dict[field]}_{pop}': hl.if_else(hl.is_nan(field_expr),
                                                                       hl.str(field_expr),
                                                                       hl.str(field_expr) if field=='low_confidence' else hl.format('%.3e', field_expr))})
    
    mt2 = mt1.annotate_entries(**annotate_dict)
    
    mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409')
    mt2 = mt2.key_cols_by('pheno_id')
    mt2 = mt2.key_rows_by().drop('locus','alleles','summary_stats') # row fields that are no longer included: 'gene','annotation'
    print(mt2.describe())
    
    batch_idx = 1
    get_export_path = lambda batch_idx: f'{ldprune_dir}/release/{trait_category}/{"-".join(pop_list)}_batch{batch_idx}/subbatch{cluster_idx}'

    while hl.hadoop_is_dir(get_export_path(batch_idx)):
        batch_idx += 1
    print(f'\nExporting {len(cluster_pheno_id_list)} phenos to: {get_export_path(batch_idx)}\n')
    hl.experimental.export_entries_by_col(mt = mt2,
                                          path = get_export_path(batch_idx),
                                          bgzip = True,
                                          batch_size = batch_size,
                                          use_string_key_as_file_name = True,
                                          header_json_in_file = False)
    end = time()
    print(f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs')
Ejemplo n.º 4
0
 def test_hadoop_is_dir(self):
     self.assertTrue(hl.hadoop_is_dir(resource('ls_test/subdir')))
     self.assertFalse(hl.hadoop_is_dir(resource('ls_test/f_50')))
     self.assertFalse(hl.hadoop_is_dir(resource('ls_test/invalid-path')))
Ejemplo n.º 5
0
@author: nbaya
"""

import subprocess
from itertools import combinations
import hail as hl

hl.init()

for num_pops in range(1, 6)[::-1]:

    all_pops = ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']
    pop_sets = [set(i) for i in list(combinations(all_pops, num_pops))
                ]  # list of exact set of pops for which phenotype is defined

    incorrrect_pops = all_pops[:num_pops]  # incorrect column suffixes

    for trait_category in ['quant', 'binary']:

        for pop_set in pop_sets:

            pop_list = sorted(pop_set)  # correct column suffixes

            bucket = f'gs://ukb-diverse-pops/ld_prune/release/{trait_category}/{"-".join(pop_list)}_batch1'
            print(bucket)

            if hl.hadoop_is_dir(bucket):
                subprocess.call(['gsutil', '-m', 'cp', f'{bucket}/*bgz', './'])
                assert False
#                break
Ejemplo n.º 6
0
 def test_hadoop_is_dir(self):
     self.assertTrue(hl.hadoop_is_dir(resource('ls_test/subdir')))
     self.assertFalse(hl.hadoop_is_dir(resource('ls_test/f_50')))
     self.assertFalse(hl.hadoop_is_dir(resource('ls_test/invalid-path')))
Ejemplo n.º 7
0
import json
import os

hl.init(log="/dev/null")

#%%
os.chdir("/Users/weisburd/code/methods/gcnv_viewer")
print(os.getcwd())

#%%

#google_storage_dir = "gs://fc-secure-e2c5f2a5-2e76-4c01-a264-419262b2c7c8/dcr_tabs"
#google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v1/beds"
google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v3/beds"

assert hl.hadoop_is_dir(google_storage_dir)

#%%

batch_name_to_path_and_samples = {}

for result in hl.hadoop_ls(google_storage_dir):

    if not result['path'].endswith('.bed.gz') and not result['path'].endswith('.bed'):
        continue

    if result['size_bytes'] < 1000:
        print(f"ERROR: file size of {result['path']} is too small: {result['size_bytes']}")

    with hl.hadoop_open(result['path'], 'r') as f:
        line = f.readline()