Beispiel #1
0
def get_final_sumstats_mt_for_export():
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)
    mt0 = mt0.select_rows()
    return mt0
Beispiel #2
0
def make_pheno_manifest(export=True):
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)

    ht = mt0.cols()
    annotate_dict = {}

    annotate_dict.update({
        'pops': hl.delimit(ht.pheno_data.pop),
        'num_pops': hl.len(ht.pheno_data.pop)
    })

    for field in ['n_cases', 'n_controls', 'heritability', 'lambda_gc']:
        for pop in ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']:
            new_field = field if field != 'heritability' else 'saige_heritability'  # new field name (only applicable to saige heritability)
            idx = ht.pheno_data.pop.index(pop)
            field_expr = ht.pheno_data[field]
            annotate_dict.update({
                f'{new_field}_{pop}':
                hl.if_else(hl.is_nan(idx), hl.null(field_expr[0].dtype),
                           field_expr[idx])
            })
    annotate_dict.update({'filename': get_pheno_id(tb=ht) + '.tsv.bgz'})
    ht = ht.annotate(**annotate_dict)

    dropbox_manifest = hl.import_table(
        f'{ldprune_dir}/UKBB_Pan_Populations-Manifest_20200615-manifest_info.tsv',
        impute=True,
        key='File')
    dropbox_manifest = dropbox_manifest.filter(
        dropbox_manifest['is_old_file'] != '1')
    bgz = dropbox_manifest.filter(~dropbox_manifest.File.contains('.tbi'))
    bgz = bgz.rename({'File': 'filename'})
    tbi = dropbox_manifest.filter(dropbox_manifest.File.contains('.tbi'))
    tbi = tbi.annotate(
        filename=tbi.File.replace('.tbi', '')).key_by('filename')

    dropbox_annotate_dict = {}

    rename_dict = {
        'dbox link': 'dropbox_link',
        'size (bytes)': 'size_in_bytes'
    }

    dropbox_annotate_dict.update({'filename_tabix': tbi[ht.filename].File})
    for field in ['dbox link', 'wget', 'size (bytes)', 'md5 hex']:
        for tb, suffix in [(bgz, ''), (tbi, '_tabix')]:
            dropbox_annotate_dict.update({
                (rename_dict[field] if field in rename_dict else field.replace(
                     ' ', '_')) + suffix:
                tb[ht.filename][field]
            })
    ht = ht.annotate(**dropbox_annotate_dict)
    ht = ht.drop('pheno_data')
    ht.describe()
    ht.show()
Beispiel #3
0
def make_pheno_manifest():
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)
    ht = mt0.cols()
    annotate_dict = {}

    annotate_dict.update({
        'pops': hl.delimit(ht.pheno_data.pop),
        'num_pops': hl.len(ht.pheno_data.pop)
    })

    for field in ['n_cases', 'n_controls', 'heritability', 'lambda_gc']:
        for pop in ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']:
            new_field = field if field != 'heritability' else 'saige_heritability'  # new field name (only applicable to saige heritability)
            idx = ht.pheno_data.pop.index(pop)
            field_expr = ht.pheno_data[field]
            annotate_dict.update({
                f'{new_field}_{pop}':
                hl.if_else(hl.is_nan(idx), hl.null(field_expr[0].dtype),
                           field_expr[idx])
            })
    annotate_dict.update({
        'filename':
        (ht.trait_type + '-' + ht.phenocode + '-' + ht.pheno_sex +
         hl.if_else(hl.len(ht.coding) > 0, '-' + ht.coding, '') +
         hl.if_else(hl.len(ht.modifier) > 0, '-' + ht.modifier, '')).replace(
             ' ', '_').replace('/', '_') + '.tsv.bgz'
    })
    ht = ht.annotate(**annotate_dict)
    aws_bucket = 'https://pan-ukb-us-east-1.s3.amazonaws.com/sumstats_release'
    ht = ht.annotate(aws_link=aws_bucket + '/' + ht.filename,
                     aws_link_tabix=aws_bucket + '_tabix/' + ht.filename +
                     '.tbi')

    other_fields_ht = hl.import_table(
        f'{ldprune_dir}/release/md5_hex_and_file_size.tsv.bgz',
        force_bgz=True,
        key=PHENO_KEY_FIELDS)
    other_fields = [
        'size_in_bytes', 'size_in_bytes_tabix', 'md5_hex', 'md5_hex_tabix'
    ]

    ht = ht.annotate(wget='wget ' + ht.aws_link,
                     wget_tabix='wget ' + ht.aws_link_tabix,
                     **{f: other_fields_ht[ht.key][f]
                        for f in other_fields})

    ht = ht.drop('pheno_data', 'pheno_indices')
    ht.export(f'{bucket}/combined_results/phenotype_manifest.tsv.bgz')
Beispiel #4
0
def export_binary_eur(cluster_idx, num_clusters=10, batch_size=256):
    r'''
    Export summary statistics for binary traits defined only for EUR. 
    Given the large number of such traits (4184), it makes sense to batch this 
    across `num_clusters` clusters for reduced wall time and robustness to mid-export errors.
    NOTE: `cluster_idx` is 1-indexed.
    '''

    hl.init(default_reference='GRCh38', log='/tmp/export_entries_by_col.log')
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)
    mt0 = mt0.select_rows()
    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())

    mt0 = mt0.annotate_cols(
        pheno_id=(mt0.trait_type + '-' + mt0.phenocode + '-' + mt0.pheno_sex +
                  hl.if_else(hl.len(mt0.coding) > 0, '-' + mt0.coding, '') +
                  hl.if_else(hl.len(mt0.modifier) > 0, '-' + mt0.modifier, '')
                  ).replace(' ', '_').replace('/', '_'))
    mt0 = mt0.annotate_rows(chr=mt0.locus.contig,
                            pos=mt0.locus.position,
                            ref=mt0.alleles[0],
                            alt=mt0.alleles[1])

    trait_types_to_run = ['categorical', 'phecode', 'icd10',
                          'prescriptions']  # list of which trait_type to run

    # fields specific to each category of trait
    meta_fields = ['AF_Cases', 'AF_Controls']
    fields = ['AF.Cases', 'AF.Controls']

    # dictionaries for renaming fields
    meta_field_rename_dict = {
        'BETA': 'beta_meta',
        'SE': 'se_meta',
        'Pvalue': 'pval_meta',
        'AF_Cases': 'af_cases_meta',
        'AF_Controls': 'af_controls_meta',
        'Pvalue_het': 'pval_heterogeneity'
    }
    field_rename_dict = {
        'AF.Cases': 'af_cases',
        'AF.Controls': 'af_controls',
        'BETA': 'beta',
        'SE': 'se',
        'Pvalue': 'pval',
        'low_confidence': 'low_confidence'
    }  # decided on this implementation to make later code cleaner

    all_binary_trait_types = {
        'categorical', 'phecode', 'icd10', 'prescriptions'
    }

    meta_fields += ['BETA', 'SE', 'Pvalue', 'Pvalue_het']
    fields += ['BETA', 'SE', 'Pvalue', 'low_confidence']

    trait_category = 'binary'
    trait_types = all_binary_trait_types.intersection(
        trait_types_to_run)  # get list of binary trait types to run
    pop_set = {'EUR'}
    start = time()

    mt1 = mt0.filter_cols((hl.literal(trait_types).contains(mt0.trait_type)) &
                          (hl.set(mt0.pheno_data.pop) == hl.literal(pop_set)))

    pheno_id_list = mt1.pheno_id.collect()

    num_traits = len(pheno_id_list)  # total number of traits to run

    traits_per_cluster = ceil(
        num_traits / num_clusters)  # maximum traits to run per cluster

    cluster_pheno_id_list = pheno_id_list[
        (cluster_idx - 1) * traits_per_cluster:cluster_idx *
        traits_per_cluster]  # list of traits to run in current cluster

    print(len(cluster_pheno_id_list))

    mt1 = mt1.filter_cols(
        hl.literal(cluster_pheno_id_list).contains(mt1.pheno_id))

    pop_list = sorted(pop_set)

    annotate_dict = {}

    keyed_mt = meta_mt0[mt1.row_key, mt1.col_key]
    if len(pop_set) > 1:
        for field in meta_fields:  # NOTE: Meta-analysis columns go before per-population columns
            #        annotate_dict.update({f'{meta_field_rename_dict[field]}': hl.float64(hl.format('%.3e', keyed_mt.meta_analysis[field][0]))})
            field_expr = keyed_mt.meta_analysis[field][0]
            annotate_dict.update({
                f'{meta_field_rename_dict[field]}':
                hl.if_else(hl.is_nan(field_expr), hl.str(field_expr),
                           hl.format('%.3e', field_expr))
            })

    for field in fields:
        for pop_idx, pop in enumerate(pop_list):
            #            annotate_dict.update({f'{field_rename_dict[field]}_{pops[pop_idx]}': hl.format('%.3e', mt1.summary_stats[field][pop_idx])})
            field_expr = mt1.summary_stats[field][pop_idx]
            annotate_dict.update({
                f'{field_rename_dict[field]}_{pop}':
                hl.if_else(
                    hl.is_nan(field_expr), hl.str(field_expr),
                    hl.str(field_expr) if field == 'low_confidence' else
                    hl.format('%.3e', field_expr))
            })

    mt2 = mt1.annotate_entries(**annotate_dict)

    mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409')
    mt2 = mt2.key_cols_by('pheno_id')
    mt2 = mt2.key_rows_by().drop(
        'locus', 'alleles', 'summary_stats'
    )  # row fields that are no longer included: 'gene','annotation'
    print(mt2.describe())

    batch_idx = 1
    get_export_path = lambda batch_idx: f'{ldprune_dir}/release/{trait_category}/{"-".join(pop_list)}_batch{batch_idx}/subbatch{cluster_idx}'

    while hl.hadoop_is_dir(get_export_path(batch_idx)):
        batch_idx += 1
    print(
        f'\nExporting {len(cluster_pheno_id_list)} phenos to: {get_export_path(batch_idx)}\n'
    )
    hl.experimental.export_entries_by_col(mt=mt2,
                                          path=get_export_path(batch_idx),
                                          bgzip=True,
                                          batch_size=batch_size,
                                          use_string_key_as_file_name=True,
                                          header_json_in_file=False)
    end = time()
    print(
        f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs'
    )
Beispiel #5
0
def export_results(num_pops,
                   trait_types='all',
                   batch_size=256,
                   phenocode=None):
    r'''
    `num_pops`: exact number of populations for which phenotype is defined
    `trait_types`: trait category (options: all, binary, quant)
    `batch_size`: batch size argument for export entries by col
    '''
    assert trait_types in {
        'all', 'quant', 'binary'
    }, "trait_types must be one of the following: {'all','quant','binary'}"
    hl.init(default_reference='GRCh38', log='/tmp/export_entries_by_col.log')
    print(f'\n\nExporting {trait_types} trait types for {num_pops} pops\n\n')
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)
    mt0 = mt0.select_rows()

    if phenocode != None:
        print('\nFiltering to traits with phenocode: {phenocode}\n')
        mt0 = mt0.filter_cols(mt0.phenocode == phenocode)

    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())

    mt0 = mt0.annotate_cols(
        pheno_id=(mt0.trait_type + '-' + mt0.phenocode + '-' + mt0.pheno_sex +
                  hl.if_else(hl.len(mt0.coding) > 0, '-' + mt0.coding, '') +
                  hl.if_else(hl.len(mt0.modifier) > 0, '-' + mt0.modifier, '')
                  ).replace(' ', '_').replace('/', '_'))
    mt0 = mt0.annotate_rows(chr=mt0.locus.contig,
                            pos=mt0.locus.position,
                            ref=mt0.alleles[0],
                            alt=mt0.alleles[1])

    all_pops = ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']

    if trait_types == 'all':
        trait_types_to_run = [
            'continuous', 'biomarkers', 'categorical', 'phecode', 'icd10',
            'prescriptions'
        ]  # list of which trait_type to run
    elif trait_types == 'quant':
        trait_types_to_run = ['continuous', 'biomarkers']
    elif trait_types == 'binary':
        trait_types_to_run = [
            'categorical', 'phecode', 'icd10', 'prescriptions'
        ]

    pop_sets = [set(i) for i in list(combinations(all_pops, num_pops))
                ]  # list of exact set of pops for which phenotype is defined

    # fields specific to each category of trait
    quant_meta_fields = ['AF_Allele2']
    quant_fields = ['AF_Allele2']

    binary_meta_fields = ['AF_Cases', 'AF_Controls']
    binary_fields = ['AF.Cases', 'AF.Controls']

    # dictionaries for renaming fields
    quant_meta_field_rename_dict = {
        'AF_Allele2': 'af_meta',
        'BETA': 'beta_meta',
        'SE': 'se_meta',
        'Pvalue': 'pval_meta',
        'Pvalue_het': 'pval_heterogeneity'
    }
    quant_field_rename_dict = {
        'AF_Allele2': 'af',
        'BETA': 'beta',
        'SE': 'se',
        'Pvalue': 'pval',
        'low_confidence': 'low_confidence'
    }  # decided on this implementation to make later code cleaner

    binary_meta_field_rename_dict = {
        'BETA': 'beta_meta',
        'SE': 'se_meta',
        'Pvalue': 'pval_meta',
        'AF_Cases': 'af_cases_meta',
        'AF_Controls': 'af_controls_meta',
        'Pvalue_het': 'pval_heterogeneity'
    }
    binary_field_rename_dict = {
        'AF.Cases': 'af_cases',
        'AF.Controls': 'af_controls',
        'BETA': 'beta',
        'SE': 'se',
        'Pvalue': 'pval',
        'low_confidence': 'low_confidence'
    }  # decided on this implementation to make later code cleaner

    all_quant_trait_types = {'continuous', 'biomarkers'}
    all_binary_trait_types = {
        'categorical', 'phecode', 'icd10', 'prescriptions'
    }

    quant_trait_types = all_quant_trait_types.intersection(
        trait_types_to_run)  # get list of quant trait types to run
    binary_trait_types = all_binary_trait_types.intersection(
        trait_types_to_run)  # get list of binary trait types to run
    error_trait_types = set(trait_types_to_run).difference(
        quant_trait_types.union(binary_trait_types))
    assert len(
        error_trait_types
    ) == 0, f'ERROR: The following trait_types are invalid: {error_trait_types}'

    for trait_category, trait_types in [('binary', binary_trait_types),
                                        ('quant', quant_trait_types)]:
        if len(trait_types) == 0:  #if no traits in trait_types list
            continue

        print(f'{trait_category} trait types to run: {trait_types}')

        if trait_category == 'quant':
            meta_fields = quant_meta_fields
            fields = quant_fields
            meta_field_rename_dict = quant_meta_field_rename_dict
            field_rename_dict = quant_field_rename_dict
        elif trait_category == 'binary':
            meta_fields = binary_meta_fields
            fields = binary_fields
            meta_field_rename_dict = binary_meta_field_rename_dict
            field_rename_dict = binary_field_rename_dict

        meta_fields += ['BETA', 'SE', 'Pvalue', 'Pvalue_het']
        fields += ['BETA', 'SE', 'Pvalue', 'low_confidence']

        for pop_set in pop_sets:
            start = time()

            if pop_set == {
                    'EUR'
            } and trait_category == 'binary':  # run EUR-only binary traits separately
                print('\nSkipping EUR-only binary traits\n')
                continue

            mt1 = mt0.filter_cols(
                (hl.literal(trait_types).contains(mt0.trait_type))
                & (hl.set(mt0.pheno_data.pop) == hl.literal(pop_set)))

            col_ct = mt1.count_cols()
            if col_ct == 0:
                print(
                    f'\nSkipping {trait_types},{sorted(pop_set)}, no phenotypes found\n'
                )
                continue

            pop_list = sorted(pop_set)

            annotate_dict = {}
            # TODO: Filter variants with NA in field
            keyed_mt = meta_mt0[mt1.row_key, mt1.col_key]
            if len(pop_set) > 1:
                for field in meta_fields:  # NOTE: Meta-analysis columns go before per-population columns
                    #        annotate_dict.update({f'{meta_field_rename_dict[field]}': hl.float64(hl.format('%.3e', keyed_mt.meta_analysis[field][0]))})
                    field_expr = keyed_mt.meta_analysis[field][0]
                    annotate_dict.update({
                        f'{meta_field_rename_dict[field]}':
                        hl.if_else(hl.is_nan(field_expr), hl.str(field_expr),
                                   hl.format('%.3e', field_expr))
                    })

            for field in fields:
                for pop_idx, pop in enumerate(pop_list):
                    #            annotate_dict.update({f'{field_rename_dict[field]}_{pops[pop_idx]}': hl.format('%.3e', mt1.summary_stats[field][pop_idx])})
                    field_expr = mt1.summary_stats[field][pop_idx]
                    annotate_dict.update({
                        f'{field_rename_dict[field]}_{pop}':
                        hl.if_else(
                            hl.is_nan(field_expr), hl.str(field_expr),
                            hl.str(field_expr) if field == 'low_confidence'
                            else hl.format('%.3e', field_expr))
                    })

            mt2 = mt1.annotate_entries(**annotate_dict)

            mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409')
            mt2 = mt2.key_cols_by('pheno_id')
            mt2 = mt2.key_rows_by().drop(
                'locus', 'alleles', 'summary_stats'
            )  # row fields that are no longer included: 'gene','annotation'

            batch_idx = 1
            get_export_path = lambda batch_idx: f'{ldprune_dir}/release/{trait_category}/{"" if phenocode is None else f"{phenocode}/"}{"-".join(pop_list)}_batch{batch_idx}'
            #            if hl.hadoop_is_dir(get_export_path(batch_idx)):
            #                print(f'Skipping because path exists: {get_export_path(batch_idx)}')
            #                continue
            print(mt2.describe())
            while hl.hadoop_is_dir(get_export_path(batch_idx)):
                batch_idx += 1
            print(
                f'\nExporting {col_ct} phenos to: {get_export_path(batch_idx)}\n'
            )
            hl.experimental.export_entries_by_col(
                mt=mt2,
                path=get_export_path(batch_idx),
                bgzip=True,
                batch_size=batch_size,
                use_string_key_as_file_name=True,
                header_json_in_file=False)
            end = time()
            print(
                f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs'
            )