def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad.describe() bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def nullify_nan(value): return hl.cond(hl.is_nan(value), hl.null(value.dtype), value)
def compute_coverage_stats( mt: hl.MatrixTable, reference_ht: hl.Table, coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100], ) -> hl.Table: """ Computes the following coverage statistics for every base of the `reference_ht` provided: - mean - median - total DP - fraction of samples with coverage above X, for each x in `coverage_over_x_bins` The `reference_ht` is a table that contains row for each locus coverage should be computed on. It needs to be keyed with the same keys as `mt`, typically either `locus` or `locus, alleles`. The `reference_ht` can e.g. be created using `get_reference_ht` :param mt: Input sparse MT :param reference_ht: Input reference HT :param coverage_over_x_bins: List of boundaries for computing samples over X :return: Table with per-base coverage stats """ n_samples = mt.count_cols() print(f"Computing coverage stats on {n_samples} samples.") # Create an outer join with the reference Table mt = mt.select_entries("END", "DP").select_cols().select_rows() col_key_fields = list(mt.col_key) t = mt._localize_entries("__entries", "__cols") t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer") t = t.annotate(__entries=hl.or_else( t.__entries, hl.range(n_samples).map( lambda x: hl.null(t.__entries.dtype.element_type)), )) mt = t._unlocalize_entries("__entries", "__cols", col_key_fields) # Densify mt = hl.experimental.densify(mt) # Filter rows where the reference is missing mt = mt.filter_rows(mt._in_ref) # Unfilter entries so that entries with no ref block overlap aren't null mt = mt.unfilter_entries() # Compute coverage stats coverage_over_x_bins = sorted(coverage_over_x_bins) max_coverage_bin = coverage_over_x_bins[-1] hl_coverage_over_x_bins = hl.array(coverage_over_x_bins) # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin coverage_counter_expr = hl.agg.counter( hl.min(max_coverage_bin, hl.or_else(mt.DP, 0))) # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins # and computes the cumulative sum over them. # It needs to be in reverse order because we want the sum over samples covered by > X. count_array_expr = hl.cumulative_sum( hl.array([ hl.int32(coverage_counter_expr.get(max_coverage_bin, 0)) ] # The coverage was already floored to the max_coverage_bin, so no more aggregation is needed for the max bin ). extend( # For each of the other bins, coverage needs to be summed between the boundaries hl.range(hl.len(hl_coverage_over_x_bins) - 1, 0, step=-1). map(lambda i: hl.sum( hl.range(hl_coverage_over_x_bins[i - 1], hl_coverage_over_x_bins[i]).map(lambda j: hl.int32( coverage_counter_expr.get(j, 0))))))) mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0)) # Annotate rows now return mt.select_rows( mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr), median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0), total_DP=hl.agg.sum(mt.DP), **{ f"over_{x}": count_array_expr[i] / n_samples for i, x in zip( range( len(coverage_over_x_bins) - 1, -1, -1 ), # Reverse the bin index as count_array_expr has the reverse order coverage_over_x_bins, ) }, ).rows()
def export_results(num_pops, trait_types='all', batch_size=256, mt=None, export_path_str=None, skip_binary_eur=True): r''' `num_pops`: exact number of populations for which phenotype is defined `trait_types`: trait category (options: all, binary, quant) `batch_size`: batch size argument for export entries by col ''' assert trait_types in { 'all', 'quant', 'binary' }, "trait_types must be one of the following: {'all','quant','binary'}" print(f'\n\nExporting {trait_types} trait types for {num_pops} pops\n\n') if mt == None: mt0 = get_final_sumstats_mt_for_export() else: mt0 = mt meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path()) mt0 = mt0.annotate_cols(pheno_id=get_pheno_id(tb=mt0)) mt0 = mt0.annotate_rows(chr=mt0.locus.contig, pos=mt0.locus.position, ref=mt0.alleles[0], alt=mt0.alleles[1]) all_pops = ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID'] if trait_types == 'all': trait_types_to_run = [ 'continuous', 'biomarkers', 'categorical', 'phecode', 'icd10', 'prescriptions' ] # list of which trait_type to run elif trait_types == 'quant': trait_types_to_run = ['continuous', 'biomarkers'] elif trait_types == 'binary': trait_types_to_run = [ 'categorical', 'phecode', 'icd10', 'prescriptions' ] pop_sets = [set(i) for i in list(combinations(all_pops, num_pops)) ] # list of exact set of pops for which phenotype is defined # fields specific to each category of trait quant_meta_fields = ['AF_Allele2'] quant_fields = ['AF_Allele2'] binary_meta_fields = ['AF_Cases', 'AF_Controls'] binary_fields = ['AF.Cases', 'AF.Controls'] # dictionaries for renaming fields quant_meta_field_rename_dict = { 'AF_Allele2': 'af_meta', 'BETA': 'beta_meta', 'SE': 'se_meta', 'Pvalue': 'pval_meta', 'Pvalue_het': 'pval_heterogeneity' } quant_field_rename_dict = { 'AF_Allele2': 'af', 'BETA': 'beta', 'SE': 'se', 'Pvalue': 'pval', 'low_confidence': 'low_confidence' } # decided on this implementation to make later code cleaner binary_meta_field_rename_dict = { 'BETA': 'beta_meta', 'SE': 'se_meta', 'Pvalue': 'pval_meta', 'AF_Cases': 'af_cases_meta', 'AF_Controls': 'af_controls_meta', 'Pvalue_het': 'pval_heterogeneity' } binary_field_rename_dict = { 'AF.Cases': 'af_cases', 'AF.Controls': 'af_controls', 'BETA': 'beta', 'SE': 'se', 'Pvalue': 'pval', 'low_confidence': 'low_confidence' } # decided on this implementation to make later code cleaner all_quant_trait_types = {'continuous', 'biomarkers'} all_binary_trait_types = { 'categorical', 'phecode', 'icd10', 'prescriptions' } quant_trait_types = all_quant_trait_types.intersection( trait_types_to_run) # get list of quant trait types to run binary_trait_types = all_binary_trait_types.intersection( trait_types_to_run) # get list of binary trait types to run error_trait_types = set(trait_types_to_run).difference( quant_trait_types.union(binary_trait_types)) assert len( error_trait_types ) == 0, f'ERROR: The following trait_types are invalid: {error_trait_types}' for trait_category, trait_types in [('binary', binary_trait_types), ('quant', quant_trait_types)]: if len(trait_types) == 0: #if no traits in trait_types list continue print(f'{trait_category} trait types to run: {trait_types}') if trait_category == 'quant': meta_fields = quant_meta_fields fields = quant_fields meta_field_rename_dict = quant_meta_field_rename_dict field_rename_dict = quant_field_rename_dict elif trait_category == 'binary': meta_fields = binary_meta_fields fields = binary_fields meta_field_rename_dict = binary_meta_field_rename_dict field_rename_dict = binary_field_rename_dict meta_fields += ['BETA', 'SE', 'Pvalue', 'Pvalue_het'] fields += ['BETA', 'SE', 'Pvalue', 'low_confidence'] for pop_set in pop_sets: start = time() if (pop_set == {'EUR'} and trait_category == 'binary' ) and skip_binary_eur: # run EUR-only binary traits separately print('\nSkipping EUR-only binary traits\n') continue mt1 = mt0.filter_cols( (hl.literal(trait_types).contains(mt0.trait_type)) & (hl.set(mt0.pheno_data.pop) == hl.literal(pop_set))) col_ct = mt1.count_cols() if col_ct == 0: print( f'\nSkipping {trait_types},{sorted(pop_set)}, no phenotypes found\n' ) continue pop_list = sorted(pop_set) annotate_dict = {} keyed_mt = meta_mt0[mt1.row_key, mt1.col_key] if len(pop_set) > 1: for field in meta_fields: # NOTE: Meta-analysis columns go before per-population columns field_expr = keyed_mt.meta_analysis[field][0] annotate_dict.update({ f'{meta_field_rename_dict[field]}': hl.if_else(hl.is_nan(field_expr), hl.str(field_expr), hl.format('%.3e', field_expr)) }) for field in fields: for pop_idx, pop in enumerate(pop_list): field_expr = mt1.summary_stats[field][pop_idx] annotate_dict.update({ f'{field_rename_dict[field]}_{pop}': hl.if_else( hl.is_nan(field_expr), hl.str(field_expr), hl.str(field_expr) if field == 'low_confidence' else hl.format('%.3e', field_expr)) }) mt2 = mt1.annotate_entries(**annotate_dict) mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409') mt2 = mt2.key_cols_by('pheno_id') mt2 = mt2.key_rows_by().drop( 'locus', 'alleles', 'summary_stats' ) # row fields that are no longer included: 'gene','annotation' batch_idx = 1 get_export_path = lambda batch_idx: f'{ldprune_dir}/export_results/{"" if export_path_str is None else f"{export_path_str}/"}{trait_category}/{"-".join(pop_list)}_batch{batch_idx}' print(mt2.describe()) while hl.hadoop_is_dir(get_export_path(batch_idx)): batch_idx += 1 print( f'\nExporting {col_ct} phenos to: {get_export_path(batch_idx)}\n' ) hl.experimental.export_entries_by_col( mt=mt2, path=get_export_path(batch_idx), bgzip=True, batch_size=batch_size, use_string_key_as_file_name=True, header_json_in_file=False) end = time() print( f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs' )
def export_binary_eur(cluster_idx, num_clusters=10, batch_size=256): r''' Export summary statistics for binary traits defined only for EUR. Given the large number of such traits (4184), it makes sense to batch this across `num_clusters` clusters for reduced wall time and robustness to mid-export errors. NOTE: `cluster_idx` is 1-indexed. ''' mt0 = get_final_sumstats_mt_for_export() meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path()) mt0 = mt0.annotate_cols(pheno_id=get_pheno_id(tb=mt0)) mt0 = mt0.annotate_rows(chr=mt0.locus.contig, pos=mt0.locus.position, ref=mt0.alleles[0], alt=mt0.alleles[1]) trait_types_to_run = ['categorical', 'phecode', 'icd10', 'prescriptions'] # list of which trait_type to run # fields specific to each category of trait meta_fields = ['AF_Cases', 'AF_Controls'] fields = ['AF.Cases', 'AF.Controls'] # dictionaries for renaming fields meta_field_rename_dict = { 'BETA': 'beta_meta', 'SE': 'se_meta', 'Pvalue': 'pval_meta', 'AF_Cases': 'af_cases_meta', 'AF_Controls': 'af_controls_meta', 'Pvalue_het': 'pval_heterogeneity' } field_rename_dict = { 'AF.Cases': 'af_cases', 'AF.Controls': 'af_controls', 'BETA': 'beta', 'SE': 'se', 'Pvalue': 'pval', 'low_confidence': 'low_confidence' } # decided on this implementation to make later code cleaner all_binary_trait_types = { 'categorical', 'phecode', 'icd10', 'prescriptions' } meta_fields += ['BETA', 'SE', 'Pvalue', 'Pvalue_het'] fields += ['BETA', 'SE', 'Pvalue', 'low_confidence'] trait_category = 'binary' trait_types = all_binary_trait_types.intersection( trait_types_to_run) # get list of binary trait types to run pop_set = {'EUR'} start = time() mt1 = mt0.filter_cols((hl.literal(trait_types).contains(mt0.trait_type)) & (hl.set(mt0.pheno_data.pop) == hl.literal(pop_set))) pheno_id_list = mt1.pheno_id.collect() num_traits = len(pheno_id_list) # total number of traits to run traits_per_cluster = ceil( num_traits / num_clusters) # maximum traits to run per cluster cluster_pheno_id_list = pheno_id_list[ (cluster_idx - 1) * traits_per_cluster:cluster_idx * traits_per_cluster] # list of traits to run in current cluster print(len(cluster_pheno_id_list)) mt1 = mt1.filter_cols( hl.literal(cluster_pheno_id_list).contains(mt1.pheno_id)) pop_list = sorted(pop_set) annotate_dict = {} keyed_mt = meta_mt0[mt1.row_key, mt1.col_key] if len(pop_set) > 1: for field in meta_fields: # NOTE: Meta-analysis columns go before per-population columns field_expr = keyed_mt.meta_analysis[field][0] annotate_dict.update({ f'{meta_field_rename_dict[field]}': hl.if_else(hl.is_nan(field_expr), hl.str(field_expr), hl.format('%.3e', field_expr)) }) for field in fields: for pop_idx, pop in enumerate(pop_list): field_expr = mt1.summary_stats[field][pop_idx] annotate_dict.update({ f'{field_rename_dict[field]}_{pop}': hl.if_else( hl.is_nan(field_expr), hl.str(field_expr), hl.str(field_expr) if field == 'low_confidence' else hl.format('%.3e', field_expr)) }) mt2 = mt1.annotate_entries(**annotate_dict) mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409') mt2 = mt2.key_cols_by('pheno_id') mt2 = mt2.key_rows_by().drop( 'locus', 'alleles', 'summary_stats' ) # row fields that are no longer included: 'gene','annotation' print(mt2.describe()) batch_idx = 1 get_export_path = lambda batch_idx: f'{ldprune_dir}/release/{trait_category}/{"-".join(pop_list)}_batch{batch_idx}/subbatch{cluster_idx}' while hl.hadoop_is_dir(get_export_path(batch_idx)): batch_idx += 1 print( f'\nExporting {len(cluster_pheno_id_list)} phenos to: {get_export_path(batch_idx)}\n' ) hl.experimental.export_entries_by_col(mt=mt2, path=get_export_path(batch_idx), bgzip=True, batch_size=batch_size, use_string_key_as_file_name=True, header_json_in_file=False) end = time() print( f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs' )
def prepare_pext_data(base_level_pext_path): tmp_dir = os.path.expanduser("~") # # Step 1: rename fields, extract chrom/pos from locus, convert missing values to 0, export to TSV # ds = hl.read_table(base_level_pext_path) ds = ds.select( gene_id=ds.ensg, chrom=ds.locus.contig, pos=ds.locus.position, # Replace NaNs and missing values with 0s mean=hl.cond( hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion), hl.float(0), ds.mean_proportion), **{ renamed: hl.cond( hl.is_missing(ds[original]) | hl.is_nan(ds[original]), hl.float(0), ds[original]) for original, renamed in TISSUE_NAME_MAP.items() }) ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus") ds.export("file://" + os.path.join(tmp_dir, "bases.tsv")) # # Step 2: Collect base-level data into regions # with open(os.path.join(tmp_dir, "regions.tsv"), "w") as output_file: writer = csv.writer(output_file, delimiter="\t") writer.writerow(["gene_id", "chrom", "start", "stop", "mean"] + TISSUE_FIELDS) def output_region(region): writer.writerow([ region.gene, region.chrom, region.start, region.stop, region.tissues["mean"] ] + [region.tissues[t] for t in TISSUE_FIELDS]) rows = read_bases_tsv(os.path.join(tmp_dir, "bases.tsv")) first_row = next(rows) current_region = Region(gene=first_row.gene, chrom=first_row.chrom, start=first_row.pos, stop=None, tissues=first_row.tissues) last_pos = first_row.pos for row in rows: if (row.gene != current_region.gene or row.chrom != current_region.chrom or row.pos > (last_pos + 1) or any(row.tissues[t] != current_region.tissues[t] for t in row.tissues)): output_region(current_region._replace(stop=last_pos)) current_region = Region(gene=row.gene, chrom=row.chrom, start=row.pos, stop=None, tissues=row.tissues) last_pos = row.pos output_region(current_region._replace(stop=last_pos)) # Copy regions file to HDFS subprocess.run( [ "hdfs", "dfs", "-cp", "file://" + os.path.join(tmp_dir, "regions.tsv"), os.path.join(tmp_dir, "regions.tsv") ], check=True, ) # # Step 3: Convert regions to a Hail table. # types = {t: hl.tfloat for t in TISSUE_FIELDS} types["gene_id"] = hl.tstr types["chrom"] = hl.tstr types["start"] = hl.tint types["stop"] = hl.tint types["mean"] = hl.tfloat ds = hl.import_table(os.path.join(tmp_dir, "regions.tsv"), min_partitions=100, missing="", types=types) ds = ds.select("gene_id", "chrom", "start", "stop", "mean", tissues=hl.struct(**{t: ds[t] for t in TISSUE_FIELDS})) ds = ds.group_by("gene_id").aggregate( regions=hl.agg.collect(ds.row_value.drop("gene_id"))) return ds