def sor_from_sb( sb: Union[hl.expr.ArrayNumericExpression, hl.expr.ArrayExpression] ) -> hl.expr.Float64Expression: """ Computes `SOR` (Symmetric Odds Ratio test) annotation from the `SB` (strand balance table) field. .. note:: This function can either take - an array of length four containing the forward and reverse strands' counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] - a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]] GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java :param sb: Count of ref/alt reads on each strand :return: SOR value """ if not isinstance(sb, hl.expr.ArrayNumericExpression): sb = hl.bind(lambda x: hl.flatten(x), sb) sb = sb.map(lambda x: hl.float64(x) + 1) ref_fw = sb[0] ref_rv = sb[1] alt_fw = sb[2] alt_rv = sb[3] symmetrical_ratio = ((ref_fw * alt_rv) / (alt_fw * ref_rv)) + ( (alt_fw * ref_rv) / (ref_fw * alt_rv) ) ref_ratio = hl.min(ref_rv, ref_fw) / hl.max(ref_rv, ref_fw) alt_ratio = hl.min(alt_fw, alt_rv) / hl.max(alt_fw, alt_rv) sor = hl.log(symmetrical_ratio) + hl.log(ref_ratio) - hl.log(alt_ratio) return sor
def filter(self, mt): mt = mt.annotate_rows(variant_qc=variant_qc_aggregator(mt)) row_filter = mt[self._row_filter].filters if self._row_filter else mt.exclude_row col_filter = mt[self._col_filter].filters if self._col_filter else mt.exclude_col pre_filter = row_filter | col_filter mt = mt.annotate_cols(**{ 'fstat': hl.struct( filters=hl.agg.filter(pre_filter == False & mt.locus.in_autosome(), (hl.agg.inbreeding(mt.GT, hl.min(mt.variant_qc.AF)).f_stat < -self._fhet_th) | (hl.agg.inbreeding(mt.GT, hl.min(mt.variant_qc.AF)).f_stat > self._fhet_th)) )}) return mt
def _get_alt_count(locus, gt, is_female): """Calculate alt allele count with sex info if present.""" if is_female is None: return hl.or_missing(locus.in_autosome(), gt.n_alt_alleles()) return (hl.case().when( locus.in_autosome_or_par(), gt.n_alt_alleles()).when( ~is_female & (locus.in_x_nonpar() | locus.in_y_nonpar()), hl.min(1, gt.n_alt_alleles()), ).when(is_female & locus.in_y_nonpar(), 0).default(0))
def filter(self, mt): col_filter = mt[self._col_filter].filters if self._col_filter else mt.exclude_col pre_filter = col_filter mt = mt.annotate_rows(**{ 'monomorphic_var': hl.struct( filters=hl.agg.filter(pre_filter == False, hl.min(variant_qc_aggregator(mt).AC)) == 0)}) return mt
def filter(self, mt): col_filter = mt[self._col_filter].filters if self._col_filter else mt.exclude_col pre_filter = col_filter mt = mt.annotate_rows(**{ 'maf': hl.struct( filters=hl.agg.filter(pre_filter == False, hl.min(variant_qc_aggregator(mt).AF)) < self._maf_thresh)}) return mt
def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index ) == hl.unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None)
def compute_last_ref_block_end(mt: hl.MatrixTable) -> hl.Table: """ This function takes a sparse MT and computes for each row the genomic position of the most upstream reference block overlapping that row. Note that since reference blocks do not extend beyond contig boundaries, only the position is kept. This function returns a Table with that annotation. (`last_END_position`). :param mt: Input MatrixTable :return: Output Table with `last_END_position` annotation """ mt = mt.select_entries("END") # Localize entries, so that they can be viewed as an array and scanned over using hl.scan.array_agg ht = mt._localize_entries("__entries", "__cols") # Compute the position by using hl.scan._prev_nonnull. # This was inspired by hl.experimental.densify # _prev_non_null is an aggregator that keeps the previous record in memory # and updates it with the given value at the row if it's not null (missing) # The following code computes the following annotation for each row: # 1. Keep a scan of the entries using _prev_nonnull, keeping the start (ht.locus) and end (entry.END) of each ref block (1.1) # 2. For the current row locus, record the start of the block that starts the furthest away, # that is the minimum position in the current scan for any block that overlaps the current locus (2.1) ht = ht.select( last_END_position=hl.or_else( hl.min( # 2. For the current row locus, record the start of the block that starts the furthest away hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( # 1. Keep a scan of the entries using _prev_nonnull hl.or_missing( hl.is_defined( entry.END ), # Update the scan whenever a new ref block is encountered hl.tuple( [ # 1.1 keep the start (ht.locus) and end (entry.END) of each ref block ht.locus, entry.END, ] ), ) ), ht.__entries, ).map( lambda x: hl.or_missing( # 2.1 get the start position of blocks that overlap the current locus (x[1] >= ht.locus.position) & (x[0].contig == ht.locus.contig), x[0].position, ) ) ), ht.locus.position, ) ) return ht.select_globals()
def filter_maf(mt: hl.MatrixTable, maf: float) -> Tuple[hl.MatrixTable, Dict[str, int]]: # step 9 mt = compute_qc_metrics(mt) mt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) maf_removed = mt.filter_rows(mt.maf < maf).rsid.collect() if len(maf_removed) > 0: mt = mt.filter_rows(hl.literal(maf_removed).contains(mt['rsid']), keep=False) results = {'maf_removed': len(maf_removed)} return mt, results
def filter_invariant_snps( mt: hl.MatrixTable) -> Tuple[hl.MatrixTable, Dict[str, int]]: # step 8 mt = compute_qc_metrics(mt) mt = mt.annotate_rows(MAC=hl.min(mt.variant_qc.AC)) monomorphic_snps = mt.filter_rows(mt.MAC == 0).rsid.collect() if len(monomorphic_snps) > 0: mt = mt.filter_rows(hl.literal(monomorphic_snps).contains(mt['rsid']), keep=False) results = {'monomorphic_snps': len(monomorphic_snps)} return mt, results
def gnomad_coverage_stats_optimized(): mt = hl.read_matrix_table(resource('gnomad_dp_simulation.mt')) mt = mt.annotate_rows(mean=hl.agg.mean(mt.x), count_array=hl.rbind(hl.agg.counter(hl.min(100, mt.x)), lambda c: hl.range(0, 100).map(lambda i: c.get(i, 0)))) mt = mt.annotate_rows(median=hl.rbind(hl.sum(mt.count_array) / 2, lambda s: hl.find(lambda x: x > s, hl.array_scan( lambda i, j: i + j, 0, mt.count_array))), **{f'above_{x}': hl.sum(mt.count_array[x:]) for x in [1, 5, 10, 15, 20, 25, 30, 50, 100]} ) mt.rows()._force_count()
def format_regional_missense_constraint(ds): ds = ds.annotate(obs_mis=hl.int(ds.obs_mis)) ds = ds.annotate(start=hl.min(ds.genomic_start, ds.genomic_end), stop=hl.max(ds.genomic_start, ds.genomic_end)) ds = ds.drop("amino_acids", "chr", "gene", "genomic_start", "genomic_end", "region_name") ds = ds.transmute(transcript_id=ds.transcript.split("\\.")[0]) ds = ds.group_by("transcript_id").aggregate(regions=hl.agg.collect(ds.row_value)) ds = ds.annotate(regions=hl.sorted(ds.regions, lambda region: region.start)) return ds
def get_gt_counts(freq: str): return hl.array([ hl.min(vp_freq_expr.v1[freq].AN, vp_freq_expr.v2[freq].AN), # AABB vp_freq_expr.v2[freq].AC - (2 * vp_freq_expr.v2[freq].homozygote_count), # AABb vp_freq_expr.v2[freq].homozygote_count, # AAbb vp_freq_expr.v1[freq].AC - (2 * vp_freq_expr.v1[freq].homozygote_count), # AaBB 0, # AaBb 0, # Aabb vp_freq_expr.v1[freq].homozygote_count, # aaBB 0, # aaBb 0 # aabb ])
def filter_snps(mt, maf): mt = hl.variant_qc(mt) mt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt.filter_rows(mt.maf > maf) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt = hl.filter_intervals(mt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) return mt
def diagonal(nd): """Gets the diagonal of a 2 dimensional NDArray. Examples -------- >>> hl.eval(hl.nd.diagonal(hl.nd.array([[1, 2], [3, 4]]))) array([1, 4], dtype=int32) :param nd: A 2 dimensional NDArray, shape(M, N). :return: A 1 dimension NDArray of length min (M, N), containing the diagonal of `nd`. """ assert nd.ndim == 2, "diagonal requires 2 dimensional ndarray" shape_min = hl.min(nd.shape[0], nd.shape[1]) return hl.nd.array(hl.range(hl.int32(shape_min)).map(lambda i: nd[i, i]))
def pca_filter_mt(in_mt: hl.MatrixTable, maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000): print("\nInitial number of SNPs before filtering: {}".format( in_mt.count_rows())) mt = hl.variant_qc(in_mt) print(f'\nFiltering out variants with MAF < {maf}') mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt_filt = mt_filt.filter_rows(mt_filt.maf > maf) print(f'\nFiltering out variants with HWE < {hwe:1e}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe) print(f'\nFiltering out variants with Call Rate < {call_rate}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate) # no strand ambiguity print('\nFiltering out strand ambigous variants') mt_filt = mt_filt.filter_rows( ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1])) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb print( '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]' ) intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt_filt = hl.filter_intervals(mt_filt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) # This step is expensive (on local machine) print( f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}' ) mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window) mt_ld_pruned = mt_filt.filter_rows( hl.is_defined(mt_ld_prune[mt_filt.row_key])) print("\nNumber of SNPs after filtering: {}".format( mt_ld_pruned.count_rows())) return mt_ld_pruned
def _genotype_fields(self): # Convert the mt genotype entries into num_alt, gq, ab, dp, and sample_id. is_called = hl.is_defined(self.mt.GT) return { 'num_alt': hl.cond(is_called, self.mt.GT.n_alt_alleles(), -1), 'gq': hl.cond(is_called, self.mt.GQ, hl.null(hl.tint)), 'ab': hl.bind( lambda total: hl.cond( (is_called) & (total != 0) & (hl.len(self.mt.AD) > 1), hl.float(self.mt.AD[1] / total), hl.null(hl.tfloat)), hl.sum(self.mt.AD)), 'dp': hl.cond(is_called, hl.int(hl.min(self.mt.DP, 32000)), hl.null(hl.tfloat)), 'sample_id': self.mt.s }
def run_logistic_bool(mt, variable): ht = hl.logistic_regression_rows(test='firth', y=mt[variable], x=mt.GT.n_alt_alleles(), covariates=[ 1, mt.imputesex.impute_sex.is_female, mt.pca.PC1, mt.pca.PC2, mt.pca.PC3, mt.pca.PC4, mt.pca.PC5, mt.pca.PC6, mt.pca.PC7, mt.pca.PC8, mt.pca.PC9, mt.pca.PC10 ]) mt = mt.filter_cols(hl.is_defined(mt[variable])) mt = mt.annotate_rows(MAC=hl.min( hl.agg.sum(mt.GT.n_alt_alleles()), hl.agg.sum( hl.int64(mt.GT.is_het_ref()) + 2 * hl.int64(mt.GT.is_hom_ref())))) ht = ht.annotate(MAC=mt.rows()[ht.key].MAC) return (ht)
def add_stats( i: hl.expr.StructExpression, j: hl.expr.StructExpression ) -> hl.expr.StructExpression: """ This merges two stast counters together. It assumes that all stats counter fields are present in the struct. :param i: accumulator: struct with mean, n and variance :param j: new element: stats_struct -- needs to contain mean, n and variance :return: Accumulation over all elements: struct with mean, n and variance """ delta = j.mean - i.mean n_tot = i.n + j.n return hl.struct( min=hl.min(i.min, j.min), max=hl.max(i.max, j.max), mean=(i.mean * i.n + j.mean * j.n) / n_tot, variance=i.variance + j.variance + (delta * delta * i.n * j.n) / n_tot, n=n_tot, sum=i.sum + j.sum, )
def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))) .filter(lambda j: hl.downcode(hl.unphased_diploid_gt_index_call(j), local_a_index) == hl.unphased_diploid_gt_index_call(i)) .map(lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None)
def fet_expr(het_count_exp: hl.expr.Int64Expression, hom_count_expr: hl.expr.Int64Expression): return hl.bind( lambda x: hl.struct( counts=x, dominant=hl.fisher_exact_test(x[0][0], x[0][1] + x[0][2], x[1][0], x[1][1] + x[1][2]), recessive=hl.fisher_exact_test(x[0][0] + x[0][1], x[0][ 2], x[1][0] + x[1][1], x[1][2])), hl.bind( lambda x: [ [ hl.int32( hl.cond(x.contains(False), x[False].get(0, 0), 0)), hl.int32( hl.cond(x.contains(False), x[False].get(1, 0), 0)), hl.int32( hl.cond(x.contains(False), x[False].get(2, 0), 0)) ], [ hl.int32( hl.cond(x.contains(True), x[True].get(0, 0), 0) ), hl.int32( hl.cond(x.contains(True), x[True].get(1, 0), 0) ), hl.int32( hl.cond(x.contains(True), x[True].get(2, 0), 0) ) ], ], hl.agg.group_by( mt.is_case, hl.agg.counter( hl.min(2, het_count_exp + 2 * hom_count_expr)))))
def prepare_exac_regional_missense_constraint(path): ds = hl.import_table( path, missing="", types={ "transcript": hl.tstr, "gene": hl.tstr, "chr": hl.tstr, "amino_acids": hl.tstr, "genomic_start": hl.tint, "genomic_end": hl.tint, "obs_mis": hl.tfloat, "exp_mis": hl.tfloat, "obs_exp": hl.tfloat, "chisq_diff_null": hl.tfloat, "region_name": hl.tstr, }, ) ds = ds.annotate(obs_mis=hl.int(ds.obs_mis)) ds = ds.annotate(start=hl.min(ds.genomic_start, ds.genomic_end), stop=hl.max(ds.genomic_start, ds.genomic_end)) ds = ds.drop("amino_acids", "chr", "gene", "genomic_start", "genomic_end", "region_name") ds = ds.transmute(transcript_id=ds.transcript.split("\\.")[0]) ds = ds.group_by("transcript_id").aggregate( regions=hl.agg.collect(ds.row_value)) ds = ds.annotate( regions=hl.sorted(ds.regions, lambda region: region.start)) ds = ds.select(exac_regional_missense_constraint_regions=ds.regions) return ds
def compute_coverage_stats( mt: hl.MatrixTable, reference_ht: hl.Table, coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100], ) -> hl.Table: """ Computes the following coverage statistics for every base of the `reference_ht` provided: - mean - median - total DP - fraction of samples with coverage above X, for each x in `coverage_over_x_bins` The `reference_ht` is a table that contains row for each locus coverage should be computed on. It needs to be keyed with the same keys as `mt`, typically either `locus` or `locus, alleles`. The `reference_ht` can e.g. be created using `get_reference_ht` :param mt: Input sparse MT :param reference_ht: Input reference HT :param coverage_over_x_bins: List of boundaries for computing samples over X :return: Table with per-base coverage stats """ n_samples = mt.count_cols() print(f"Computing coverage stats on {n_samples} samples.") # Create an outer join with the reference Table mt = mt.select_entries("END", "DP").select_cols().select_rows() col_key_fields = list(mt.col_key) t = mt._localize_entries("__entries", "__cols") t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer") t = t.annotate( __entries=hl.or_else( t.__entries, hl.range(n_samples).map(lambda x: hl.null(t.__entries.dtype.element_type)), ) ) mt = t._unlocalize_entries("__entries", "__cols", col_key_fields) # Densify mt = hl.experimental.densify(mt) # Filter rows where the reference is missing mt = mt.filter_rows(mt._in_ref) # Unfilter entries so that entries with no ref block overlap aren't null mt = mt.unfilter_entries() # Compute coverage stats coverage_over_x_bins = sorted(coverage_over_x_bins) max_coverage_bin = coverage_over_x_bins[-1] hl_coverage_over_x_bins = hl.array(coverage_over_x_bins) # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin coverage_counter_expr = hl.agg.counter( hl.min(max_coverage_bin, hl.or_else(mt.DP, 0)) ) # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins # and computes the cumulative sum over them. # It needs to be in reverse order because we want the sum over samples covered by > X. count_array_expr = hl.cumulative_sum( hl.array( [ hl.int32(coverage_counter_expr.get(max_coverage_bin, 0)) ] # The coverage was already floored to the max_coverage_bin, so no more aggregation is needed for the max bin ).extend( # For each of the other bins, coverage needs to be summed between the boundaries hl.range(hl.len(hl_coverage_over_x_bins) - 1, 0, step=-1).map( lambda i: hl.sum( hl.range( hl_coverage_over_x_bins[i - 1], hl_coverage_over_x_bins[i] ).map(lambda j: hl.int32(coverage_counter_expr.get(j, 0))) ) ) ) ) mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0)) # Annotate rows now return mt.select_rows( mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr), median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0), total_DP=hl.agg.sum(mt.DP), **{ f"over_{x}": count_array_expr[i] / n_samples for i, x in zip( range( len(coverage_over_x_bins) - 1, -1, -1 ), # Reverse the bin index as count_array_expr has the reverse order coverage_over_x_bins, ) }, ).rows()
def relatedness_check(in_mt: hl.MatrixTable = None, method: str = 'pc_relate', outdir: str = None, kin_estimate: float = 0.98): global mt, samples_to_remove in_mt = hl.variant_qc(in_mt) in_mt = hl.sample_qc(in_mt) # _localize=False means don't put this in Python, keep it as a Hail expr call_rate_dict = in_mt.aggregate_cols(hl.dict( hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))), _localize=False) if method == 'pc_relate': print("\nUsing PC-Relate for relatedness checks") relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, min_kinship=0.1, statistics='kin') samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.kin > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i.s], cr_s2=call_rate_dict[samples_to_remove_ht.j.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) elif method == 'ibd': print("\nUsing PLINK-style identity by descent for relatedness checks") in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) relatedness_ht = hl.identity_by_descent( in_mt, maf=in_mt['maf'] ) # this returns a Hail Table with the sample pairs samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.ibd.PI_HAT > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i], cr_s2=call_rate_dict[samples_to_remove_ht.j]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) else: print("\nUsing KING for relatedness checks") if kin_estimate > 0.5: raise Exception( "\nThe maximum kinship coefficient is for KING 0.5") relatedness_mt = hl.king(in_mt.GT) filtered_relatedness_mt = relatedness_mt.filter_entries( (relatedness_mt.s_1 != relatedness_mt.s) & (relatedness_mt.phi >= kin_estimate), keep=True) samples_to_remove_ht = filtered_relatedness_mt.entries() samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.s_1], cr_s2=call_rate_dict[samples_to_remove_ht.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.s_1, samples_to_remove.s)) samples = samples_list.sample_to_remove.collect() if len(samples) > 0: in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) print("\nNumber of samples that fail relatedness checks: {}".format( len(samples))) with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f: for sample in samples: f.write(sample + "\n") else: print("\nNo samples failed the relatedness check") return in_mt
def test_annotate(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}] kt = hl.Table.parallelize(rows, schema) self.assertTrue(kt.annotate()._same(kt)) result1 = convert_struct_to_dict(kt.annotate(foo=kt.a + 1, foo2=kt.a).take(1)[0]) self.assertDictEqual(result1, {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'foo': 5, 'foo2': 4}) result3 = convert_struct_to_dict(kt.annotate( x1=kt.f.map(lambda x: x * 2), x2=kt.f.map(lambda x: [x, x + 1]).flatmap(lambda x: x), x3=hl.min(kt.f), x4=hl.max(kt.f), x5=hl.sum(kt.f), x6=hl.product(kt.f), x7=kt.f.length(), x8=kt.f.filter(lambda x: x == 3), x9=kt.f[1:], x10=kt.f[:], x11=kt.f[1:2], x12=kt.f.map(lambda x: [x, x + 1]), x13=kt.f.map(lambda x: [[x, x + 1], [x + 2]]).flatmap(lambda x: x), x14=hl.cond(kt.a < kt.b, kt.c, hl.null(hl.tint32)), x15={1, 2, 3} ).take(1)[0]) self.assertDictEqual(result3, {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'x1': [2, 4, 6], 'x2': [1, 2, 2, 3, 3, 4], 'x3': 1, 'x4': 3, 'x5': 6, 'x6': 6, 'x7': 3, 'x8': [3], 'x9': [2, 3], 'x10': [1, 2, 3], 'x11': [2], 'x12': [[1, 2], [2, 3], [3, 4]], 'x13': [[1, 2], [3], [2, 3], [4], [3, 4], [5]], 'x14': None, 'x15': set([1, 2, 3])}) kt.annotate( x1=kt.a + 5, x2=5 + kt.a, x3=kt.a + kt.b, x4=kt.a - 5, x5=5 - kt.a, x6=kt.a - kt.b, x7=kt.a * 5, x8=5 * kt.a, x9=kt.a * kt.b, x10=kt.a / 5, x11=5 / kt.a, x12=kt.a / kt.b, x13=-kt.a, x14=+kt.a, x15=kt.a == kt.b, x16=kt.a == 5, x17=5 == kt.a, x18=kt.a != kt.b, x19=kt.a != 5, x20=5 != kt.a, x21=kt.a > kt.b, x22=kt.a > 5, x23=5 > kt.a, x24=kt.a >= kt.b, x25=kt.a >= 5, x26=5 >= kt.a, x27=kt.a < kt.b, x28=kt.a < 5, x29=5 < kt.a, x30=kt.a <= kt.b, x31=kt.a <= 5, x32=5 <= kt.a, x33=(kt.a == 0) & (kt.b == 5), x34=(kt.a == 0) | (kt.b == 5), x35=False, x36=True )
def ht_to_vcf_mt( info_ht: hl.Table, pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS, ) -> hl.MatrixTable: """ Creates a MT ready for vcf export from a HT. In particular, the following conversions are done: - All int64 are coerced to int32 - Fields specified by `pipe_delimited_annotations` will be converted from arrays to pipe-delimited strings .. note:: The MT returned has no cols. :param info_ht: Input HT :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct) :return: MatrixTable ready for VCF export """ def get_pipe_expr( array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|") # Make sure the HT is keyed by locus, alleles info_ht = info_ht.key_by("locus", "alleles") # Convert int64 fields to int32 (int64 isn't supported by VCF) for f, ft in info_ht.info.dtype.items(): if ft == hl.dtype("int64"): logger.warning( f"Coercing field info.{f} from int64 to int32 for VCF output. Value will be capped at int32 max value." ) info_ht = info_ht.annotate(info=info_ht.info.annotate( **{f: hl.int32(hl.min(2**31 - 1, info_ht.info[f]))})) elif ft == hl.dtype("array<int64>"): logger.warning( f"Coercing field info.{f} from array<int64> to array<int32> for VCF output. Array values will be capped at int32 max value." ) info_ht = info_ht.annotate(info=info_ht.info.annotate( **{ f: info_ht.info[f].map( lambda x: hl.int32(hl.min(2**31 - 1, x))) })) info_expr = {} # Make sure to pipe-delimit fields that need to. # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty) # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, but it is overwritten below for f in pipe_delimited_annotations: if f in info_ht.info: info_expr[f] = "|" + get_pipe_expr(info_ht.info[f]) # Flatten SB if it is an array of arrays if "SB" in info_ht.info and not isinstance(info_ht.info.SB, hl.expr.ArrayNumericExpression): info_expr["SB"] = info_ht.info.SB[0].extend(info_ht.info.SB[1]) if "AS_SB_TABLE" in info_ht.info: info_expr["AS_SB_TABLE"] = get_pipe_expr( info_ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ","))) # Annotate with new expression and add 's' empty string field required to cast HT to MT info_ht = info_ht.annotate(info=info_ht.info.annotate(**info_expr), s=hl.null(hl.tstr)) # Create an MT with no cols so that we acn export to VCF info_mt = info_ht.to_matrix_table_row_major(columns=["s"], entry_field_name="s") return info_mt.filter_cols(False)
import hail as hl from gnomad_qc.v3.resources import get_full_mt last_END_position_path = 'gs://gnomad/annotations/hail-0.2/ht/genomes_v3/gnomad_genomes_v3_last_END_positions.ht' # END RESOURCES mt = get_full_mt(False) mt = mt.select_entries('END') t = mt._localize_entries('__entries', '__cols') t = t.select(last_END_position=hl.or_else( hl.min( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( hl.or_missing(hl.is_defined(entry.END), hl.tuple([t.locus, entry.END]))), t.__entries). map(lambda x: hl.or_missing((x[1] >= t.locus.position) & (x[ 0].contig == t.locus.contig), x[0].position))), t.locus.position)) t.write(last_END_position_path, overwrite=True)
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
is_PSYCHOSIS=mt.phenotype.PSYCHOSIS) mt = mt.annotate_cols(is_BPPSY=hl.case().when( (mt.is_BP_including_BPSCZ) & (mt.is_PSYCHOSIS), True).when(~mt.is_BP_including_BPSCZ, False).default(hl.null(hl.tbool)), is_BP_no_PSY=hl.case().when( (mt.is_BP_including_BPSCZ) & (~mt.is_PSYCHOSIS), True).when(~mt.is_BP_including_BPSCZ, False).default(hl.null(hl.tbool))) mt.cols().select('is_BP1', 'is_BP2', 'is_BPNOS', 'is_BPSCZ', 'is_BP', 'is_BP_including_BPSCZ', 'is_SCZ', 'is_BPPSY', 'is_BP_no_PSY', 'is_PSYCHOSIS').write(PHENOTYPE_TABLE_BOOL, overwrite=True) mt = mt.annotate_rows(MAC=hl.min( hl.agg.sum(mt.GT.n_alt_alleles()), hl.agg.sum( hl.int64(mt.GT.is_het_ref()) + 2 * hl.int64(mt.GT.is_hom_ref())))) mt_MAC10 = mt.filter_rows(mt.MAC >= 10) def run_logistic_bool(mt, variable): ht = hl.logistic_regression_rows(test='firth', y=mt[variable], x=mt.GT.n_alt_alleles(), covariates=[ 1, mt.imputesex.impute_sex.is_female, mt.pca.PC1, mt.pca.PC2, mt.pca.PC3, mt.pca.PC4, mt.pca.PC5, mt.pca.PC6, mt.pca.PC7, mt.pca.PC8, mt.pca.PC9, mt.pca.PC10
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = ld_score_all_phenos_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0) }, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={ '__y': chi_sq_exprs[0], '__n': n_samples_exprs[0] }) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict( **{ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr }, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, ** {n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ ds.select( **{ '__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]]) }) for i, y in enumerate(ys) ] mts = [ ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=[ '__w_initial', '__w_initial_floor', '__x', '__x_floor' ]) for ht in hts ] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg(lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[ i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl .rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map(lambda s1: step1_idx >= s1, step1_separators )) - 1, lambda is_separator, step1_block: entry.annotate(__step1_block=step1_block, __step2_block=hl.cond( ~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x) ]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max( hl.min(mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M ]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter( (mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta), hl.range(n_blocks))) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected )) - hl.sum( hl.map(lambda x: x[i], mt. __step1_block_betas_bias_corrected))** 2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + +mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter( mt.__in_step2, hl.agg.linreg( y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) ]) mt = mt.annotate_cols(__step2_h2=hl.max( hl.min(mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n) / M ]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]), hl.range(n_blocks))) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean(mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0 / (mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + +mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols(__final_block_betas_bias_corrected=( n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected) ], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks ]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct(estimate=mt.__final_betas[0], standard_error=hl.sqrt( mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M / hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
n_partitions = 500 mt = hl.read_matrix_table( f'{lustre_dir}/variant_qc/megaWES_final_after_RF.mt') table_cohort = hl.import_table( f"{lustre_dir1}/sanger_cohorts_corrected_ukbb_july_2020.tsv", delimiter="\t").key_by('s') mt = mt.annotate_cols(cohort=table_cohort[mt.s].cohort) df = pd.read_csv( f"{lustre_dir1}/sanger_cohorts_corrected_ukbb_july_2020.tsv", sep="\t") cohorts_array = df.cohort.unique() mt = mt.annotate_rows(MAF_cohorts=hl.agg.group_by( mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AF))) mt = mt.annotate_rows(AN_cohorts=hl.agg.group_by( mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AN))) mt = mt.annotate_rows(AC_cohorts=hl.agg.group_by( mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AC))) mt = mt.annotate_rows(missingness_cohorts=hl.agg.group_by( mt.cohort, hl.min((hl.agg.count_where(hl.is_missing(mt['GT']))) / mt.count_rows() * 2))) mt = mt.annotate_rows(info=mt.info.annotate( cohort_names=mt.MAF_cohorts.keys())) mt = mt.annotate_rows(info=mt.info.annotate( MAF_cohorts_values=mt.MAF_cohorts.values()))
def adjust_vcf_incompatible_types( ht: hl.Table, pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS, ) -> hl.Table: """ Create a Table ready for vcf export. In particular, the following conversions are done: - All int64 are coerced to int32 - Fields specified by `pipe_delimited_annotations` are converted from arrays to pipe-delimited strings :param ht: Input Table. :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct). :return: Table ready for VCF export. """ def get_pipe_expr( array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|") # Make sure the HT is keyed by locus, alleles ht = ht.key_by("locus", "alleles") info_type_convert_expr = {} # Convert int64 fields to int32 (int64 isn't supported by VCF) for f, ft in ht.info.dtype.items(): if ft == hl.dtype("int64"): logger.warning( "Coercing field info.%s from int64 to int32 for VCF output. Value will be capped at int32 max value.", f, ) info_type_convert_expr.update( {f: hl.int32(hl.min(2**31 - 1, ht.info[f]))}) elif ft == hl.dtype("array<int64>"): logger.warning( "Coercing field info.%s from array<int64> to array<int32> for VCF output. Array values will be capped " "at int32 max value.", f, ) info_type_convert_expr.update( {f: ht.info[f].map(lambda x: hl.int32(hl.min(2**31 - 1, x)))}) ht = ht.annotate(info=ht.info.annotate(**info_type_convert_expr)) info_expr = {} # Make sure to pipe-delimit fields that need to. # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty) # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, it is handled below for f in pipe_delimited_annotations: if f in ht.info and f != "AS_SB_TABLE": info_expr[f] = "|" + get_pipe_expr(ht.info[f]) # Flatten SB if it is an array of arrays if "SB" in ht.info and not isinstance(ht.info.SB, hl.expr.ArrayNumericExpression): info_expr["SB"] = ht.info.SB[0].extend(ht.info.SB[1]) if "AS_SB_TABLE" in ht.info: info_expr["AS_SB_TABLE"] = get_pipe_expr( ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ","))) # Annotate with new expression ht = ht.annotate(info=ht.info.annotate(**info_expr)) return ht