def annotate_with_genotype_num_alt(mt: hl.MatrixTable) -> hl.MatrixTable: if 'AD' in set(mt.entry): # GATK-consistent VCF mt = mt.annotate_rows(genotypes=(hl.agg.collect( hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0, mt.GT.n_alt_alleles()), ab=hl.cond( mt.alleles[1] == '<CNV>', 0.0, hl.float(hl.array(mt.AD)[1]) / hl.float(hl.fold(lambda i, j: i + j, 0, mt.AD))), gq=mt.GQ, sample_id=mt.s, dp=mt.DP)))) elif 'AO' in set(mt.entry): mt = mt.annotate_rows( genotypes=hl.agg.collect( hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0, mt.GT.n_alt_alleles()), ab=hl.cond(mt.alleles[1] == '<CNV>' or mt.DP == 0, 0.0, hl.float(mt.AO[0]) / hl.float(mt.DP)), dp=mt.DP, gq=mt.GQ, sample_id=mt.s)) ) #hl.cond(mt.GT=="0/0",0,hl.cond(mt.GT=="1/0",1,hl.cond(mt.GT=="0/1",1,hl.cond((mt.GT=="1/1",2,hl.cond(mt.GT=="1/2",2,hl.cond(mt.GT=="2/1",2,hl.cond(mt.GT=="2/2",2,-1)))))))) else: raise ValueError("unrecognized vcf") return mt
def prepare_exac_constraint(exac_constraint_path): ds = hl.import_table(exac_constraint_path, force=True) ds = ds.select( transcript_id=ds.transcript.split("\\.")[0], # Expected exp_syn=hl.float(ds.exp_syn), exp_mis=hl.float(ds.exp_mis), exp_lof=hl.float(ds.exp_lof), # Actual obs_syn=hl.int(ds.n_syn), obs_mis=hl.int(ds.n_mis), obs_lof=hl.int(ds.n_lof), # mu mu_syn=hl.float(ds.mu_syn), mu_mis=hl.float(ds.mu_mis), mu_lof=hl.float(ds.mu_lof), # Z syn_z=hl.float(ds.syn_z), mis_z=hl.float(ds.mis_z), lof_z=hl.float(ds.lof_z), # Other pLI=hl.float(ds.pLI), ) ds = ds.key_by("transcript_id") return ds
def format_exac_constraint(ds): # Select relevant fields ds = ds.select( transcript_id=ds.transcript.split("\\.")[0], # Expected exp_syn=hl.float(ds.exp_syn), exp_mis=hl.float(ds.exp_mis), exp_lof=hl.float(ds.exp_lof), # Actual obs_syn=hl.int(ds.n_syn), obs_mis=hl.int(ds.n_mis), obs_lof=hl.int(ds.n_lof), # mu mu_syn=hl.float(ds.mu_syn), mu_mis=hl.float(ds.mu_mis), mu_lof=hl.float(ds.mu_lof), # Z syn_z=hl.float(ds.syn_z), mis_z=hl.float(ds.mis_z), lof_z=hl.float(ds.lof_z), # Other pLI=hl.float(ds.pLI), ) ds = ds.key_by("transcript_id") return ds
def test_dndarray_sum(): n_variants = 10 n_samples = 10 block_size = 3 n_blocks = 16 mt1 = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt1 = mt1.select_entries(dosage=hl.float(mt1.GT.n_alt_alleles())) mt2 = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt2 = mt2.select_entries(dosage=hl.float(mt2.GT.n_alt_alleles())) da1 = hl.experimental.dnd.array(mt1, 'dosage', block_size=block_size) da2 = hl.experimental.dnd.array(mt2, 'dosage', block_size=block_size) da_sum = (da1 + da2).checkpoint(new_temp_file()) assert da_sum._force_count_blocks() == n_blocks da_result = da_sum.collect() a1 = np.array(mt1.dosage.collect()).reshape(n_variants, n_samples) a2 = np.array(mt2.dosage.collect()).reshape(n_variants, n_samples) a_result = a1 + a2 assert np.array_equal(da_result, a_result)
def prepare_exac_constraint(path): ds = hl.import_table(path, force=True) ds = ds.repartition(32, shuffle=True) # Select relevant fields ds = ds.select( # Remove version number from transcript ID transcript_id=ds.transcript.split("\\.")[0], # Expected exp_syn=hl.float(ds.exp_syn), exp_mis=hl.float(ds.exp_mis), exp_lof=hl.float(ds.exp_lof), # Actual obs_syn=hl.int(ds.n_syn), obs_mis=hl.int(ds.n_mis), obs_lof=hl.int(ds.n_lof), # mu mu_syn=hl.float(ds.mu_syn), mu_mis=hl.float(ds.mu_mis), mu_lof=hl.float(ds.mu_lof), # Z syn_z=hl.float(ds.syn_z), mis_z=hl.float(ds.mis_z), lof_z=hl.float(ds.lof_z), # Other pli=hl.float(ds.pLI), ) ds = ds.key_by("transcript_id") return ds
def _parse_odds_ratio(field_name): return hl.rbind( ds[field_name].split(" ", n=2), lambda parts: hl.rbind( parts[0], parts[1][1:-1].split("-", 2), lambda value, bounds: hl.struct( **{ field_name: hl.float(value), field_name + " lower bound": hl.float(bounds[0]), field_name + " upper bound": hl.float(bounds[1]), }), ), )
def add_global_af(ht: hl.Table, temp: str) -> hl.Table: ''' Adds gnomAD global AF annotation to Table :param Table ht: Input Table :param str temp: Path to temp bucket (to store intermediary files) :return: Table with gnomAD global AF annotation :rtype: Table ''' # checkpoint table after completing both gnomAD exomes and gnomAD genomes join temp_path = f'{temp}/join.ht' ht = ht.checkpoint(temp_path) # set gnomAD ACs and ANs to 0 if they are missing after the join ht = ht.transmute( gnomad_exomes_AC=hl.if_else(hl.is_defined(ht.gnomad_exomes_AC), ht.gnomad_exomes_AC, 0), gnomad_genomes_AC=hl.if_else(hl.is_defined(ht.gnomad_genomes_AC), ht.gnomad_genomes_AC, 0), gnomad_exomes_AN=hl.if_else(hl.is_defined(ht.gnomad_exomes_AN), ht.gnomad_exomes_AN, 0), gnomad_genomes_AN=hl.if_else(hl.is_defined(ht.gnomad_genomes_AN), ht.gnomad_genomes_AN, 0), ) ht = ht.annotate(gnomad_global_AF=( hl.if_else(((ht.gnomad_exomes_AN == 0) & (ht.gnomad_genomes_AN == 0)), 0.0, hl.float((ht.gnomad_exomes_AC + ht.gnomad_genomes_AC) / (ht.gnomad_exomes_AN + ht.gnomad_genomes_AN))))) ht.describe() return ht
def annotate_meta(mt, meta, key_by): ''' Annotates a matrix table with the following meta data: collaborator participant id, site id, and autocall call rate. Assumes that meta and mt don't have the same key, keys meta by chip_well_barcode :param mt: hail matrix table :param meta_table: tsv file containing sample IDs, reported sex information, and IS specific to :param key_by: :return: ''' # Setting the key of metaDat as chip_well_barcode meta = meta.key_by(key_by) # Adding the reported sex column from the sample data table to the matrix table mt = mt.annotate_cols(reported_sex=meta[mt.s].reported_gender) # Annotating the matrix table with the projID from meta data mt = mt.annotate_cols(collab_PID=meta[mt.s].collaborator_participant_id) # Creating a new column in mt for siteID which is the 3 letters from the collab participant id mt = mt.annotate_cols(siteID=mt.col.collab_PID[0:3]) # Switching lowercase letters to capital in collabPID and siteID mt = mt.annotate_cols(collab_PID=mt.collab_PID.upper()) mt = mt.annotate_cols(siteID=mt.siteID.upper()) # Annotating mt with autocall call rate mt = mt.annotate_cols(autocall_call_rate=meta[mt.s].autocall_call_rate) # Filtering out individuals with an autocall call rate below .95 mt = mt.filter_cols(hl.float(mt.autocall_call_rate) < .95, keep=False) # Filtering NA12878 individual from the matrix table samples_to_remove = {'NA12878'} set_to_remove = hl.literal(samples_to_remove) return mt.filter_cols(~set_to_remove.contains(mt['collab_PID']))
def test_memory_issue_from_9009(): mt = hl.utils.range_matrix_table(1000, 1, n_partitions=1) mt = mt.annotate_entries(x=hl.float(mt.row_idx * mt.col_idx)) mt = mt.annotate_rows(big=hl.zeros(100_000_000)) try: hl.linalg.BlockMatrix.write_from_entry_expr(mt.x, new_temp_file(), overwrite=True) except Exception: assert False
def test_lambda_gc_nans(self): N = 5000000 ht = hl.utils.range_table(N).annotate(x=hl.scan.count() / N, is_even=hl.scan.count() % 2 == 0) lgc_nan = hl.lambda_gc(hl.case().when(ht.is_even, hl.float('nan')).default(ht.x)) self.assertAlmostEqual(lgc_nan, 1, places=1) # approximate, 1 place is safe
def test_king_homo_estimator(): hl.set_global_seed(1) mt = hl.balding_nichols_model(2, 5, 5) mt = mt.select_entries(genotype_score=hl.float(mt.GT.n_alt_alleles())) da = hl.experimental.dnd.array(mt, 'genotype_score', block_size=3) def sqr(x): return x * x score_difference = da.T.inner_product( da, lambda l, r: sqr(l - r), lambda l, r: l + r, hl.float(0), hl.agg.sum).checkpoint(new_temp_file()) assert np.array_equal( score_difference.collect(), np.array([[0., 6., 4., 2., 4.], [6., 0., 6., 4., 6.], [4., 6., 0., 6., 0.], [2., 4., 6., 0., 6.], [4., 6., 0., 6., 0.]]))
def import_vqsr( vqsr_path: str, vqsr_type: str = "alleleSpecificTrans", num_partitions: int = 5000, overwrite: bool = False, import_header_path: Optional[str] = None, ) -> None: """ Imports vqsr site vcf into a HT :param vqsr_path: Path to input vqsr site vcf. This can be specified as Hadoop glob patterns :param vqsr_type: One of `classic`, `alleleSpecific` (allele specific) or `alleleSpecificTrans` (allele specific with transmitted singletons) :param num_partitions: Number of partitions to use for the VQSR HT :param overwrite: Whether to overwrite imported VQSR HT :param import_header_path: Optional path to a header file to use for import :return: None """ logger.info(f"Importing VQSR annotations for {vqsr_type} VQSR...") mt = hl.import_vcf( vqsr_path, force_bgz=True, reference_genome="GRCh38", header_file=import_header_path, ).repartition(num_partitions) ht = mt.rows() ht = ht.annotate(info=ht.info.annotate( AS_VQSLOD=ht.info.AS_VQSLOD.map(lambda x: hl.float(x)), AS_QUALapprox=ht.info.AS_QUALapprox.split("\|")[1:].map( lambda x: hl.int(x)), AS_VarDP=ht.info.AS_VarDP.split("\|")[1:].map(lambda x: hl.int(x)), AS_SB_TABLE=ht.info.AS_SB_TABLE.split("\|").map( lambda x: x.split(",").map(lambda y: hl.int(y))), )) ht = ht.checkpoint( get_vqsr_filters(f"vqsr_{vqsr_type}", split=False, finalized=False).path, overwrite=overwrite, ) unsplit_count = ht.count() ht = hl.split_multi_hts(ht) ht = ht.annotate( info=ht.info.annotate(**split_info_annotation(ht.info, ht.a_index)), ) ht = ht.checkpoint( get_vqsr_filters(f"vqsr_{vqsr_type}", split=True, finalized=False).path, overwrite=overwrite, ) split_count = ht.count() logger.info( f"Found {unsplit_count} unsplit and {split_count} split variants with VQSR annotations" )
def test_medium_collect(): n_variants = 100 n_samples = 100 block_size = 32 mt = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles())) da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size) a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples) assert np.array_equal(da.collect(), a)
def variant_qc_aggregator(mt) -> hl.MatrixTable: """:func:`.variant_qc` as an aggregator.""" bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( "'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) n_cols = hl.agg.count() bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) return hl.rbind( hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when( hl.len(mt.alleles) == 2, hl.hardy_weinberg_test( e1.call_stats.homozygote_count[0], e1.call_stats.AC[ 1] - 2 * e1.call_stats.homozygote_count[1], e1. call_stats.homozygote_count[1])).or_missing(), lambda hwe: hl.struct( **{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value })))
def cast_str(ht: hl.Table, field_names: list, output_type: str) -> hl.Table: """ :param ht: :param field_names: :param output_type: :return: """ if output_type == 'float': ht = (ht.transmute( **{ k: hl.cond(~ht[k].matches('\p{Digit}'), hl.float(0), hl.float(ht[k])) for k in field_names })) if output_type == 'int': ht = (ht.transmute( **{ k: hl.cond(~ht[k].matches('\p{Digit}'), hl.int(0), hl.int( ht[k])) for k in field_names })) return ht
def join_mitochondria_vcfs_into_mt(input_tsv: str, output_bucket: str, chunk_size: int = 100) -> hl.MatrixTable: """Reformats and joins individual mitochondrial vcfs into one MatrixTable :param dict confirmed_vcfs: dictionary of samples for which the vcf existence was confirmed (sample as key, path to vcf as value) :param str output_bucket: path to bucket to which results should be written :param int chunk_size: number of MatrixTables to join per chunk :return: joined MatrixTable of samples given in confirmed_vcfs dictionary :rtype: hl.MatrixTable """ mt_list = [] with open(input_tsv, "r") as f: for line in f: line = line.rstrip() items = line.split("\t") sample, vcf_path = items[0:2] mt = hl.import_vcf(vcf_path, reference_genome="GRCh38") # because the vcfs are split, there is only one AF value, although misinterpreted as an array because Number=A in vcf header # second value of MMQ is the value for the alternate allele mt = mt.select_entries("DP", HL=mt.AF[0]) mt = mt.annotate_entries(MQ=hl.float(mt.info["MMQ"][1]), TLOD=mt.info["TLOD"][0], FT=hl.if_else( hl.len(mt.filters) == 0, {"PASS"}, mt.filters)) # use GRCh37 as reference as this is more compatibile with mitochondria resources that may be added as annotations in downstream scripts mt = mt.key_rows_by( locus=hl.locus("MT", mt.locus.position, reference_genome="GRCh37"), alleles=mt.alleles, ) mt = mt.key_cols_by(s=sample) mt = mt.select_rows() mt_list.append(mt) temp_out_dir = output_bucket + "/temp" combined_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size) return combined_mt
def test_matmul_via_inner_product(): n_variants = 10 n_samples = 10 block_size = 3 n_blocks = 16 mt = hl.utils.range_matrix_table(n_variants, n_samples) mt = mt.select_entries(x=mt.row_idx * mt.col_idx) da = hl.experimental.dnd.array(mt, 'x', block_size=block_size) prod = (da @ da.T).checkpoint(new_temp_file()) assert prod._force_count_blocks() == n_blocks prod_result = prod.collect() ip_result = da.inner_product(da.T, lambda l, r: l * r, lambda l, r: l + r, hl.float(0.0), lambda prod: hl.agg.sum(prod)).collect() assert np.array_equal(prod_result, ip_result)
def test_medium_matmul(): n_variants = 100 n_samples = 100 block_size = 32 n_blocks = 16 mt = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles())) da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size) da = (da @ da.T).checkpoint(new_temp_file()) assert da._force_count_blocks() == n_blocks da_result = da.collect().reshape(n_variants, n_variants) a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples) a_result = a @ a.T assert np.array_equal(da_result, a_result)
def _genotype_fields(self): # Convert the mt genotype entries into num_alt, gq, ab, dp, and sample_id. is_called = hl.is_defined(self.mt.GT) return { 'num_alt': hl.cond(is_called, self.mt.GT.n_alt_alleles(), -1), 'gq': hl.cond(is_called, self.mt.GQ, hl.null(hl.tint)), 'ab': hl.bind( lambda total: hl.cond( (is_called) & (total != 0) & (hl.len(self.mt.AD) > 1), hl.float(self.mt.AD[1] / total), hl.null(hl.tfloat)), hl.sum(self.mt.AD)), 'dp': hl.cond(is_called, hl.int(hl.min(self.mt.DP, 32000)), hl.null(hl.tfloat)), 'sample_id': self.mt.s }
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def main(args): ######################################################################## ### initialize print('Getting started: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # 1. Read in summary stats data # 2. Annotate matrix table with effect sizes for each phenotype # 3. Compute PRS for each start = time.time() pheno_gwas = hl.import_table(f'gs://apcdr/pheno_code_ukb_code.txt') pheno_ss = dict([(x.pheno_code, x.ukb_code) for x in pheno_gwas.collect()]) #pheno_ss = dict([(x.ss_code, x.pheno_code) for x in pheno_gwas.collect()]) # mt = hl.read_matrix_table('gs://apcdr/prs_sumstats_clumps/ukb_holdout/ukb31063.gwas_holdout_sumstats_pheno37_subset.mt') mt = hl.read_matrix_table('gs://apcdr/dosage_bgen/apcdr.mt') ss_keys = dict( zip(['CHR', 'POS', 'REF', 'ALT', 'P', 'BETA'], args.chr_pos_ref_alt_p_beta.split(','))) for pheno in list(pheno_ss.keys()): #for pheno in ['WHR']: print('Pheno: ' + pheno + ', Time: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) suffix_replace = args.ss_suffix.split('.') suffix_replace[-2] = 'clumped' suffix_replace = '.'.join(suffix_replace) if hl.hadoop_exists(args.ss_clump_prefix + pheno + suffix_replace): ss_path = args.ss_clump_prefix + pheno + args.ss_suffix clump_path = args.ss_clump_prefix + pheno + suffix_replace elif hl.hadoop_exists(args.ss_clump_prefix + pheno_ss[pheno] + suffix_replace): ss_path = args.ss_clump_prefix + pheno_ss[pheno] + args.ss_suffix clump_path = args.ss_clump_prefix + pheno_ss[pheno] + suffix_replace else: continue ss = hl.import_table(ss_path, impute=True, delimiter='\s+', min_partitions=1000) ss = ss.annotate(locus=hl.locus(hl.str(ss[ss_keys['CHR']]), ss[ss_keys['POS']]), alleles=[ss[ss_keys['REF']], ss[ss_keys['ALT']]]) ss = ss.key_by(ss.locus, ss.alleles) ## Read in summary statistics and true phenotypes mt_annot = mt.annotate_rows(ss=ss[mt.locus, mt.alleles]) # come back to this # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.gwas_vs_holdout.txt', # types={'s': hl.tstr}, key='s') # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.holdout_and_target.txt', # types={'s': hl.tstr}, key='s') # # mt_annot = mt_annot.filter_cols(hl.or_else(ht_samples[mt_annot.s].in_gwas != 'TRUE', True)) # mt_annot = mt_annot.filter_cols(hl.is_defined(ht_samples[mt_annot.s])) # # print(mt.count()) # 13364303, 136265) print('Starting ' + pheno + ': ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) p_max = { 's1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1 } pheno_clump = specific_clumps(clump_path) mt_annot = mt_annot.filter_rows(pheno_clump.get(mt_annot.locus, False)) # print(mt.count()) annot_expr = { k: hl.agg.sum( hl.float(mt_annot.ss[ss_keys['BETA']]) * mt_annot.dosage * hl.int(mt_annot.ss[ss_keys['P']] < v)) for k, v in p_max.items() } mt_annot = mt_annot.annotate_cols(**annot_expr) ht_out = mt_annot.cols() #ht_out.describe() #covs = hl.read_table('gs://apcdr/ukb_holdout/uk_round2_allSamples_phenos_phesant.ht').select('age', 'sex') # added # need to add in PCs #ht_out = ht_out.annotate(**covs[ht_out.key]) ht_comb = ht_out.select(*p_max.keys(), age=ht_out.phenotypes.age, sex=ht_out.phenotypes.sex, pheno=ht_out.phenotypes[pheno]) output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS' #ht_comb.describe() #ht_comb.write(output_location + '.ht', overwrite=args.overwrite) #ht_comb = hl.read_table(output_location + '.ht') ht_comb.export(output_location + '.txt.bgz') end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: # Count variants for ranking count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp( ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')} rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}") ht_truth_data = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht") ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) ht = ht.annotate( **ht_truth_data[ht.key], # **fam_ht[ht.key], # **gnomad_ht[ht.key], # **denovo_ht[ht.key], # clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()), rank_bins=hl.array( [hl.Struct( rank_id=rank_name, bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins))) ) for rank_name in rank_variant_counts] ), # lcr=hl.is_defined(lcr_intervals[ht.locus]) ) ht = ht.explode(ht.rank_bins) ht = ht.transmute( rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin ) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True) # Create binned data return ( ht .group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.transmitted_singleton, trans_singletons=hl.is_defined(ht.singleton_rank), de_novo_high_quality=ht.de_novo_high_quality_rank, de_novo_medium_quality=hl.is_defined( ht.de_novo_medium_quality_rank), de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank), # release_adj=ht.ac > 0, bin=ht.bin )._set_buffer_size(20000) .aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition( ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion( ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), # n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.transmitted_singleton), n_high_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.99), n_validated_DDD_denovos=hl.agg.count_where( ht.inheritance.contains("De novo")), n_medium_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.5), n_high_confidence_de_novos=hl.agg.count_where( ht.de_novo_data.confidence[0] == 'HIGH'), n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum( ht.family_stats.mendel[0].errors)), n_high_quality_de_novos_synonymous=hl.agg.count_where( (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")), # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where( ht.family_stats.mendel[0].errors > 0)), # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)), n_omni=hl.agg.count_where(ht.omni), n_mills=hl.agg.count_where(ht.mills), n_hapmap=hl.agg.count_where(ht.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.kgp_phase1_hc), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site) ) )
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind( hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when( hl.len(mt.alleles) == 2, hl.hardy_weinberg_test( e1.call_stats.homozygote_count[0], e1.call_stats.AC[ 1] - 2 * e1.call_stats.homozygote_count[1], e1. call_stats.homozygote_count[1])).or_missing(), lambda hwe: hl.struct( **{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value }))) return mt.annotate_rows(**{name: result})
def main(args): ######################################################################## ### initialize phenos = [ 'height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil', 'eosinophil', 'basophil', 'lymphocyte', 'rbc', 'mch', 'mcv', 'mchc', 'hb', 'ht', 'plt' ] phenos.sort() phenotype = 'ALL17' if args.clump_basename is None: clumps = args.dirname + args.basename + '_ALL17.clumped' prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename + '-pt-sumstats-locus-allele-keyed.kt' contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename else: clumps = args.dirname + args.clump_basename + '_ALL17.clumped' prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename_out + '-pt-sumstats-locus-allele-keyed.kt' contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename_out # clumps = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '.clumped' # ss_filename = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '.tsv.gz' # out_base = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '_gwas_PRS' clump_table_location = clumps.replace('.clumped', '.kt') contigs = {'0{}'.format(x): str(x) for x in range(1, 10)} bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen' start = time.time() # large block size because we read very little data (due to filtering & ignoring genotypes) hl.init(branching_factor=10, min_block_size=2000) # set min_block_size only in import_bgen ################################################################################ ### set up the sumstats table (chr, bp for union SNPs) # if (args.generate_prs_loci_table): # t = hl.import_table(sumstats_text_file, # delimiter='\s+', # impute=True) # t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP)) # t = t.key_by('locus') # t.write(prs_loci_table_location, overwrite=True) # # ss = hl.read_table(prs_loci_table_location) if args.read_clumps: clump_file = hl.import_table(clumps, delimiter='\s+', impute=True) clump_file = clump_file.select( locus=hl.locus(hl.str(clump_file.CHR), clump_file.BP)) clump_file = clump_file.key_by('locus') clump_file.write(clump_table_location, overwrite=True) clump_file = hl.read_table(clump_table_location) # ################################################################################ # ### determine the indices of the prs variants in bgen # if (args.generate_contig_row_dict): # mt = hl.methods.import_bgen(bgen_files, # [], # contig_recoding=contigs, # _row_fields=['file_row_idx']) # prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows() # print('about to collect') # # remove all unnecessary data, dropping keys and other irrelevant fields # prs_rows = prs_rows.key_by() # prs_rows = prs_rows.select(contig=prs_rows.locus.contig, # file_row_idx=prs_rows.file_row_idx) # contig_row_list = prs_rows.collect() # print('finished collecting') # contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list] # print('reformed') # from collections import defaultdict # contig_row_dict = defaultdict(list) # for k, v in contig_reformed: # contig_row_dict[k].append(v) # print('dictionary created') # # with hl.hadoop_open(contig_row_dict_location, 'wb') as f: # pickle.dump(contig_row_dict, f) # else: # with hl.hadoop_open(contig_row_dict_location, 'rb') as f: # contig_row_dict = pickle.load(f) ################################################################################ ### Get true phenotypes from UKBB if args.pheno_table: # phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz', # key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='') phenotypes = hl.import_table( 'gs://armartin/disparities/ukbb/UKB_phenos_ALL17.txt.bgz', key='eid', impute=True, types={'eid': hl.tstr}) covariates = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', key='s', impute=True, types={'s': hl.tstr}) samples = covariates.annotate(**phenotypes[covariates.s]) # Write pheno/covar/sample info table for pheno in phenos: #sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s') gwas_holdout = hl.import_table( 'gs://armartin/mama/ukb31063.gwas_samples.gwas_vs_holdout.txt', delimiter='\s+').key_by('s') samples = samples.annotate(**{ pheno + '_holdout': gwas_holdout[samples.s].in_gwas == 'FALSE' }) samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht', True) if args.ss_tables: # Write ss info for pheno in phenos: print(pheno) # change sumstats to bgz #ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz', ss = hl.import_table(args.dirname + pheno + '_' + args.basename + '.*.bgz', delimiter='\s+', impute=True, types={ 'MAMA_BETA': hl.tfloat, 'MAMA_PVAL': hl.tfloat, 'BP': hl.tint }) #, 'N': hl.tint}) ss = ss.key_by( locus=hl.locus(hl.str(ss.CHR), hl.int(ss.BP))).repartition(200) ss.write(args.dirname + pheno + '_' + args.basename + '.ht', True) ################################################################################ ### Run the PRS using phenotype-specific clump variants if args.write_bgen: mt_all = hl.import_bgen( bgen_files, ['dosage'], sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', variants=clump_file.locus) # contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()} # mt_all = hl.methods.import_bgen(bgen_files, # ['dosage'], # sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', # contig_recoding=contigs, # _variants_per_file=contig_row_dict2, # _row_fields=[]) #samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht', True) samples = hl.read_table(args.dirname + args.basename + '_holdout_gwas_phenos.ht') mt_all = mt_all.annotate_cols(**samples[ mt_all.s]) # ok that phenos keyed on userId not s? # if args.clump_basename is None: mt_all.repartition(5000, shuffle=False).write( args.dirname + args.basename + '_ALL17.mt', True) else: mt_all.repartition(5000, shuffle=False).write( args.dirname + args.basename_out + '_ALL17.mt', True) mt_all = hl.read_matrix_table(args.dirname + args.basename + '_ALL17.mt') for pheno in phenos: #[6:len(phenos)]: print(pheno) ss = hl.read_table(args.dirname + pheno + '_' + args.basename + '.ht') """ To add: - Filter only to samples in holdout GWAS - Filter to rows in phenotype-specific clump file - Build PRS for 10 p-value thresholds - Also fix nt1/nt2 to A1 and A2 (check) from sumstats. """ # filter to only samples held out from GWAS mt = mt_all.filter_cols(mt_all[pheno + '_holdout']) mt = mt.annotate_rows(ss=ss[mt.locus]) mt = annotate_beta(mt, mt.ss) p_max = { 's1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1 } if args.clump_basename is None: pheno_clump = specific_clumps(args.dirname + pheno + '_' + args.basename + '.clumped') else: pheno_clump = specific_clumps(args.dirname + pheno + '_' + args.clump_basename + '.clumped') mt = mt.filter_rows(pheno_clump.get(mt.locus, False)) print(mt.count()) # divide by sd's of frequencies to get standardized betas back to allelic scale for MAMA betas (only, not METAL) # sqrt(2pq) if args.betas_are_standardized: annot_expr = { k: hl.agg.sum(mt.beta / hl.sqrt(2 * hl.float(mt.ss.FRQ) * 1 - hl.float(mt.ss.FRQ)) * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v)) for k, v in p_max.items() } else: annot_expr = { k: hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v)) for k, v in p_max.items() } mt = mt.annotate_cols(**annot_expr) if args.clump_basename is None: mt.cols().write(args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.ht', stage_locally=True, overwrite=True) ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.ht') else: mt.cols().write(args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.ht', stage_locally=True, overwrite=True) ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.ht') ht_out = ht.drop(*[x for x in list(ht.row) if 'holdout' in x], *[x for x in phenos if pheno not in x]) if args.clump_basename is None: output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.txt.bgz' else: output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.txt.bgz' ht_out.export(output_location) end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
def create_binned_data(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: """ Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param Table ht: Input rank table :param str data: Which data/run hash is being created :param str data_type: one of 'exomes' or 'genomes' :param int n_bins: Number of bins. :return: Binned Table :rtype: Table """ # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank') } rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}" ) ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) # Load external evaluation data clinvar_ht = hl.read_table(clinvar_ht_path) denovo_ht = get_validated_denovos_ht() if data_type == 'exomes': denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality) else: denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality) denovo_ht = denovo_ht.select( validated_denovo=denovo_ht.validated, high_confidence_denovo=denovo_ht.Confidence == 'HIGH') ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data')) fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats')) fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0]) gnomad_ht = get_gnomad_data(data_type).rows() gnomad_ht = gnomad_ht.select( vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE, vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE, fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) | (gnomad_ht.info.MQ < 30)) lcr_intervals = hl.import_locus_intervals(lcr_intervals_path) ht = ht.annotate( **ht_truth_data[ht.key], **fam_ht[ht.key], **gnomad_ht[ht.key], **denovo_ht[ht.key], clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()), rank_bins=hl.array([ hl.Struct( rank_id=rank_name, bin=hl.int( hl.ceil( hl.float(ht[rank_name] + 1) / hl.floor( ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))) for rank_name in rank_variant_counts ]), lcr=hl.is_defined(lcr_intervals[ht.locus])) ht = ht.explode(ht.rank_bins) ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht', overwrite=True) # Create binned data return (ht.group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.singleton, release_adj=ht.ac > 0, bin=ht.bin)._set_buffer_size(20000).aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where( hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where( hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.singleton), n_validated_de_novos=hl.agg.count_where(ht.validated_denovo), n_high_confidence_de_novos=hl.agg.count_where( ht.high_confidence_denovo), n_de_novo=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_de_novo_sites_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.t)), n_untrans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[1] == 1) & (ht.family_stats.tdt.t == 1)), n_omni=hl.agg.count_where(ht.truth_data.omni), n_mills=hl.agg.count_where(ht.truth_data.mills), n_hapmap=hl.agg.count_where(ht.truth_data.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.truth_data.kgp_high_conf_snvs), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind(hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when(hl.len(mt.alleles) == 2, hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0], e1.call_stats.AC[1] - 2 * e1.call_stats.homozygote_count[1], e1.call_stats.homozygote_count[1]) ).or_missing(), lambda hwe: hl.struct(**{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value}))) return mt.annotate_rows(**{name: result})
def prepare_base_level_pext(base_level_pext_path): tmp_dir = os.path.expanduser("~") # # Step 1: rename fields, extract chrom/pos from locus, convert missing values to 0, export to TSV # ds = hl.read_table(base_level_pext_path) ds = ds.select( gene_id=ds.ensg, chrom=ds.locus.contig, pos=ds.locus.position, # Replace NaNs and missing values with 0s mean=hl.if_else( hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion), hl.float(0), ds.mean_proportion), **{ renamed: hl.if_else( hl.is_missing(ds[original]) | hl.is_nan(ds[original]), hl.float(0), ds[original]) for original, renamed in TISSUE_NAME_MAP.items() }) ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus") ds.export("file://" + os.path.join(tmp_dir, "bases.tsv")) # # Step 2: Collect base-level data into regions # with open(os.path.join(tmp_dir, "regions.tsv"), "w") as output_file: writer = csv.writer(output_file, delimiter="\t") writer.writerow(["gene_id", "chrom", "start", "stop", "mean"] + TISSUE_FIELDS) def output_region(region): writer.writerow([ region.gene, region.chrom, region.start, region.stop, region.tissues["mean"] ] + [region.tissues[t] for t in TISSUE_FIELDS]) rows = read_bases_tsv(os.path.join(tmp_dir, "bases.tsv")) first_row = next(rows) current_region = Region(gene=first_row.gene, chrom=first_row.chrom, start=first_row.pos, stop=None, tissues=first_row.tissues) last_pos = first_row.pos for row in tqdm(rows): if (row.gene != current_region.gene or row.chrom != current_region.chrom or row.pos > (last_pos + 1) or any(row.tissues[t] != current_region.tissues[t] for t in row.tissues)): output_region(current_region._replace(stop=last_pos)) current_region = Region(gene=row.gene, chrom=row.chrom, start=row.pos, stop=None, tissues=row.tissues) last_pos = row.pos output_region(current_region._replace(stop=last_pos)) # Copy regions file to HDFS subprocess.run( [ "hdfs", "dfs", "-cp", "file://" + os.path.join(tmp_dir, "regions.tsv"), os.path.join("/tmp/regions.tsv") ], check=True, ) # # Step 3: Convert regions to a Hail table. # types = {t: hl.tfloat for t in TISSUE_FIELDS} types["gene_id"] = hl.tstr types["chrom"] = hl.tstr types["start"] = hl.tint types["stop"] = hl.tint types["mean"] = hl.tfloat ds = hl.import_table("/tmp/regions.tsv", min_partitions=100, missing="", types=types) ds = ds.select("gene_id", "chrom", "start", "stop", "mean", tissues=hl.struct(**{t: ds[t] for t in TISSUE_FIELDS})) ds = ds.group_by("gene_id").aggregate( regions=hl.agg.collect(ds.row_value.drop("gene_id"))) return ds
def main(args): input_tsv = args.input_tsv output_ht = args.output_ht chunk_size = args.chunk_size overwrite = args.overwrite mt_list = [] logger.info( "Reading in individual coverage files as matrix tables and adding to a list of matrix tables..." ) with open(input_tsv, "r") as f: #next(f) for line in f: line = line.rstrip() items = line.split("\t") sample, base_level_coverage_metrics = items[0:2] #print(sample) #print(base_level_coverage_metrics) mt = hl.import_matrix_table( base_level_coverage_metrics, delimiter="\t", row_fields={ "chrom": hl.tstr, "pos": hl.tint, "target": hl.tstr }, row_key=["chrom", "pos"], ).drop("target") mt = mt.rename({"x": "coverage"}) mt = mt.key_cols_by(s=sample) mt_list.append(mt) logger.info("Joining individual coverage mts...") out_dir = dirname(output_ht) temp_out_dir = out_dir + "/temp" cov_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size) n_samples = cov_mt.count_cols() logger.info("Adding coverage annotations...") cov_mt = cov_mt.annotate_rows( locus=hl.locus(cov_mt.chrom, cov_mt.pos, reference_genome="GRCh38"), mean=hl.float(hl.agg.mean(cov_mt.coverage)), median=hl.median(hl.agg.collect(cov_mt.coverage)), over_100=hl.float( (hl.agg.count_where(cov_mt.coverage > 100) / n_samples)), over_1000=hl.float( (hl.agg.count_where(cov_mt.coverage > 1000) / n_samples)), ) cov_mt.show() cov_mt = cov_mt.key_rows_by("locus").drop("chrom", "pos") output_mt = re.sub("\.ht$", ".mt", output_ht) output_tsv = re.sub("\.ht$", ".tsv", output_ht) output_samples = re.sub("\.ht$", "_sample_level.txt", output_ht) logger.info("Writing sample level coverage...") sample_mt = cov_mt.key_rows_by(pos=cov_mt.locus.position) sample_mt.coverage.export(output_samples) logger.info("Writing coverage mt and ht...") cov_mt.write(output_mt, overwrite=overwrite) cov_ht = cov_mt.rows() cov_ht = cov_ht.checkpoint(output_ht, overwrite=overwrite) cov_ht.export(output_tsv)
def prepare_mitochondrial_variants(path, mnvs_path=None): ds = hl.read_table(path) haplogroups = hl.eval(ds.globals.hap_order) ds = ds.annotate(hl_hist=ds.hl_hist.annotate( bin_edges=ds.hl_hist.bin_edges.map( lambda n: hl.float(hl.format("%.2f", n))))) filter_names = hl.dict({ "artifact_prone_site": "Artifact-prone site", "indel_stack": "Indel stack", "npg": "No passing genotype" }) ds = ds.select( # ID variant_id=variant_id(ds.locus, ds.alleles), reference_genome=ds.locus.dtype.reference_genome.name, chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, # Quality filters=ds.filters.map(lambda f: filter_names.get(f, f)), qual=ds.qual, genotype_quality_metrics=[ hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all) ], genotype_quality_filters=[ hl.struct( name="Base Quality", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.base_qual_hist), ), hl.struct( name="Contamination", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.contamination_hist), ), hl.struct( name="Heteroplasmy below 10%", filtered=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.heteroplasmy_below_10_percent_hist), ), hl.struct(name="Position", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.position_hist)), hl.struct( name="Strand Bias", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.strand_bias_hist), ), hl.struct( name="Weak Evidence", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.weak_evidence_hist), ), ], site_quality_metrics=[ hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)), hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)), hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)), ], # Frequency an=ds.AN, ac_hom=ds.AC_hom, ac_het=ds.AC_het, excluded_ac=ds.excluded_AC, # Heteroplasmy common_low_heteroplasmy=ds.common_low_heteroplasmy, heteroplasmy_distribution=ds.hl_hist, max_heteroplasmy=ds.max_hl, # Populations populations=hl.sorted( hl.range(hl.len( ds.globals.pop_order)).map(lambda pop_index: hl.struct( id=ds.globals.pop_order[pop_index], an=ds.pop_AN[pop_index], ac_het=ds.pop_AC_het[pop_index], ac_hom=ds.pop_AC_hom[pop_index], heteroplasmy_distribution=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.pop_hl_hist[pop_index], n_smaller=0, n_larger=0, ), )), key=lambda pop: pop.id, ), # Haplogroups hapmax_af_hom=ds.hapmax_AF_hom, hapmax_af_het=ds.hapmax_AF_het, faf_hapmax_hom=ds.faf_hapmax_hom, haplogroup_defining=ds.hap_defining_variant, haplogroups=[ hl.struct( id=haplogroup, an=ds.hap_AN[i], ac_het=ds.hap_AC_het[i], ac_hom=ds.hap_AC_hom[i], faf_hom=ds.hap_faf_hom[i], heteroplasmy_distribution=ds.hap_hl_hist[i], ) for i, haplogroup in enumerate(haplogroups) ], # Other age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom), flags=hl.set([ hl.or_missing(ds.common_low_heteroplasmy, "common_low_heteroplasmy") ]).filter(hl.is_defined), mitotip_score=ds.mitotip_score, mitotip_trna_prediction=ds.mitotip_trna_prediction, pon_ml_probability_of_pathogenicity=ds. pon_ml_probability_of_pathogenicity, pon_mt_trna_prediction=ds.pon_mt_trna_prediction, variant_collapsed=ds.variant_collapsed, vep=ds.vep, ) if mnvs_path: mnvs = hl.import_table(mnvs_path, types={ "pos": hl.tint, "ref": hl.tstr, "alt": hl.tstr, "AC_hom_MNV": hl.tint }) mnvs = mnvs.key_by( locus=hl.locus("chrM", mnvs.pos, reference_genome=ds.locus.dtype.reference_genome), alleles=[mnvs.ref, mnvs.alt], ) ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0)) ds = ds.annotate( flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags)) return ds
import hail as hl mt = hl.balding_nichols_model(3, 100, 100) gts_as_rows = mt.annotate_rows( mean=hl.agg.mean(hl.float(mt.GT.n_alt_alleles())), genotypes=hl.agg.collect(hl.float(mt.GT.n_alt_alleles()))).rows() groups = gts_as_rows.group_by( ld_block=gts_as_rows.locus.position // 10).aggregate( genotypes=hl.agg.collect(gts_as_rows.genotypes), ys=hl.agg.collect(gts_as_rows.mean)) df = groups.to_spark() from pyspark.sql.functions import udf def get_intercept(X, y): from sklearn import linear_model clf = linear_model.Lasso(alpha=0.1) clf.fit(X, y) return float(clf.intercept_) get_intercept_udf = udf(get_intercept) df.select(get_intercept_udf("genotypes", "ys").alias("intercept")).show()
import hail as hl root = 'gs://hail-datasets-raw-data/LDSC/baselineLD_v2.2' mt = hl.import_matrix_table(f'{root}/ld_scores.GRCh37.tsv.bgz', row_fields={'CHR': hl.tstr, 'SNP': hl.tstr, 'BP': hl.tint}, entry_type=hl.tstr) mt = mt.annotate_entries(x=hl.float(mt['x'])) mt = mt.annotate_rows( locus=hl.locus(mt['CHR'], mt['BP'], 'GRCh37')) mt = mt.key_rows_by('locus') mt = mt.select_rows('SNP') M = hl.import_table( f'{root}/M.GRCh37.tsv.bgz', key='annotation') M_5_50 = hl.import_table( f'{root}/M_5_50.GRCh37.tsv.bgz', key='annotation') mt = mt.rename({'col_id': 'annotation'}) mt = mt.annotate_cols( M_5_50=hl.int(hl.float(M_5_50[mt.annotation].M_5_50)), M=hl.int(hl.float(M[mt.annotation].M))) n_rows, n_cols = mt.count() n_partitions = mt.n_partitions() mt = mt.annotate_globals( metadata=hl.struct( name='LDSC_baselineLD_v2.2_ld_scores', reference_genome='GRCh37',