def test_window_by_locus(self): mt = hl.utils.range_matrix_table(100, 2, n_partitions=10) mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1)) mt = mt.key_rows_by('locus') mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx) mt = hl.window_by_locus(mt, 5).cache() self.assertEqual(mt.count_rows(), 100) rows = mt.rows() self.assertTrue( rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5))) self.assertTrue( rows.all( hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx, hl.zip_with_index(rows.prev_rows)))) entries = mt.entries() self.assertTrue( entries.all( hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries))) self.assertTrue( entries.all( hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx, hl.zip_with_index(entries.prev_entries))))
def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression: """ Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings (all alleles) """ ref = alt_alleles[0] alts = alt_alleles[1:] non_star_alleles = hl.filter(lambda a: a != '*', alts) return hl.struct(variant_type=hl.cond( hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles), hl.cond(hl.len(non_star_alleles) > 1, "multi-snv", "snv"), hl.cond( hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles), hl.cond(hl.len(non_star_alleles) > 1, "multi-indel", "indel"), "mixed") ), n_alt_alleles=hl.len(non_star_alleles))
def test_window_by_locus(self): mt = hl.utils.range_matrix_table(100, 2, n_partitions=10) mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1)) mt = mt.key_rows_by('locus') mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx) mt = hl.window_by_locus(mt, 5).cache() self.assertEqual(mt.count_rows(), 100) rows = mt.rows() self.assertTrue(rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5))) self.assertTrue(rows.all(hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx, hl.zip_with_index(rows.prev_rows)))) entries = mt.entries() self.assertTrue(entries.all(hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries))) self.assertTrue(entries.all(hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx, hl.zip_with_index(entries.prev_entries))))
def load_activity_monitor_data(first_exposure_and_activity_monitor_data_path): ht = hl.import_table(first_exposure_and_activity_monitor_data_path, delimiter=',', quote='"', missing='', impute=True, key='eid') #, min_partitions=500) quality_fields = ['90015-0.0', '90016-0.0', '90017-0.0'] qual_ht = ht.select(hq=hl.is_missing(ht['90002-0.0']) & hl.all(lambda x: x == 1, [ht[x] for x in quality_fields])) mt = filter_and_annotate_ukb_data(ht, lambda x, v: x.startswith('90') and x.endswith('-0.0') and v.dtype in {hl.tint32, hl.tfloat64}) mt = mt.filter_cols(mt.ValueType == 'Continuous') mt = mt.annotate_rows(**qual_ht[mt.row_key]) mt = mt.annotate_entries(value=hl.or_missing(hl.is_defined(mt.hq), mt.value)) mt = mt.key_cols_by(trait_type='continuous', phenocode=mt.phenocode, pheno_sex='both_sexes', coding=NULL_STR_KEY, modifier=NULL_STR_KEY) return mt
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'export_pheno'), suffix='.log'), default_reference='GRCh38') sys.path.append('/') load_module = importlib.import_module(args.load_module) add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') mt = getattr(load_module, args.load_mt_function)(*add_args) mt = mt.filter_cols( hl.all(lambda x: x, [ mt[k] == getattr(args, k, False) for k in PHENO_KEY_FIELDS if k != 'pheno_sex' ])) pheno_sex_mt = mt.filter_cols(mt.pheno_sex == args.pheno_sex) if pheno_sex_mt.count_cols() == 1: mt = pheno_sex_mt else: mt = mt.filter_cols(mt.pheno_sex == 'both_sexes') mt = mt.select_entries(value=mt[args.pheno_sex]) if args.binary_trait: mt = mt.select_entries(value=hl.int(mt.value)) if args.proportion_single_sex > 0: prop_female = mt.n_cases_females / (mt.n_cases_males + mt.n_cases_females) prop_female = prop_female.collect()[0] print(f'Female proportion: {prop_female}') if prop_female <= args.proportion_single_sex: print( f'{prop_female} less than {args.proportion_single_sex}. Filtering to males...' ) mt = mt.filter_rows(mt.sex == 1) elif prop_female >= 1 - args.proportion_single_sex: print( f'{prop_female} greater than {1 - args.proportion_single_sex}. Filtering to females...' ) mt = mt.filter_rows(mt.sex == 0) ht = mt.key_cols_by().select_cols().entries() ht.export(args.output_file)
def test_sampleqc_old_new_equivalence(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_chr22_5_samples.vds')) sqc = hl.vds.sample_qc(vds) dense = hl.vds.to_dense_mt(vds) dense = dense.transmute_entries(GT=hl.vds.lgt_to_gt(dense.LGT, dense.LA)) res = hl.sample_qc(dense) res = res.annotate_cols(sample_qc_new=sqc[res.s]) fields_to_test = [ 'n_het', 'n_hom_var', 'n_non_ref', 'n_singleton', 'n_snp', 'n_insertion', 'n_deletion', 'n_transition', 'n_transversion', 'n_star', 'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion' ] assert res.aggregate_cols( hl.all(*(hl.agg.all(res.sample_qc[field] == res.sample_qc_new[field]) for field in fields_to_test)))
def concatenate(nds, axis=0): """Join a sequence of arrays along an existing axis. Examples -------- >>> x = hl.nd.array([[1., 2.], [3., 4.]]) >>> y = hl.nd.array([[5.], [6.]]) >>> hl.eval(hl.nd.concatenate([x, y], axis=1)) array([[1., 2., 5.], [3., 4., 6.]]) >>> x = hl.nd.array([1., 2.]) >>> y = hl.nd.array([3., 4.]) >>> hl.eval(hl.nd.concatenate((x, y), axis=0)) array([1., 2., 3., 4.]) Parameters ---------- :param nds: a1, a2, …sequence of array_like The arrays must have the same shape, except in the dimension corresponding to axis (the first, by default). Note: unlike Numpy, the numerical element type of each array_like must match. :param axis: int, optional The axis along which the arrays will be joined. Default is 0. Note: unlike Numpy, if provided, axis cannot be None. Returns ------- - res: ndarray The concatenated array """ head_nd = nds[0] head_ndim = head_nd.ndim hl.case().when(hl.all(lambda a: a.ndim == head_ndim, nds), True).or_error("Mismatched ndim") makearr = aarray(nds) concat_ir = NDArrayConcat(makearr._ir, axis) return construct_expr(concat_ir, tndarray(head_nd._type.element_type, head_ndim))
def annotate_variants(mt): ''' Takes matrix table and annotates variants with gene, LOF and missense annotations by parsing VEP annotations. :param mt: matrix table to annotate :return: returns matrix table with new row annotations gene, LOF, and missense. ''' try: test = hl.is_defined(mt.row.was_split) except Exception as e: print('Split multi-allelics before running!') print(e) return # If there is no canonical and protein-coding transcript consequence for that variant, # give the gene corresponding to the most severe consequence. # If there is a canonical and protein-coding transcript consequence for that variant, # give the gene symbol associated with that transcript consequence. canon_pc = mt.row.vep.transcript_consequences.filter( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')) most_severe = mt.row.vep.transcript_consequences.filter( lambda x: x.consequence_terms.contains(mt.row.vep. most_severe_consequence)) mt = mt.annotate_rows(gene=hl.if_else( hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences), canon_pc.map(lambda x: x.gene_symbol), most_severe.map(lambda x: x.gene_symbol))) # The above returns gene symbols for all canonical and protein coding transcripts, not just the one related to the # most severe consequence. So we will keep the above, but annotate also the gene corresponding to the most severe # consequence as well (useful for synonymous, missense, and LOF annotations) canon_pc = mt.row.vep.transcript_consequences.filter( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding') & x.consequence_terms.contains(mt.vep.most_severe_consequence)) most_severe = mt.vep.transcript_consequences.filter( lambda x: x.consequence_terms.contains(mt.row.vep. most_severe_consequence)) mt = mt.annotate_rows(gene_most_severe_conseq=hl.if_else( hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.vep.transcript_consequences), canon_pc.map(lambda x: x.gene_symbol), most_severe.map(lambda x: x.gene_symbol))) # either if there is a canonical and protein coding transcript consequence for that variant, # and the lof annotation is not missing and equal to HC, and the lof flag is missing or is blank, # or if there isn't a canonical and protein coding transcript consequence for that variant and the # transcript consequence with consequence terms containing the most severe consequence term has lof not missing, # is equal to HC, and lof flags missing or blank, # true, else false canon_pc = mt.row.vep.transcript_consequences\ .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')) most_severe = mt.row.vep.transcript_consequences\ .filter(lambda x: x.consequence_terms.contains( mt.row.vep.most_severe_consequence)) canon_bool = ( hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences) & hl.any(lambda x: hl.is_defined(x.lof), canon_pc) & (canon_pc.map(lambda x: x.lof) == ["HC"]) & (hl.all(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == ""), canon_pc))) non_canon_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & hl.any(lambda x: hl.is_defined(x.lof), most_severe) & (most_severe.map(lambda x: x.lof) == ["HC"]) & (hl.all( lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == ""), most_severe))) mt = mt.annotate_rows(LOF=hl.if_else(canon_bool | non_canon_bool, True, False)) # Either if there is a canonical and protein coding transcript consequence for that variant # whose consequence terms contain "missense variant" # or if there is not a canonical and protein coding transcript consequence for that variant, # but the most severe consequence is "missense variant" # or if if there is a canonical and protein coding transcript consequence for that variant # whose consequence terms contain "inframe deletion" # or if there is not a canonical and protein coding transcript consequence for that variant, # but the variant's most severe consequence is "inframe deletion" # true else false canon_pc = mt.row.vep.transcript_consequences\ .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')) canon_missense_bool = canon_pc.map(lambda x: x.consequence_terms).contains( ["missense_variant"]) noncanon_missense_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & (mt.row.vep.most_severe_consequence == "missense_variant")) canon_inframe_bool = canon_pc.map(lambda x: x.consequence_terms).contains( ["inframe_deletion"]) noncanon_inframe_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & (mt.row.vep.most_severe_consequence == "inframe_deletion")) canon_inframe_ins_bool = canon_pc.map( lambda x: x.consequence_terms).contains(["inframe_insertion"]) noncanon_inframe_ins_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & (mt.row.vep.most_severe_consequence == "inframe_insertion")) mt = mt.annotate_rows( missense=hl.if_else((canon_missense_bool | noncanon_missense_bool | canon_inframe_bool | noncanon_inframe_bool | canon_inframe_ins_bool | noncanon_inframe_ins_bool), True, False)) # If the most severe consequence is "synonymous_variant", true else false mt = mt.annotate_rows(synonymous=hl.if_else( mt.row.vep.most_severe_consequence == "synonymous_variant", True, False)) # When there is a transcript consequence for that variant that is canonical, # protein coding, and lof = "HC", its lof flags # When there is not a transcript consequence for that variant that is canonical and protein coding, # but there is a transcript consequence whose consequence terms contains the most severe consequence # and its lof == HC, its lof flags # else blank canon_bool = hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences) canon_hc_bool = hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == 'HC'), mt.row.vep.transcript_consequences) canon_pc_hc = mt.row.vep.transcript_consequences.filter(lambda x: ( x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == "HC")) most_severe_bool = hl.any( lambda x: (x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) & (x.lof == 'HC'), mt.row.vep.transcript_consequences) most_severe_hc = mt.row.vep.transcript_consequences.filter(lambda x: ( x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) & (x.lof == "HC")) mt = mt.annotate_rows(LOF_flag=hl.case().when( canon_hc_bool, canon_pc_hc.map(lambda x: x.lof_flags)).when( ~canon_bool & most_severe_bool, most_severe_hc.map(lambda x: x.lof_flags)).default([""])) return mt
def impute_sex_chromosome_ploidy(vds: VariantDataset, calling_intervals, normalization_contig: str) -> hl.Table: """Impute sex chromosome ploidy from depth of reference data within calling intervals. Returns a :class:`.Table` with sample ID keys, with the following fields: - ``autosomal_mean_dp`` (*float64*): Mean depth on calling intervals on normalization contig. - ``x_mean_dp`` (*float64*): Mean depth on calling intervals on X chromosome. - ``x_ploidy`` (*float64*): Estimated ploidy on X chromosome. Equal to ``2 * x_mean_dp / autosomal_mean_dp``. - ``y_mean_dp`` (*float64*): Mean depth on calling intervals on chromosome. - ``y_ploidy`` (*float64*): Estimated ploidy on Y chromosome. Equal to ``2 * y_mean_db / autosomal_mean_dp``. Parameters ---------- vds : vds: :class:`.VariantDataset` Dataset. calling_intervals : :class:`.Table` or :class:`.ArrayExpression` Calling intervals with consistent read coverage (for exomes, trim the capture intervals). normalization_contig : str Autosomal contig for depth comparison. Returns ------- :class:`.Table` """ if not isinstance(calling_intervals, Table): calling_intervals = hl.Table.parallelize( hl.map(lambda i: hl.struct(interval=i), calling_intervals), schema=hl.tstruct(interval=calling_intervals.dtype.element_type), key='interval') else: key_dtype = calling_intervals.key.dtype if len(key_dtype) != 1 or not isinstance( calling_intervals.key[0].dtype, hl.tinterval) or calling_intervals.key[ 0].dtype.point_type != vds.reference_data.locus.dtype: raise ValueError( f"'impute_sex_chromosome_ploidy': expect calling_intervals to be list of intervals or" f" table with single key of type interval<locus>, found table with key: {key_dtype}" ) rg = vds.reference_data.locus.dtype.reference_genome par_boundaries = [] for par_interval in rg.par: par_boundaries.append(par_interval.start) par_boundaries.append(par_interval.end) # segment on PAR interval boundaries calling_intervals = hl.segment_intervals(calling_intervals, par_boundaries) # remove intervals overlapping PAR calling_intervals = calling_intervals.filter( hl.all(lambda x: ~x.overlaps(calling_intervals.interval), hl.literal(rg.par))) # checkpoint for efficient multiple downstream usages info("'impute_sex_chromosome_ploidy': checkpointing calling intervals") calling_intervals = calling_intervals.checkpoint( new_temp_file(extension='ht')) interval = calling_intervals.key[0] (any_bad_intervals, chrs_represented) = calling_intervals.aggregate( (hl.agg.any(interval.start.contig != interval.end.contig), hl.agg.collect_as_set(interval.start.contig))) if any_bad_intervals: raise ValueError( "'impute_sex_chromosome_ploidy' does not support calling intervals that span chromosome boundaries" ) if len(rg.x_contigs) != 1: raise NotImplementedError( f"reference genome {rg.name!r} has multiple X contigs, this is not supported in 'impute_sex_chromosome_ploidy'" ) chr_x = rg.x_contigs[0] if len(rg.y_contigs) != 1: raise NotImplementedError( f"reference genome {rg.name!r} has multiple Y contigs, this is not supported in 'impute_sex_chromosome_ploidy'" ) chr_y = rg.y_contigs[0] kept_contig_filter = hl.array(chrs_represented).map( lambda x: hl.parse_locus_interval(x, reference_genome=rg)) vds = VariantDataset( hl.filter_intervals(vds.reference_data, kept_contig_filter), hl.filter_intervals(vds.variant_data, kept_contig_filter)) coverage = interval_coverage(vds, calling_intervals, gq_thresholds=()).drop('gq_thresholds') coverage = coverage.annotate_rows(contig=coverage.interval.start.contig) coverage = coverage.annotate_cols(__mean_dp=hl.agg.group_by( coverage.contig, hl.agg.sum(coverage.sum_dp) / hl.agg.sum(coverage.interval_size))) mean_dp_dict = coverage.__mean_dp auto_dp = mean_dp_dict.get(normalization_contig) x_dp = mean_dp_dict.get(chr_x) y_dp = mean_dp_dict.get(chr_y) per_sample = coverage.transmute_cols(autosomal_mean_dp=auto_dp, x_mean_dp=x_dp, x_ploidy=2 * x_dp / auto_dp, y_mean_dp=y_dp, y_ploidy=2 * y_dp / auto_dp) info( "'impute_sex_chromosome_ploidy': computing and checkpointing coverage and karyotype metrics" ) return per_sample.cols().checkpoint( new_temp_file('impute_sex_karyotype', extension='ht'))