def test_haploid(self): expected = hl.Table.parallelize([ hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02024", GT=hl.call(0, 0), AD=[10, 0], GQ=44), hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02025", GT=hl.call(1), AD=[0, 6], GQ=70), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02024", GT=hl.call(2, 2), AD=[0, 0, 11], GQ=33), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02025", GT=hl.call(2), AD=[0, 0, 9], GQ=24) ], key=['locus', 's']) mt = hl.import_vcf(resource('haploid.vcf')) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'AD', 'GQ') self.assertTrue(entries._same(expected))
def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index( hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]))
def unphase_mt(mt: hl.MatrixTable) -> hl.MatrixTable: """ Generate unphased version of MatrixTable (assumes call is in mt.GT and is diploid or haploid only) """ return mt.annotate_entries(GT=hl.case().when( mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False)).when( mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False)).default( hl.null(hl.tcall)))
def test_call_fields(self): expected = hl.Table.parallelize([ hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02024", GT=hl.call(0, 0), GTA=hl.null(hl.tcall), GTZ=hl.call(0, 1)), hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02025", GT=hl.call(1), GTA=hl.null(hl.tcall), GTZ=hl.call(0)), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02024", GT=hl.call(2, 2), GTA=hl.call(2, 1), GTZ=hl.call(1, 1)), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02025", GT=hl.call(2), GTA=hl.null(hl.tcall), GTZ=hl.call(1)) ], key=['locus', 's']) mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ']) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'GTA', 'GTZ') self.assertTrue(entries._same(expected))
def test_lgt_to_gt(): call_0_0_f = hl.call(0, 0, phased=False) call_0_0_t = hl.call(0, 0, phased=True) call_0_1_f = hl.call(0, 1, phased=False) call_2_0_t = hl.call(2, 0, phased=True) call_1 = hl.call(1, phased=False) la = [0, 3, 5] assert hl.eval(tuple(hl.vds.lgt_to_gt(c, la) for c in [call_0_0_f, call_0_0_t, call_0_1_f, call_2_0_t, call_1])) == \ tuple([hl.Call([0, 0], phased=False), hl.Call([0, 0], phased=True), hl.Call([0, 3], phased=False), hl.Call([5, 0], phased=True), hl.Call([3], phased=False)])
def unphase_call_expr(call_expr: hl.expr.CallExpression) -> hl.expr.CallExpression: """ Generate unphased version of a call expression (which can be phased or not) :param call_expr: Input call expression :return: unphased call expression """ return ( hl.case() .when(call_expr.is_diploid(), hl.call(call_expr[0], call_expr[1], phased=False)) .when(call_expr.is_haploid(), hl.call(call_expr[0], phased=False)) .default(hl.null(hl.tcall)) )
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'), default_reference=args.reference) sys.path.append('/') add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') load_module = importlib.import_module(args.load_module) mt = getattr(load_module, args.load_mt_function)(*add_args) if args.gene_map_ht_path is None: interval = [hl.parse_locus_interval(args.interval)] else: gene_ht = hl.read_table(args.gene_map_ht_path) if args.gene is not None: gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene) interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False) else: interval = [hl.parse_locus_interval(args.interval)] gene_ht = hl.filter_intervals(gene_ht, interval) gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation)) gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t') ).key_by().drop('start').export(args.group_output_file, header=False) # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants` if not args.no_adj: mt = mt.filter_entries(mt.adj) mt = hl.filter_intervals(mt, interval) if not args.input_bgen: mt = mt.select_entries('GT') mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0) mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1]) if args.callrate_filter: mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter) if args.export_bgen: if not args.input_bgen: mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT)) mt = gt_to_gp(mt) mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing) hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid) else: mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0))) # Note: no mean-imputation for VCF hl.export_vcf(mt, args.output_file)
def hom_alt_depletion_fix( mt: hl.MatrixTable, het_non_ref_expr: hl.expr.BooleanExpression, af_expr: hl.expr.Float64Expression, af_cutoff: float = 0.01, ab_cutoff: float = 0.9, ) -> hl.MatrixTable: """ Adjust MT genotypes with temporary fix for the depletion of homozygous alternate genotypes. More details about the problem can be found on the gnomAD blog: https://gnomad.broadinstitute.org/blog/2020-10-gnomad-v3-1-new-content-methods-annotations-and-data-availability/#tweaks-and-updates :param mt: Input MT that needs hom alt genotype fix :param het_non_ref_expr: Expression indicating whether the original genotype (pre split multi) is het non ref :param af_expr: Allele frequency expression to determine which variants need the hom alt fix :param af_cutoff: Allele frequency cutoff for variants that need the hom alt fix. Default is 0.01 :param ab_cutoff: Allele balance cutoff to determine which genotypes need the hom alt fix. Default is 0.9 :return: MatrixTable with genotypes adjusted for the hom alt depletion fix """ return mt.annotate_entries(GT=hl.if_else( mt.GT.is_het() # Skip adjusting genotypes if sample originally had a het nonref genotype & ~het_non_ref_expr & (af_expr > af_cutoff) & (mt.AD[1] / mt.DP > ab_cutoff), hl.call(1, 1), mt.GT, ))
def test_pcrelate_issue_5263(): mt = hl.balding_nichols_model(3, 50, 100) expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all') mt = mt.select_entries(GT2=mt.GT, GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5))) actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all') assert expected._same(actual, tolerance=1e-4)
def adjusted_sex_ploidy_expr( locus_expr: hl.expr.LocusExpression, gt_expr: hl.expr.CallExpression, karyotype_expr: hl.expr.StringExpression, xy_karyotype_str: str = "XY", xx_karyotype_str: str = "XX", ) -> hl.expr.CallExpression: """ Creates an entry expression to convert males to haploid on non-PAR X/Y and females to missing on Y :param locus_expr: Locus :param gt_expr: Genotype :param karyotype_expr: Karyotype :param xy_karyotype_str: Male sex karyotype representation :param xx_karyotype_str: Female sex karyotype representation :return: Genotype adjusted for sex ploidy """ male = karyotype_expr == xy_karyotype_str female = karyotype_expr == xx_karyotype_str x_nonpar = locus_expr.in_x_nonpar() y_par = locus_expr.in_y_par() y_nonpar = locus_expr.in_y_nonpar() return (hl.case(missing_false=True).when( female & (y_par | y_nonpar), hl.null(hl.tcall)).when( male & (x_nonpar | y_nonpar) & gt_expr.is_het(), hl.null(hl.tcall)).when(male & (x_nonpar | y_nonpar), hl.call(gt_expr[0], phased=False)).default(gt_expr))
def adjust_sex_ploidy(mt: hl.MatrixTable, sex_expr: hl.expr.StringExpression, male_str: str = 'male', female_str: str = 'female') -> hl.MatrixTable: """ Converts males to haploid on non-PAR X/Y, sets females to missing on Y :param MatrixTable mt: Input MatrixTable :param StringExpression sex_expr: Expression pointing to sex in MT (if not male_str or female_str, no change) :param str male_str: String for males (default 'male') :param str female_str: String for females (default 'female') :return: MatrixTable with fixed ploidy for sex chromosomes :rtype: MatrixTable """ male = sex_expr == male_str female = sex_expr == female_str x_nonpar = mt.locus.in_x_nonpar() y_par = mt.locus.in_y_par() y_nonpar = mt.locus.in_y_nonpar() return mt.annotate_entries( GT=hl.case( missing_false=True).when(female & (y_par | y_nonpar), hl.null(hl.tcall)). when(male & (x_nonpar | y_nonpar) & mt.GT.is_het(), hl.null(hl.tcall)). when(male & (x_nonpar | y_nonpar), hl.call(mt.GT[0], phased=False)).default(mt.GT))
def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) )
def test_haploid(self): expected = hl.Table.parallelize( [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024", GT = hl.call(0, 0), AD = [10, 0], GQ = 44), hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025", GT = hl.call(1), AD = [0, 6], GQ = 70), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024", GT = hl.call(2, 2), AD = [0, 0, 11], GQ = 33), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025", GT = hl.call(2), AD = [0, 0, 9], GQ = 24)], key=['locus', 's']) mt = hl.import_vcf(resource('haploid.vcf')) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'AD', 'GQ') self.assertTrue(entries._same(expected))
def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) )
def rewrite_ref(r): ref_block_selector = {} for k, t in merged_schema.items(): if k == 'LA': ref_block_selector[k] = hl.literal([0]) elif k in ('LGT', 'GT'): ref_block_selector[k] = hl.call(0, 0) else: ref_block_selector[k] = r[k] if k in r else hl.missing(t) return r.select(**ref_block_selector)
def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: """ Get the set of all different one-hot-encoded allele-vectors in a genotype call. It is returned as an ordered array where the first vector corresponds to the first allele, and the second vector (only present if het) the second allele. :param CallExpression call: genotype :param ArrayExpression alleles: Alleles at the site :return: Array of one-hot-encoded alleles :rtype: ArrayExpression """ return hl.cond( call.is_het(), hl.array([ hl.call(call[0]).one_hot_alleles(alleles), hl.call(call[1]).one_hot_alleles(alleles), ]), hl.array([hl.call(call[0]).one_hot_alleles(alleles)]) )
def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: """ Get the set of all different one-hot-encoded allele-vectors in a genotype call. It is returned as an ordered array where the first vector corresponds to the first allele, and the second vector (only present if het) the second allele. :param CallExpression call: genotype :param ArrayExpression alleles: Alleles at the site :return: Array of one-hot-encoded alleles :rtype: ArrayExpression """ return hl.cond( call.is_het(), hl.array([ hl.call(call[0]).one_hot_alleles(alleles), hl.call(call[1]).one_hot_alleles(alleles), ]), hl.array([hl.call(call[0]).one_hot_alleles(alleles)]) )
def transform_one(mt, info_to_keep=[]) -> Table: """transforms a gvcf into a form suitable for combining The input to this should be some result of either :func:`.import_vcf` or :func:`.import_vcfs` with `array_elements_required=False`. There is a strong assumption that this function will be called on a matrix table with one column. """ if not info_to_keep: info_to_keep = [name for name in mt.info if name not in ['END', 'DP']] mt = localize(mt) if mt.row.dtype not in _transform_rows_function_map: f = hl.experimental.define_function( lambda row: hl.rbind( hl.len(row.alleles), '<NON_REF>' == row.alleles[-1], lambda alleles_len, has_non_ref: hl.struct( locus=row.locus, alleles=hl.cond(has_non_ref, row.alleles[:-1], row.alleles), rsid=row.rsid, __entries=row.__entries.map( lambda e: hl.struct( DP=e.DP, END=row.info.END, GQ=e.GQ, LA=hl.range(0, alleles_len - hl.cond(has_non_ref, 1, 0)), LAD=hl.cond(has_non_ref, e.AD[:-1], e.AD), LGT=e.GT, LPGT=e.PGT, LPL=hl.cond(has_non_ref, hl.cond(alleles_len > 2, e.PL[:-alleles_len], hl.null(e.PL.dtype)), hl.cond(alleles_len > 1, e.PL, hl.null(e.PL.dtype))), MIN_DP=e.MIN_DP, PID=e.PID, RGQ=hl.cond( has_non_ref, e.PL[hl.call(0, alleles_len - 1).unphased_diploid_gt_index()], hl.null(e.PL.dtype.element_type)), SB=e.SB, gvcf_info=hl.case() .when(hl.is_missing(row.info.END), hl.struct(**(row.info.select(*info_to_keep)))) .or_missing() ))), ), mt.row.dtype) _transform_rows_function_map[mt.row.dtype] = f transform_row = _transform_rows_function_map[mt.row.dtype] return Table(TableMapRows(mt._tir, Apply(transform_row._name, transform_row._ret_type, TopLevelReference('row'))))
def phase_y_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase) :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ return hl.or_missing( proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]), hl.array([ hl.call(proband_call[0], phased=True), hl.call(father_call[0], phased=True), hl.null(hl.tcall) ]) )
def phase_y_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase) :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ return hl.or_missing( proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]), hl.array([ hl.call(proband_call[0], phased=True), hl.call(father_call[0], phased=True), hl.null(hl.tcall) ]) )
def test_call_fields(self): expected = hl.Table.parallelize( [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024", GT = hl.call(0, 0), GTA = hl.null(hl.tcall), GTZ = hl.call(0, 1)), hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025", GT = hl.call(1), GTA = hl.null(hl.tcall), GTZ = hl.call(0)), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024", GT = hl.call(2, 2), GTA = hl.call(2, 1), GTZ = hl.call(1, 1)), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025", GT = hl.call(2), GTA = hl.null(hl.tcall), GTZ = hl.call(1))], key=['locus', 's']) mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ']) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'GTA', 'GTZ') self.assertTrue(entries._same(expected))
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def coalesce_join(ref, var): call_field = 'GT' if 'GT' in var else 'LGT' assert call_field in var, var.dtype merged_fields = {} merged_fields[call_field] = hl.coalesce(var[call_field], hl.call(0, 0)) for field in ref.dtype: if field in var: merged_fields[field] = hl.coalesce(var[field], ref[field]) return hl.struct(**merged_fields).annotate(**{f: var[f] for f in var if f not in merged_fields})
def phase_parent_call(call: hl.expr.CallExpression, transmitted_allele_index: int): """ Given a genotype and which allele was transmitted to the offspring, returns the parent phased genotype. :param CallExpression call: Parent genotype :param int transmitted_allele_index: index of transmitted allele (0 or 1) :return: Phased parent genotype :rtype: CallExpression """ return hl.call(call[transmitted_allele_index], call[hl.int(transmitted_allele_index == 0)], phased=True)
def transform_one(mt, info_to_keep=[]) -> Table: if not info_to_keep: info_to_keep = [name for name in mt.info if name not in ['END', 'DP']] mt = localize(mt) if mt.row.dtype not in _transform_rows_function_map: f = hl.experimental.define_function( lambda row: hl.rbind( hl.len(row.alleles), '<NON_REF>' == row.alleles[-1], lambda alleles_len, has_non_ref: hl.struct( locus=row.locus, alleles=hl.cond(has_non_ref, row.alleles[:-1], row.alleles), rsid=row.rsid, __entries=row.__entries.map( lambda e: hl.struct( DP=e.DP, END=row.info.END, GQ=e.GQ, LA=hl.range(0, alleles_len - hl.cond(has_non_ref, 1, 0)), LAD=hl.cond(has_non_ref, e.AD[:-1], e.AD), LGT=e.GT, LPGT=e.PGT, LPL=hl.cond(has_non_ref, hl.cond(alleles_len > 2, e.PL[:-alleles_len], hl.null(e.PL.dtype)), hl.cond(alleles_len > 1, e.PL, hl.null(e.PL.dtype))), MIN_DP=e.MIN_DP, PID=e.PID, RGQ=hl.cond( has_non_ref, e.PL[hl.call(0, alleles_len - 1).unphased_diploid_gt_index()], hl.null(e.PL.dtype.element_type)), SB=e.SB, gvcf_info=hl.case() .when(hl.is_missing(row.info.END), hl.struct(**( parse_as_fields( row.info.select(*info_to_keep), has_non_ref) ))) .or_missing() ))), ), mt.row.dtype) _transform_rows_function_map[mt.row.dtype] = f transform_row = _transform_rows_function_map[mt.row.dtype] return Table(TableMapRows(mt._tir, Apply(transform_row._name, transform_row._ret_type, TopLevelReference('row'))))
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def phase_parent_call(call: hl.expr.CallExpression, transmitted_allele_index: int): """ Given a genotype and which allele was transmitted to the offspring, returns the parent phased genotype. :param CallExpression call: Parent genotype :param int transmitted_allele_index: index of transmitted allele (0 or 1) :return: Phased parent genotype :rtype: CallExpression """ return hl.call( call[transmitted_allele_index], call[hl.int(transmitted_allele_index == 0)], phased=True )
def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index(hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]) )
def make_entry_struct(e, alleles_len, has_non_ref, row): handled_fields = dict() handled_names = { 'LA', 'gvcf_info', 'END', 'LAD', 'AD', 'LGT', 'GT', 'LPL', 'PL', 'LPGT', 'PGT' } if 'END' not in row.info: raise hl.utils.FatalError( "the Hail GVCF combiner expects GVCFs to have an 'END' field in INFO." ) if 'GT' not in e: raise hl.utils.FatalError( "the Hail GVCF combiner expects GVCFs to have a 'GT' field in FORMAT." ) handled_fields['LA'] = hl.range( 0, alleles_len - hl.if_else(has_non_ref, 1, 0)) handled_fields['LGT'] = get_lgt(e, alleles_len, has_non_ref, row) if 'AD' in e: handled_fields['LAD'] = hl.if_else(has_non_ref, e.AD[:-1], e.AD) if 'PGT' in e: handled_fields['LPGT'] = e.PGT if 'PL' in e: handled_fields['LPL'] = hl.if_else( has_non_ref, hl.if_else(alleles_len > 2, e.PL[:-alleles_len], hl.missing(e.PL.dtype)), hl.if_else(alleles_len > 1, e.PL, hl.missing(e.PL.dtype))) handled_fields['RGQ'] = hl.if_else( has_non_ref, e.PL[hl.call(0, alleles_len - 1).unphased_diploid_gt_index()], hl.missing(e.PL.dtype.element_type)) handled_fields['END'] = row.info.END handled_fields['gvcf_info'] = (hl.case().when( hl.is_missing(row.info.END), hl.struct(**(parse_as_fields(row.info.select( *info_to_keep), has_non_ref)))).or_missing()) pass_through_fields = { k: v for k, v in e.items() if k not in handled_names } return hl.struct(**handled_fields, **pass_through_fields)
def coalesce_join(ref, var): call_field = 'GT' if 'GT' in var else 'LGT' assert call_field in var, var.dtype shared_fields = [call_field] + list( f for f in ref.dtype if f in var.dtype) shared_field_set = set(shared_fields) var_fields = [f for f in var.dtype if f not in shared_field_set] return hl.if_else( hl.is_defined(var), var.select(*shared_fields, *var_fields), ref.annotate(**{ call_field: hl.call(0, 0) }).select(*shared_fields, **{f: hl.null(var[f].dtype) for f in var_fields}))
def main(args): hl.init() data_type = 'genomes' if args.genomes else 'exomes' if args.write_hardcalls: mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None) ht = hl.read_table(qc_ht_path(data_type, 'hard_filters')) mt = annotate_adj( mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex)) mt = mt.select_entries(GT=hl.case(missing_false=True).when( hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT), PID=mt.PID, adj=mt.adj) mt = adjust_sex_ploidy(mt, mt.sex) mt = mt.select_cols().naive_coalesce(10000) mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False), args.overwrite) if args.split_hardcalls: mt = get_gnomad_data(data_type, split=False, meta_root=None) mt = hl.split_multi_hts(mt) mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True), args.overwrite) if args.write_nonrefs: # CPU-hours: 600 (E) mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None).select_cols() mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT)) mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref()) mt = annotate_adj(mt) if args.exomes: mt = mt.naive_coalesce(10000) mt.write( get_gnomad_data_path(data_type, split=False, non_refs_only=True), args.overwrite) if args.split_nonrefs: # CPU-hours: 300 (E) mt = get_gnomad_data(data_type, split=False, non_refs_only=True) mt = hl.split_multi_hts(mt) mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref()) mt.write( get_gnomad_data_path(data_type, split=True, non_refs_only=True), args.overwrite)
def lgt_to_gt(lgt, la): """Transform LGT into GT using local alleles array. Parameters ---------- lgt : :class:`.CallExpression` LGT value. la : :class:`.ArrayExpression` Local alleles array. Returns ------- :class:`.CallExpression` Notes ----- This function assumes diploid genotypes. """ return hl.call(la[lgt[0]], la[lgt[1]])
def lgt_to_gt(lgt, la): """Transforming Local GT and Local Alleles into the true GT Parameters ---------- lgt : :class:`.CallExpression` The LGT value la : :class:`.ArrayExpression` The Local Alleles array Returns ------- :class:`.CallExpression` Notes ----- This function assumes diploid genotypes. """ return hl.call(la[lgt[0]], la[lgt[1]])
def test_agg_call_stats(self): t = hl.Table.parallelize([ hl.struct(c=hl.call(0, 0)), hl.struct(c=hl.call(0, 1)), hl.struct(c=hl.call(0, 2, phased=True)), hl.struct(c=hl.call(1)), hl.struct(c=hl.call(0)), hl.struct(c=hl.call()) ]) actual = t.aggregate(hl.agg.call_stats(t.c, ['A', 'T', 'G'])) expected = hl.struct(AC=[5, 2, 1], AF=[5.0 / 8.0, 2.0 / 8.0, 1.0 / 8.0], AN=8, homozygote_count=[1, 0, 0]) self.assertTrue(hl.Table.parallelize([actual]), hl.Table.parallelize([expected]))
def test_agg_call_stats(self): t = hl.Table.parallelize([ hl.struct(c=hl.call(0, 0)), hl.struct(c=hl.call(0, 1)), hl.struct(c=hl.call(0, 2, phased=True)), hl.struct(c=hl.call(1)), hl.struct(c=hl.call(0)), hl.struct(c=hl.call()) ]) actual = t.aggregate(hl.agg.call_stats(t.c, ['A', 'T', 'G'])) expected = hl.struct(AC=[5, 2, 1], AF=[5.0 / 8.0, 2.0 / 8.0, 1.0 / 8.0], AN=8, homozygote_count=[1, 0, 0]) self.assertTrue(hl.Table.parallelize([actual]), hl.Table.parallelize([expected]))
def test_lgt_to_gt_invalid(): c1 = hl.call(1, 1) c2 = hl.call(1, 1, phased=True) assert hl.eval(hl.vds.lgt_to_gt(c1, [0, 17495])) == hl.Call([17495, 17495])
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...") # hotfix for depletion of homozygous alternate genotypes # Using v3.0 AF to avoid an extra frequency calculation # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments freq_ht = freq.versions["3"].ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries( GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, ) ) logger.info("Calculating InbreedingCoefficient...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'InbreedingCoeff', 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno'} bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2'} # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
def lgt_to_gt(lgt, la): """A method for transforming Local GT and Local Alleles into the true GT""" return hl.call(la[lgt[0]], la[lgt[1]])
def transform_one(mt, vardp_outlier=100_000) -> Table: """transforms a gvcf into a form suitable for combining The input to this should be some result of either :func:`.import_vcf` or :func:`.import_vcfs` with `array_elements_required=False`. There is a strong assumption that this function will be called on a matrix table with one column. """ mt = localize(mt) if mt.row.dtype not in _transform_rows_function_map: f = hl.experimental.define_function( lambda row: hl.rbind( hl.len(row.alleles), '<NON_REF>' == row.alleles[-1], lambda alleles_len, has_non_ref: hl.struct( locus=row.locus, alleles=hl.cond(has_non_ref, row.alleles[:-1], row.alleles), rsid=row.rsid, __entries=row.__entries.map( lambda e: hl.struct( DP=e.DP, END=row.info.END, GQ=e.GQ, LA=hl.range(0, alleles_len - hl.cond(has_non_ref, 1, 0)), LAD=hl.cond(has_non_ref, e.AD[:-1], e.AD), LGT=e.GT, LPGT=e.PGT, LPL=hl.cond(has_non_ref, hl.cond(alleles_len > 2, e.PL[:-alleles_len], hl.null(e.PL.dtype)), hl.cond(alleles_len > 1, e.PL, hl.null(e.PL.dtype))), MIN_DP=e.MIN_DP, PID=e.PID, RGQ=hl.cond( has_non_ref, e.PL[hl.call(0, alleles_len - 1).unphased_diploid_gt_index()], hl.null(e.PL.dtype.element_type)), SB=e.SB, gvcf_info=hl.case() .when(hl.is_missing(row.info.END), hl.struct( ClippingRankSum=row.info.ClippingRankSum, BaseQRankSum=row.info.BaseQRankSum, MQ=row.info.MQ, MQRankSum=row.info.MQRankSum, MQ_DP=row.info.MQ_DP, QUALapprox=row.info.QUALapprox, RAW_MQ=row.info.RAW_MQ, ReadPosRankSum=row.info.ReadPosRankSum, VarDP=hl.cond(row.info.VarDP > vardp_outlier, row.info.DP, row.info.VarDP))) .or_missing() ))), ), mt.row.dtype) _transform_rows_function_map[mt.row.dtype] = f transform_row = _transform_rows_function_map[mt.row.dtype] return Table(TableMapRows(mt._tir, Apply(transform_row._name, TopLevelReference('row'))))