def load_cmg(cmg_csv: str) -> hl.Table: cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"') cmg_ht = cmg_ht.transmute( locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'), alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1], locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'), alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2] ) liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'})) lifted_over_variants = hl.sorted( hl.array([ liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]), liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1]) ]), lambda x: x.locus ) cmg_ht = cmg_ht.key_by( locus1=lifted_over_variants[0].locus, alleles1=lifted_over_variants[0].alleles, locus2=lifted_over_variants[1].locus, alleles2=lifted_over_variants[1].alleles ) return cmg_ht.annotate( bad_liftover=( hl.is_missing(cmg_ht.locus1) | hl.is_missing(cmg_ht.locus2) | (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) | (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0]) ) )
def create_gene_map_ht(ht, check_gene_contigs=False): from gnomad.utils.vep import process_consequences ht = process_consequences(ht) ht = ht.explode(ht.vep.worst_csq_by_gene_canonical) ht = ht.annotate( variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' + ht.alleles[0] + '/' + ht.alleles[1], annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical)) if check_gene_contigs: gene_contigs = ht.group_by( gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id, gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol, ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig)) assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1) gene_map_ht = ht.group_by( gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id, gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol, ).partition_hint(100).aggregate( interval=hl.interval(start=hl.locus( hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)), end=hl.locus( hl.agg.take(ht.locus.contig, 1)[0], hl.agg.max(ht.locus.position))), variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)), ) return gene_map_ht
def intersect_target_ref(ref_mt_filt, snp_list, grch37_or_grch38, intersect_out, overwrite: bool = False): mt = hl.read_matrix_table(ref_mt_filt) if grch37_or_grch38.lower() == 'grch38': snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr), hl.int(snp_list.pos), reference_genome='GRCh38'), alleles=[snp_list.ref, snp_list.alt]) mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key])) elif grch37_or_grch38.lower() == 'grch37': snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr), hl.int(snp_list.pos), reference_genome='GRCh37'), alleles=[snp_list.ref, snp_list.alt]) # liftover snp list to GRCh38, filter to SNPs in mt rg37, rg38 = load_liftover() snp_liftover = snp_list.annotate( new_locus=hl.liftover(snp_list.locus, 'GRCh38')) snp_liftover = snp_liftover.filter( hl.is_defined(snp_liftover.new_locus)) snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus, alleles=snp_liftover.alleles) mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key])) mt = mt.repartition(5000) mt = mt.checkpoint(intersect_out, overwrite=overwrite, _read_if_exists=not overwrite)
def import_key(ss_filename, ss_keys, clump_name): keys = ss_keys.split(',') ss = hl.import_table(ss_filename, impute=True, delimiter='\s+', types={ keys[1]: hl.tfloat, keys[0]: hl.tstr }, min_partitions=100) clump = hl.import_table(clump_name, delimiter='\s+', min_partitions=10, types={ 'P': hl.tfloat, 'CHR': hl.tstr, 'BP': hl.tint }) clump = clump.key_by(locus=hl.locus(clump.CHR, clump.BP)) clump = clump.filter(clump.P < 5e-8) ss = ss.annotate(**{keys[1]: hl.int(ss[keys[1]])}) chroms = set(map(str, range(1, 23))) ss = ss.filter(hl.literal(chroms).contains(ss[keys[0]])) ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]), alleles=[ss[keys[2]], ss[keys[3]]]) ss = ss.key_by(ss.locus) ss = ss.annotate(clump=hl.is_defined(clump[ss.key])) ss = ss.key_by(ss.locus, ss.alleles) p = keys[-1] return ss, p
def test_haploid(self): expected = hl.Table.parallelize([ hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02024", GT=hl.call(0, 0), AD=[10, 0], GQ=44), hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02025", GT=hl.call(1), AD=[0, 6], GQ=70), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02024", GT=hl.call(2, 2), AD=[0, 0, 11], GQ=33), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02025", GT=hl.call(2), AD=[0, 0, 9], GQ=24) ], key=['locus', 's']) mt = hl.import_vcf(resource('haploid.vcf')) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'AD', 'GQ') self.assertTrue(entries._same(expected))
def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual( hl.eval( hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval( hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual( hl.eval( hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval( hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) with self.assertRaises(FatalError): hl.eval( hl.liftover( hl.parse_locus_interval('1:10000-10000', reference_genome='GRCh37'), 'GRCh38')) grch37.remove_liftover("GRCh38")
def test_call_fields(self): expected = hl.Table.parallelize([ hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02024", GT=hl.call(0, 0), GTA=hl.null(hl.tcall), GTZ=hl.call(0, 1)), hl.struct(locus=hl.locus("X", 16050036), s="C1046::HG02025", GT=hl.call(1), GTA=hl.null(hl.tcall), GTZ=hl.call(0)), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02024", GT=hl.call(2, 2), GTA=hl.call(2, 1), GTZ=hl.call(1, 1)), hl.struct(locus=hl.locus("X", 16061250), s="C1046::HG02025", GT=hl.call(2), GTA=hl.null(hl.tcall), GTZ=hl.call(1)) ], key=['locus', 's']) mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ']) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'GTA', 'GTZ') self.assertTrue(entries._same(expected))
def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) grch37.remove_liftover("GRCh38")
def test_uniqueness(self): db = hl.experimental.DB(config=AnnotationDBTests.db_json) t = hl.utils.range_table(10) t = t.annotate(locus=hl.locus('1', t.idx + 1)) t = db.annotate_rows_db(t, 'unique_dataset', 'nonunique_dataset') t.unique_dataset.dtype == hl.tstruct(annotation=hl.tstr) t.nonunique_dataset.dtype == hl.tstruct(annotation=hl.tarray(hl.tstr))
def main(args): add_args = {} if args.n_threads is not None: add_args['master'] = f'local[{args.n_threads}]' hl.init(default_reference='GRCh38', log='/load_finngen.log', **add_args) if args.load_single: ht = hl.import_table(args.input_file, impute=True, force_bgz=True, min_partitions=100).rename({'#chrom': 'chrom'}) ht = ht.transmute(locus=hl.locus('chr' + ht.chrom, ht.pos), alleles=[ht.ref, ht.alt]).key_by('locus', 'alleles') ht = ht.transmute(Pvalue=ht.pval).annotate_globals( **json.loads(args.additional_dict)) ht = ht.annotate(**get_vep_formatted_data(args.vep_path)[ht.key]) ht = ht.checkpoint(args.output_ht, overwrite=args.overwrite, _read_if_exists=not args.overwrite) ht = ht.select_globals().annotate(**json.loads(args.additional_dict)) mt = ht.to_matrix_table( ['locus', 'alleles'], ['phenocode'], ['rsids', 'nearest_genes', 'gene', 'annotation'], ['category', 'name', 'n_cases', 'n_controls']) mt.checkpoint(args.output_mt, overwrite=args.overwrite, _read_if_exists=not args.overwrite) if args.combine_all: # all_hts = list(filter(lambda y: y.endswith('.ht'), map(lambda x: x['path'], hl.hadoop_ls(args.input_directory)))) # print(f'Got {len(all_hts)} HTs...') # mt = mwzj_hts_by_tree(all_hts, temp_bucket + '/finngen', ['phenocode'], debug=True) # mt.checkpoint(temp_mt_path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) mt = hl.read_matrix_table(temp_mt_path) mt.naive_coalesce(5000).write(args.output_mt, args.overwrite)
def main(args): full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt') # liftover chains rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) chips = hl.hadoop_open(args.chip_loci) chip_dict = {} for chip in chips: chip = chip.strip().split() chip_pos = hl.import_table(chip[1], filter='\[Controls\]', skip_blank_lines=True) chip_pos = chip_pos.filter( hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains( chip_pos.chr)) chip_pos = chip_pos.key_by( locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos))) # liftover chip position info chip_pos = chip_pos.annotate( new_locus=hl.liftover(chip_pos.locus, 'GRCh38')) chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus)) chip_pos = chip_pos.key_by(locus=chip_pos.new_locus) # filter full vcf to sites in genotype data geno_vcf = full_vcf.filter_rows(hl.is_defined( chip_pos[full_vcf.locus])) hl.export_vcf( geno_vcf, 'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
def import_cadd_table(path: str, genome_version: str, partitions) -> hl.Table: if genome_version not in ("37", "38"): raise ValueError(f"Invalid genome version: {genome_version}") column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'} types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32} cadd_ht = import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=partitions) cadd_ht = cadd_ht.rename(column_names) chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}")) alleles = hl.array([cadd_ht.ref, cadd_ht.alt]) cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles) cadd_union_ht = cadd_ht.head(0) for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]): contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig)) cadd_union_ht = cadd_union_ht.union(cadd_ht_subset) cadd_union_ht = cadd_union_ht.key_by("locus", "alleles") cadd_union_ht.describe() return cadd_union_ht
def setupAnnotationDBTests(cls): startTestHailContext() t = hl.utils.range_table(10) t = t.annotate(locus=hl.locus('1', t.idx + 1)) t = t.annotate(annotation=hl.str(t.idx)) d = tempfile.TemporaryDirectory() fname = d.name + '/f.mt' t.write(fname) cls.temp_dir = d cls.db_json = { 'unique_dataset': { 'description': 'now with unique rows!', 'url': 'https://example.com', 'key_properties': ['unique'], 'versions': [{ 'url': fname, 'version': 'v1-GRCh37' }] }, 'nonunique_dataset': { 'description': 'non-unique rows :(', 'url': 'https://example.net', 'key_properties': [], 'versions': [{ 'url': fname, 'version': 'v1-GRCh37' }] } }
def specific_clumps(filename): clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat}) clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP))) return clump
def annotate_variants_with_mnvs(variants_path, mnvs_path): ds = hl.read_table(mnvs_path) ds = ds.select("changes_amino_acids_for_snvs", "constituent_snvs", "constituent_snv_ids", "n_individuals",) ds = ds.explode(ds.constituent_snvs, "snv") ds = ds.annotate( locus=hl.locus(ds.snv.chrom, ds.snv.pos, reference_genome="GRCh37"), alleles=[ds.snv.ref, ds.snv.alt] ) ds = ds.group_by(ds.locus, ds.alleles).aggregate(multi_nucleotide_variants=hl.agg.collect(ds.row.drop("snv"))) variants = hl.read_table(variants_path) variants = variants.annotate(multi_nucleotide_variants=ds[variants.key].multi_nucleotide_variants) variants = variants.annotate( flags=hl.if_else( hl.len(variants.multi_nucleotide_variants) > 0, variants.flags.add("mnv"), variants.flags, missing_false=True, ), multi_nucleotide_variants=variants.multi_nucleotide_variants.map( lambda mnv: mnv.select( combined_variant_id=mnv.variant_id, changes_amino_acids=mnv.changes_amino_acids_for_snvs.contains(variants.variant_id), n_individuals=mnv.n_individuals, other_constituent_snvs=mnv.constituent_snv_ids.filter(lambda snv_id: snv_id != variants.variant_id), ) ), ) return variants
def get_contig_size(contig: str) -> int: logger.info(f"Working on {contig}") contig_ht = hl.utils.range_table( ref.contig_length(contig), n_partitions=int(ref.contig_length(contig) / 500_000), ) contig_ht = contig_ht.annotate( locus=hl.locus(contig=contig, pos=contig_ht.idx + 1, reference_genome=ref) ) contig_ht = contig_ht.filter(contig_ht.locus.sequence_context().lower() != "n") if contig in ref.x_contigs: contig_ht = contig_ht.filter(contig_ht.locus.in_x_nonpar()) if contig in ref.y_contigs: contig_ht = contig_ht.filter(contig_ht.locus.in_y_nonpar()) contig_ht = contig_ht.key_by("locus") if included_calling_intervals is not None: contig_ht = contig_ht.filter( hl.is_defined(included_calling_intervals[contig_ht.key]) ) if excluded_calling_intervals is not None: contig_ht = contig_ht.filter( hl.is_missing(excluded_calling_intervals[contig_ht.key]) ) contig_size = contig_ht.count() logger.info(f"Contig {contig} has {contig_size} bases for coverage.") return contig_size
def rekey_new_reference( t: Union[hl.Table, hl.MatrixTable], reference: hl.ReferenceGenome) -> Union[hl.Table, hl.MatrixTable]: """ Re-key Table or MatrixTable with a new reference genome. :param t: Input Table/MatrixTable. :param reference: Reference genome to re-key with. :return: Re-keyed Table/MatrixTable """ t = t.rename({"locus": "locus_original"}) locus_expr = hl.locus( t.locus_original.contig, t.locus_original.position, reference_genome=reference, ) if isinstance(t, hl.MatrixTable): t = t.annotate_rows(locus=locus_expr) t = t.key_rows_by("locus", "alleles").drop("locus_original") else: t = t.annotate(locus=locus_expr) t = t.key_by("locus", "alleles").drop("locus_original") return t
def compute_prs_mt(genotype_mt_path, prs_mt_path): scratch_dir = 'gs://ukbb-diverse-temp-30day/nb-scratch' clumped = hl.read_table( 'gs://ukb-diverse-pops/ld_prune/results_high_quality/not_AMR/phecode-250.2-both_sexes/clump_results.ht/' ) sumstats = hl.import_table( 'gs://ukb-diverse-pops/sumstats_flat_files/phecode-250.2-both_sexes.tsv.bgz', impute=True) sumstats = sumstats.annotate(locus=hl.locus(sumstats.chr, sumstats.pos), alleles=hl.array([sumstats.ref, sumstats.alt])) sumstats = sumstats.key_by('locus', 'alleles') sumstats.describe() # mt = hl.read_matrix_table(genotype_mt_path) # read genotype mt subset # get full genotype mt meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) mt = get_filtered_mt_with_x() mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) mt = mt.select_entries('dosage') mt = mt.select_rows() mt = mt.select_cols() mt = mt.annotate_rows(beta=hl.if_else(hl.is_defined(clumped[mt.row_key]), sumstats[mt.row_key].beta_meta, 0)) mt = mt.annotate_cols(score=hl.agg.sum(mt.beta * mt.dosage)) mt_cols = mt.cols() mt_cols = mt_cols.repartition(1000) mt_cols.write(f'{scratch_dir}/prs_all_samples.ht')
def test_reference_genome_sequence(self): gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file( "test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue( hl.eval( t.all( hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base))) l = hl.locus("a", 7, gr4) self.assertTrue( hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA")) gr4.remove_sequence() assert not gr4.has_sequence() gr4.add_sequence(resource("fake_reference.fasta"), resource("fake_reference.fasta.fai")) assert gr4.has_sequence()
def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def test_window_by_locus(self): mt = hl.utils.range_matrix_table(100, 2, n_partitions=10) mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1)) mt = mt.key_rows_by('locus') mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx) mt = hl.window_by_locus(mt, 5).cache() self.assertEqual(mt.count_rows(), 100) rows = mt.rows() self.assertTrue( rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5))) self.assertTrue( rows.all( hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx, hl.zip_with_index(rows.prev_rows)))) entries = mt.entries() self.assertTrue( entries.all( hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries))) self.assertTrue( entries.all( hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx, hl.zip_with_index(entries.prev_entries))))
def test_import_keyby_count_ldsc_lowered_shuffle(self): # integration test pulled out of test_ld_score_regression to isolate issues with lowered shuffles # and RDD serialization, 2021-07-06 # if this comment no longer reflects the backend system, that's a really good thing ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={ 'L2': hl.tfloat, 'BP': hl.tint }) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={ 'N': hl.tint, 'Z': hl.tfloat }) j1 = ht_scores[ht_20160['SNP']] ht_20160 = ht_20160.annotate(ld_score=j1['L2'], locus=hl.locus(j1['CHR'], j1['BP']), alleles=hl.array( [ht_20160['A2'], ht_20160['A1']])) ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) assert ht_20160._force_count() == 151
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad.describe() bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi2 - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def specific_clumps(filename): clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat}) clump_dict = clump.aggregate(hl.dict(hl.agg.collect( (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)), True) )), _localize=False) return clump_dict
def create_rf_2_0_2_rank(data_type: str, beta: bool) -> None: """ Creates a rank file for 2.0.2 RF and writes it to its correct location. :param str data_type: One of 'exomes' or 'genomes' :param bool beta: If set, then creates the table for the "beta" 2.0.2 RF with QD / max(p(AB)) :return: Nothing :rtype: None """ logger.info( f"Creating rank file for {data_type} RF 2.0.2{'beta' if beta else ''}") if not hl.hadoop_exists( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht'): ht = hl.import_table(get_2_0_2_rf_path(data_type, beta), types={'chrom': hl.tstr}, impute=True, min_partitions=1000) if 'chrom' in ht.row: ht = ht.transmute(locus=hl.locus(ht.chrom, ht.pos), alleles=[ht.ref, ht.alt]) else: ht = ht.transmute( v=hl.parse_variant(ht.v), rfprob=ht.rf_rpob_tp # Yes, this is awful ) ht = ht.transmute(locus=ht.v.locus, alleles=ht.v.alleles) ht = ht.key_by('locus', 'alleles') gnomad_ht = get_gnomad_annotations(data_type) ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rfprob) ht.write( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht') ht = hl.read_table( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht') ht = add_rank(ht, score_expr=1 - ht.score, subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton, 'adj_rank': ht.ac > 0, 'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0), 'adj_singleton_rank': ht.singleton & (ht.ac > 0), 'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0) }) ht.write(score_ranking_path(data_type, 'rf_2.0.2{}'.format('_beta' if beta else '')), overwrite=True)
def test_haploid(self): expected = hl.Table.parallelize( [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024", GT = hl.call(0, 0), AD = [10, 0], GQ = 44), hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025", GT = hl.call(1), AD = [0, 6], GQ = 70), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024", GT = hl.call(2, 2), AD = [0, 0, 11], GQ = 33), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025", GT = hl.call(2), AD = [0, 0, 9], GQ = 24)], key=['locus', 's']) mt = hl.import_vcf(resource('haploid.vcf')) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'AD', 'GQ') self.assertTrue(entries._same(expected))
def test_call_fields(self): expected = hl.Table.parallelize( [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024", GT = hl.call(0, 0), GTA = hl.null(hl.tcall), GTZ = hl.call(0, 1)), hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025", GT = hl.call(1), GTA = hl.null(hl.tcall), GTZ = hl.call(0)), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024", GT = hl.call(2, 2), GTA = hl.call(2, 1), GTZ = hl.call(1, 1)), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025", GT = hl.call(2), GTA = hl.null(hl.tcall), GTZ = hl.call(1))], key=['locus', 's']) mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ']) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'GTA', 'GTZ') self.assertTrue(entries._same(expected))
def import_key(ss_filename, ss_keys): ss = hl.import_table(ss_filename, impute=True, delimiter='\s+') keys = ss_keys.split(',') p = keys[-1] ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]), alleles=[ss[keys[2]], ss[keys[3]]]) ss = ss.key_by(ss.locus, ss.alleles) return ss, p
def get_annot_ht(): t = hl.import_table(f'{wd_data}/gencode.v31lift37.annotation.gff3.gz',no_header=True,impute=True, comment=('#'),force=True) #t = hl.import_table('/Users/nbaya/Downloads/gencode.v31lift37.annotation.gtf',no_header=True,impute=True, comment=('#')) t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("="))) t2 = t2.filter(t2.f2 == "CDS") # only want coding sequences, not entire genes t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f3, 'GRCh37')) t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f4, 'GRCh37')) t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0[3:], t2.f3, 'GRCh37'), hl.locus(t2.f0[3:], t2.f4, 'GRCh37'))) t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1])))) t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], level=t2.GFF_Columns["level"]) t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7) t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7) t2 = t2.key_by(t2.interval) return t2
def setupAnnotationDBTests(cls): startTestHailContext() t = hl.utils.range_table(10) t = t.key_by(locus=hl.locus('1', t.idx + 1)) t = t.annotate(annotation=hl.str(t.idx)) cls.tempdir_manager = hl.TemporaryDirectory() d = cls.tempdir_manager.__enter__() fname = d + '/f.mt' t.write(fname) cls.db_json = { 'unique_dataset': { 'description': 'now with unique rows!', 'url': 'https://example.com', 'annotation_db': { 'key_properties': ['unique'] }, 'versions': [{ 'url': { "aws": { "eu": fname, "us": fname }, "gcp": { "eu": fname, "us": fname } }, 'version': 'v1', 'reference_genome': 'GRCh37' }] }, 'nonunique_dataset': { 'description': 'non-unique rows :(', 'url': 'https://example.net', 'annotation_db': { 'key_properties': [] }, 'versions': [{ 'url': { "aws": { "eu": fname, "us": fname }, "gcp": { "eu": fname, "us": fname } }, 'version': 'v1', 'reference_genome': 'GRCh37' }] } }
def test_constructors(self): rg = hl.ReferenceGenome("foo", ["1"], {"1": 100}) schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32) rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}] kt = hl.Table.parallelize(rows, schema) kt = kt.annotate(d=hl.int64(kt.d)) kt = kt.annotate(l1=hl.parse_locus("1:51"), l2=hl.locus("1", 51, reference_genome=rg), i1=hl.parse_locus_interval("1:51-56", reference_genome=rg), i2=hl.interval(hl.locus("1", 51, reference_genome=rg), hl.locus("1", 56, reference_genome=rg))) expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64, 'l1': hl.tlocus(), 'l2': hl.tlocus(rg), 'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))} self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
def test_uniqueness(self): db = hl.experimental.DB(region='us', cloud='gcp', config=AnnotationDBTests.db_json) t = hl.utils.range_table(10) t = t.key_by(locus=hl.locus('1', t.idx + 1)) t = db.annotate_rows_db(t, 'unique_dataset', 'nonunique_dataset') assert t.unique_dataset.dtype == hl.dtype( 'struct{idx: int32, annotation: str}') assert t.nonunique_dataset.dtype == hl.dtype( 'array<struct{idx: int32, annotation: str}>')
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def test_reference_genome_sequence(self): gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base))) l = hl.locus("a", 7, gr4) self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1), alleles = ['A', 'G']) .key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s = hl.str(mt.col_idx)) .key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def test_window_by_locus(self): mt = hl.utils.range_matrix_table(100, 2, n_partitions=10) mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1)) mt = mt.key_rows_by('locus') mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx) mt = hl.window_by_locus(mt, 5).cache() self.assertEqual(mt.count_rows(), 100) rows = mt.rows() self.assertTrue(rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5))) self.assertTrue(rows.all(hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx, hl.zip_with_index(rows.prev_rows)))) entries = mt.entries() self.assertTrue(entries.all(hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries))) self.assertTrue(entries.all(hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx, hl.zip_with_index(entries.prev_entries))))
def test_hardy_weinberg_test(self): mt = hl.import_vcf(resource('HWE_test.vcf')) mt = mt.select_rows(**hl.agg.hardy_weinberg_test(mt.GT)) rt = mt.rows() expected = hl.Table.parallelize([ hl.struct( locus=hl.locus('20', pos), alleles=alleles, het_freq_hwe=r, p_value=p) for (pos, alleles, r, p) in [ (1, ['A', 'G'], 0.0, 0.5), (2, ['A', 'G'], 0.25, 0.5), (3, ['T', 'C'], 0.5357142857142857, 0.21428571428571427), (4, ['T', 'A'], 0.5714285714285714, 0.6571428571428573), (5, ['G', 'A'], 0.3333333333333333, 0.5)]], key=['locus', 'alleles']) self.assertTrue(rt.filter(rt.locus.position != 6)._same(expected)) rt6 = rt.filter(rt.locus.position == 6).collect()[0] self.assertEqual(rt6['p_value'], 0.5) self.assertTrue(math.isnan(rt6['het_freq_hwe']))
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_ld_score(self): ht = hl.import_table(doctest_resource('ldsc.annot'), types={'BP': hl.tint, 'CM': hl.tfloat, 'binary': hl.tint, 'continuous': hl.tfloat}) ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) ht = ht.key_by('locus') mt = hl.import_plink(bed=doctest_resource('ldsc.bed'), bim=doctest_resource('ldsc.bim'), fam=doctest_resource('ldsc.fam')) mt = mt.annotate_rows(binary=ht[mt.locus].binary, continuous=ht[mt.locus].continuous) ht_univariate = hl.experimental.ld_score( entry_expr=mt.GT.n_alt_alleles(), locus_expr=mt.locus, radius=1.0, coord_expr=mt.cm_position) ht_annotated = hl.experimental.ld_score( entry_expr=mt.GT.n_alt_alleles(), locus_expr=mt.locus, radius=1.0, coord_expr=mt.cm_position, annotation_exprs=[mt.binary, mt.continuous]) univariate = ht_univariate.aggregate(hl.struct( chr20=hl.agg.filter( (ht_univariate.locus.contig == '20') & (ht_univariate.locus.position == 82079), hl.agg.collect(ht_univariate.univariate))[0], chr22 =hl.agg.filter( (ht_univariate.locus.contig == '22') & (ht_univariate.locus.position == 16894090), hl.agg.collect(ht_univariate.univariate))[0], mean=hl.agg.mean(ht_univariate.univariate))) self.assertAlmostEqual(univariate.chr20, 1.601, places=3) self.assertAlmostEqual(univariate.chr22, 1.140, places=3) self.assertAlmostEqual(univariate.mean, 3.507, places=3) annotated = ht_annotated.aggregate( hl.struct( chr20=hl.struct(binary=hl.agg.filter( (ht_annotated.locus.contig == '20') & (ht_annotated.locus.position == 82079), hl.agg.collect(ht_annotated.binary))[0], continuous=hl.agg.filter( (ht_annotated.locus.contig == '20') & (ht_annotated.locus.position == 82079), hl.agg.collect(ht_annotated.continuous))[0]), chr22=hl.struct( binary=hl.agg.filter( (ht_annotated.locus.contig == '22') & (ht_annotated.locus.position == 16894090), hl.agg.collect(ht_annotated.binary))[0], continuous=hl.agg.filter( (ht_annotated.locus.contig == '22') & (ht_annotated.locus.position == 16894090), hl.agg.collect(ht_annotated.continuous))[0]), mean_stats=hl.struct(binary=hl.agg.mean(ht_annotated.binary), continuous=hl.agg.mean(ht_annotated.continuous)))) self.assertAlmostEqual(annotated.chr20.binary, 1.152, places=3) self.assertAlmostEqual(annotated.chr20.continuous, 73.014, places=3) self.assertAlmostEqual(annotated.chr22.binary, 1.107, places=3) self.assertAlmostEqual(annotated.chr22.continuous, 102.174, places=3) self.assertAlmostEqual(annotated.mean_stats.binary, 0.965, places=3) self.assertAlmostEqual(annotated.mean_stats.continuous, 176.528, places=3)
def test_ld_score_regression(self): ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint}) ht_50_irnt = hl.import_table( doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_50_irnt = ht_50_irnt.annotate( chi_squared=ht_50_irnt['Z']**2, n=ht_50_irnt['N'], ld_score=ht_scores[ht_50_irnt['SNP']]['L2'], locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'], ht_scores[ht_50_irnt['SNP']]['BP']), alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]), phenotype='50_irnt') ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'], ht_50_irnt['alleles']) ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'], ht_50_irnt['n'], ht_50_irnt['ld_score'], ht_50_irnt['phenotype']) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_20160 = ht_20160.annotate( chi_squared=ht_20160['Z']**2, n=ht_20160['N'], ld_score=ht_scores[ht_20160['SNP']]['L2'], locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'], ht_scores[ht_20160['SNP']]['BP']), alleles=hl.array([ht_20160['A2'], ht_20160['A1']]), phenotype='20160') ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) ht_20160 = ht_20160.select(ht_20160['chi_squared'], ht_20160['n'], ht_20160['ld_score'], ht_20160['phenotype']) ht = ht_50_irnt.union(ht_20160) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['phenotype'], row_fields=['ld_score'], col_fields=[]) mt_tmp = new_temp_file() mt.write(mt_tmp, overwrite=True) mt = hl.read_matrix_table(mt_tmp) ht_results = hl.experimental.ld_score_regression( weight_expr=mt['ld_score'], ld_score_expr=mt['ld_score'], chi_sq_exprs=mt['chi_squared'], n_samples_exprs=mt['n'], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results['50_irnt']['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results['20160']['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results['20160']['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results['20160']['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_standard_error'], 0.0416, places=4) ht = ht_50_irnt.annotate( chi_squared_50_irnt=ht_50_irnt['chi_squared'], n_50_irnt=ht_50_irnt['n'], chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'], n_20160=ht_20160[ht_50_irnt.key]['n']) ht_results = hl.experimental.ld_score_regression( weight_expr=ht['ld_score'], ld_score_expr=ht['ld_score'], chi_sq_exprs=[ht['chi_squared_50_irnt'], ht['chi_squared_20160']], n_samples_exprs=[ht['n_50_irnt'], ht['n_20160']], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results[0]['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results[0]['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results[0]['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results[0]['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results[0]['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results[1]['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results[1]['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results[1]['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results[1]['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results[1]['snp_heritability_standard_error'], 0.0416, places=4)
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. significance_line : float, optional p-value at which to add a horizontal, dotted red line indicating genome-wide significance. If ``None``, no line is added. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if locus is None: locus = pvals._indices.source.locus ref = locus.dtype.reference_genome if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) source_pd = _collect_scatter_plot_data( ('_global_locus', locus.global_position()), ('_pval', pvals), fields=hover_fields, n_divisions=None if collect_all else n_divisions ) source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']] source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']] observed_contigs = set(source_pd['_contig']) observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs] contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs]) color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2)) p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000) p, _, legend, _, _, _ = _get_scatter_plot_elements( p, source_pd, x_col='_global_locus', y_col='_pval', label_cols=['_contig'], colors={'_contig': color_mapper}, size=size ) legend.visible = False p.xaxis.ticker = contig_ticks p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs)) p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')] if significance_line is not None: p.renderers.append(Span(location=-log10(significance_line), dimension='width', line_color='red', line_dash='dashed', line_width=1.5)) return p
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, {'locus': hl.Locus('1', 2), 'cm': 3.0}, {'locus': hl.Locus('1', 4), 'cm': 4.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('3', 3), 'cm': 5.0}] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True): """Returns start and stop indices for window around each locus. Examples -------- Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5: >>> starts, stops = hl.linalg.utils.locus_windows( ... hl.balding_nichols_model(1, 5, 5).locus, ... radius=2) >>> starts, stops (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5])) The following examples involve three contigs. >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, ... {'locus': hl.Locus('1', 2), 'cm': 3.0}, ... {'locus': hl.Locus('1', 4), 'cm': 4.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('3', 3), 'cm': 5.0}] >>> ht = hl.Table.parallelize( ... loci, ... hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), ... key=['locus']) Windows with 1bp radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1) (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6])) Windows with 1cm radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6])) Notes ----- This function returns two 1-dimensional ndarrays of integers, ``starts`` and ``stops``, each of size equal to the number of rows. By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``position[i] - radius <= position[j] <= position[i] + radius``. If the :meth:`.global_position` on `locus_expr` is not in ascending order, this method will fail. Ascending order should hold for a matrix table keyed by locus or variant (and the associated row table), or for a table that has been ordered by `locus_expr`. Set `coord_expr` to use a value other than position to define the windows. This row-indexed numeric expression must be non-missing, non-``nan``, on the same source as `locus_expr`, and ascending with respect to locus position for each contig; otherwise the function will fail. The last example above uses centimorgan coordinates, so ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``cm[i] - radius <= cm[j] <= cm[i] + radius``. Index ranges are start-inclusive and stop-exclusive. This function is especially useful in conjunction with :meth:`.BlockMatrix.sparsify_row_intervals`. Parameters ---------- locus_expr : :class:`.LocusExpression` Row-indexed locus expression on a table or matrix table. radius: :obj:`int` Radius of window for row values. coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value. Must be on the same table or matrix table as `locus_expr`. By default, the row value is given by the locus position. Returns ------- (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`) Tuple of start indices array and stop indices array. """ if radius < 0: raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}") check_row_indexed('locus_windows', locus_expr) if coord_expr is not None: check_row_indexed('locus_windows', coord_expr) src = locus_expr._indices.source if locus_expr not in src._fields_inverse: locus = Env.get_uid() annotate_fields = {locus: locus_expr} if coord_expr is not None: if coord_expr not in src._fields_inverse: coords = Env.get_uid() annotate_fields[coords] = coord_expr else: coords = src._fields_inverse[coord_expr] if isinstance(src, hl.MatrixTable): new_src = src.annotate_rows(**annotate_fields) else: new_src = src.annotate(**annotate_fields) locus_expr = new_src[locus] if coord_expr is not None: coord_expr = new_src[coords] if coord_expr is None: coord_expr = locus_expr.position rg = locus_expr.dtype.reference_genome contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr)) # check loci are in sorted order last_pos = hl.fold(lambda a, elt: (hl.case() .when(a <= elt, elt) .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")), -1, hl.agg.collect(hl.case() .when(hl.is_defined(locus_expr), locus_expr.global_position()) .or_error("locus_windows: missing value for 'locus_expr'."))) checked_contig_groups = (hl.case() .when(last_pos >= 0, contig_group_expr) .or_error("locus_windows: 'locus_expr' has length 0")) contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False) coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1]) starts_and_stops = hl._locus_windows_per_contig(coords, radius) if not _localize: return starts_and_stops starts, stops = hl.eval(starts_and_stops) return np.array(starts), np.array(stops)
def test_expand_types(self): t1 = hl.utils.range_table(10) t1 = t1.key_by(x = hl.locus('1', t1.idx+1)).expand_types() t2 = hl.utils.range_table(10).key_by() t2 = t2.annotate(x=hl.struct(contig='1', position=t2.idx+1)) self.assertTrue(t1._same(t2))