def test_contig_recoding(): path1 = os.path.join(resource('gvcfs'), 'recoding', 'HG00187.hg38.g.vcf.gz') path2 = os.path.join(resource('gvcfs'), 'recoding', 'HG00187.hg38.recoded.g.vcf.gz') out_file_1 = new_temp_file(extension='mt') out_file_2 = new_temp_file(extension='mt') vc.run_combiner([path1, path1], out_file_1, Env.hc()._tmpdir, reference_genome='GRCh38', use_exome_default_intervals=True) vc.run_combiner([path2, path2], out_file_2, Env.hc()._tmpdir, reference_genome='GRCh38', contig_recoding={'22': 'chr22'}, use_exome_default_intervals=True) mt1 = hl.read_matrix_table(out_file_1) mt2 = hl.read_matrix_table(out_file_2) assert mt1.count() == mt2.count() assert mt1._same(mt2)
def test_write_stage_locally(self): mt = self.get_vds() f = new_temp_file(suffix='mt') mt.write(f, stage_locally=True) mt2 = hl.read_matrix_table(f) self.assertTrue(mt._same(mt2))
def test_write_stage_locally(self): mt = self.get_vds() f = new_temp_file(suffix='mt') mt.write(f, stage_locally=True) mt2 = hl.read_matrix_table(f) self.assertTrue(mt._same(mt2))
def test_read_stored_globals(self): ds = self.get_vds() ds = ds.annotate_globals(x=5, baz='foo') f = new_temp_file(suffix='vds') ds.write(f) t = hl.read_table(f + '/globals') self.assertTrue(ds.globals_table()._same(t))
def test_1kg_chr22(): out_file = new_temp_file(suffix='mt') sample_names = all_samples[:5] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] vc.run_combiner(paths, out_file=out_file, tmp_path=Env.hc().tmp_dir, branch_factor=2, batch_size=2, reference_genome='GRCh38') sample_data = dict() for sample, path in zip(sample_names, paths): ht = hl.import_vcf(path, force_bgz=True, reference_genome='GRCh38').localize_entries('entries') n, n_variant = ht.aggregate((hl.agg.count(), hl.agg.count_where(ht.entries[0].GT.is_non_ref()))) sample_data[sample] = (n, n_variant) mt = hl.read_matrix_table(out_file) mt = mt.annotate_cols(n=hl.agg.count(), n_variant=hl.agg.count_where( mt.LGT.is_non_ref())) # annotate the number of non-missing records combined_results = hl.tuple([mt.s, mt.n, mt.n_variant]).collect() assert len(combined_results) == len(sample_names) for sample, n, n_variant in combined_results: true_n, true_n_variant = sample_data[sample] assert n == true_n, sample assert n_variant == true_n_variant, sample
def test_read_stored_globals(self): ds = self.get_vds() ds = ds.annotate_globals(x=5, baz='foo') f = new_temp_file(suffix='vds') ds.write(f) t = hl.read_table(f + '/globals') self.assertTrue(ds.globals_table()._same(t))
def test_combiner_plan_round_trip_serialization(): sample_names = all_samples[:5] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] plan_path = new_temp_file(extension='json') out_file = new_temp_file(extension='vds') plan = new_combiner(gvcf_paths=paths, output_path=out_file, temp_path=Env.hc()._tmpdir, save_path=plan_path, reference_genome='GRCh38', use_exome_default_intervals=True, branch_factor=2, batch_size=2) plan.save() plan_loaded = load_combiner(plan_path) assert plan == plan_loaded
def test_combiner_run(): tmpdir = new_temp_file() samples = all_samples[:5] input_paths = [resource(os.path.join('gvcfs', '1kg_chr22', f'{s}.hg38.g.vcf.gz')) for s in samples] final_paths_individual = [os.path.join(tmpdir, f'sample_{s}') for s in samples] final_path_1 = os.path.join(tmpdir, 'final1.vds') final_path_2 = os.path.join(tmpdir, 'final2.vds') parts = hl.eval([hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38')]) for input_gvcf, path in zip(input_paths[:2], final_paths_individual[:2]): combiner = hl.vds.new_combiner(output_path=path, intervals=parts, temp_path=tmpdir, gvcf_paths=[input_gvcf], reference_genome='GRCh38') combiner.run() combiner = hl.vds.new_combiner(output_path=final_path_1, intervals=parts, temp_path=tmpdir, gvcf_paths=input_paths[2:], vds_paths=final_paths_individual[:2], reference_genome='GRCh38', branch_factor=2, batch_size=2) combiner.run() combiner2 = hl.vds.new_combiner(output_path=final_path_2, intervals=parts, temp_path=tmpdir, gvcf_paths=input_paths, reference_genome='GRCh38', branch_factor=2, batch_size=2) combiner2.run() assert hl.vds.read_vds(final_path_1)._same(hl.vds.read_vds(final_path_2))
def test_codecs_table(self): from hail.utils.java import scala_object codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs() rt = self.get_vds().rows() temp = new_temp_file(suffix='ht') for codec in codecs: rt.write(temp, overwrite=True, _codec_spec=codec.toString()) rt2 = hl.read_table(temp) self.assertTrue(rt._same(rt2))
def test_codecs_table(self): from hail.utils.java import Env, scala_object codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs() rt = self.get_vds().rows() temp = new_temp_file(suffix='ht') for codec in codecs: rt.write(temp, overwrite=True, _codec_spec=codec.toString()) rt2 = hl.read_table(temp) self.assertTrue(rt._same(rt2))
def test_codecs_matrix(self): from hail.utils.java import Env, scala_object codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs() ds = self.get_vds() temp = new_temp_file(suffix='hmt') for codec in codecs: ds.write(temp, overwrite=True, _codec_spec=codec.toString()) ds2 = hl.read_matrix_table(temp) self.assertTrue(ds._same(ds2))
def test_multi_write(self): mt = self.get_vds() f = new_temp_file() hl.experimental.write_matrix_tables([mt, mt], f) path1 = f + '0.mt' path2 = f + '1.mt' mt1 = hl.read_matrix_table(path1) mt2 = hl.read_matrix_table(path2) self.assertTrue(mt._same(mt1)) self.assertTrue(mt._same(mt2)) self.assertTrue(mt1._same(mt2))
def test_key_by_locus_alleles(): out_file = new_temp_file(extension='mt') sample_names = all_samples[:5] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] vc.run_combiner(paths, out_file=out_file, tmp_path=Env.hc()._tmpdir, reference_genome='GRCh38', key_by_locus_and_alleles=True) mt = hl.read_matrix_table(out_file) assert(list(mt.row_key) == ['locus', 'alleles']) mt._force_count_rows()
def test_non_ref_alleles_set_to_missing(): path = os.path.join(resource('gvcfs'), 'non_ref_call.g.vcf.gz') out_file = new_temp_file(extension='mt') vc.run_combiner([path, path], out_file=out_file, tmp_path=Env.hc()._tmpdir, branch_factor=2, batch_size=2, reference_genome='GRCh38') mt = hl.read_matrix_table(out_file) n_alleles = hl.len(mt.alleles) gt_idx = hl.experimental.lgt_to_gt(mt.LGT, mt.LA).unphased_diploid_gt_index() assert mt.aggregate_entries( hl.agg.all(gt_idx < (n_alleles * (n_alleles + 1)) / 2))
def test_head(self): # no empty partitions mt1 = hl.utils.range_matrix_table(10, 10) # empty partitions at front mt2 = hl.utils.range_matrix_table(20, 10, 20) mt2 = mt2.filter_rows(mt2.row_idx > 9) mts = [mt1, mt2] for mt in mts: tmp_file = new_temp_file(suffix='mt') mt.write(tmp_file) mt_readback = hl.read_matrix_table(tmp_file) for mt_ in [mt, mt_readback]: assert mt_.head(1).count_rows() == 1 assert mt_.head(1)._force_count_rows() == 1 assert mt_.head(100).count_rows() == 10 assert mt_.head(100)._force_count_rows() == 10
def test_combiner_manual_filtration(): sample_names = all_samples[:2] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] out_file = new_temp_file(extension='vds') plan = new_combiner(gvcf_paths=paths, output_path=out_file, temp_path=Env.hc()._tmpdir, reference_genome='GRCh38', use_exome_default_intervals=True, gvcf_reference_entry_fields_to_keep=['GQ'], gvcf_info_to_keep=['ExcessHet'], force=True) assert plan.gvcf_info_to_keep == {'ExcessHet'} plan.run() vds = hl.vds.read_vds(out_file) assert list(vds.variant_data.gvcf_info) == ['ExcessHet'] assert list(vds.reference_data.entry) == ['END', 'GQ']
def test_sample_override(): out_file = new_temp_file(extension='mt') sample_names = all_samples[:5] new_names = [f'S{i}' for i, _ in enumerate(sample_names)] paths = [ os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names ] header_path = paths[0] vc.run_combiner(paths, out_file=out_file, tmp_path=Env.hc()._tmpdir, reference_genome='GRCh38', header=header_path, sample_names=new_names, key_by_locus_and_alleles=True, use_exome_default_intervals=True) mt_cols = hl.read_matrix_table(out_file).key_cols_by().cols() mt_names = mt_cols.aggregate(hl.agg.collect(mt_cols.s)) assert new_names == mt_names
def impute_sex_chromosome_ploidy(vds: VariantDataset, calling_intervals, normalization_contig: str) -> hl.Table: """Impute sex chromosome ploidy from depth of reference data within calling intervals. Returns a :class:`.Table` with sample ID keys, with the following fields: - ``autosomal_mean_dp`` (*float64*): Mean depth on calling intervals on normalization contig. - ``x_mean_dp`` (*float64*): Mean depth on calling intervals on X chromosome. - ``x_ploidy`` (*float64*): Estimated ploidy on X chromosome. Equal to ``2 * x_mean_dp / autosomal_mean_dp``. - ``y_mean_dp`` (*float64*): Mean depth on calling intervals on chromosome. - ``y_ploidy`` (*float64*): Estimated ploidy on Y chromosome. Equal to ``2 * y_mean_db / autosomal_mean_dp``. Parameters ---------- vds : vds: :class:`.VariantDataset` Dataset. calling_intervals : :class:`.Table` or :class:`.ArrayExpression` Calling intervals with consistent read coverage (for exomes, trim the capture intervals). normalization_contig : str Autosomal contig for depth comparison. Returns ------- :class:`.Table` """ if not isinstance(calling_intervals, Table): calling_intervals = hl.Table.parallelize( hl.map(lambda i: hl.struct(interval=i), calling_intervals), schema=hl.tstruct(interval=calling_intervals.dtype.element_type), key='interval') else: key_dtype = calling_intervals.key.dtype if len(key_dtype) != 1 or not isinstance( calling_intervals.key[0].dtype, hl.tinterval) or calling_intervals.key[ 0].dtype.point_type != vds.reference_data.locus.dtype: raise ValueError( f"'impute_sex_chromosome_ploidy': expect calling_intervals to be list of intervals or" f" table with single key of type interval<locus>, found table with key: {key_dtype}" ) rg = vds.reference_data.locus.dtype.reference_genome par_boundaries = [] for par_interval in rg.par: par_boundaries.append(par_interval.start) par_boundaries.append(par_interval.end) # segment on PAR interval boundaries calling_intervals = hl.segment_intervals(calling_intervals, par_boundaries) # remove intervals overlapping PAR calling_intervals = calling_intervals.filter( hl.all(lambda x: ~x.overlaps(calling_intervals.interval), hl.literal(rg.par))) # checkpoint for efficient multiple downstream usages info("'impute_sex_chromosome_ploidy': checkpointing calling intervals") calling_intervals = calling_intervals.checkpoint( new_temp_file(extension='ht')) interval = calling_intervals.key[0] (any_bad_intervals, chrs_represented) = calling_intervals.aggregate( (hl.agg.any(interval.start.contig != interval.end.contig), hl.agg.collect_as_set(interval.start.contig))) if any_bad_intervals: raise ValueError( "'impute_sex_chromosome_ploidy' does not support calling intervals that span chromosome boundaries" ) if len(rg.x_contigs) != 1: raise NotImplementedError( f"reference genome {rg.name!r} has multiple X contigs, this is not supported in 'impute_sex_chromosome_ploidy'" ) chr_x = rg.x_contigs[0] if len(rg.y_contigs) != 1: raise NotImplementedError( f"reference genome {rg.name!r} has multiple Y contigs, this is not supported in 'impute_sex_chromosome_ploidy'" ) chr_y = rg.y_contigs[0] kept_contig_filter = hl.array(chrs_represented).map( lambda x: hl.parse_locus_interval(x, reference_genome=rg)) vds = VariantDataset( hl.filter_intervals(vds.reference_data, kept_contig_filter), hl.filter_intervals(vds.variant_data, kept_contig_filter)) coverage = interval_coverage(vds, calling_intervals, gq_thresholds=()).drop('gq_thresholds') coverage = coverage.annotate_rows(contig=coverage.interval.start.contig) coverage = coverage.annotate_cols(__mean_dp=hl.agg.group_by( coverage.contig, hl.agg.sum(coverage.sum_dp) / hl.agg.sum(coverage.interval_size))) mean_dp_dict = coverage.__mean_dp auto_dp = mean_dp_dict.get(normalization_contig) x_dp = mean_dp_dict.get(chr_x) y_dp = mean_dp_dict.get(chr_y) per_sample = coverage.transmute_cols(autosomal_mean_dp=auto_dp, x_mean_dp=x_dp, x_ploidy=2 * x_dp / auto_dp, y_mean_dp=y_dp, y_ploidy=2 * y_dp / auto_dp) info( "'impute_sex_chromosome_ploidy': computing and checkpointing coverage and karyotype metrics" ) return per_sample.cols().checkpoint( new_temp_file('impute_sex_karyotype', extension='ht'))