def test_combiner_works(): _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False) entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vcfs = [transform_gvcf(mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))), reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs] comb = combine_variant_datasets(vcfs) assert len(parts) == comb.variant_data.n_partitions() comb.variant_data._force_count_rows() comb.reference_data._force_count_rows()
def test_vcf_vds_combiner_equivalence(): import hail.experimental.vcf_combiner.vcf_combiner as vcf import hail.vds.combiner as vds _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = [mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))) for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)] entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs]) smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs]) smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ') smt = smt.select_entries(*smt_from_vds.entry) # harmonize fields and order smt = smt.key_rows_by('locus', 'alleles') assert smt._same(smt_from_vds)
def __init__(self, schema, paths, key, intervals): assert (key is None) == (intervals is None) self.schema = schema self.paths = paths self.key = key if intervals is not None: t = hl.expr.impute_type(intervals) if not isinstance(t, hl.tarray) and not isinstance( t.element_type, hl.tinterval): raise TypeError("'intervals' must be an array of tintervals") pt = t.element_type.point_type if isinstance(pt, hl.tstruct): self._interval_type = t else: self._interval_type = hl.tarray( hl.tinterval(hl.tstruct(__point=pt))) if intervals is not None and t != self._interval_type: self.intervals = [ hl.Interval(hl.Struct(__point=i.start), hl.Struct(__point=i.end), i.includes_start, i.includes_end) for i in intervals ] else: self.intervals = intervals
def generate_5_sample_vds(): paths = [ os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [ 'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz', 'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz', 'HG00320.hg38.g.vcf.gz' ] ] parts = [ hl.Interval(start=hl.Struct( locus=hl.Locus('chr22', 1, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus( 'chr22', hl.get_reference('GRCh38').contig_length('chr22') - 1, reference_genome='GRCh38')), includes_end=True) ] vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False) to_keep = defined_entry_fields( vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) vds = hl.vds.combiner.combine_variant_datasets( [hl.vds.combiner.transform_gvcf(mt, to_keep) for mt in vcfs]) vds.variant_data = vds.variant_data._key_rows_by_assert_sorted( 'locus', 'alleles') vds.write(os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'), overwrite=True)
def read_with_index_p1000(): rows = 10_000_000 bins = 1_000 width = rows // bins intervals = [hl.Interval(start=i, end=i + width) for i in range(0, rows, width)] ht = hl.read_table(resource('table_10M_par_10.ht'), _intervals=intervals) ht._force_count()
def read_with_index_p1000(path): rows = 10_000_000 bins = 1_000 width = rows // bins intervals = [hl.Interval(start=i, end=i + width) for i in range(0, rows, width)] ht = hl.read_table(path, _intervals=intervals) ht._force_count()
def default_exome_intervals(reference_genome) -> List[hl.utils.Interval]: """create a list of locus intervals suitable for importing and merging exome gvcfs. As exomes are small. One partition per chromosome works well here. Parameters ---------- reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. NOTE: only GRCh37 and GRCh38 references are supported. Returns ------- :obj:`List[Interval]` """ if reference_genome.name == 'GRCh37': contigs = [f'{i}' for i in range(1, 23)] + ['X', 'Y', 'MT'] elif reference_genome.name == 'GRCh38': contigs = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] else: raise ValueError( f"Invalid reference genome '{reference_genome.name}', only 'GRCh37' and 'GRCh38' are supported" ) return [ hl.Interval(start=hl.Locus(contig=contig, position=1, reference_genome=reference_genome), end=hl.Locus.parse(f'{contig}:END', reference_genome=reference_genome), includes_end=True) for contig in contigs ]
def test_order_by_intervals(self): intervals = {0: hl.Interval(0, 3, includes_start=True, includes_end=False), 1: hl.Interval(0, 4, includes_start=True, includes_end=True), 2: hl.Interval(1, 4, includes_start=True, includes_end=False), 3: hl.Interval(0, 4, includes_start=False, includes_end=False), 4: hl.Interval(0, 4, includes_start=True, includes_end=False)} ht = hl.utils.range_table(5) ht = ht.annotate_globals(ilist=intervals) ht = ht.annotate(interval=ht['ilist'][ht['idx']]) ht = ht.order_by(ht['interval']) ordered = ht['interval'].collect() expected = [intervals[i] for i in [0, 4, 1, 3, 2]] self.assertEqual(ordered, expected)
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set()
def test_filter_intervals_compound_partition_key(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles)) .key_rows_by('locus', 'alleles')) intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']), hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
def locus_interval(start, end): return hl.Interval(start=hl.Locus( contig=contig, position=start, reference_genome=reference_genome), end=hl.Locus(contig=contig, position=end, reference_genome=reference_genome), includes_end=True)
def read(fname: str) -> 'DNDArray': # read without good partitioning, just to get the globals a = DNDArray(hl.read_table(fname)) t = hl.read_table(fname, _intervals=[ hl.Interval(hl.Struct(r=i, c=j), hl.Struct(r=i, c=j + 1)) for i in range(a.n_block_rows) for j in range(a.n_block_cols)]) return DNDArray(t)
def assert_rg_loaded_correctly(name): rg = hl.get_reference(name) self.assertEqual(rg.contigs, ["1", "X", "Y", "MT"]) self.assertEqual(rg.lengths, {"1": 5, "X": 4, "Y": 3, "MT": 2}) self.assertEqual(rg.x_contigs, ["X"]) self.assertEqual(rg.y_contigs, ["Y"]) self.assertEqual(rg.mt_contigs, ["MT"]) self.assertEqual(rg.par, [ hl.Interval(start=hl.Locus("X", 2, name), end=hl.Locus("X", 4, name)) ])
def get_n_even_intervals(n): ref = hl.default_reference() genome_size = sum(ref.lengths.values()) partition_size = int(genome_size / n) + 1 return list( map( lambda x: hl.Interval( hl.eval(hl.locus_from_global_position(x * partition_size)), hl.eval( hl.locus_from_global_position( min(x * partition_size + partition_size, genome_size - 1)))), range(n)))
def calculate_new_intervals(ht, n, reference_genome): """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable for repartitioning a combiner matrix table Parameters ---------- ht : :class:`.Table` Table / Rows Table to compute new intervals for n : :obj:`int` Number of rows each partition should have, (last partition may be smaller) reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. Returns ------- :obj:`List[Interval]` """ assert list(ht.key) == ['locus'] assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome) end = hl.Locus(reference_genome.contigs[-1], reference_genome.lengths[reference_genome.contigs[-1]], reference_genome=reference_genome) n_rows = ht.count() if n_rows == 0: raise ValueError('empty table!') ht = ht.select() ht = ht.annotate(x=hl.scan.count()) ht = ht.annotate(y=ht.x + 1) ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1))) ht = ht.select() ht = ht.annotate(start=hl.or_else( hl.scan._prev_nonnull( hl.locus_from_global_position(ht.locus.global_position() + 1, reference_genome=reference_genome)), hl.locus_from_global_position(0, reference_genome=reference_genome))) ht = ht.key_by() ht = ht.select( interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True)) intervals = ht.aggregate(hl.agg.collect(ht.interval)) last_st = hl.eval( hl.locus_from_global_position( hl.literal(intervals[-1].end).global_position() + 1, reference_genome=reference_genome)) interval = hl.Interval(start=last_st, end=end, includes_end=True) intervals.append(interval) return intervals
def test_gvcfs(spark, tmp_path): # GVCF MatrixTables are not keyed by locus and alleles, just by locus input_vcf = 'test-data/tabix-test-vcf/combined.chr20_18210071_18210093.g.vcf.gz' partitions = [ hl.Interval(hl.Locus("chr20", 1, reference_genome='GRCh38'), hl.Locus("chr20", 20000000, reference_genome='GRCh38'), includes_end=True) ] hail_df = functions.from_matrix_table( hl.import_gvcfs([input_vcf], partitions, force_bgz=True, reference_genome='GRCh38')[0]) _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf', 'bigvcf')
def values(self): values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), { "a": 0, "b": 1, "c": 4 }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1]))] return values
def __init__(self, path, intervals, filter_intervals): if intervals is not None: t = hl.expr.impute_type(intervals) if not isinstance(t, hl.tarray) and not isinstance(t.element_type, hl.tinterval): raise TypeError("'intervals' must be an array of tintervals") pt = t.element_type.point_type if isinstance(pt, hl.tstruct): self._interval_type = t else: self._interval_type = hl.tarray(hl.tinterval(hl.tstruct(__point=pt))) self.path = path self.filter_intervals = filter_intervals if intervals is not None and t != self._interval_type: self.intervals = [hl.Interval(hl.Struct(__point=i.start), hl.Struct(__point=i.end), i.includes_start, i.includes_end) for i in intervals] else: self.intervals = intervals
def union_intervals(intervals: List[hl.Interval], is_sorted: bool = False): """ Generate a list with the union of all intervals in the input list by merging overlapping intervals. :param intervals: Intervals to merge :param is_sorted: If set, assumes intervals are already sorted, otherwise will sort. :return: List of merged intervals """ sorted_intervals = intervals if is_sorted else sort_intervals(intervals) merged_intervals = sorted_intervals[:1] for interval in sorted_intervals[1:]: if merged_intervals[-1].start.contig == interval.start.contig: if merged_intervals[-1].end.position < interval.end.position: if interval.start.position <= merged_intervals[-1].end.position: merged_intervals[-1] = hl.Interval( merged_intervals[-1].start, interval.end) else: merged_intervals.append(interval) else: merged_intervals.append(interval) return merged_intervals
def read_with_index_p50k(): intervals = [hl.Interval(start=i, end=i + 200) for i in range(0, 10_000_000, 200)] ht = hl.read_table(resource('table_10M_par_10.ht'), _intervals=intervals) ht._force_count()
def from_matrix_table( mt: MatrixTable, entrc_field: str, *, n_partitions: Optional[int] = None, block_size: Optional[int] = None ) -> 'DNDArray': if n_partitions is None: n_partitions = mt.n_partitions() if block_size is None: block_size = DNDArray.default_block_size if n_partitions == 0: assert mt.count_cols() == 0 assert mt.count_rows() == 0 t = range_table(0, 0) t = t.annotate(r=0, c=0, block=nd.array([]).reshape((0, 0))) t = t.select_globals( r_field='r', c_field='c', n_rows=0, n_cols=0, n_block_rows=0, n_block_cols=0, block_size=0) return DNDArray(t) assert 'r' not in mt.row assert 'c' not in mt.row assert 'block' not in mt.row n_rows, n_cols = mt.count() n_block_rows = (n_rows + block_size - 1) // block_size n_block_cols = (n_cols + block_size - 1) // block_size entries, cols, row_index, col_blocks = (Env.get_uid() for _ in range(4)) mt = (mt .select_globals() .select_rows() .select_cols() .add_row_index(row_index) .localize_entries(entries, cols)) # FIXME: remove when ndarray support structs mt = mt.annotate(**{entries: mt[entries][entrc_field]}) mt = mt.annotate( **{col_blocks: hl.range(n_block_cols).map( lambda c: hl.struct( c=c, entries=mt[entries][(c * block_size):((c + 1) * block_size)]))} ) mt = mt.explode(col_blocks) mt = mt.select(row_index, **mt[col_blocks]) mt = mt.annotate(r=hl.int(mt[row_index] // block_size)) mt = mt.key_by(mt.r, mt.c) mt = mt.group_by(mt.r, mt.c).aggregate( entries=hl.sorted( hl.agg.collect(hl.struct(row_index=mt[row_index], entries=mt.entries)), key=lambda x: x.row_index ).map(lambda x: x.entries)) mt = mt.select(block=hl.nd.array(mt.entries)) mt = mt.select_globals( r_field='r', c_field='c', n_rows=n_rows, n_cols=n_cols, n_block_rows=n_block_rows, n_block_cols=n_block_cols, block_size=block_size) fname = new_temp_file() mt = mt.key_by(mt.r, mt.c) mt.write(fname, _codec_spec=DNDArray.fast_codec_spec) t = hl.read_table(fname, _intervals=[ hl.Interval(hl.Struct(r=i, c=j), hl.Struct(r=i, c=j + 1)) for i in range(n_block_rows) for j in range(n_block_cols)]) return DNDArray(t)
def from_matrix_table( mt: MatrixTable, entry_field: str, *, n_partitions: Optional[int] = None, block_size: Optional[int] = None, sort_columns: bool = False ) -> 'DNDArray': if n_partitions is None: n_partitions = mt.n_partitions() if block_size is None: block_size = DNDArray.default_block_size if n_partitions == 0: assert mt.count_cols() == 0 assert mt.count_rows() == 0 t = range_table(0, 0) t = t.annotate(r=0, c=0, block=nd.array([]).reshape((0, 0))) t = t.select_globals( n_rows=0, n_cols=0, n_block_rows=0, n_block_cols=0, block_size=0) return DNDArray(t) assert 'r' not in mt.row assert 'c' not in mt.row assert 'block' not in mt.row n_rows, n_cols = mt.count() n_block_rows = (n_rows + block_size - 1) // block_size n_block_cols = (n_cols + block_size - 1) // block_size entries, cols, row_index, col_blocks = (Env.get_uid() for _ in range(4)) if sort_columns: col_index = Env.get_uid() col_order = mt.add_col_index(col_index) col_order = col_order.key_cols_by().cols() col_order = col_order.select(key=col_order.row.select(*mt.col_key), index=col_order[col_index]) col_order = col_order.collect(_localize=False) col_order = hl.sorted(col_order, key=lambda x: x.key) col_order = col_order['index'].collect()[0] mt = mt.choose_cols(col_order) else: col_keys = mt.col_key.collect(_localize=False) out_of_order = hl.range(hl.len(col_keys) - 1).map( lambda i: col_keys[i] > col_keys[i + 1]) out_of_order = out_of_order.collect()[0] if any(out_of_order): raise ValueError( 'from_matrix_table: columns are not in sorted order. You may request a ' 'sort with sort_columns=True.') mt = (mt .select_globals() .select_rows() .select_cols() .add_row_index(row_index) .localize_entries(entries, cols)) # FIXME: remove when ndarray support structs mt = mt.annotate(**{entries: mt[entries][entry_field]}) mt = mt.annotate( **{col_blocks: hl.range(n_block_cols).map( lambda c: hl.struct( c=c, entries=mt[entries][(c * block_size):((c + 1) * block_size)]))} ) mt = mt.explode(col_blocks) mt = mt.select(row_index, **mt[col_blocks]) mt = mt.annotate(r=hl.int(mt[row_index] // block_size)) mt = mt.key_by(mt.r, mt.c) mt = mt.group_by(mt.r, mt.c).aggregate( entries=hl.sorted( hl.agg.collect(hl.struct(row_index=mt[row_index], entries=mt.entries)), key=lambda x: x.row_index ).map(lambda x: x.entries)) mt = mt.select(block=hl.nd.array(mt.entries)) mt = mt.select_globals( n_rows=n_rows, n_cols=n_cols, n_block_rows=n_block_rows, n_block_cols=n_block_cols, block_size=block_size) fname = new_temp_file() mt = mt.key_by(mt.r, mt.c) mt.write(fname, _codec_spec=DNDArray.fast_codec_spec) t = hl.read_table(fname, _intervals=[ hl.Interval(hl.Struct(r=i, c=j), hl.Struct(r=i, c=j + 1)) for i in range(n_block_rows) for j in range(n_block_cols)]) return DNDArray(t)
def generate_datasets(doctest_namespace, output_dir): doctest_namespace['hl'] = hl files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds print("finished setting up doctest...")
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...")