def filter_by_coordinates(self, gp_range, nlp_range): assert (len(gp_range) == 2 and len(nlp_range) == 2) return self.ht.filter( hl.interval(hl.int64(gp_range[0]), hl.int64(gp_range[1]), includes_start=True, includes_end=True).contains(self.ht.global_position) & hl.interval(hl.float64(nlp_range[0]), hl.float64(nlp_range[1]), includes_start=True, includes_end=True).contains(self.ht.neg_log_pval))
def test_row_joins_into_table(self): rt = hl.utils.range_matrix_table(9, 13, 3) mt1 = rt.key_rows_by(idx=rt.row_idx) mt1 = mt1.select_rows(v=mt1.idx + 2) mt2 = rt.key_rows_by(idx=rt.row_idx, idx2=rt.row_idx + 1) mt2 = mt2.select_rows(v=mt2.idx + 2) t1 = hl.utils.range_table(10, 3) t2 = t1.key_by(t1.idx, idx2=t1.idx + 1) t1 = t1.select(v=t1.idx + 2) t2 = t2.select(v=t2.idx + 2) tinterval1 = t1.key_by(k=hl.interval(t1.idx, t1.idx, True, True)) tinterval1 = tinterval1.select(v=tinterval1.idx + 2) tinterval2 = t2.key_by(k=hl.interval(t2.key, t2.key, True, True)) tinterval2 = tinterval2.select(v=tinterval2.idx + 2) values = [hl.Struct(v=i + 2) for i in range(9)] # join on mt row key self.assertEqual(t1.index(mt1.row_key).collect(), values) self.assertEqual(t2.index(mt2.row_key).collect(), values) self.assertEqual(t1.index(mt1.idx).collect(), values) self.assertEqual(t2.index(mt2.idx, mt2.idx2).collect(), values) self.assertEqual(t1.index(mt2.idx).collect(), values) with self.assertRaises(hl.expr.ExpressionException): t2.index(mt2.idx).collect() with self.assertRaises(hl.expr.ExpressionException): t2.index(mt1.row_key).collect() # join on not mt row key self.assertEqual( t1.index(mt1.v).collect(), [hl.Struct(v=i + 2) for i in range(2, 10)] + [None]) self.assertEqual( t2.index(mt2.idx2, mt2.v).collect(), [hl.Struct(v=i + 2) for i in range(1, 10)]) with self.assertRaises(hl.expr.ExpressionException): t2.index(mt2.v).collect() # join on interval of first field of mt row key self.assertEqual(tinterval1.index(mt1.idx).collect(), values) self.assertEqual(tinterval1.index(mt1.row_key).collect(), values) self.assertEqual(tinterval1.index(mt2.idx).collect(), values) with self.assertRaises(hl.expr.ExpressionException): tinterval1.index(mt2.row_key).collect() with self.assertRaises(hl.expr.ExpressionException): tinterval2.index(mt2.idx).collect() with self.assertRaises(hl.expr.ExpressionException): tinterval2.index(mt2.row_key).collect() with self.assertRaises(hl.expr.ExpressionException): tinterval2.index(mt2.idx, mt2.idx2).collect()
def test_row_joins_into_table(self): rt = hl.utils.range_matrix_table(9, 13, 3) mt1 = rt.key_rows_by(idx=rt.row_idx) mt1 = mt1.select_rows(v=mt1.idx + 2) mt2 = rt.key_rows_by(idx=rt.row_idx, idx2=rt.row_idx + 1) mt2 = mt2.select_rows(v=mt2.idx + 2) t1 = hl.utils.range_table(10, 3) t2 = t1.key_by(t1.idx, idx2=t1.idx + 1) t1 = t1.select(v=t1.idx + 2) t2 = t2.select(v=t2.idx + 2) tinterval1 = t1.key_by(k=hl.interval(t1.idx, t1.idx, True, True)) tinterval1 = tinterval1.select(v=tinterval1.idx + 2) tinterval2 = t2.key_by(k=hl.interval(t2.key, t2.key, True, True)) tinterval2 = tinterval2.select(v=tinterval2.idx + 2) values = [hl.Struct(v=i + 2) for i in range(9)] # join on mt row key self.assertEqual(t1.index(mt1.row_key).collect(), values) self.assertEqual(t2.index(mt2.row_key).collect(), values) self.assertEqual(t1.index(mt1.idx).collect(), values) self.assertEqual(t2.index(mt2.idx, mt2.idx2).collect(), values) self.assertEqual(t1.index(mt2.idx).collect(), values) with self.assertRaises(hl.expr.ExpressionException): t2.index(mt2.idx).collect() with self.assertRaises(hl.expr.ExpressionException): t2.index(mt1.row_key).collect() # join on not mt row key self.assertEqual(t1.index(mt1.v).collect(), [hl.Struct(v=i + 2) for i in range(2, 10)] + [None]) self.assertEqual(t2.index(mt2.idx2, mt2.v).collect(), [hl.Struct(v=i + 2) for i in range(1, 10)]) with self.assertRaises(hl.expr.ExpressionException): t2.index(mt2.v).collect() # join on interval of first field of mt row key self.assertEqual(tinterval1.index(mt1.idx).collect(), values) self.assertEqual(tinterval1.index(mt1.row_key).collect(), values) self.assertEqual(tinterval1.index(mt2.idx).collect(), values) with self.assertRaises(hl.expr.ExpressionException): tinterval1.index(mt2.row_key).collect() with self.assertRaises(hl.expr.ExpressionException): tinterval2.index(mt2.idx).collect() with self.assertRaises(hl.expr.ExpressionException): tinterval2.index(mt2.row_key).collect() with self.assertRaises(hl.expr.ExpressionException): tinterval2.index(mt2.idx, mt2.idx2).collect()
def create_gene_map_ht(ht, check_gene_contigs=False): from gnomad.utils.vep import process_consequences ht = process_consequences(ht) ht = ht.explode(ht.vep.worst_csq_by_gene_canonical) ht = ht.annotate( variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' + ht.alleles[0] + '/' + ht.alleles[1], annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical)) if check_gene_contigs: gene_contigs = ht.group_by( gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id, gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol, ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig)) assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1) gene_map_ht = ht.group_by( gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id, gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol, ).partition_hint(100).aggregate( interval=hl.interval(start=hl.locus( hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)), end=hl.locus( hl.agg.take(ht.locus.contig, 1)[0], hl.agg.max(ht.locus.position))), variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)), ) return gene_map_ht
def _dumps_partitions(partitions, row_key_type): parts_type = partitions.dtype if not (isinstance(parts_type, hl.tarray) and isinstance(parts_type.element_type, hl.tinterval)): raise ValueError( f'partitions type invalid: {parts_type} must be array of intervals' ) point_type = parts_type.element_type.point_type f1, t1 = next(iter(row_key_type.items())) if point_type == t1: partitions = hl.map( lambda x: hl.interval(start=hl.struct(**{f1: x.start}), end=hl.struct(**{f1: x.end}), includes_start=x.includes_start, includes_end=x.includes_end), partitions) else: if not isinstance(point_type, hl.tstruct): raise ValueError( f'partitions has wrong type: {point_type} must be struct or type of first row key field' ) if not point_type._is_prefix_of(row_key_type): raise ValueError( f'partitions type invalid: {point_type} must be prefix of {row_key_type}' ) s = json.dumps(partitions.dtype._convert_to_json(hl.eval(partitions))) return s, partitions.dtype
def test_interval_join(self): left = hl.utils.range_table(50, n_partitions=10) intervals = hl.utils.range_table(4) intervals = intervals.key_by(interval=hl.interval(intervals.idx * 10, intervals.idx * 10 + 5)) left = left.annotate(interval_matches=intervals.index(left.key)) self.assertTrue(left.all(hl.case() .when(left.idx % 10 < 5, left.interval_matches.idx == left.idx // 10) .default(hl.is_missing(left.interval_matches))))
def calculate_new_intervals(ht, n, reference_genome): """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable for repartitioning a combiner matrix table Parameters ---------- ht : :class:`.Table` Table / Rows Table to compute new intervals for n : :obj:`int` Number of rows each partition should have, (last partition may be smaller) reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. Returns ------- :obj:`List[Interval]` """ assert list(ht.key) == ['locus'] assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome) end = hl.Locus(reference_genome.contigs[-1], reference_genome.lengths[reference_genome.contigs[-1]], reference_genome=reference_genome) n_rows = ht.count() if n_rows == 0: raise ValueError('empty table!') ht = ht.select() ht = ht.annotate(x=hl.scan.count()) ht = ht.annotate(y=ht.x + 1) ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1))) ht = ht.select() ht = ht.annotate(start=hl.or_else( hl.scan._prev_nonnull( hl.locus_from_global_position(ht.locus.global_position() + 1, reference_genome=reference_genome)), hl.locus_from_global_position(0, reference_genome=reference_genome))) ht = ht.key_by() ht = ht.select( interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True)) intervals = ht.aggregate(hl.agg.collect(ht.interval)) last_st = hl.eval( hl.locus_from_global_position( hl.literal(intervals[-1].end).global_position() + 1, reference_genome=reference_genome)) interval = hl.Interval(start=last_st, end=end, includes_end=True) intervals.append(interval) return intervals
def test_segment_intervals(self): intervals = hl.Table.parallelize( [ hl.struct(interval=hl.interval(0, 10)), hl.struct(interval=hl.interval(20, 50)), hl.struct(interval=hl.interval(52, 52)) ], schema=hl.tstruct(interval=hl.tinterval(hl.tint32)), key='interval') points1 = [-1, 5, 30, 40, 52, 53] segmented1 = hl.segment_intervals(intervals, points1) assert segmented1.aggregate( hl.agg.collect(segmented1.interval) == [ hl.interval(0, 5), hl.interval(5, 10), hl.interval(20, 30), hl.interval(30, 40), hl.interval(40, 50), hl.interval(52, 52) ])
def get_annot_ht(): t = hl.import_table(f'{wd_data}/gencode.v31lift37.annotation.gff3.gz',no_header=True,impute=True, comment=('#'),force=True) #t = hl.import_table('/Users/nbaya/Downloads/gencode.v31lift37.annotation.gtf',no_header=True,impute=True, comment=('#')) t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("="))) t2 = t2.filter(t2.f2 == "CDS") # only want coding sequences, not entire genes t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f3, 'GRCh37')) t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f4, 'GRCh37')) t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0[3:], t2.f3, 'GRCh37'), hl.locus(t2.f0[3:], t2.f4, 'GRCh37'))) t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1])))) t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], level=t2.GFF_Columns["level"]) t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7) t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7) t2 = t2.key_by(t2.interval) return t2
def test_constructors(self): rg = hl.ReferenceGenome("foo", ["1"], {"1": 100}) schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32) rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}] kt = hl.Table.parallelize(rows, schema) kt = kt.annotate(d=hl.int64(kt.d)) kt = kt.annotate(l1=hl.parse_locus("1:51"), l2=hl.locus("1", 51, reference_genome=rg), i1=hl.parse_locus_interval("1:51-56", reference_genome=rg), i2=hl.interval(hl.locus("1", 51, reference_genome=rg), hl.locus("1", 56, reference_genome=rg))) expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64, 'l1': hl.tlocus(), 'l2': hl.tlocus(rg), 'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))} self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
ht = ht.annotate(locus=hl.locus('chr' + ht['chromosome'].replace('MT', 'M'), ht['position'], 'hg19')) if args.b == 'GRCh37': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37') ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh37')) if args.b == 'GRCh38': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19ToHg38.over.chain.gz', 'GRCh38') ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh38')) ht = ht.filter(hl.is_defined(ht['locus'])) ht = ht.select('locus', 'N', 'S') ht = ht.key_by('locus') if args.d == 'elements': name = 'GERP_elements' ht = hl.import_table('gs://hail-datasets-extracted-data/GERP++/GERP++_elements.hg19.tsv.bgz', types={'start': hl.tint, 'end': hl.tint, 'S': hl.tfloat, 'p_value': hl.tfloat}) ht = ht.annotate(interval=hl.interval(hl.locus(ht['chromosome'], ht['start'], 'hg19'), hl.locus(ht['chromosome'], ht['end'], 'hg19'))) if args.b == 'GRCh37': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37') ht = ht.annotate(interval=hl.liftover(ht['interval'], 'GRCh37')) if args.b == 'GRCh38': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19ToHg38.over.chain.gz', 'GRCh38') ht = ht.annotate(interval=hl.liftover(ht['interval'], 'GRCh38')) ht = ht.filter(hl.is_defined(ht['interval'])) ht = ht.select('interval', 'S', 'p_value') ht = ht.key_by('interval') n_rows = ht.count() n_partitions = ht.n_partitions() ht = ht.annotate_globals(metadata=hl.struct(name=name, version='GERP++', reference_genome=args.b,
def create_genome_intervals_file() -> hl.Table: # Load GTF file tmp_path = f'/tmp_{uuid.uuid4()}.ht' ht = _load_gencode_gtf() ht.filter((ht.feature == 'gene') & (ht.gene_type == 'protein_coding')).write(tmp_path, True) # Scan to get bounds, create intervals tmp_path2 = f'/tmp/tmp_{uuid.uuid4()}.ht' ht = hl.read_table(tmp_path) ht = ht.filter((ht.feature == 'gene') & (ht.gene_type == 'protein_coding')) ht = ht.select('gene_id', 'gene_name') last_locus = hl.scan.take(ht.row, 1, ordering=-ht.interval.start.global_position()) intergenic_region = hl.or_missing( (hl.len(last_locus) > 0) & (last_locus[0].interval.end.contig == ht.interval.start.contig), hl.interval(last_locus[0].interval.end, ht.interval.start)) ht = ht.annotate(last_locus=last_locus, intergenic_region=intergenic_region) intergenic_length = ht.intergenic_region.end.position - ht.intergenic_region.start.position intergenic_region = hl.or_missing(intergenic_length > 0, ht.intergenic_region) intergenic_dist = hl.int( (intergenic_region.end.position - intergenic_region.start.position) / 2) chrom = ht.interval.start.contig def interval(pos1, pos2): return hl.interval(hl.locus(chrom, pos1), hl.locus(chrom, pos2)) ht = ht.transmute( intergenic_region1=hl.or_missing( hl.is_defined(intergenic_region), interval( intergenic_region.start.position + 1, # gene interval is closed intergenic_region.start.position + intergenic_dist)), intergenic_region2=hl.or_missing( hl.is_defined(intergenic_region), interval(intergenic_region.start.position + intergenic_dist, intergenic_region.end.position))).key_by() regions = hl.array([ hl.struct(interval=ht.interval, gene_id=ht.gene_id, gene_name=ht.gene_name, within_gene=True) ]) regions = hl.if_else( hl.is_defined(ht.intergenic_region1), regions.extend([ hl.struct(interval=ht.intergenic_region1, gene_id=ht.last_locus[0].gene_id, gene_name=ht.last_locus[0].gene_name, within_gene=False), hl.struct(interval=ht.intergenic_region2, gene_id=ht.gene_id, gene_name=ht.gene_name, within_gene=False) ]), regions) ht = ht.annotate(regions=regions).explode('regions') ht = ht.select(**ht.regions) return ht.key_by('interval')
def interval(pos1, pos2): return hl.interval(hl.locus(chrom, pos1), hl.locus(chrom, pos2))
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...")
ht_transcripts = hl.import_table('gs://hail-datasets/raw-data/gtex/v7/reference/gencode.v19.transcripts.patched_contigs.gtf', comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint}, missing='.', min_partitions=12) ht_transcripts = ht_transcripts.rename({'f0': 'contig', 'f1': 'annotation_source', 'f2': 'feature_type', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'phase', 'f8': 'attributes'}) ht_transcripts = ht_transcripts.filter(ht_transcripts.feature_type == 'transcript') ht_transcripts = ht_transcripts.annotate(interval=hl.interval(hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37'))) ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht_transcripts.attributes.split('; ')))) attribute_cols = list(ht_transcripts.aggregate(hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys()))))) ht_transcripts = ht_transcripts.annotate(**{x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols}) ht_transcripts = ht_transcripts.select(*(['transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag'])) ht_transcripts = ht_transcripts.rename({'havana_transcript': 'havana_transcript_id', 'havana_gene': 'havana_gene_id'}) ht_transcripts = ht_transcripts.key_by(ht_transcripts.transcript_id) mt = hl.import_matrix_table('gs://hail-datasets/raw-data/gtex/v7/rna-seq/processed/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_expected_count.tsv.bgz', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr}, row_key='transcript_id', missing='', entry_type=hl.tfloat) mt = mt.annotate_cols(sample_id=mt.col_id) mt = mt.key_cols_by(mt.sample_id) mt = mt.annotate_entries(read_count=hl.int(mt.x)) mt = mt.drop(mt.col_id, mt.x)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def generate_datasets(doctest_namespace, output_dir): doctest_namespace['hl'] = hl files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds print("finished setting up doctest...")
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t') ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) ht = ht.annotate(attribute=hl.dict( hl.map( lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', ''). replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal( set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
import hail as hl t = hl.import_table('/Users/bfranco/downloads/gencode.v30.annotation.gff3',no_header=True, impute=True, comment=('#')) t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("="))) t2 = t2.filter(t2.f2 == "gene") t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0, t2.f3, 'GRCh38'), hl.locus(t2.f0, t2.f3, 'GRCh38'))) t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1])))) t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], level=t2.GFF_Columns["level"]) t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7) t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7) t2.write('gs://hail-datasets-hail-data/gencode_v30_annotation.mt', overwrite=True)
def segment_intervals(ht, points): """Segment the interval keys of `ht` at a given set of points. Parameters ---------- ht : :class:`.Table` Table with interval keys. points : :class:`.Table` or :class:`.ArrayExpression` Points at which to segment the intervals, a table or an array. Returns ------- :class:`.Table` """ if len(ht.key) != 1 or not isinstance(ht.key[0].dtype, hl.tinterval): raise ValueError( "'segment_intervals' expects a table with interval keys") point_type = ht.key[0].dtype.point_type if isinstance(points, Table): if len(points.key) != 1 or points.key[0].dtype != point_type: raise ValueError( "'segment_intervals' expects points to be a table with a single" " key of the same type as the intervals in 'ht', or an array of those points:" f"\n expect {point_type}, found {list(points.key.dtype.values())}" ) points = hl.array(hl.set(points.collect(_localize=False))) if points.dtype.element_type != point_type: raise ValueError( f"'segment_intervals' expects points to be a table with a single" f" key of the same type as the intervals in 'ht', or an array of those points:" f"\n expect {point_type}, found {points.dtype.element_type}") points = hl._sort_by(points, lambda l, r: hl._compare(l, r) < 0) ht = ht.annotate_globals(__points=points) interval = ht.key[0] points = ht.__points lower = hl.expr.functions._lower_bound(points, interval.start) higher = hl.expr.functions._lower_bound(points, interval.end) n_points = hl.len(points) lower = hl.if_else((lower < n_points) & (points[lower] == interval.start), lower + 1, lower) higher = hl.if_else((higher < n_points) & (points[higher] == interval.end), higher - 1, higher) interval_results = hl.rbind( lower, higher, lambda lower, higher: hl.cond( lower >= higher, [interval], hl.flatten([ [ hl.interval(interval.start, points[lower], includes_start=interval.includes_start, includes_end=False) ], hl.range(lower, higher - 1).map(lambda x: hl.interval( points[x], points[x + 1], includes_start=True, includes_end=False)), [ hl.interval(points[higher - 1], interval.end, includes_start=True, includes_end=interval.includes_end) ], ]))) ht = ht.annotate(__new_intervals=interval_results, lower=lower, higher=higher).explode('__new_intervals') return ht.key_by(**{ list(ht.key)[0]: ht.__new_intervals }).drop('__new_intervals')
def _coerce(self, x): assert isinstance(x, hl.expr.IntervalExpression) return hl.interval(self.point_type.coerce(x.start), self.point_type.coerce(x.end), includes_start=x.includes_start, includes_end=x.includes_end)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None, force_bgz=False, force=False) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :obj:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :class:`str` File to import. reference_genome : :class:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). force_bgz : :obj:`bool` If ``True``, load files as blocked gzip files, assuming that they were actually compressed using the BGZ codec. This option is useful when the file extension is not ``'.bgz'``, but the file is blocked gzip, so that the file can be read in parallel and not on a single node. force : :obj:`bool` If ``True``, load gzipped files serially on one core. This should be used only when absolutely necessary, as processing time will be increased due to lack of parallelism. Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t', force_bgz=force_bgz, force=force) ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) def parse_attributes(unparsed_attributes): def parse_attribute(attribute): key_and_value = attribute.split(' ') key = key_and_value[0] value = key_and_value[1] return (key, value.replace('"|;\\$', '')) return hl.dict(unparsed_attributes.split('; ').map(parse_attribute)) ht = ht.annotate(attribute=parse_attributes(ht['attribute'])) ht = ht.checkpoint(new_temp_file()) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome.name == 'GRCh37': ht = ht.annotate( seqname=hl.case().when((ht['seqname'] == 'M') | (ht['seqname'] == 'chrM'), 'MT'). when(ht['seqname'].startswith('chr'), ht['seqname'].replace( '^chr', '')).default(ht['seqname'])) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(reference_genome.contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
'SMRRNANM', 'SMVQCFL', 'SMTRSCPT', 'SMMPPDPR', 'SMCGLGTH', 'SMUNPDRD', 'SMMPPDUN', 'SME2ANTI', 'SMALTALG', 'SME2SNSE', 'SMMFLGTH', 'SMSPLTRD', 'SME1ANTI', 'SME1SNSE', 'SMNUM5CD'] ht_samples = ht_samples.annotate(**{x: hl.float(ht_samples[x]) for x in float_cols}) ht_samples = ht_samples.annotate(**{x: hl.int(ht_samples[x].replace('.0$', '')) for x in int_cols}) ht = ht.filter(ht.feature_type == 'gene') ht = ht.annotate(interval=hl.interval(hl.locus(ht['contig'], ht['start'], 'GRCh37'), hl.locus(ht['contig'], ht['end'] + 1, 'GRCh37'))) ht = ht.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attributes'].split('; ')))) attribute_cols = list(ht.aggregate(hl.set(hl.flatten(hl.agg.collect(ht.attributes.keys()))))) ht = ht.annotate(**{x: hl.or_missing(ht_genes.attributes.contains(x), ht_genes.attributes[x]) for x in attribute_cols}) ht = ht.select(*(['gene_id', 'interval', 'gene_type', 'strand', 'annotation_source', 'havana_gene', 'gene_status', 'tag'])) ht = ht.rename({'havana_gene': 'havana_gene_id'}) ht = ht.key_by(ht_genes.gene_id) """
ht = ht.key_by('locus') if args.d == 'lcr': name = 'Ensembl_low_complexity_regions' ht = hl.import_table( 'gs://hail-datasets-extracted-data/Ensembl/{n}.{v}.{rg}.tsv.bgz'. format(n=name, v=version, rg=reference_genome), types={ 'start': hl.tint, 'end': hl.tint }) if reference_genome == 'GRCh38': ht = ht.annotate(chromosome='chr' + ht['chromosome'].replace('MT', 'M')) ht = ht.annotate(interval=hl.interval( hl.locus(ht['chromosome'], ht['start'], reference_genome), hl.locus(ht['chromosome'], ht['end'], reference_genome))) ht = ht.select('interval') ht = ht.key_by('interval') if args.d in set(['cdna', 'cds', 'ncrna']): if args.d == 'cdna': name = 'Ensembl_cDNA_regions' elif args.d == 'cds': name = 'Ensembl_CDS_regions' else: name = 'Ensembl_ncRNA_regions' ht = hl.import_table( 'gs://hail-datasets-extracted-data/Ensembl/Ensembl_{0}_regions.{1}.{2}.tsv.bgz' .format(args.d, version, reference_genome), types={