def test_reference_genome(self): rg = hl.get_reference('GRCh37') self.assertEqual(rg.name, "GRCh37") self.assertEqual(rg.contigs[0], "1") self.assertListEqual(rg.x_contigs, ["X"]) self.assertListEqual(rg.y_contigs, ["Y"]) self.assertListEqual(rg.mt_contigs, ["MT"]) self.assertEqual(rg.par[0], hl.eval(hl.parse_locus_interval("X:60001-2699521"))) self.assertEqual(rg.contig_length("1"), 249250621) name = "test" contigs = ["1", "X", "Y", "MT"] lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000} x_contigs = ["X"] y_contigs = ["Y"] mt_contigs = ["MT"] par = [("X", 5, 1000)] gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par) self.assertEqual(gr2.name, name) self.assertListEqual(gr2.contigs, contigs) self.assertListEqual(gr2.x_contigs, x_contigs) self.assertListEqual(gr2.y_contigs, y_contigs) self.assertListEqual(gr2.mt_contigs, mt_contigs) self.assertEqual(gr2.par, [hl.eval(hl.parse_locus_interval("X:5-1000", gr2))]) self.assertEqual(gr2.contig_length("1"), 10000) self.assertDictEqual(gr2.lengths, lengths) gr2.write("/tmp/my_gr.json")
def test_multi_way_zip_join_globals(self): t1 = hl.utils.range_table(1).annotate_globals(x=hl.null(hl.tint32)) t2 = hl.utils.range_table(1).annotate_globals(x=5) t3 = hl.utils.range_table(1).annotate_globals(x=0) expected = hl.struct(__globals=hl.array([ hl.struct(x=hl.null(hl.tint32)), hl.struct(x=5), hl.struct(x=0)])) joined = hl.Table._multi_way_zip_join([t1, t2, t3], '__data', '__globals') self.assertEqual(hl.eval(joined.globals), hl.eval(expected))
def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) grch37.remove_liftover("GRCh38")
def test_joins(self): kt = hl.utils.range_table(1).key_by().drop('idx') kt = kt.annotate(a='foo') kt1 = hl.utils.range_table(1).key_by().drop('idx') kt1 = kt1.annotate(a='foo', b='bar').key_by('a') kt2 = hl.utils.range_table(1).key_by().drop('idx') kt2 = kt2.annotate(b='bar', c='baz').key_by('b') kt3 = hl.utils.range_table(1).key_by().drop('idx') kt3 = kt3.annotate(c='baz', d='qux').key_by('c') kt4 = hl.utils.range_table(1).key_by().drop('idx') kt4 = kt4.annotate(d='qux', e='quam').key_by('d') ktr = kt.annotate(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e) self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam']) ktr = kt.select(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e) self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam']) self.assertEqual(kt.filter(kt4[kt3[kt2[kt1[kt.a].b].c].d].e == 'quam').count(), 1) m = hl.import_vcf(resource('sample.vcf')) vkt = m.rows() vkt = vkt.select(vkt.qual) vkt = vkt.annotate(qual2=m.index_rows(vkt.key).qual) self.assertTrue(vkt.filter(vkt.qual != vkt.qual2).count() == 0) m2 = m.annotate_rows(qual2=vkt.index(m.row_key).qual) self.assertTrue(m2.filter_rows(m2.qual != m2.qual2).count_rows() == 0) m3 = m.annotate_rows(qual2=m.index_rows(m.row_key).qual) self.assertTrue(m3.filter_rows(m3.qual != m3.qual2).count_rows() == 0) kt5 = hl.utils.range_table(1).annotate(key='C1589').key_by('key') m4 = m.annotate_cols(foo=m.s[:5]) m4 = m4.annotate_cols(idx=kt5[m4.foo].idx) n_C1589 = m.filter_cols(m.s[:5] == 'C1589').count_cols() self.assertTrue(n_C1589 > 1) self.assertEqual(m4.filter_cols(hl.is_defined(m4.idx)).count_cols(), n_C1589) kt = hl.utils.range_table(1) kt = kt.annotate_globals(foo=5) self.assertEqual(hl.eval(kt.foo), 5) kt2 = hl.utils.range_table(1) kt2 = kt2.annotate_globals(kt_foo=kt.index_globals().foo) self.assertEqual(hl.eval(kt2.globals.kt_foo), 5)
def collect(self, _localize=True): """Collect all records of an expression into a local list. Examples -------- Collect all the values from `C1`: >>> table1.C1.collect() [2, 2, 10, 11] Warning ------- Extremely experimental. Warning ------- The list of records may be very large. Returns ------- :obj:`list` """ uid = Env.get_uid() name, t = self._to_table(uid) e = t.collect(_localize=False).map(lambda r: r[name]) if _localize: return hl.eval(e) return e
def take(self, n, _localize=True): """Collect the first `n` records of an expression. Examples -------- Take the first three rows: >>> table1.X.take(3) [5, 6, 7] Warning ------- Extremely experimental. Parameters ---------- n : int Number of records to take. Returns ------- :obj:`list` """ uid = Env.get_uid() name, t = self._to_table(uid) e = t.take(n, _localize=False).map(lambda r: r[name]) if _localize: return hl.eval(e) return e
def impute_sex_aggregator(call, aaf, aaf_threshold=0.0, include_par=False, female_threshold=0.4, male_threshold=0.8) -> hl.Table: """:func:`.impute_sex` as an aggregator.""" mt = call._indices.source rg = mt.locus.dtype.reference_genome x_contigs = hl.literal( hl.eval( hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg), rg.x_contigs))) inbreeding = hl.agg.inbreeding(call, aaf) is_female = hl.if_else( inbreeding.f_stat < female_threshold, True, hl.if_else(inbreeding.f_stat > male_threshold, False, hl.is_missing('tbool'))) expression = hl.struct(is_female=is_female, **inbreeding) if not include_par: interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg))) par_intervals = hl.literal(rg.par, interval_type) expression = hl.agg.filter( ~par_intervals.any( lambda par_interval: par_interval.contains(mt.locus)), expression) expression = hl.agg.filter( (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression) expression = hl.agg.filter( x_contigs.any(lambda contig: contig.contains(mt.locus)), expression) return expression
def _spectral_moments(A, num_moments, p=None, moment_samples=500, block_size=128): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) n = A.ncols if p is None: p = min(num_moments // 2, 10) # TODO: When moment_samples > n, we should just do a TSQR on A, and compute # the spectrum of R. assert moment_samples < n, '_spectral_moments: moment_samples must be smaller than num cols of A' G = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) Q1, R1 = hl.nd.qr(G)._persist() fact = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = hl.eval(fact.spectral_moments(num_moments, R1)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs return moments, stdevs
def main(args): # Read mt mt = hl.read_matrix_table(args.matrixtable) # pca_scores_pop pca_scores_pop = hl.read_table(args.pca_scores_population) # annotate mt with pop and superpop mt = mt.annotate_cols(assigned_pop=pca_scores_pop[mt.s].pop) # do sample_qc # calculate and annotate with metric heterozygosity mt_with_sampleqc = hl.sample_qc(mt, name='sample_qc') mt_with_sampleqc = mt_with_sampleqc.annotate_cols(sample_qc=mt_with_sampleqc.sample_qc.annotate( heterozygosity_rate=mt_with_sampleqc.sample_qc.n_het/mt_with_sampleqc.sample_qc.n_called)) # save sample_qc and heterozygosity table as ht table mt_with_sampleqc.write( f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.mt", overwrite=True) mt_with_sampleqc.cols().write( f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht", overwrite=True) pop_ht = hl.read_table( f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht") # run function on metrics including heterozygosity first for pops: qc_metrics = ['heterozygosity_rate', 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var'] pop_filter_ht = compute_stratified_metrics_filter( pop_ht, qc_metrics, ['assigned_pop']) pop_ht = pop_ht.annotate_globals(hl.eval(pop_filter_ht.globals)) pop_ht = pop_ht.annotate(**pop_filter_ht[pop_ht.key]).persist() checkpoint = pop_ht.aggregate(hl.agg.count_where( hl.len(pop_ht.qc_metrics_filters) == 0)) logger.info(f'{checkpoint} exome samples found passing pop filtering') pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_QC_filters.ht")
def test_ndarray_transpose(): np_v = np.array([1, 2, 3]) np_m = np.array([[1, 2, 3], [4, 5, 6]]) np_cube = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) v = hl.nd.array(np_v) m = hl.nd.array(np_m) cube = hl.nd.array(np_cube) assert_ndarrays_eq( (v.T, np_v.T), (v.T, np_v), (m.T, np_m.T), (cube.transpose((0, 2, 1)), np_cube.transpose((0, 2, 1))), (cube.T, np_cube.T)) assert hl.eval(hl.null(hl.tndarray(hl.tfloat, 1)).T) is None with pytest.raises(ValueError) as exc: v.transpose((1,)) assert "Invalid axis: 1" in str(exc.value) with pytest.raises(ValueError) as exc: cube.transpose((1, 1)) assert "Expected 3 axes, got 2" in str(exc.value) with pytest.raises(ValueError) as exc: cube.transpose((1, 1, 1)) assert "Axes cannot contain duplicates" in str(exc.value)
def test_annotate_globals(self): mt = hl.utils.range_matrix_table(1, 1) ht = hl.utils.range_table(1, 1) data = [ (5, hl.tint, operator.eq), (float('nan'), hl.tfloat32, lambda x, y: str(x) == str(y)), (float('inf'), hl.tfloat64, lambda x, y: str(x) == str(y)), (float('-inf'), hl.tfloat64, lambda x, y: str(x) == str(y)), (1.111, hl.tfloat64, operator.eq), ([hl.Struct(**{'a': None, 'b': 5}), hl.Struct(**{'a': 'hello', 'b': 10})], hl.tarray(hl.tstruct(a=hl.tstr, b=hl.tint)), operator.eq) ] for x, t, f in data: self.assertTrue(f(hl.eval(mt.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}") self.assertTrue(f(hl.eval(ht.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
def setup(path): interval = [ hl.eval( hl.parse_locus_interval('chr1:START-END', reference_genome='GRCh38')) ] return hl.import_vcfs([path], interval, reference_genome='GRCh38')[0]
def _dumps_partitions(partitions, row_key_type): parts_type = partitions.dtype if not (isinstance(parts_type, hl.tarray) and isinstance(parts_type.element_type, hl.tinterval)): raise ValueError( f'partitions type invalid: {part_type} must be array of intervals') point_type = parts_type.element_type.point_type f1, t1 = next(iter(row_key_type.items())) if point_type == t1: partitions = hl.map( lambda x: hl.interval(start=hl.struct(**{f1: x.start}), end=hl.struct(**{f1: x.end}), includes_start=True, includes_end=False), partitions) else: if not isinstance(point_type, hl.tstruct): raise ValueError( f'partitions has wrong type: {point_type} must be struct or type of first row key field' ) if not point_type._is_prefix_of(row_key_type): raise ValueError( f'partitions type invalid: {point_type} must be prefix of {row_key_type}' ) s = json.dumps(partitions.dtype._convert_to_json(hl.eval(partitions))) return s, partitions.dtype
def test_reference_genome_sequence(self): gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base))) l = hl.locus("a", 7, gr4) self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
def assert_raw_equivalence(hl_ndarray, np_ndarray): ndarray_h, ndarray_tau = hl.eval(hl.nd.qr(hl_ndarray, mode="raw")) np_ndarray_h, np_ndarray_tau = np.linalg.qr(np_ndarray, mode="raw") rank = np.linalg.matrix_rank(np_ndarray) assert np.allclose(ndarray_h[:, :rank], np_ndarray_h[:, :rank]) assert np.allclose(ndarray_tau[:rank], np_ndarray_tau[:rank])
def test_explode_on_set(self): t = hl.utils.range_table(1) t = t.annotate(a=hl.set(['a', 'b', 'c'])) t = t.explode('a') self.assertEqual(set(t.collect()), hl.eval(hl.set([hl.struct(idx=0, a='a'), hl.struct(idx=0, a='b'), hl.struct(idx=0, a='c')])))
def test_value_same_after_parsing(self): for t, v in self.values(): row_v = ir.Literal(t, v) map_globals_ir = ir.TableMapGlobals( ir.TableRange(1, 1), ir.InsertFields(ir.Ref("global"), [("foo", row_v)], None)) new_globals = hl.eval(hl.Table(map_globals_ir).index_globals()) self.assertEqual(new_globals, hl.Struct(foo=v))
def test_loop_with_struct_of_strings(self): def loop_func(recur_f, my_struct): return hl.if_else(hl.len(my_struct.s1) > hl.len(my_struct.s2), my_struct, recur_f(hl.struct(s1=my_struct.s1 + my_struct.s2[-1], s2=my_struct.s2[:-1]))) initial_struct = hl.struct(s1="a", s2="gfedcb") assert hl.eval(hl.experimental.loop(loop_func, hl.tstruct(s1=hl.tstr, s2=hl.tstr), initial_struct)) == hl.Struct(s1="abcd", s2="gfe")
def assert_ndarrays(asserter, exprs_and_expecteds): exprs, expecteds = zip(*exprs_and_expecteds) expr_tuple = hl.tuple(exprs) evaled_exprs = hl.eval(expr_tuple) for (evaled, expected) in zip(evaled_exprs, expecteds): assert asserter(evaled, expected)
def export_table_to_elasticsearch( table, host, index_name, block_size=5000, id_field=None, mapping=None, num_shards=10, port=9200, verbose=True, es_config=None, ): es_client = elasticsearch.Elasticsearch(host, port=port) elasticsearch_config = {"es.write.operation": "index"} if es_config: elasticsearch_config = {**elasticsearch_config, **es_config} if id_field is not None: elasticsearch_config["es.mapping.id"] = id_field if not mapping: mapping = elasticsearch_mapping_for_table(table) # Delete the index before creating it if es_client.indices.exists(index=index_name): es_client.indices.delete(index=index_name) # TODO This is disabled by default in ES 6+ mapping["_all"] = {"enabled": "false"} mapping["_meta"] = struct_to_dict(hl.eval(table.globals)) # Hard code type name for all indices # Mapping types are removed in ES 7 type_name = "documents" # https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings request_body = { # TODO Mapping types are removed in ES 7 "mappings": { type_name: mapping }, "settings": { "index.codec": "best_compression", "index.mapping.total_fields.limit": 10000, "index.number_of_replicas": 0, "index.number_of_shards": num_shards, "index.refresh_interval": -1, }, } es_client.indices.create(index=index_name, body=request_body) hl.export_elasticsearch(table, host, port, index_name, type_name, block_size, elasticsearch_config, verbose) es_client.indices.forcemerge(index=index_name)
def test_ndarray_eval(): data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] nd_expr = hl._ndarray(data_list) evaled = hl.eval(nd_expr) np_equiv = np.array(data_list, dtype=np.int32) assert(np.array_equal(evaled, np_equiv)) assert(evaled.strides == np_equiv.strides) assert hl.eval(hl._ndarray([[], []])).strides == (8, 8) assert np.array_equal(hl.eval(hl._ndarray([])), np.array([])) zero_array = np.zeros((10, 10), dtype=np.int64) evaled_zero_array = hl.eval(hl.literal(zero_array)) assert np.array_equal(evaled_zero_array, zero_array) assert zero_array.dtype == evaled_zero_array.dtype # Testing from hail arrays assert np.array_equal(hl.eval(hl._ndarray(hl.range(6))), np.arange(6)) assert np.array_equal(hl.eval(hl._ndarray(hl.int64(4))), np.array(4)) # Testing missing data assert hl.eval(hl._ndarray(hl.null(hl.tarray(hl.tint32)))) is None with pytest.raises(ValueError) as exc: hl._ndarray([[4], [1, 2, 3], 5]) assert "inner dimensions do not match" in str(exc.value)
def test_ndarray(): a1 = hl.eval(hl.nd.array((1, 2, 3))) a2 = hl.eval(hl.nd.array([1, 2, 3])) an1 = np.array((1, 2, 3)) an2 = np.array([1, 2, 3]) assert (np.array_equal(a1, a2) and np.array_equal(a2, an2)) a1 = hl.eval(hl.nd.array(((1), (2), (3)))) a2 = hl.eval(hl.nd.array(([1], [2], [3]))) a3 = hl.eval(hl.nd.array([[1], [2], [3]])) an1 = np.array(((1), (2), (3))) an2 = np.array(([1], [2], [3])) an3 = np.array([[1], [2], [3]]) assert (np.array_equal(a1, an1) and np.array_equal(a2, an2) and np.array_equal(a3, an3)) a1 = hl.eval(hl.nd.array(((1, 2), (2, 5), (3, 8)))) a2 = hl.eval(hl.nd.array([[1, 2], [2, 5], [3, 8]])) an1 = np.array(((1, 2), (2, 5), (3, 8))) an2 = np.array([[1, 2], [2, 5], [3, 8]]) assert (np.array_equal(a1, an1) and np.array_equal(a2, an2))
def assert_complete_equivalence(hl_ndarray, np_ndarray): q, r = hl.eval(hl.nd.qr(hl_ndarray, mode="complete")) nq, nr = np.linalg.qr(np_ndarray, mode="complete") rank = np.linalg.matrix_rank(np_ndarray) assert np.allclose(q[:, :rank], nq[:, :rank]) assert np.allclose(r, nr) assert np.allclose(q @ r, np_ndarray)
def assert_ndarrays(asserter, exprs_and_expecteds): exprs, expecteds = zip(*exprs_and_expecteds) expr_tuple = hl.tuple(exprs) evaled_exprs = hl.eval(expr_tuple) evaled_and_expected = zip(evaled_exprs, expecteds) for (idx, (evaled, expected)) in enumerate(evaled_and_expected): assert asserter(evaled, expected), f"NDArray comparison {idx} failed"
def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # input MT mt = hl.read_matrix_table(args.mt_input_path) # filter high-quality genotype # mt = filter_genotypes_ab(mt) # import capture interval table (intersect) intervals = hl.read_table(args.ht_intervals) # generate an interval x sample MT by computing per intervals callrate mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals) # run pca eigenvalues, ht_pca, _ = run_platform_pca( callrate_mt=mt_callrate, binarization_threshold=args.binarization_threshold) # normalize eigenvalues (0-100) eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues] # compute eigenvalues cumulative sum ev_cumsum = hl.array_scan(lambda i, j: i + j, 0, hl.array(eigenvalues_norm)) # getting optimal number of PCs (those which explain 99% of the variance) n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0))) logger.info( f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}" ) # filter out uninformative PCs ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs]) # apply unsupervised clustering on PCs to infer samples platform ht_platform = assign_platform_from_pcs( platform_pca_scores_ht=ht_pca, pc_scores_ann='scores', hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_cluster_size) ht_platform.show() # write HT ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht_platform.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
def test_value_same_after_parsing(self): for t, v in self.values(): row_v = ir.Literal(t, v) map_globals_ir = ir.TableMapGlobals( ir.TableRange(1, 1), ir.InsertFields( ir.Ref("global"), [("foo", row_v)])) new_globals = hl.eval(hl.Table(map_globals_ir).globals) self.assertEquals(new_globals, hl.Struct(foo=v))
def test_ndarray_map(): a = hl.nd.array([[2, 3, 4], [5, 6, 7]]) b = hl.map(lambda x: -x, a) c = hl.map(lambda x: True, a) assert_ndarrays_eq((b, [[-2, -3, -4], [-5, -6, -7]]), (c, [[True, True, True], [True, True, True]])) assert hl.eval(hl.null(hl.tndarray(hl.tfloat, 1)).map(lambda x: x * 2)) is None
def test_ndarray_matmul(): np_v = np.array([1, 2]) np_m = np.array([[1, 2], [3, 4]]) np_r = np.array([[1, 2, 3], [4, 5, 6]]) np_cube = np.arange(8).reshape((2, 2, 2)) np_rect_prism = np.arange(12).reshape((3, 2, 2)) np_broadcasted_mat = np.arange(4).reshape((1, 2, 2)) np_six_dim_tensor = np.arange(3 * 7 * 1 * 9 * 4 * 5).reshape((3, 7, 1, 9, 4, 5)) np_five_dim_tensor = np.arange(7 * 5 * 1 * 5 * 3).reshape((7, 5, 1, 5, 3)) v = hl._nd.array(np_v) m = hl._nd.array(np_m) r = hl._nd.array(np_r) cube = hl._nd.array(np_cube) rect_prism = hl._nd.array(np_rect_prism) broadcasted_mat = hl._nd.array(np_broadcasted_mat) six_dim_tensor = hl._nd.array(np_six_dim_tensor) five_dim_tensor = hl._nd.array(np_five_dim_tensor) assert_ndarrays_eq( (v @ v, np_v @ np_v), (m @ m, np_m @ np_m), (m @ m.T, np_m @ np_m.T), (r @ r.T, np_r @ np_r.T), (v @ m, np_v @ np_m), (m @ v, np_m @ np_v), (cube @ cube, np_cube @ np_cube), (cube @ v, np_cube @ np_v), (v @ cube, np_v @ np_cube), (cube @ m, np_cube @ np_m), (m @ cube, np_m @ np_cube), (rect_prism @ m, np_rect_prism @ np_m), (m @ rect_prism, np_m @ np_rect_prism), (m @ rect_prism.T, np_m @ np_rect_prism.T), (broadcasted_mat @ rect_prism, np_broadcasted_mat @ np_rect_prism), (six_dim_tensor @ five_dim_tensor, np_six_dim_tensor @ np_five_dim_tensor) ) assert hl.eval(hl.null(hl.tndarray(hl.tfloat64, 2)) @ hl.null(hl.tndarray(hl.tfloat64, 2))) is None assert hl.eval(hl.null(hl.tndarray(hl.tint64, 2)) @ hl._nd.array(np.arange(10).reshape(5, 2))) is None assert hl.eval(hl._nd.array(np.arange(10).reshape(5, 2)) @ hl.null(hl.tndarray(hl.tint64, 2))) is None with pytest.raises(ValueError): m @ 5 with pytest.raises(ValueError): m @ hl._nd.array(5) with pytest.raises(ValueError): cube @ hl._nd.array(5) with pytest.raises(FatalError) as exc: hl.eval(r @ r) assert "Matrix dimensions incompatible: 3 2" in str(exc) with pytest.raises(FatalError) as exc: hl.eval(hl._nd.array([1, 2]) @ hl._nd.array([1, 2, 3])) assert "Matrix dimensions incompatible" in str(exc)
def getVariantStats(ipvDict, studyPerVariant, centersPerHomoVus, inList, outList): allVariants = ipvDict.keys() variantsDict = dict() for v in allVariants: vClass = ipvDict[v]['class'] vPopFreq = '%.4f'%(ipvDict[v]['maxFreq']) vCohortFreq = '%.4f'%(ipvDict[v]['cohortFreq']) aa = str(ipvDict[v]['aa']) Aa = str(ipvDict[v]['Aa']) AA = str(ipvDict[v]['AA']) F = str(ipvDict[v]['F']) Z = str(ipvDict[v]['Z']) p = (2 * int(AA) + int(Aa)) / (2 * (int(AA) + int(Aa) + int(aa))) q = 1 - p exonic = str(ipvDict[v]['exonic']) chisquare = str(ipvDict[v]['chisquare']) if len(ipvDict[v]['homozygous individuals']) == 0: homoSample = "None" else: homoSample = ipvDict[v]['homozygous individuals'][0] if len(ipvDict[v]['heterozygous individuals']) == 0: heteroSample = "None" else: heteroSample = ipvDict[v]['heterozygous individuals'][0] v = v.replace(' ', '') v = v.replace("'", "") study = studyPerVariant[v] if v in inList: vIn = 'True' elif v in outList: vIn = 'False' else: vIn = 'NA' variantsDict[v] = dict() variantsDict[v]['class'] = vClass variantsDict[v]['popFreq'] = vPopFreq variantsDict[v]['cohortFreq'] = vCohortFreq variantsDict[v]['homozygousSample'] = homoSample variantsDict[v]['heterozygousSample'] = heteroSample variantsDict[v]['inGnomad'] = vIn variantsDict[v]['aa'] = aa variantsDict[v]['Aa'] = Aa variantsDict[v]['AA'] = AA variantsDict[v]['hail_hweafp'] = hl.eval(hl.hardy_weinberg_test(int(AA),int(Aa),int(aa))).p_value variantsDict[v]['F'] = F variantsDict[v]['Z'] = Z variantsDict[v]['p'] = p variantsDict[v]['q'] = q variantsDict[v]['chisquare'] = chisquare variantsDict[v]['sequenceCenter'] = str(centersPerHomoVus[v]).replace(" ", "") variantsDict[v]['exonic'] = exonic variantsDict[v]['study'] = study return variantsDict
def export_table_to_elasticsearch(table, host, index_name, block_size=5000, id_field=None, mapping=None, num_shards=10, port=9200, verbose=True): es_client = elasticsearch.Elasticsearch(host, port=port) if not mapping: mapping = elasticsearch_mapping_for_table(table) # Delete the index before creating it if es_client.indices.exists(index=index_name): es_client.indices.delete(index=index_name) mapping["_meta"] = dict(hl.eval(table.globals)) # https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings request_body = { "mappings": mapping, "settings": { "index.codec": "best_compression", "index.mapping.total_fields.limit": 10000, "index.number_of_replicas": 0, "index.number_of_shards": num_shards, "index.refresh_interval": -1, }, } es_client.indices.create(index=index_name, body=request_body) temp_file = "table-tmp.json.txt" table = table.key_by() table.select(json=hl.json(table.row_value)).export(temp_file, header=False) buffer = [] with open(temp_file) as f: for line in f: data = json.loads(line) buffer.append(data) if len(buffer) >= block_size: helpers.bulk(es_client, build_bulk_request(buffer, index_name, id_field)) buffer = [] if buffer: helpers.bulk(es_client, build_bulk_request(buffer, index_name, id_field)) buffer = [] es_client.indices.forcemerge(index=index_name)
def make_index_dict(ht): ''' Create a look-up Dictionary for entries contained in the frequency annotation array :param Table ht: Table containing freq_meta global annotation to be indexed :return: Dictionary keyed by grouping combinations in the frequency array, with values describing the corresponding index of each grouping entry in the frequency array :rtype: Dict of str: int ''' freq_meta = hl.eval(ht.globals.freq_meta) index_dict = make_freq_meta_index_dict(freq_meta) return index_dict
def test_annotation(self, mock_load_gencode): mock_load_gencode.return_value = GENE_ID_MAPPING rows = annotate_fields(self.mt, TEST_GENCODE_RELEASE, TEST_GENCODE_PATH) mock_load_gencode.assert_called_with(TEST_GENCODE_RELEASE, download_path=TEST_GENCODE_PATH) row_dict = {row['variantId']: row for row in rows.take(11)} self.assertListEqual([ row_dict[row] for row in ['CPX_chr1_1', 'DUP_chr1_1', 'INS_chr1_10'] ], hl.eval([VARIANT_CPX, VARIANT_DUP, VARIANT_INS]))
def test_loop_memory(self): def foo(recur, arr, idx): return hl.if_else(idx > 10, arr, recur(arr.append(hl.str(idx)), idx + 1)) assert hl.eval( hl.experimental.loop(foo, hl.tarray(hl.tstr), hl.literal(['foo']), 1)) == [ 'foo', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10' ]
def test_concatenate(): x = np.array([[1., 2.], [3., 4.]]) y = np.array([[5.], [6.]]) np_res = np.concatenate([x, y], axis=1) res = hl.eval(hl.nd.concatenate([x, y], axis=1)) assert np.array_equal(np_res, res) x = np.array([[1], [3]]) y = np.array([[5], [6]]) seq = [x, y] np_res = np.concatenate(seq) res = hl.eval(hl.nd.concatenate(seq)) assert np.array_equal(np_res, res) seq = (x, y) np_res = np.concatenate(seq) res = hl.eval(hl.nd.concatenate(seq)) assert np.array_equal(np_res, res)
def test_hstack(): ht = hl.utils.range_table(10) def assert_table(a, b): ht2 = ht.annotate(x=hl.nd.array(a), y=hl.nd.array(b)) ht2 = ht2.annotate(stacked=hl.nd.hstack([ht2.x, ht2.y])) assert np.array_equal(ht2.collect()[0].stacked, np.hstack([a, b])) a = np.array([1, 2, 3]) b = np.array([2, 3, 4]) assert (np.array_equal(hl.eval(hl.nd.hstack((a, b))), np.hstack((a, b)))) assert (np.array_equal(hl.eval(hl.nd.hstack(hl.array([a, b]))), np.hstack((a, b)))) assert_table(a, b) a = np.array([[1], [2], [3]]) b = np.array([[2], [3], [4]]) assert (np.array_equal(hl.eval(hl.nd.hstack((a, b))), np.hstack((a, b)))) assert (np.array_equal(hl.eval(hl.nd.hstack(hl.array([a, b]))), np.hstack((a, b)))) assert_table(a, b)
def test_lgt_to_gt(): call_0_0_f = hl.call(0, 0, phased=False) call_0_0_t = hl.call(0, 0, phased=True) call_0_1_f = hl.call(0, 1, phased=False) call_2_0_t = hl.call(2, 0, phased=True) call_1 = hl.call(1, phased=False) la = [0, 3, 5] assert hl.eval(tuple(hl.vds.lgt_to_gt(c, la) for c in [call_0_0_f, call_0_0_t, call_0_1_f, call_2_0_t, call_1])) == \ tuple([hl.Call([0, 0], phased=False), hl.Call([0, 0], phased=True), hl.Call([0, 3], phased=False), hl.Call([5, 0], phased=True), hl.Call([3], phased=False)])
def test_matrix_filter_intervals(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) self.assertEqual( hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3) intervals = [hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')), hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
def overlaps(self, interval): """True if the the supplied interval contains any value in common with this one. Parameters ---------- interval : :class:`.Interval` Interval object with the same point type. Returns ------- :obj:`bool` """ return hl.eval(hl.literal(self, hl.tinterval(self._point_type)).overlaps(interval))
def contains(self, value): """True if `value` is contained within the interval. Examples -------- >>> interval2.contains(5) True >>> interval2.contains(6) False Parameters ---------- value : Object with type :meth:`.point_type`. Returns ------- :obj:`bool` """ return hl.eval(hl.literal(self, hl.tinterval(self._point_type)).contains(value))
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. significance_line : float, optional p-value at which to add a horizontal, dotted red line indicating genome-wide significance. If ``None``, no line is added. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if locus is None: locus = pvals._indices.source.locus ref = locus.dtype.reference_genome if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) source_pd = _collect_scatter_plot_data( ('_global_locus', locus.global_position()), ('_pval', pvals), fields=hover_fields, n_divisions=None if collect_all else n_divisions ) source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']] source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']] observed_contigs = set(source_pd['_contig']) observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs] contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs]) color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2)) p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000) p, _, legend, _, _, _ = _get_scatter_plot_elements( p, source_pd, x_col='_global_locus', y_col='_pval', label_cols=['_contig'], colors={'_contig': color_mapper}, size=size ) legend.visible = False p.xaxis.ticker = contig_ticks p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs)) p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')] if significance_line is not None: p.renderers.append(Span(location=-log10(significance_line), dimension='width', line_color='red', line_dash='dashed', line_width=1.5)) return p
def test_define_function(self): f = hl.experimental.define_function( lambda a, b: (a + 7) * b, hl.tint32, hl.tint32) self.assertEqual(hl.eval(f(1, 3)), 24)
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the key or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): k_type = ds.row_key.dtype else: assert isinstance(ds, Table) k_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == k_type[0]: needs_wrapper = True point_type = hl.tstruct(foo=point_type) elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type): needs_wrapper = False else: raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type))) def wrap_input(interval): if interval is None: raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.") elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals_type = intervals.dtype intervals = hl.eval(intervals) intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json([wrap_input(i) for i in intervals]) if isinstance(ds, MatrixTable): config = { 'name': 'MatrixFilterIntervals', 'keyType': point_type._parsable_string(), 'intervals': intervals, 'keep': keep } return MatrixTable(MatrixToMatrixApply(ds._mir, config)) else: config = { 'name': 'TableFilterIntervals', 'keyType': point_type._parsable_string(), 'intervals': intervals, 'keep': keep } return Table(TableToTableApply(ds._tir, config))
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True): """Returns start and stop indices for window around each locus. Examples -------- Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5: >>> starts, stops = hl.linalg.utils.locus_windows( ... hl.balding_nichols_model(1, 5, 5).locus, ... radius=2) >>> starts, stops (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5])) The following examples involve three contigs. >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, ... {'locus': hl.Locus('1', 2), 'cm': 3.0}, ... {'locus': hl.Locus('1', 4), 'cm': 4.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('3', 3), 'cm': 5.0}] >>> ht = hl.Table.parallelize( ... loci, ... hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), ... key=['locus']) Windows with 1bp radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1) (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6])) Windows with 1cm radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6])) Notes ----- This function returns two 1-dimensional ndarrays of integers, ``starts`` and ``stops``, each of size equal to the number of rows. By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``position[i] - radius <= position[j] <= position[i] + radius``. If the :meth:`.global_position` on `locus_expr` is not in ascending order, this method will fail. Ascending order should hold for a matrix table keyed by locus or variant (and the associated row table), or for a table that has been ordered by `locus_expr`. Set `coord_expr` to use a value other than position to define the windows. This row-indexed numeric expression must be non-missing, non-``nan``, on the same source as `locus_expr`, and ascending with respect to locus position for each contig; otherwise the function will fail. The last example above uses centimorgan coordinates, so ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``cm[i] - radius <= cm[j] <= cm[i] + radius``. Index ranges are start-inclusive and stop-exclusive. This function is especially useful in conjunction with :meth:`.BlockMatrix.sparsify_row_intervals`. Parameters ---------- locus_expr : :class:`.LocusExpression` Row-indexed locus expression on a table or matrix table. radius: :obj:`int` Radius of window for row values. coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value. Must be on the same table or matrix table as `locus_expr`. By default, the row value is given by the locus position. Returns ------- (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`) Tuple of start indices array and stop indices array. """ if radius < 0: raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}") check_row_indexed('locus_windows', locus_expr) if coord_expr is not None: check_row_indexed('locus_windows', coord_expr) src = locus_expr._indices.source if locus_expr not in src._fields_inverse: locus = Env.get_uid() annotate_fields = {locus: locus_expr} if coord_expr is not None: if coord_expr not in src._fields_inverse: coords = Env.get_uid() annotate_fields[coords] = coord_expr else: coords = src._fields_inverse[coord_expr] if isinstance(src, hl.MatrixTable): new_src = src.annotate_rows(**annotate_fields) else: new_src = src.annotate(**annotate_fields) locus_expr = new_src[locus] if coord_expr is not None: coord_expr = new_src[coords] if coord_expr is None: coord_expr = locus_expr.position rg = locus_expr.dtype.reference_genome contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr)) # check loci are in sorted order last_pos = hl.fold(lambda a, elt: (hl.case() .when(a <= elt, elt) .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")), -1, hl.agg.collect(hl.case() .when(hl.is_defined(locus_expr), locus_expr.global_position()) .or_error("locus_windows: missing value for 'locus_expr'."))) checked_contig_groups = (hl.case() .when(last_pos >= 0, contig_group_expr) .or_error("locus_windows: 'locus_expr' has length 0")) contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False) coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1]) starts_and_stops = hl._locus_windows_per_contig(coords, radius) if not _localize: return starts_and_stops starts, stops = hl.eval(starts_and_stops) return np.array(starts), np.array(stops)