Esempio n. 1
0
    def test_reference_genome(self):
        rg = hl.get_reference('GRCh37')
        self.assertEqual(rg.name, "GRCh37")
        self.assertEqual(rg.contigs[0], "1")
        self.assertListEqual(rg.x_contigs, ["X"])
        self.assertListEqual(rg.y_contigs, ["Y"])
        self.assertListEqual(rg.mt_contigs, ["MT"])
        self.assertEqual(rg.par[0], hl.eval(hl.parse_locus_interval("X:60001-2699521")))
        self.assertEqual(rg.contig_length("1"), 249250621)

        name = "test"
        contigs = ["1", "X", "Y", "MT"]
        lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000}
        x_contigs = ["X"]
        y_contigs = ["Y"]
        mt_contigs = ["MT"]
        par = [("X", 5, 1000)]

        gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par)
        self.assertEqual(gr2.name, name)
        self.assertListEqual(gr2.contigs, contigs)
        self.assertListEqual(gr2.x_contigs, x_contigs)
        self.assertListEqual(gr2.y_contigs, y_contigs)
        self.assertListEqual(gr2.mt_contigs, mt_contigs)
        self.assertEqual(gr2.par, [hl.eval(hl.parse_locus_interval("X:5-1000", gr2))])
        self.assertEqual(gr2.contig_length("1"), 10000)
        self.assertDictEqual(gr2.lengths, lengths)
        gr2.write("/tmp/my_gr.json")
Esempio n. 2
0
 def test_multi_way_zip_join_globals(self):
     t1 = hl.utils.range_table(1).annotate_globals(x=hl.null(hl.tint32))
     t2 = hl.utils.range_table(1).annotate_globals(x=5)
     t3 = hl.utils.range_table(1).annotate_globals(x=0)
     expected = hl.struct(__globals=hl.array([
         hl.struct(x=hl.null(hl.tint32)),
         hl.struct(x=5),
         hl.struct(x=0)]))
     joined = hl.Table._multi_way_zip_join([t1, t2, t3], '__data', '__globals')
     self.assertEqual(hl.eval(joined.globals), hl.eval(expected))
Esempio n. 3
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')

        self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False)))

        self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'),
                                             'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'),
                                           is_negative_strand=True)))

        grch37.remove_liftover("GRCh38")
Esempio n. 4
0
    def test_joins(self):
        kt = hl.utils.range_table(1).key_by().drop('idx')
        kt = kt.annotate(a='foo')

        kt1 = hl.utils.range_table(1).key_by().drop('idx')
        kt1 = kt1.annotate(a='foo', b='bar').key_by('a')

        kt2 = hl.utils.range_table(1).key_by().drop('idx')
        kt2 = kt2.annotate(b='bar', c='baz').key_by('b')

        kt3 = hl.utils.range_table(1).key_by().drop('idx')
        kt3 = kt3.annotate(c='baz', d='qux').key_by('c')

        kt4 = hl.utils.range_table(1).key_by().drop('idx')
        kt4 = kt4.annotate(d='qux', e='quam').key_by('d')

        ktr = kt.annotate(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e)
        self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam'])

        ktr = kt.select(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e)
        self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam'])

        self.assertEqual(kt.filter(kt4[kt3[kt2[kt1[kt.a].b].c].d].e == 'quam').count(), 1)

        m = hl.import_vcf(resource('sample.vcf'))
        vkt = m.rows()
        vkt = vkt.select(vkt.qual)
        vkt = vkt.annotate(qual2=m.index_rows(vkt.key).qual)
        self.assertTrue(vkt.filter(vkt.qual != vkt.qual2).count() == 0)

        m2 = m.annotate_rows(qual2=vkt.index(m.row_key).qual)
        self.assertTrue(m2.filter_rows(m2.qual != m2.qual2).count_rows() == 0)

        m3 = m.annotate_rows(qual2=m.index_rows(m.row_key).qual)
        self.assertTrue(m3.filter_rows(m3.qual != m3.qual2).count_rows() == 0)

        kt5 = hl.utils.range_table(1).annotate(key='C1589').key_by('key')
        m4 = m.annotate_cols(foo=m.s[:5])
        m4 = m4.annotate_cols(idx=kt5[m4.foo].idx)
        n_C1589 = m.filter_cols(m.s[:5] == 'C1589').count_cols()
        self.assertTrue(n_C1589 > 1)
        self.assertEqual(m4.filter_cols(hl.is_defined(m4.idx)).count_cols(), n_C1589)

        kt = hl.utils.range_table(1)
        kt = kt.annotate_globals(foo=5)
        self.assertEqual(hl.eval(kt.foo), 5)

        kt2 = hl.utils.range_table(1)

        kt2 = kt2.annotate_globals(kt_foo=kt.index_globals().foo)
        self.assertEqual(hl.eval(kt2.globals.kt_foo), 5)
Esempio n. 5
0
    def collect(self, _localize=True):
        """Collect all records of an expression into a local list.

        Examples
        --------

        Collect all the values from `C1`:

        >>> table1.C1.collect()
        [2, 2, 10, 11]

        Warning
        -------
        Extremely experimental.

        Warning
        -------
        The list of records may be very large.

        Returns
        -------
        :obj:`list`
        """
        uid = Env.get_uid()
        name, t = self._to_table(uid)
        e = t.collect(_localize=False).map(lambda r: r[name])
        if _localize:
            return hl.eval(e)
        return e
Esempio n. 6
0
    def take(self, n, _localize=True):
        """Collect the first `n` records of an expression.

        Examples
        --------

        Take the first three rows:

        >>> table1.X.take(3)
        [5, 6, 7]

        Warning
        -------
        Extremely experimental.

        Parameters
        ----------
        n : int
            Number of records to take.

        Returns
        -------
        :obj:`list`
        """
        uid = Env.get_uid()
        name, t = self._to_table(uid)
        e = t.take(n, _localize=False).map(lambda r: r[name])
        if _localize:
            return hl.eval(e)
        return e
Esempio n. 7
0
def impute_sex_aggregator(call,
                          aaf,
                          aaf_threshold=0.0,
                          include_par=False,
                          female_threshold=0.4,
                          male_threshold=0.8) -> hl.Table:
    """:func:`.impute_sex` as an aggregator."""
    mt = call._indices.source
    rg = mt.locus.dtype.reference_genome
    x_contigs = hl.literal(
        hl.eval(
            hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg),
                   rg.x_contigs)))
    inbreeding = hl.agg.inbreeding(call, aaf)
    is_female = hl.if_else(
        inbreeding.f_stat < female_threshold, True,
        hl.if_else(inbreeding.f_stat > male_threshold, False,
                   hl.is_missing('tbool')))
    expression = hl.struct(is_female=is_female, **inbreeding)
    if not include_par:
        interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg)))
        par_intervals = hl.literal(rg.par, interval_type)
        expression = hl.agg.filter(
            ~par_intervals.any(
                lambda par_interval: par_interval.contains(mt.locus)),
            expression)
    expression = hl.agg.filter(
        (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression)
    expression = hl.agg.filter(
        x_contigs.any(lambda contig: contig.contains(mt.locus)), expression)

    return expression
Esempio n. 8
0
def _spectral_moments(A,
                      num_moments,
                      p=None,
                      moment_samples=500,
                      block_size=128):
    if not isinstance(A, TallSkinnyMatrix):
        check_entry_indexed('_spectral_moments/entry_expr', A)
        A = _make_tsm_from_call(A, block_size)

    n = A.ncols

    if p is None:
        p = min(num_moments // 2, 10)

    # TODO: When moment_samples > n, we should just do a TSQR on A, and compute
    # the spectrum of R.
    assert moment_samples < n, '_spectral_moments: moment_samples must be smaller than num cols of A'
    G = hl.nd.zeros(
        (n,
         moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1))
    Q1, R1 = hl.nd.qr(G)._persist()
    fact = _krylov_factorization(A, Q1, p, compute_U=False)
    moments_and_stdevs = hl.eval(fact.spectral_moments(num_moments, R1))
    moments = moments_and_stdevs.moments
    stdevs = moments_and_stdevs.stdevs
    return moments, stdevs
def main(args):

    # Read mt
    mt = hl.read_matrix_table(args.matrixtable)
    # pca_scores_pop
    pca_scores_pop = hl.read_table(args.pca_scores_population)

    # annotate mt with pop and superpop
    mt = mt.annotate_cols(assigned_pop=pca_scores_pop[mt.s].pop)

    # do sample_qc
    # calculate and annotate with metric heterozygosity
    mt_with_sampleqc = hl.sample_qc(mt, name='sample_qc')

    mt_with_sampleqc = mt_with_sampleqc.annotate_cols(sample_qc=mt_with_sampleqc.sample_qc.annotate(
        heterozygosity_rate=mt_with_sampleqc.sample_qc.n_het/mt_with_sampleqc.sample_qc.n_called))
    # save sample_qc and heterozygosity table as ht table
    mt_with_sampleqc.write(
        f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.mt", overwrite=True)
    mt_with_sampleqc.cols().write(
        f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht",  overwrite=True)
    pop_ht = hl.read_table(
        f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht")
    # run function on metrics including heterozygosity first for pops:
    qc_metrics = ['heterozygosity_rate', 'n_snp', 'r_ti_tv',
                  'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var']
    pop_filter_ht = compute_stratified_metrics_filter(
        pop_ht, qc_metrics, ['assigned_pop'])
    pop_ht = pop_ht.annotate_globals(hl.eval(pop_filter_ht.globals))
    pop_ht = pop_ht.annotate(**pop_filter_ht[pop_ht.key]).persist()

    checkpoint = pop_ht.aggregate(hl.agg.count_where(
        hl.len(pop_ht.qc_metrics_filters) == 0))
    logger.info(f'{checkpoint} exome samples found passing pop filtering')
    pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/mt_pops_QC_filters.ht")
Esempio n. 10
0
def test_ndarray_transpose():
    np_v = np.array([1, 2, 3])
    np_m = np.array([[1, 2, 3], [4, 5, 6]])
    np_cube = np.array([[[1, 2],
                         [3, 4]],
                        [[5, 6],
                         [7, 8]]])
    v = hl.nd.array(np_v)
    m = hl.nd.array(np_m)
    cube = hl.nd.array(np_cube)

    assert_ndarrays_eq(
        (v.T, np_v.T),
        (v.T, np_v),
        (m.T, np_m.T),
        (cube.transpose((0, 2, 1)), np_cube.transpose((0, 2, 1))),
        (cube.T, np_cube.T))

    assert hl.eval(hl.null(hl.tndarray(hl.tfloat, 1)).T) is None

    with pytest.raises(ValueError) as exc:
        v.transpose((1,))
    assert "Invalid axis: 1" in str(exc.value)

    with pytest.raises(ValueError) as exc:
        cube.transpose((1, 1))
    assert "Expected 3 axes, got 2" in str(exc.value)

    with pytest.raises(ValueError) as exc:
        cube.transpose((1, 1, 1))
    assert "Axes cannot contain duplicates" in str(exc.value)
Esempio n. 11
0
    def test_annotate_globals(self):
        mt = hl.utils.range_matrix_table(1, 1)
        ht = hl.utils.range_table(1, 1)
        data = [
            (5, hl.tint, operator.eq),
            (float('nan'), hl.tfloat32, lambda x, y: str(x) == str(y)),
            (float('inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
            (float('-inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
            (1.111, hl.tfloat64, operator.eq),
            ([hl.Struct(**{'a': None, 'b': 5}),
              hl.Struct(**{'a': 'hello', 'b': 10})], hl.tarray(hl.tstruct(a=hl.tstr, b=hl.tint)), operator.eq)
        ]

        for x, t, f in data:
            self.assertTrue(f(hl.eval(mt.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
            self.assertTrue(f(hl.eval(ht.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
Esempio n. 12
0
def setup(path):
    interval = [
        hl.eval(
            hl.parse_locus_interval('chr1:START-END',
                                    reference_genome='GRCh38'))
    ]
    return hl.import_vcfs([path], interval, reference_genome='GRCh38')[0]
Esempio n. 13
0
def _dumps_partitions(partitions, row_key_type):
    parts_type = partitions.dtype
    if not (isinstance(parts_type, hl.tarray)
            and isinstance(parts_type.element_type, hl.tinterval)):
        raise ValueError(
            f'partitions type invalid: {part_type} must be array of intervals')

    point_type = parts_type.element_type.point_type

    f1, t1 = next(iter(row_key_type.items()))
    if point_type == t1:
        partitions = hl.map(
            lambda x: hl.interval(start=hl.struct(**{f1: x.start}),
                                  end=hl.struct(**{f1: x.end}),
                                  includes_start=True,
                                  includes_end=False), partitions)
    else:
        if not isinstance(point_type, hl.tstruct):
            raise ValueError(
                f'partitions has wrong type: {point_type} must be struct or type of first row key field'
            )
        if not point_type._is_prefix_of(row_key_type):
            raise ValueError(
                f'partitions type invalid: {point_type} must be prefix of {row_key_type}'
            )

    s = json.dumps(partitions.dtype._convert_to_json(hl.eval(partitions)))
    return s, partitions.dtype
Esempio n. 14
0
    def test_annotate_globals(self):
        mt = hl.utils.range_matrix_table(1, 1)
        ht = hl.utils.range_table(1, 1)
        data = [
            (5, hl.tint, operator.eq),
            (float('nan'), hl.tfloat32, lambda x, y: str(x) == str(y)),
            (float('inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
            (float('-inf'), hl.tfloat64, lambda x, y: str(x) == str(y)),
            (1.111, hl.tfloat64, operator.eq),
            ([hl.Struct(**{'a': None, 'b': 5}),
              hl.Struct(**{'a': 'hello', 'b': 10})], hl.tarray(hl.tstruct(a=hl.tstr, b=hl.tint)), operator.eq)
        ]

        for x, t, f in data:
            self.assertTrue(f(hl.eval(mt.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
            self.assertTrue(f(hl.eval(ht.annotate_globals(foo=hl.literal(x, t)).foo), x), f"{x}, {t}")
Esempio n. 15
0
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"),
                                              resource("fake_reference.fasta.fai"),
                                              mt_contigs=["b", "c"], x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
Esempio n. 16
0
    def assert_raw_equivalence(hl_ndarray, np_ndarray):
        ndarray_h, ndarray_tau = hl.eval(hl.nd.qr(hl_ndarray, mode="raw"))
        np_ndarray_h, np_ndarray_tau = np.linalg.qr(np_ndarray, mode="raw")

        rank = np.linalg.matrix_rank(np_ndarray)

        assert np.allclose(ndarray_h[:, :rank], np_ndarray_h[:, :rank])
        assert np.allclose(ndarray_tau[:rank], np_ndarray_tau[:rank])
Esempio n. 17
0
 def test_explode_on_set(self):
     t = hl.utils.range_table(1)
     t = t.annotate(a=hl.set(['a', 'b', 'c']))
     t = t.explode('a')
     self.assertEqual(set(t.collect()),
                      hl.eval(hl.set([hl.struct(idx=0, a='a'),
                                      hl.struct(idx=0, a='b'),
                                      hl.struct(idx=0, a='c')])))
Esempio n. 18
0
 def test_value_same_after_parsing(self):
     for t, v in self.values():
         row_v = ir.Literal(t, v)
         map_globals_ir = ir.TableMapGlobals(
             ir.TableRange(1, 1),
             ir.InsertFields(ir.Ref("global"), [("foo", row_v)], None))
         new_globals = hl.eval(hl.Table(map_globals_ir).index_globals())
         self.assertEqual(new_globals, hl.Struct(foo=v))
Esempio n. 19
0
 def test_explode_on_set(self):
     t = hl.utils.range_table(1)
     t = t.annotate(a=hl.set(['a', 'b', 'c']))
     t = t.explode('a')
     self.assertEqual(set(t.collect()),
                      hl.eval(hl.set([hl.struct(idx=0, a='a'),
                                      hl.struct(idx=0, a='b'),
                                      hl.struct(idx=0, a='c')])))
Esempio n. 20
0
    def test_loop_with_struct_of_strings(self):
        def loop_func(recur_f, my_struct):
            return hl.if_else(hl.len(my_struct.s1) > hl.len(my_struct.s2),
                              my_struct,
                              recur_f(hl.struct(s1=my_struct.s1 + my_struct.s2[-1], s2=my_struct.s2[:-1])))

        initial_struct = hl.struct(s1="a", s2="gfedcb")
        assert hl.eval(hl.experimental.loop(loop_func, hl.tstruct(s1=hl.tstr, s2=hl.tstr), initial_struct)) == hl.Struct(s1="abcd", s2="gfe")
Esempio n. 21
0
def assert_ndarrays(asserter, exprs_and_expecteds):
    exprs, expecteds = zip(*exprs_and_expecteds)

    expr_tuple = hl.tuple(exprs)
    evaled_exprs = hl.eval(expr_tuple)

    for (evaled, expected) in zip(evaled_exprs, expecteds):
        assert asserter(evaled, expected)
def export_table_to_elasticsearch(
    table,
    host,
    index_name,
    block_size=5000,
    id_field=None,
    mapping=None,
    num_shards=10,
    port=9200,
    verbose=True,
    es_config=None,
):
    es_client = elasticsearch.Elasticsearch(host, port=port)

    elasticsearch_config = {"es.write.operation": "index"}

    if es_config:
        elasticsearch_config = {**elasticsearch_config, **es_config}

    if id_field is not None:
        elasticsearch_config["es.mapping.id"] = id_field

    if not mapping:
        mapping = elasticsearch_mapping_for_table(table)

    # Delete the index before creating it
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)

    # TODO This is disabled by default in ES 6+
    mapping["_all"] = {"enabled": "false"}
    mapping["_meta"] = struct_to_dict(hl.eval(table.globals))

    # Hard code type name for all indices
    # Mapping types are removed in ES 7
    type_name = "documents"

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings
    request_body = {
        # TODO Mapping types are removed in ES 7
        "mappings": {
            type_name: mapping
        },
        "settings": {
            "index.codec": "best_compression",
            "index.mapping.total_fields.limit": 10000,
            "index.number_of_replicas": 0,
            "index.number_of_shards": num_shards,
            "index.refresh_interval": -1,
        },
    }

    es_client.indices.create(index=index_name, body=request_body)

    hl.export_elasticsearch(table, host, port, index_name, type_name,
                            block_size, elasticsearch_config, verbose)

    es_client.indices.forcemerge(index=index_name)
Esempio n. 23
0
def test_ndarray_eval():
    data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    nd_expr = hl._ndarray(data_list)
    evaled = hl.eval(nd_expr)
    np_equiv = np.array(data_list, dtype=np.int32)
    assert(np.array_equal(evaled, np_equiv))
    assert(evaled.strides == np_equiv.strides)

    assert hl.eval(hl._ndarray([[], []])).strides == (8, 8)
    assert np.array_equal(hl.eval(hl._ndarray([])), np.array([]))

    zero_array = np.zeros((10, 10), dtype=np.int64)
    evaled_zero_array = hl.eval(hl.literal(zero_array))

    assert np.array_equal(evaled_zero_array, zero_array)
    assert zero_array.dtype == evaled_zero_array.dtype

    # Testing from hail arrays
    assert np.array_equal(hl.eval(hl._ndarray(hl.range(6))), np.arange(6))
    assert np.array_equal(hl.eval(hl._ndarray(hl.int64(4))), np.array(4))

    # Testing missing data
    assert hl.eval(hl._ndarray(hl.null(hl.tarray(hl.tint32)))) is None

    with pytest.raises(ValueError) as exc:
        hl._ndarray([[4], [1, 2, 3], 5])
    assert "inner dimensions do not match" in str(exc.value)
Esempio n. 24
0
def test_ndarray():
    a1 = hl.eval(hl.nd.array((1, 2, 3)))
    a2 = hl.eval(hl.nd.array([1, 2, 3]))
    an1 = np.array((1, 2, 3))
    an2 = np.array([1, 2, 3])

    assert (np.array_equal(a1, a2) and np.array_equal(a2, an2))

    a1 = hl.eval(hl.nd.array(((1), (2), (3))))
    a2 = hl.eval(hl.nd.array(([1], [2], [3])))
    a3 = hl.eval(hl.nd.array([[1], [2], [3]]))

    an1 = np.array(((1), (2), (3)))
    an2 = np.array(([1], [2], [3]))
    an3 = np.array([[1], [2], [3]])

    assert (np.array_equal(a1, an1) and np.array_equal(a2, an2)
            and np.array_equal(a3, an3))

    a1 = hl.eval(hl.nd.array(((1, 2), (2, 5), (3, 8))))
    a2 = hl.eval(hl.nd.array([[1, 2], [2, 5], [3, 8]]))

    an1 = np.array(((1, 2), (2, 5), (3, 8)))
    an2 = np.array([[1, 2], [2, 5], [3, 8]])

    assert (np.array_equal(a1, an1) and np.array_equal(a2, an2))
Esempio n. 25
0
    def assert_complete_equivalence(hl_ndarray, np_ndarray):
        q, r = hl.eval(hl.nd.qr(hl_ndarray, mode="complete"))
        nq, nr = np.linalg.qr(np_ndarray, mode="complete")

        rank = np.linalg.matrix_rank(np_ndarray)

        assert np.allclose(q[:, :rank], nq[:, :rank])
        assert np.allclose(r, nr)
        assert np.allclose(q @ r, np_ndarray)
Esempio n. 26
0
def assert_ndarrays(asserter, exprs_and_expecteds):
    exprs, expecteds = zip(*exprs_and_expecteds)

    expr_tuple = hl.tuple(exprs)
    evaled_exprs = hl.eval(expr_tuple)

    evaled_and_expected = zip(evaled_exprs, expecteds)
    for (idx, (evaled, expected)) in enumerate(evaled_and_expected):
        assert asserter(evaled, expected), f"NDArray comparison {idx} failed"
Esempio n. 27
0
def main(args):

    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # input MT
    mt = hl.read_matrix_table(args.mt_input_path)

    # filter high-quality genotype
    # mt = filter_genotypes_ab(mt)

    # import capture interval table (intersect)
    intervals = hl.read_table(args.ht_intervals)

    # generate an interval x sample MT by computing per intervals callrate
    mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals)

    # run pca
    eigenvalues, ht_pca, _ = run_platform_pca(
        callrate_mt=mt_callrate,
        binarization_threshold=args.binarization_threshold)

    # normalize eigenvalues (0-100)
    eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues]

    # compute eigenvalues cumulative sum
    ev_cumsum = hl.array_scan(lambda i, j: i + j, 0,
                              hl.array(eigenvalues_norm))

    # getting optimal number of PCs (those which explain 99% of the variance)
    n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0)))

    logger.info(
        f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}"
    )

    # filter out uninformative PCs
    ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs])

    # apply unsupervised clustering on PCs to infer samples platform
    ht_platform = assign_platform_from_pcs(
        platform_pca_scores_ht=ht_pca,
        pc_scores_ann='scores',
        hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
        hdbscan_min_samples=args.hdbscan_min_cluster_size)

    ht_platform.show()

    # write HT
    ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht_platform.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
Esempio n. 28
0
 def test_value_same_after_parsing(self):
     for t, v in self.values():
         row_v = ir.Literal(t, v)
         map_globals_ir = ir.TableMapGlobals(
             ir.TableRange(1, 1),
             ir.InsertFields(
                 ir.Ref("global"),
                 [("foo", row_v)]))
         new_globals = hl.eval(hl.Table(map_globals_ir).globals)
         self.assertEquals(new_globals, hl.Struct(foo=v))
Esempio n. 29
0
def test_ndarray_map():
    a = hl.nd.array([[2, 3, 4], [5, 6, 7]])
    b = hl.map(lambda x: -x, a)
    c = hl.map(lambda x: True, a)

    assert_ndarrays_eq((b, [[-2, -3, -4], [-5, -6, -7]]),
                       (c, [[True, True, True], [True, True, True]]))

    assert hl.eval(hl.null(hl.tndarray(hl.tfloat,
                                       1)).map(lambda x: x * 2)) is None
Esempio n. 30
0
def test_ndarray_matmul():
    np_v = np.array([1, 2])
    np_m = np.array([[1, 2], [3, 4]])
    np_r = np.array([[1, 2, 3], [4, 5, 6]])
    np_cube = np.arange(8).reshape((2, 2, 2))
    np_rect_prism = np.arange(12).reshape((3, 2, 2))
    np_broadcasted_mat = np.arange(4).reshape((1, 2, 2))
    np_six_dim_tensor = np.arange(3 * 7 * 1 * 9 * 4 * 5).reshape((3, 7, 1, 9, 4, 5))
    np_five_dim_tensor = np.arange(7 * 5 * 1 * 5 * 3).reshape((7, 5, 1, 5, 3))

    v = hl._nd.array(np_v)
    m = hl._nd.array(np_m)
    r = hl._nd.array(np_r)
    cube = hl._nd.array(np_cube)
    rect_prism = hl._nd.array(np_rect_prism)
    broadcasted_mat = hl._nd.array(np_broadcasted_mat)
    six_dim_tensor = hl._nd.array(np_six_dim_tensor)
    five_dim_tensor = hl._nd.array(np_five_dim_tensor)

    assert_ndarrays_eq(
        (v @ v, np_v @ np_v),
        (m @ m, np_m @ np_m),
        (m @ m.T, np_m @ np_m.T),
        (r @ r.T, np_r @ np_r.T),
        (v @ m, np_v @ np_m),
        (m @ v, np_m @ np_v),
        (cube @ cube, np_cube @ np_cube),
        (cube @ v, np_cube @ np_v),
        (v @ cube, np_v @ np_cube),
        (cube @ m, np_cube @ np_m),
        (m @ cube, np_m @ np_cube),
        (rect_prism @ m, np_rect_prism @ np_m),
        (m @ rect_prism, np_m @ np_rect_prism),
        (m @ rect_prism.T, np_m @ np_rect_prism.T),
        (broadcasted_mat @ rect_prism, np_broadcasted_mat @ np_rect_prism),
        (six_dim_tensor @ five_dim_tensor, np_six_dim_tensor @ np_five_dim_tensor)
    )

    assert hl.eval(hl.null(hl.tndarray(hl.tfloat64, 2)) @ hl.null(hl.tndarray(hl.tfloat64, 2))) is None
    assert hl.eval(hl.null(hl.tndarray(hl.tint64, 2)) @ hl._nd.array(np.arange(10).reshape(5, 2))) is None
    assert hl.eval(hl._nd.array(np.arange(10).reshape(5, 2)) @ hl.null(hl.tndarray(hl.tint64, 2))) is None

    with pytest.raises(ValueError):
        m @ 5

    with pytest.raises(ValueError):
        m @ hl._nd.array(5)

    with pytest.raises(ValueError):
        cube @ hl._nd.array(5)

    with pytest.raises(FatalError) as exc:
        hl.eval(r @ r)
    assert "Matrix dimensions incompatible: 3 2" in str(exc)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl._nd.array([1, 2]) @ hl._nd.array([1, 2, 3]))
    assert "Matrix dimensions incompatible" in str(exc)
Esempio n. 31
0
def getVariantStats(ipvDict, studyPerVariant, centersPerHomoVus, inList, outList):
	allVariants = ipvDict.keys()
	variantsDict = dict()
	for v in allVariants:
		vClass = ipvDict[v]['class']
		vPopFreq = '%.4f'%(ipvDict[v]['maxFreq'])
		vCohortFreq = '%.4f'%(ipvDict[v]['cohortFreq'])
		aa = str(ipvDict[v]['aa'])
		Aa = str(ipvDict[v]['Aa'])
		AA = str(ipvDict[v]['AA'])
		F = str(ipvDict[v]['F'])
		Z = str(ipvDict[v]['Z'])
		p = (2 * int(AA) + int(Aa)) / (2 * (int(AA) + int(Aa) + int(aa)))
		q = 1 - p
		exonic = str(ipvDict[v]['exonic'])
		chisquare = str(ipvDict[v]['chisquare'])
		if len(ipvDict[v]['homozygous individuals']) == 0:
			homoSample = "None"
		else:
			homoSample = ipvDict[v]['homozygous individuals'][0]
		if len(ipvDict[v]['heterozygous individuals']) == 0:
			heteroSample = "None"
		else:
			heteroSample = ipvDict[v]['heterozygous individuals'][0]
		v = v.replace(' ', '')
		v = v.replace("'", "")
		study = studyPerVariant[v]
		if v in inList:
			vIn = 'True'
		elif v in outList:
			vIn = 'False'
		else:
			vIn = 'NA'
		variantsDict[v] = dict()
		variantsDict[v]['class'] = vClass
		variantsDict[v]['popFreq'] = vPopFreq
		variantsDict[v]['cohortFreq'] = vCohortFreq

		variantsDict[v]['homozygousSample'] = homoSample
		variantsDict[v]['heterozygousSample'] = heteroSample
		variantsDict[v]['inGnomad'] = vIn
		variantsDict[v]['aa'] = aa
		variantsDict[v]['Aa'] = Aa
		variantsDict[v]['AA'] = AA
		variantsDict[v]['hail_hweafp'] = hl.eval(hl.hardy_weinberg_test(int(AA),int(Aa),int(aa))).p_value
		variantsDict[v]['F'] = F
		variantsDict[v]['Z'] = Z
		variantsDict[v]['p'] = p
		variantsDict[v]['q'] = q
		variantsDict[v]['chisquare'] = chisquare
		variantsDict[v]['sequenceCenter'] = str(centersPerHomoVus[v]).replace(" ", "")
		variantsDict[v]['exonic'] = exonic
		variantsDict[v]['study'] = study

	return variantsDict
Esempio n. 32
0
def export_table_to_elasticsearch(table,
                                  host,
                                  index_name,
                                  block_size=5000,
                                  id_field=None,
                                  mapping=None,
                                  num_shards=10,
                                  port=9200,
                                  verbose=True):
    es_client = elasticsearch.Elasticsearch(host, port=port)

    if not mapping:
        mapping = elasticsearch_mapping_for_table(table)

    # Delete the index before creating it
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)

    mapping["_meta"] = dict(hl.eval(table.globals))

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings
    request_body = {
        "mappings": mapping,
        "settings": {
            "index.codec": "best_compression",
            "index.mapping.total_fields.limit": 10000,
            "index.number_of_replicas": 0,
            "index.number_of_shards": num_shards,
            "index.refresh_interval": -1,
        },
    }

    es_client.indices.create(index=index_name, body=request_body)

    temp_file = "table-tmp.json.txt"
    table = table.key_by()
    table.select(json=hl.json(table.row_value)).export(temp_file, header=False)

    buffer = []
    with open(temp_file) as f:
        for line in f:
            data = json.loads(line)
            buffer.append(data)

            if len(buffer) >= block_size:
                helpers.bulk(es_client,
                             build_bulk_request(buffer, index_name, id_field))
                buffer = []

    if buffer:
        helpers.bulk(es_client, build_bulk_request(buffer, index_name,
                                                   id_field))
        buffer = []

    es_client.indices.forcemerge(index=index_name)
Esempio n. 33
0
def make_index_dict(ht):
    '''
    Create a look-up Dictionary for entries contained in the frequency annotation array
    :param Table ht: Table containing freq_meta global annotation to be indexed
    :return: Dictionary keyed by grouping combinations in the frequency array, with values describing the corresponding index
        of each grouping entry in the frequency array
    :rtype: Dict of str: int
    '''
    freq_meta = hl.eval(ht.globals.freq_meta)
    index_dict = make_freq_meta_index_dict(freq_meta)
    return index_dict
Esempio n. 34
0
 def test_annotation(self, mock_load_gencode):
     mock_load_gencode.return_value = GENE_ID_MAPPING
     rows = annotate_fields(self.mt, TEST_GENCODE_RELEASE,
                            TEST_GENCODE_PATH)
     mock_load_gencode.assert_called_with(TEST_GENCODE_RELEASE,
                                          download_path=TEST_GENCODE_PATH)
     row_dict = {row['variantId']: row for row in rows.take(11)}
     self.assertListEqual([
         row_dict[row]
         for row in ['CPX_chr1_1', 'DUP_chr1_1', 'INS_chr1_10']
     ], hl.eval([VARIANT_CPX, VARIANT_DUP, VARIANT_INS]))
Esempio n. 35
0
    def test_loop_memory(self):
        def foo(recur, arr, idx):
            return hl.if_else(idx > 10, arr,
                              recur(arr.append(hl.str(idx)), idx + 1))

        assert hl.eval(
            hl.experimental.loop(foo, hl.tarray(hl.tstr), hl.literal(['foo']),
                                 1)) == [
                                     'foo', '1', '2', '3', '4', '5', '6', '7',
                                     '8', '9', '10'
                                 ]
Esempio n. 36
0
def test_concatenate():
    x = np.array([[1., 2.], [3., 4.]])
    y = np.array([[5.], [6.]])
    np_res = np.concatenate([x, y], axis=1)

    res = hl.eval(hl.nd.concatenate([x, y], axis=1))
    assert np.array_equal(np_res, res)

    x = np.array([[1], [3]])
    y = np.array([[5], [6]])

    seq = [x, y]
    np_res = np.concatenate(seq)
    res = hl.eval(hl.nd.concatenate(seq))
    assert np.array_equal(np_res, res)

    seq = (x, y)
    np_res = np.concatenate(seq)
    res = hl.eval(hl.nd.concatenate(seq))
    assert np.array_equal(np_res, res)
Esempio n. 37
0
def test_hstack():
    ht = hl.utils.range_table(10)

    def assert_table(a, b):
        ht2 = ht.annotate(x=hl.nd.array(a), y=hl.nd.array(b))
        ht2 = ht2.annotate(stacked=hl.nd.hstack([ht2.x, ht2.y]))
        assert np.array_equal(ht2.collect()[0].stacked, np.hstack([a, b]))

    a = np.array([1, 2, 3])
    b = np.array([2, 3, 4])
    assert (np.array_equal(hl.eval(hl.nd.hstack((a, b))), np.hstack((a, b))))
    assert (np.array_equal(hl.eval(hl.nd.hstack(hl.array([a, b]))),
                           np.hstack((a, b))))
    assert_table(a, b)

    a = np.array([[1], [2], [3]])
    b = np.array([[2], [3], [4]])
    assert (np.array_equal(hl.eval(hl.nd.hstack((a, b))), np.hstack((a, b))))
    assert (np.array_equal(hl.eval(hl.nd.hstack(hl.array([a, b]))),
                           np.hstack((a, b))))
    assert_table(a, b)
Esempio n. 38
0
def test_lgt_to_gt():
    call_0_0_f = hl.call(0, 0, phased=False)
    call_0_0_t = hl.call(0, 0, phased=True)
    call_0_1_f = hl.call(0, 1, phased=False)
    call_2_0_t = hl.call(2, 0, phased=True)

    call_1 = hl.call(1, phased=False)

    la = [0, 3, 5]

    assert hl.eval(tuple(hl.vds.lgt_to_gt(c, la) for c in [call_0_0_f, call_0_0_t, call_0_1_f, call_2_0_t, call_1])) == \
           tuple([hl.Call([0, 0], phased=False), hl.Call([0, 0], phased=True), hl.Call([0, 3], phased=False), hl.Call([5, 0], phased=True), hl.Call([3], phased=False)])
Esempio n. 39
0
    def test_matrix_filter_intervals(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20)

        self.assertEqual(
            hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3)

        intervals = [hl.parse_locus_interval('20:10639222-10644700'),
                     hl.parse_locus_interval('20:10644700-10644705')]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')),
                     hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
Esempio n. 40
0
    def overlaps(self, interval):
        """True if the the supplied interval contains any value in common with this one.

        Parameters
        ----------
        interval : :class:`.Interval`
            Interval object with the same point type.

        Returns
        -------
        :obj:`bool`
        """

        return hl.eval(hl.literal(self, hl.tinterval(self._point_type)).overlaps(interval))
Esempio n. 41
0
    def contains(self, value):
        """True if `value` is contained within the interval.

        Examples
        --------

        >>> interval2.contains(5)
        True

        >>> interval2.contains(6)
        False

        Parameters
        ----------
        value :
            Object with type :meth:`.point_type`.

        Returns
        -------
        :obj:`bool`
        """

        return hl.eval(hl.literal(self, hl.tinterval(self._point_type)).contains(value))
Esempio n. 42
0
File: plots.py Progetto: jigold/hail
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.
    significance_line : float, optional
        p-value at which to add a horizontal, dotted red line indicating
        genome-wide significance.  If ``None``, no line is added.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    if locus is None:
        locus = pvals._indices.source.locus

    ref = locus.dtype.reference_genome

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    source_pd = _collect_scatter_plot_data(
        ('_global_locus', locus.global_position()),
        ('_pval', pvals),
        fields=hover_fields,
        n_divisions=None if collect_all else n_divisions
    )
    source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']]
    source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']]

    observed_contigs = set(source_pd['_contig'])
    observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs]
    contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs])
    color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2))

    p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000)
    p, _, legend, _, _, _ = _get_scatter_plot_elements(
        p, source_pd, x_col='_global_locus', y_col='_pval',
        label_cols=['_contig'], colors={'_contig': color_mapper},
        size=size
    )
    legend.visible = False
    p.xaxis.ticker = contig_ticks
    p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs))
    p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')]

    if significance_line is not None:
        p.renderers.append(Span(location=-log10(significance_line),
                                dimension='width',
                                line_color='red',
                                line_dash='dashed',
                                line_width=1.5))

    return p
Esempio n. 43
0
 def test_define_function(self):
     f = hl.experimental.define_function(
         lambda a, b: (a + 7) * b, hl.tint32, hl.tint32)
     self.assertEqual(hl.eval(f(1, 3)), 24)
Esempio n. 44
0
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
    else:
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
        point_type = hl.tstruct(foo=point_type)
    elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type):
        needs_wrapper = False
    else:
        raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.")
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end),
                            interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals_type = intervals.dtype
    intervals = hl.eval(intervals)
    intervals = hl.tarray(hl.tinterval(point_type))._convert_to_json([wrap_input(i) for i in intervals])

    if isinstance(ds, MatrixTable):
        config = {
            'name': 'MatrixFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        }
        return MatrixTable(MatrixToMatrixApply(ds._mir, config))
    else:
        config = {
            'name': 'TableFilterIntervals',
            'keyType': point_type._parsable_string(),
            'intervals': intervals,
            'keep': keep
        }
        return Table(TableToTableApply(ds._tir, config))
Esempio n. 45
0
File: misc.py Progetto: jigold/hail
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True):
    """Returns start and stop indices for window around each locus.

    Examples
    --------

    Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5:

    >>> starts, stops = hl.linalg.utils.locus_windows(
    ...     hl.balding_nichols_model(1, 5, 5).locus,
    ...     radius=2)
    >>> starts, stops
    (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5]))

    The following examples involve three contigs.

    >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
    ...         {'locus': hl.Locus('1', 2), 'cm': 3.0},
    ...         {'locus': hl.Locus('1', 4), 'cm': 4.0},
    ...         {'locus': hl.Locus('2', 1), 'cm': 2.0},
    ...         {'locus': hl.Locus('2', 1), 'cm': 2.0},
    ...         {'locus': hl.Locus('3', 3), 'cm': 5.0}]

    >>> ht = hl.Table.parallelize(
    ...         loci,
    ...         hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
    ...         key=['locus'])

    Windows with 1bp radius:

    >>> hl.linalg.utils.locus_windows(ht.locus, 1)
    (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6]))

    Windows with 1cm radius:

    >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
    (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6]))

    Notes
    -----
    This function returns two 1-dimensional ndarrays of integers,
    ``starts`` and ``stops``, each of size equal to the number of rows.

    By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal
    range of row indices ``j`` such that ``contig[i] == contig[j]`` and
    ``position[i] - radius <= position[j] <= position[i] + radius``.

    If the :meth:`.global_position` on `locus_expr` is not in ascending order,
    this method will fail. Ascending order should hold for a matrix table keyed
    by locus or variant (and the associated row table), or for a table that has
    been ordered by `locus_expr`.

    Set `coord_expr` to use a value other than position to define the windows.
    This row-indexed numeric expression must be non-missing, non-``nan``, on the
    same source as `locus_expr`, and ascending with respect to locus
    position for each contig; otherwise the function will fail.

    The last example above uses centimorgan coordinates, so
    ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such
    that ``contig[i] == contig[j]`` and
    ``cm[i] - radius <= cm[j] <= cm[i] + radius``.

    Index ranges are start-inclusive and stop-exclusive. This function is
    especially useful in conjunction with
    :meth:`.BlockMatrix.sparsify_row_intervals`.

    Parameters
    ----------
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression on a table or matrix table.
    radius: :obj:`int`
        Radius of window for row values.
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value.
        Must be on the same table or matrix table as `locus_expr`.
        By default, the row value is given by the locus position.

    Returns
    -------
    (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`)
        Tuple of start indices array and stop indices array.
    """
    if radius < 0:
        raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}")
    check_row_indexed('locus_windows', locus_expr)
    if coord_expr is not None:
        check_row_indexed('locus_windows', coord_expr)

    src = locus_expr._indices.source
    if locus_expr not in src._fields_inverse:
        locus = Env.get_uid()
        annotate_fields = {locus: locus_expr}

        if coord_expr is not None:
            if coord_expr not in src._fields_inverse:
                coords = Env.get_uid()
                annotate_fields[coords] = coord_expr
            else:
                coords = src._fields_inverse[coord_expr]

        if isinstance(src, hl.MatrixTable):
            new_src = src.annotate_rows(**annotate_fields)
        else:
            new_src = src.annotate(**annotate_fields)

        locus_expr = new_src[locus]
        if coord_expr is not None:
            coord_expr = new_src[coords]

    if coord_expr is None:
        coord_expr = locus_expr.position

    rg = locus_expr.dtype.reference_genome
    contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr))

    # check loci are in sorted order
    last_pos = hl.fold(lambda a, elt: (hl.case()
                                         .when(a <= elt, elt)
                                         .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")),
                       -1,
                       hl.agg.collect(hl.case()
                                        .when(hl.is_defined(locus_expr), locus_expr.global_position())
                                        .or_error("locus_windows: missing value for 'locus_expr'.")))
    checked_contig_groups = (hl.case()
                               .when(last_pos >= 0, contig_group_expr)
                               .or_error("locus_windows: 'locus_expr' has length 0"))

    contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False)

    coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1])
    starts_and_stops = hl._locus_windows_per_contig(coords, radius)

    if not _localize:
        return starts_and_stops

    starts, stops = hl.eval(starts_and_stops)
    return np.array(starts), np.array(stops)