Exemple #1
0
def gather(ht, key, value, *fields) -> Table:
    """Collapse fields into key-value pairs.

    :func:`.gather` mimics the functionality of the `gather()` function found in R's
    ``tidyr`` package. This is a way to turn "wide" format data into "long"
    format data.

    Parameters
    ----------
    ht : :class:`.Table`
        A Hail table.
    key : :obj:`str`
        The name of the key field in the gathered table.
    value : :obj:`str`
        The name of the value field in the gathered table.
    fields : variable-length args of obj:`str`
        Names of fields to gather in ``ht``.

    Returns
    -------
    :class:`.Table`
        Table with original ``fields`` gathered into ``key`` and ``value`` fields."""

    ht = ht.annotate(
        _col_val=hl.array([hl.array([field, ht[field]]) for field in fields]))
    ht = ht.drop(*fields)
    ht = ht.explode(ht['_col_val'])
    ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]})
    ht = ht.drop('_col_val')

    ht_tmp = new_temp_file()
    ht.write(ht_tmp)

    return hl.read_table(ht_tmp)
Exemple #2
0
def test_concatenate():
    x = np.array([[1., 2.], [3., 4.]])
    y = np.array([[5.], [6.]])
    np_res = np.concatenate([x, y], axis=1)

    res = hl.eval(hl.nd.concatenate([x, y], axis=1))
    assert np.array_equal(np_res, res)

    res = hl.eval(hl.nd.concatenate(hl.array([x, y]), axis=1))
    assert np.array_equal(np_res, res)

    x = np.array([[1], [3]])
    y = np.array([[5], [6]])

    seq = [x, y]
    seq2 = hl.array(seq)
    np_res = np.concatenate(seq)
    res = hl.eval(hl.nd.concatenate(seq))
    assert np.array_equal(np_res, res)

    res = hl.eval(hl.nd.concatenate(seq2))
    assert np.array_equal(np_res, res)

    seq = (x, y)
    seq2 = hl.array([x, y])
    np_res = np.concatenate(seq)
    res = hl.eval(hl.nd.concatenate(seq))
    assert np.array_equal(np_res, res)

    res = hl.eval(hl.nd.concatenate(seq2))
    assert np.array_equal(np_res, res)
Exemple #3
0
 def merge_alleles(alleles):
     from hail.expr.functions import _num_allele_type, _allele_ints
     return hl.rbind(
         alleles.map(lambda a: hl.or_else(a[0], ''))
                .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
         lambda ref:
         hl.rbind(
             alleles.map(
                 lambda al: hl.rbind(
                     al[0],
                     lambda r:
                     hl.array([ref]).extend(
                         al[1:].map(
                             lambda a:
                             hl.rbind(
                                 _num_allele_type(r, a),
                                 lambda at:
                                 hl.cond(
                                     (_allele_ints['SNP'] == at) |
                                     (_allele_ints['Insertion'] == at) |
                                     (_allele_ints['Deletion'] == at) |
                                     (_allele_ints['MNP'] == at) |
                                     (_allele_ints['Complex'] == at),
                                     a + ref[hl.len(r):],
                                     a)))))),
             lambda lal:
             hl.struct(
                 globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                 local=lal)))
Exemple #4
0
    def test_to_table_on_various_fields(self):
        mt = hl.utils.range_matrix_table(3, 4)

        sample_ids = ['Bob', 'Alice', 'David', 'Carol']
        entries = [1, 0, 3, 2]
        rows = ['1:3:A:G', '1:2:A:G', '1:0:A:G']

        mt = mt.annotate_cols(s=hl.array(sample_ids)[mt.col_idx]).key_cols_by('s')
        mt = mt.annotate_entries(e=hl.array(entries)[mt.col_idx])
        mt = mt.annotate_rows(r=hl.array(rows)[mt.row_idx]).key_rows_by('r')

        self.assertEqual(mt.s.collect(), sample_ids)
        self.assertEqual(mt.s.take(1), [sample_ids[0]])
        self.assertEqual(mt.e.collect(), entries * 3)
        self.assertEqual(mt.e.take(1), [entries[0]])
        self.assertEqual(mt.row_idx.collect(), [2, 1, 0])
        self.assertEqual(mt.r.collect(), sorted(rows))
        self.assertEqual(mt.r.take(1), [sorted(rows)[0]])

        self.assertEqual(mt.cols().s.collect(), sorted(sample_ids))
        self.assertEqual(mt.cols().s.take(1), [sorted(sample_ids)[0]])
        self.assertEqual(mt.entries().e.collect(), sorted(entries) * 3)
        self.assertEqual(mt.entries().e.take(1), [sorted(entries)[0]])
        self.assertEqual(mt.rows().row_idx.collect(), [2, 1, 0])
        self.assertEqual(mt.rows().r.collect(), sorted(rows))
        self.assertEqual(mt.rows().r.take(1), [sorted(rows)[0]])
Exemple #5
0
def combine(ts):
    # pylint: disable=protected-access
    tmp = ts.annotate(
        alleles=merge_alleles(ts.data.map(lambda d: d.alleles)),
        rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)),
        filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))),
        info=hl.struct(
            DP=hl.sum(ts.data.map(lambda d: d.info.DP)),
            MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)),
            QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)),
            RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)),
            VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)),
            SB=hl.array([
                hl.sum(ts.data.map(lambda d: d.info.SB[0])),
                hl.sum(ts.data.map(lambda d: d.info.SB[1])),
                hl.sum(ts.data.map(lambda d: d.info.SB[2])),
                hl.sum(ts.data.map(lambda d: d.info.SB[3]))
            ])))
    tmp = tmp.annotate(
        __entries=hl.bind(
            lambda combined_allele_index:
            hl.range(0, hl.len(tmp.data)).flatmap(
                lambda i:
                hl.cond(hl.is_missing(tmp.data[i].__entries),
                        hl.range(0, hl.len(tmp.g[i].__cols))
                          .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)),
                        hl.bind(
                            lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)),
                            hl.range(0, hl.len(tmp.data[i].alleles)).map(
                                lambda j: combined_allele_index[tmp.data[i].alleles[j]])))),
            hl.dict(hl.range(0, hl.len(tmp.alleles)).map(
                lambda j: hl.tuple([tmp.alleles[j], j])))))
    tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols)))

    return tmp.drop('data', 'g')
Exemple #6
0
    def phase_haploid_proband_x_nonpar(
            proband_call: hl.expr.CallExpression,
            father_call: hl.expr.CallExpression,
            mother_call: hl.expr.CallExpression) -> hl.expr.ArrayExpression:
        """
        Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X

        :param CallExpression proband_call: Input proband genotype call
        :param CallExpression father_call: Input father genotype call
        :param CallExpression mother_call: Input mother genotype call
        :return: Array containing: phased proband call, phased father call, phased mother call
        :rtype: ArrayExpression
        """

        transmitted_allele = hl.zip_with_index(
            hl.array([mother_call[0],
                      mother_call[1]])).find(lambda m: m[1] == proband_call[0])
        return hl.or_missing(
            hl.is_defined(transmitted_allele),
            hl.array([
                hl.call(proband_call[0], phased=True),
                hl.or_missing(father_call.is_haploid(),
                              hl.call(father_call[0], phased=True)),
                phase_parent_call(mother_call, transmitted_allele[0])
            ]))
Exemple #7
0
    def test_table_filter_intervals(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20).rows()

        self.assertEqual(
            hl.filter_intervals(
                ds, [hl.parse_locus_interval('20:10639222-10644705')]).count(),
            3)

        intervals = [
            hl.parse_locus_interval('20:10639222-10644700'),
            hl.parse_locus_interval('20:10644700-10644705')
        ]
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3)

        intervals = hl.array([
            hl.parse_locus_interval('20:10639222-10644700'),
            hl.parse_locus_interval('20:10644700-10644705')
        ])
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3)

        intervals = hl.array([
            hl.eval(hl.parse_locus_interval('20:10639222-10644700')),
            hl.parse_locus_interval('20:10644700-10644705')
        ])
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3)

        intervals = [
            hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')),
            hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))
        ]
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 4)
    def test_to_table_on_various_fields(self):
        mt = hl.utils.range_matrix_table(3, 4)

        sample_ids = ['Bob', 'Alice', 'David', 'Carol']
        entries = [1, 0, 3, 2]
        rows = ['1:3:A:G', '1:2:A:G', '1:0:A:G']

        mt = mt.annotate_cols(s=hl.array(sample_ids)[mt.col_idx]).key_cols_by('s')
        mt = mt.annotate_entries(e=hl.array(entries)[mt.col_idx])
        mt = mt.annotate_rows(r=hl.array(rows)[mt.row_idx]).key_rows_by('r')

        self.assertEqual(mt.s.collect(), sample_ids)
        self.assertEqual(mt.s.take(1), [sample_ids[0]])
        self.assertEqual(mt.e.collect(), entries * 3)
        self.assertEqual(mt.e.take(1), [entries[0]])
        self.assertEqual(mt.row_idx.collect(), [2, 1, 0])
        self.assertEqual(mt.r.collect(), sorted(rows))
        self.assertEqual(mt.r.take(1), [sorted(rows)[0]])

        self.assertEqual(mt.cols().s.collect(), sorted(sample_ids))
        self.assertEqual(mt.cols().s.take(1), [sorted(sample_ids)[0]])
        self.assertEqual(mt.entries().e.collect(), sorted(entries) * 3)
        self.assertEqual(mt.entries().e.take(1), [sorted(entries)[0]])
        self.assertEqual(mt.rows().row_idx.collect(), [2, 1, 0])
        self.assertEqual(mt.rows().r.collect(), sorted(rows))
        self.assertEqual(mt.rows().r.take(1), [sorted(rows)[0]])
Exemple #9
0
 def test_agg_cols_group_by(self):
     t = hl.utils.range_matrix_table(1, 10)
     tests = [
         (agg.group_by(
             t.col_idx % 2,
             hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), {
                 0: [1, 3, 5, 7, 9, 0],
                 1: [2, 4, 6, 8, 10, 0]
             }),
         (agg.group_by(
             t.col_idx % 3,
             agg.filter(
                 t.col_idx > 7,
                 hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {
                     0: [10, 0],
                     1: [0],
                     2: [9, 0]
                 }),
         (agg.group_by(
             t.col_idx % 3,
             agg.explode(
                 lambda elt: agg.collect(elt + 1).append(0),
                 hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1],
                         hl.empty_array(hl.tint32)))), {
                             0: [10, 11, 0],
                             1: [0],
                             2: [9, 10, 0]
                         }),
     ]
     for aggregation, expected in tests:
         self.assertEqual(
             t.select_rows(result=aggregation).result.collect()[0],
             expected)
def import_cadd_table(path: str, genome_version: str, partitions) -> hl.Table:
    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome version: {genome_version}")

    column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'}
    types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32}

    cadd_ht = import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=partitions)
    cadd_ht = cadd_ht.rename(column_names)

    chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom
    locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}"))
    alleles = hl.array([cadd_ht.ref, cadd_ht.alt])
    cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles)

    cadd_union_ht = cadd_ht.head(0)
    for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]):
        contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs
        cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig))
        cadd_union_ht = cadd_union_ht.union(cadd_ht_subset)

    cadd_union_ht = cadd_union_ht.key_by("locus", "alleles")

    cadd_union_ht.describe()

    return cadd_union_ht
Exemple #11
0
 def merge_alleles(alleles):
     from hail.expr.functions import _num_allele_type, _allele_ints
     return hl.rbind(
         alleles.map(lambda a: hl.or_else(a[0], ''))
                .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
         lambda ref:
         hl.rbind(
             alleles.map(
                 lambda al: hl.rbind(
                     al[0],
                     lambda r:
                     hl.array([ref]).extend(
                         al[1:].map(
                             lambda a:
                             hl.rbind(
                                 _num_allele_type(r, a),
                                 lambda at:
                                 hl.cond(
                                     (_allele_ints['SNP'] == at) |
                                     (_allele_ints['Insertion'] == at) |
                                     (_allele_ints['Deletion'] == at) |
                                     (_allele_ints['MNP'] == at) |
                                     (_allele_ints['Complex'] == at),
                                     a + ref[hl.len(r):],
                                     a)))))),
             lambda lal:
             hl.struct(
                 globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                 local=lal)))
Exemple #12
0
def combine(ts):
    # pylint: disable=protected-access
    tmp = ts.annotate(
        alleles=merge_alleles(ts.data.map(lambda d: d.alleles)),
        rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)),
        info=hl.struct(
            MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)),
            QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)),
            RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)),
            VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)),
            SB_TABLE=hl.array([
                hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[0])),
                hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[1])),
                hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[2])),
                hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[3]))
            ])))
    tmp = tmp.annotate(
        __entries=hl.bind(
            lambda combined_allele_index:
            hl.range(0, hl.len(tmp.data)).flatmap(
                lambda i:
                hl.cond(hl.is_missing(tmp.data[i].__entries),
                        hl.range(0, hl.len(tmp.g[i].__cols))
                          .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)),
                        hl.bind(
                            lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)),
                            hl.array([0]).extend(
                                hl.range(0, hl.len(tmp.data[i].alleles)).map(
                                    lambda j: combined_allele_index[tmp.data[i].alleles[j]]))))),
            hl.dict(hl.range(1, hl.len(tmp.alleles) + 1).map(
                lambda j: hl.tuple([tmp.alleles[j - 1], j])))))
    tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols)))

    return tmp.drop('data', 'g')
Exemple #13
0
def test_ndarray_eval():
    data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    mishapen_data_list1 = [[4], [1, 2, 3]]
    mishapen_data_list2 = [[[1], [2, 3]]]
    mishapen_data_list3 = [[4], [1, 2, 3], 5]

    nd_expr = hl.nd.array(data_list)
    evaled = hl.eval(nd_expr)
    np_equiv = np.array(data_list, dtype=np.int32)
    np_equiv_fortran_style = np.asfortranarray(np_equiv)
    np_equiv_extra_dimension = np_equiv.reshape((3, 1, 3))
    assert (np.array_equal(evaled, np_equiv))
    assert (evaled.strides == np_equiv.strides)

    assert hl.eval(hl.nd.array([[], []])).strides == (8, 8)
    assert np.array_equal(hl.eval(hl.nd.array([])), np.array([]))

    zero_array = np.zeros((10, 10), dtype=np.int64)
    evaled_zero_array = hl.eval(hl.literal(zero_array))

    assert np.array_equal(evaled_zero_array, zero_array)
    assert zero_array.dtype == evaled_zero_array.dtype

    # Testing correct interpretation of numpy strides
    assert np.array_equal(hl.eval(hl.literal(np_equiv_fortran_style)),
                          np_equiv_fortran_style)
    assert np.array_equal(hl.eval(hl.literal(np_equiv_extra_dimension)),
                          np_equiv_extra_dimension)

    # Testing from hail arrays
    assert np.array_equal(hl.eval(hl.nd.array(hl.range(6))), np.arange(6))
    assert np.array_equal(hl.eval(hl.nd.array(hl.int64(4))), np.array(4))

    # Testing from nested hail arrays
    assert np.array_equal(
        hl.eval(hl.nd.array(hl.array([hl.array(x) for x in data_list]))),
        np.arange(9).reshape((3, 3)) + 1)

    # Testing missing data
    assert hl.eval(hl.nd.array(hl.null(hl.tarray(hl.tint32)))) is None

    with pytest.raises(ValueError) as exc:
        hl.nd.array(mishapen_data_list1)
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array(hl.array(mishapen_data_list1)))
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array(hl.array(mishapen_data_list2)))
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(ValueError) as exc:
        hl.nd.array(mishapen_data_list3)
    assert "inner dimensions do not match" in str(exc.value)
Exemple #14
0
    def explode(self, f, array_agg_expr):
        if len(array_agg_expr._ir.search(lambda n: isinstance(n, BaseApplyAggOp))) != 0:
            raise ExpressionException("'{}.explode' does not support an already-aggregated expression as the argument to 'collection'".format(self.correct_prefix()))
        _check_agg_bindings(array_agg_expr, self._agg_bindings)

        if isinstance(array_agg_expr.dtype, tset):
            array_agg_expr = hl.array(array_agg_expr)
        elt = array_agg_expr.dtype.element_type
        var = Env.get_uid()
        ref = construct_expr(Ref(var), elt, array_agg_expr._indices)
        self._agg_bindings.add(var)
        aggregated = f(ref)
        _check_agg_bindings(aggregated, self._agg_bindings)
        self._agg_bindings.remove(var)

        if len(aggregated._ir.search(lambda n: isinstance(n, BaseApplyAggOp))) == 0:
            raise ExpressionException("'{}.explode' must take mapping that contains aggregation expression.".format(self.correct_prefix()))

        indices, _ = unify_all(array_agg_expr, aggregated)
        aggregations = hl.utils.LinkedList(Aggregation)
        if not self._as_scan:
            aggregations = aggregations.push(Aggregation(array_agg_expr, aggregated))
        return construct_expr(AggExplode(array_agg_expr._ir, var, aggregated._ir),
                              aggregated.dtype,
                              aggregated._indices,
                              aggregations)
Exemple #15
0
def main(args):
    full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt')

    # liftover chains
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    chips = hl.hadoop_open(args.chip_loci)
    chip_dict = {}
    for chip in chips:
        chip = chip.strip().split()
        chip_pos = hl.import_table(chip[1],
                                   filter='\[Controls\]',
                                   skip_blank_lines=True)
        chip_pos = chip_pos.filter(
            hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains(
                chip_pos.chr))
        chip_pos = chip_pos.key_by(
            locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos)))

        #  liftover chip position info
        chip_pos = chip_pos.annotate(
            new_locus=hl.liftover(chip_pos.locus, 'GRCh38'))
        chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus))
        chip_pos = chip_pos.key_by(locus=chip_pos.new_locus)

        # filter full vcf to sites in genotype data
        geno_vcf = full_vcf.filter_rows(hl.is_defined(
            chip_pos[full_vcf.locus]))
        hl.export_vcf(
            geno_vcf,
            'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
def load_cmg(cmg_csv: str) -> hl.Table:
    cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"')

    cmg_ht = cmg_ht.transmute(
        locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'),
        alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1],
        locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'),
        alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2]
    )

    liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'}))
    lifted_over_variants = hl.sorted(
        hl.array([
            liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]),
            liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1])
        ]),
        lambda x: x.locus
    )

    cmg_ht = cmg_ht.key_by(
        locus1=lifted_over_variants[0].locus,
        alleles1=lifted_over_variants[0].alleles,
        locus2=lifted_over_variants[1].locus,
        alleles2=lifted_over_variants[1].alleles
    )

    return cmg_ht.annotate(
        bad_liftover=(
                hl.is_missing(cmg_ht.locus1) |
                hl.is_missing(cmg_ht.locus2) |
                (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) |
                (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0])
        )
    )
Exemple #17
0
def transform_one(mt: MatrixTable) -> MatrixTable:
    """transforms a gvcf into a form suitable for combining"""
    mt = mt.annotate_entries(
        # local (alt) allele index into global (alt) alleles
        LA=hl.range(0, hl.len(mt.alleles)),
        END=mt.info.END,
        BaseQRankSum=mt.info['BaseQRankSum'],
        ClippingRankSum=mt.info['ClippingRankSum'],
        MQ=mt.info['MQ'],
        MQRankSum=mt.info['MQRankSum'],
        ReadPosRankSum=mt.info['ReadPosRankSum'],
    )
    mt = mt.annotate_rows(
        info=mt.info.annotate(
            SB_TABLE=hl.array([
                hl.agg.sum(mt.entry.SB[0]),
                hl.agg.sum(mt.entry.SB[1]),
                hl.agg.sum(mt.entry.SB[2]),
                hl.agg.sum(mt.entry.SB[3]),
            ])
        ).select(
            "MQ_DP",
            "QUALapprox",
            "RAW_MQ",
            "VarDP",
            "SB_TABLE",
        ))
    mt = mt.transmute_entries(
        LGT=mt.GT,
        LAD=mt.AD[0:],  # requiredness issues :'(
        LPL=mt.PL[0:],
        LPGT=mt.PGT)
    mt = mt.drop('SB', 'qual', 'filters')

    return mt
def annotate_with_genotype_num_alt(mt: hl.MatrixTable) -> hl.MatrixTable:
    if 'AD' in set(mt.entry):
        # GATK-consistent VCF
        mt = mt.annotate_rows(genotypes=(hl.agg.collect(
            hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0,
                                      mt.GT.n_alt_alleles()),
                      ab=hl.cond(
                          mt.alleles[1] == '<CNV>', 0.0,
                          hl.float(hl.array(mt.AD)[1]) /
                          hl.float(hl.fold(lambda i, j: i + j, 0, mt.AD))),
                      gq=mt.GQ,
                      sample_id=mt.s,
                      dp=mt.DP))))
    elif 'AO' in set(mt.entry):
        mt = mt.annotate_rows(
            genotypes=hl.agg.collect(
                hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0,
                                          mt.GT.n_alt_alleles()),
                          ab=hl.cond(mt.alleles[1] == '<CNV>' or mt.DP == 0,
                                     0.0,
                                     hl.float(mt.AO[0]) / hl.float(mt.DP)),
                          dp=mt.DP,
                          gq=mt.GQ,
                          sample_id=mt.s))
        )  #hl.cond(mt.GT=="0/0",0,hl.cond(mt.GT=="1/0",1,hl.cond(mt.GT=="0/1",1,hl.cond((mt.GT=="1/1",2,hl.cond(mt.GT=="1/2",2,hl.cond(mt.GT=="2/1",2,hl.cond(mt.GT=="2/2",2,-1))))))))
    else:
        raise ValueError("unrecognized vcf")
    return mt
def remove_FT_values(
    mt: hl.MatrixTable,
    filters_to_remove: list = [
        'possible_numt', 'mt_many_low_hets', 'FAIL', 'blacklisted_site'
    ]
) -> hl.MatrixTable:
    """Removes the FT filters specified in filters_to_remove
    
    By default, this function removes the 'possible_numt', 'mt_many_low_hets', and 'FAIL' filters (because these filters were found to have low performance), 
    and the 'blacklisted_site' filter because this filter did not always behave as expected in early GATK versions (can be replaced with apply_mito_artifact_filter function)

    :param hl.MatrixTable mt:  MatrixTable
    :param list filters_to_remove: list of FT filters that should be removed from the entries
    
    :return: MatrixTable with certain FT filters removed
    :rtype: MatrixTable
    """

    filters_to_remove = hl.set(filters_to_remove)
    mt = mt.annotate_entries(
        FT=hl.array((mt.FT).difference(filters_to_remove)))

    # if no filters exists after removing those specified above, set the FT field to PASS
    mt = mt.annotate_entries(
        FT=hl.if_else(hl.len(mt.FT) == 0, ["PASS"], mt.FT))

    return (mt)
Exemple #20
0
def downsample(x, y, label=None, n_divisions=500) -> ArrayExpression:
    """Downsample (x, y) coordinate datapoints.

    Parameters
    ---------
    x : :class:`.NumericExpression`
        X-values to be downsampled.
    y : :class:`.NumericExpression`
        Y-values to be downsampled.
    label : :class:`.StringExpression` or :class:`.ArrayExpression`
        Additional data for each (x, y) coordinate. Can pass in multiple fields in an :class:`.ArrayExpression`.
    n_divisions : :obj:`int`
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`.ArrayExpression`
        Expression for downsampled coordinate points (x, y). The element type of the array is
        :py:data:`.ttuple` of :py:data:`.tfloat64`, :py:data:`.tfloat64`, and :py:data:`.tarray` of :py:data:`.tstring`
    """
    if label is None:
        label = hl.null(hl.tarray(hl.tstr))
    elif isinstance(label, StringExpression):
        label = hl.array([label])
    return _agg_func('downsample', [x, y, label], tarray(ttuple(tfloat64, tfloat64, tarray(tstr))),
                     constructor_args=[n_divisions])
Exemple #21
0
    def explode(self, f, array_agg_expr):
        if len(
                array_agg_expr._ir.search(
                    lambda n: isinstance(n, BaseApplyAggOp))) != 0:
            raise ExpressionException(
                "'{}.explode' does not support an already-aggregated expression as the argument to 'collection'"
                .format(self.correct_prefix()))

        if isinstance(array_agg_expr.dtype, tset):
            array_agg_expr = hl.array(array_agg_expr)
        elt = array_agg_expr.dtype.element_type
        var = Env.get_uid()
        ref = construct_expr(Ref(var), elt, array_agg_expr._indices)
        self._agg_bindings.add(var)
        aggregated = f(ref)
        self._agg_bindings.remove(var)

        if len(aggregated._ir.search(
                lambda n: isinstance(n, BaseApplyAggOp))) == 0:
            raise ExpressionException(
                "'{}.explode' must take mapping that contains aggregation expression."
                .format(self.correct_prefix()))

        indices, _ = unify_all(array_agg_expr, aggregated)
        aggregations = hl.utils.LinkedList(Aggregation)
        if not self._as_scan:
            aggregations = aggregations.push(
                Aggregation(array_agg_expr, aggregated))
        return construct_expr(
            AggExplode(array_agg_expr._ir, var, aggregated._ir),
            aggregated.dtype, aggregated._indices, aggregations)
def concordance_frequency(full_vcf, concordance_table, output):
    full_variant_qc = full_vcf.rows()
    concordance_qc = full_variant_qc.annotate(
        concordance=concordance_table[full_variant_qc.key])
    freqs = list(np.linspace(0.5, 0,
                             num=91))  ## note, this will need to be updated
    concordance_stats = concordance_qc.group_by(
        freq=hl.array(freqs).find(
            lambda x: concordance_qc.variant_qc.AF[1] >= x),
        snp=hl.is_snp(
            concordance_qc.alleles[0], concordance_qc.alleles[1])).aggregate(
                n_variants=hl.agg.count(),
                unique_variants=hl.agg.array_agg(
                    lambda row: hl.agg.array_agg(
                        lambda element: hl.agg.count_where(element > 0), row),
                    concordance_qc.concordance.concordance),
                geno_concordance=hl.agg.array_agg(
                    lambda row: hl.agg.array_agg(
                        lambda element: hl.agg.sum(element), row),
                    concordance_qc.concordance.concordance))

    concordance_stats = concordance_stats.annotate(
        total_concordant=concordance_stats.geno_concordance[3][3] +
        concordance_stats.geno_concordance[4][4],
        total_discordant=concordance_stats.geno_concordance[2][3] +
        concordance_stats.geno_concordance[2][4] +
        concordance_stats.geno_concordance[3][2] +
        concordance_stats.geno_concordance[3][4] +
        concordance_stats.geno_concordance[4][2] +
        concordance_stats.geno_concordance[4][3])
    concordance_stats = concordance_stats.annotate(
        non_ref_concordance=concordance_stats.total_concordant /
        (concordance_stats.total_concordant +
         concordance_stats.total_discordant))
    concordance_stats.export(output + 'variants.tsv')
    def test_import_keyby_count_ldsc_lowered_shuffle(self):
        # integration test pulled out of test_ld_score_regression to isolate issues with lowered shuffles
        # and RDD serialization, 2021-07-06
        # if this comment no longer reflects the backend system, that's a really good thing
        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP',
            types={
                'L2': hl.tfloat,
                'BP': hl.tint
            })

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP',
            types={
                'N': hl.tint,
                'Z': hl.tfloat
            })

        j1 = ht_scores[ht_20160['SNP']]
        ht_20160 = ht_20160.annotate(ld_score=j1['L2'],
                                     locus=hl.locus(j1['CHR'], j1['BP']),
                                     alleles=hl.array(
                                         [ht_20160['A2'], ht_20160['A1']]))

        ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles'])
        assert ht_20160._force_count() == 151
Exemple #24
0
def downsample(x, y, label=None, n_divisions=500) -> ArrayExpression:
    """Downsample (x, y) coordinate datapoints.

    Parameters
    ---------
    x : :class:`.NumericExpression`
        X-values to be downsampled.
    y : :class:`.NumericExpression`
        Y-values to be downsampled.
    label : :class:`.StringExpression` or :class:`.ArrayExpression`
        Additional data for each (x, y) coordinate. Can pass in multiple fields in an :class:`.ArrayExpression`.
    n_divisions : :obj:`int`
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`.ArrayExpression`
        Expression for downsampled coordinate points (x, y). The element type of the array is
        :py:data:`.ttuple` of :py:data:`.tfloat64`, :py:data:`.tfloat64`, and :py:data:`.tarray` of :py:data:`.tstring`
    """
    if label is None:
        label = hl.null(hl.tarray(hl.tstr))
    elif isinstance(label, StringExpression):
        label = hl.array([label])
    return _agg_func('downsample',
                     _to_agg(x),
                     tarray(ttuple(tfloat64, tfloat64, tarray(tstr))),
                     constructor_args=[n_divisions],
                     seq_op_args=[lambda x: x, y, label])
Exemple #25
0
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table:
    """
    Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample.
    An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept.
    Requires a field `filtered` which type should be the same as the input duplicated samples Table key.

    :param dups_ht: Input HT
    :return: Flattened HT
    """
    def get_dups_to_keep_expr():
        if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype:
            return (dups_ht.key, False)
        elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type
                                        == dups_ht.key[0].dtype):
            return (dups_ht.key[0], False)
        else:
            raise TypeError(
                f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible."
            )

    dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend(
        dups_ht.filtered.map(lambda x: (x, True))))
    dups_ht = dups_ht.explode("dups")
    dups_ht = dups_ht.key_by()
    return dups_ht.select(s=dups_ht.dups[0],
                          dup_filtered=dups_ht.dups[1]).key_by("s")
def compute_prs_mt(genotype_mt_path, prs_mt_path):
    scratch_dir = 'gs://ukbb-diverse-temp-30day/nb-scratch'

    clumped = hl.read_table(
        'gs://ukb-diverse-pops/ld_prune/results_high_quality/not_AMR/phecode-250.2-both_sexes/clump_results.ht/'
    )
    sumstats = hl.import_table(
        'gs://ukb-diverse-pops/sumstats_flat_files/phecode-250.2-both_sexes.tsv.bgz',
        impute=True)
    sumstats = sumstats.annotate(locus=hl.locus(sumstats.chr, sumstats.pos),
                                 alleles=hl.array([sumstats.ref,
                                                   sumstats.alt]))
    sumstats = sumstats.key_by('locus', 'alleles')
    sumstats.describe()
    #    mt = hl.read_matrix_table(genotype_mt_path) # read genotype mt subset

    # get full genotype mt
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    mt = get_filtered_mt_with_x()
    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))
    mt = mt.select_entries('dosage')
    mt = mt.select_rows()
    mt = mt.select_cols()

    mt = mt.annotate_rows(beta=hl.if_else(hl.is_defined(clumped[mt.row_key]),
                                          sumstats[mt.row_key].beta_meta, 0))
    mt = mt.annotate_cols(score=hl.agg.sum(mt.beta * mt.dosage))
    mt_cols = mt.cols()
    mt_cols = mt_cols.repartition(1000)
    mt_cols.write(f'{scratch_dir}/prs_all_samples.ht')
Exemple #27
0
def _linreg(y, x, nested_dim):
    k = len(x)
    k0 = nested_dim
    if k0 < 0 or k0 > k:
        raise ValueError(
            "linreg: `nested_dim` must be between 0 and the number "
            f"of covariates ({k}), inclusive")

    t = hl.tstruct(beta=hl.tarray(hl.tfloat64),
                   standard_error=hl.tarray(hl.tfloat64),
                   t_stat=hl.tarray(hl.tfloat64),
                   p_value=hl.tarray(hl.tfloat64),
                   multiple_standard_error=hl.tfloat64,
                   multiple_r_squared=hl.tfloat64,
                   adjusted_r_squared=hl.tfloat64,
                   f_stat=hl.tfloat64,
                   multiple_p_value=hl.tfloat64,
                   n=hl.tint64)

    x = hl.array(x)
    k = hl.int32(k)
    k0 = hl.int32(k0)

    return _agg_func('LinearRegression',
                     _to_agg(y),
                     t, [k, k0],
                     seq_op_args=[lambda y: y, x])
Exemple #28
0
 def _coerce(self, x: Expression):
     assert isinstance(x, hl.expr.DictExpression)
     if not self.kc._requires_conversion(x.dtype.key_type):
         # fast path
         return x.map_values(self.vc.coerce)
     else:
         return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])),
                               hl.array(x)))
    def phase_diploid_proband(
            locus: hl.expr.LocusExpression,
            alleles: hl.expr.ArrayExpression,
            proband_call: hl.expr.CallExpression,
            father_call: hl.expr.CallExpression,
            mother_call: hl.expr.CallExpression
    ) -> hl.expr.ArrayExpression:
        """
        Returns phased genotype calls in the case of a diploid proband
        (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband)

        :param LocusExpression locus: Locus in the trio MatrixTable
        :param ArrayExpression alleles: Alleles in the trio MatrixTable
        :param CallExpression proband_call: Input proband genotype call
        :param CallExpression father_call: Input father genotype call
        :param CallExpression mother_call: Input mother genotype call
        :return: Array containing: phased proband call, phased father call, phased mother call
        :rtype: ArrayExpression
        """

        proband_v = proband_call.one_hot_alleles(alleles)
        father_v = hl.cond(
            locus.in_x_nonpar() | locus.in_y_nonpar(),
            hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])),
            call_to_one_hot_alleles_array(father_call, alleles)
        )
        mother_v = call_to_one_hot_alleles_array(mother_call, alleles)

        combinations = hl.flatmap(
            lambda f:
            hl.zip_with_index(mother_v)
                .filter(lambda m: m[1] + f[1] == proband_v)
                .map(lambda m: hl.struct(m=m[0], f=f[0])),
            hl.zip_with_index(father_v)
        )

        return (
            hl.or_missing(
                hl.is_defined(combinations) & (hl.len(combinations) == 1),
                hl.array([
                    hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True),
                    hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)),
                    phase_parent_call(mother_call, combinations[0].m)
                ])
            )
        )
    def phase_diploid_proband(
            locus: hl.expr.LocusExpression,
            alleles: hl.expr.ArrayExpression,
            proband_call: hl.expr.CallExpression,
            father_call: hl.expr.CallExpression,
            mother_call: hl.expr.CallExpression
    ) -> hl.expr.ArrayExpression:
        """
        Returns phased genotype calls in the case of a diploid proband
        (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband)

        :param LocusExpression locus: Locus in the trio MatrixTable
        :param ArrayExpression alleles: Alleles in the trio MatrixTable
        :param CallExpression proband_call: Input proband genotype call
        :param CallExpression father_call: Input father genotype call
        :param CallExpression mother_call: Input mother genotype call
        :return: Array containing: phased proband call, phased father call, phased mother call
        :rtype: ArrayExpression
        """

        proband_v = proband_call.one_hot_alleles(alleles)
        father_v = hl.cond(
            locus.in_x_nonpar() | locus.in_y_nonpar(),
            hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])),
            call_to_one_hot_alleles_array(father_call, alleles)
        )
        mother_v = call_to_one_hot_alleles_array(mother_call, alleles)

        combinations = hl.flatmap(
            lambda f:
            hl.zip_with_index(mother_v)
                .filter(lambda m: m[1] + f[1] == proband_v)
                .map(lambda m: hl.struct(m=m[0], f=f[0])),
            hl.zip_with_index(father_v)
        )

        return (
            hl.or_missing(
                hl.is_defined(combinations) & (hl.len(combinations) == 1),
                hl.array([
                    hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True),
                    hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)),
                    phase_parent_call(mother_call, combinations[0].m)
                ])
            )
        )
Exemple #31
0
 def _coerce(self, x: Expression):
     assert isinstance(x, hl.expr.DictExpression)
     if not self.kc._requires_conversion(x.dtype.key_type):
         # fast path
         return x.map_values(self.vc.coerce)
     else:
         return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])),
                               hl.array(x)))
Exemple #32
0
 def fix_alleles(alleles):
     ref = alleles.map(lambda d: d.ref).fold(
         lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), '')
     alts = alleles.map(lambda a: hl.switch(hl.allele_type(
         a.ref, a.alt)).when('SNP', a.alt + ref[hl.len(a.alt):]).when(
             'Insertion', a.alt + ref[hl.len(a.ref):]).when(
                 'Deletion', a.alt + ref[hl.len(a.ref):]).default(a.alt))
     return hl.array([ref]).extend(alts)
Exemple #33
0
def test_complex_round_trips():
    assert_round_trip(hl.struct())
    assert_round_trip(hl.empty_array(hl.tint32))
    assert_round_trip(hl.empty_set(hl.tint32))
    assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32))
    assert_round_trip(hl.locus('1', 100))
    assert_round_trip(hl.struct(x=3))
    assert_round_trip(hl.set([3, 4, 5, 3]))
    assert_round_trip(hl.array([3, 4, 5]))
    assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'}))
    assert_round_trip(
        hl.struct(x=hl.dict({
            3: 'a',
            4: 'b',
            5: 'c'
        }),
                  y=hl.array([3, 4, 5]),
                  z=hl.set([3, 4, 5, 3])))
Exemple #34
0
def explode_phase_info(ht: hl.Table, remove_all_ref: bool = True) -> hl.Table:
    ht = ht.transmute(phase_info=hl.array(ht.phase_info))
    ht = ht.explode('phase_info')
    ht = ht.transmute(pop=ht.phase_info[0], phase_info=ht.phase_info[1])

    if remove_all_ref:
        ht = ht.filter(hl.sum(ht.phase_info.gt_counts.raw[1:]) > 0)

    return ht
Exemple #35
0
 def test_agg_cols_group_by(self):
     t = hl.utils.range_matrix_table(1, 10)
     tests = [(agg.group_by(t.col_idx % 2,
                            hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)),
               {0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0]}),
              (agg.group_by(t.col_idx % 3,
                            agg.filter(t.col_idx > 7,
                                       hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))),
               {0: [10, 0], 1: [0], 2: [9, 0]}),
              (agg.group_by(t.col_idx % 3,
                            agg.explode(lambda elt: agg.collect(elt + 1).append(0),
                                        hl.cond(t.col_idx > 7,
                                                [t.col_idx, t.col_idx + 1],
                                                hl.empty_array(hl.tint32)))),
               {0: [10, 11, 0], 1: [0], 2:[9, 10, 0]}),
              ]
     for aggregation, expected in tests:
         self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
Exemple #36
0
def main(args):

    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # input MT
    mt = hl.read_matrix_table(args.mt_input_path)

    # filter high-quality genotype
    # mt = filter_genotypes_ab(mt)

    # import capture interval table (intersect)
    intervals = hl.read_table(args.ht_intervals)

    # generate an interval x sample MT by computing per intervals callrate
    mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals)

    # run pca
    eigenvalues, ht_pca, _ = run_platform_pca(
        callrate_mt=mt_callrate,
        binarization_threshold=args.binarization_threshold)

    # normalize eigenvalues (0-100)
    eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues]

    # compute eigenvalues cumulative sum
    ev_cumsum = hl.array_scan(lambda i, j: i + j, 0,
                              hl.array(eigenvalues_norm))

    # getting optimal number of PCs (those which explain 99% of the variance)
    n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0)))

    logger.info(
        f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}"
    )

    # filter out uninformative PCs
    ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs])

    # apply unsupervised clustering on PCs to infer samples platform
    ht_platform = assign_platform_from_pcs(
        platform_pca_scores_ht=ht_pca,
        pc_scores_ann='scores',
        hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
        hdbscan_min_samples=args.hdbscan_min_cluster_size)

    ht_platform.show()

    # write HT
    ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht_platform.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
Exemple #37
0
def project_max_expr(
    project_expr: hl.expr.StringExpression,
    gt_expr: hl.expr.CallExpression,
    alleles_expr: hl.expr.ArrayExpression,
    n_projects: int = 5,
) -> hl.expr.ArrayExpression:
    """
    Create an expression that computes allele frequency information by project for the `n_projects` with the largest AF at this row.

    Will return an array with one element per non-reference allele.

    Each of these elements is itself an array of structs with the following fields:

        - AC: int32
        - AF: float64
        - AN: int32
        - homozygote_count: int32
        - project: str

    .. note::

        Only projects with AF > 0 are returned.
        In case of ties, the project ordering is not guaranteed, and at most `n_projects` are returned.

    :param project_expr: column expression containing the project
    :param gt_expr: entry expression containing the genotype
    :param alleles_expr: row expression containing the alleles
    :param n_projects: Maximum number of projects to return for each row
    :return: projectmax expression
    """
    n_alleles = hl.len(alleles_expr)

    # compute call stats by  project
    project_cs = hl.array(
        hl.agg.group_by(project_expr, hl.agg.call_stats(gt_expr,
                                                        alleles_expr)))

    return hl.or_missing(
        n_alleles > 1,  # Exclude monomorphic sites
        hl.range(1, n_alleles).map(lambda ai: hl.sorted(
            project_cs.filter(
                # filter to projects with AF > 0
                lambda x: x[1].AF[ai] > 0),
            # order the callstats computed by AF in decreasing order
            lambda x: -x[1].AF[ai]
            # take the n_projects projects with largest AF
        )[:n_projects].map(
            # add the project in the callstats struct
            lambda x: x[1].annotate(
                AC=x[1].AC[ai],
                AF=x[1].AF[ai],
                AN=x[1].AN,
                homozygote_count=x[1].homozygote_count[ai],
                project=x[0],
            ))),
    )
    def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression:
        """
        Get the set of all different one-hot-encoded allele-vectors in a genotype call.
        It is returned as an ordered array where the first vector corresponds to the first allele,
        and the second vector (only present if het) the second allele.

        :param CallExpression call: genotype
        :param ArrayExpression alleles: Alleles at the site
        :return: Array of one-hot-encoded alleles
        :rtype: ArrayExpression
        """
        return hl.cond(
            call.is_het(),
            hl.array([
                hl.call(call[0]).one_hot_alleles(alleles),
                hl.call(call[1]).one_hot_alleles(alleles),
            ]),
            hl.array([hl.call(call[0]).one_hot_alleles(alleles)])
        )
Exemple #39
0
 def test_multi_way_zip_join_globals(self):
     t1 = hl.utils.range_table(1).annotate_globals(x=hl.null(hl.tint32))
     t2 = hl.utils.range_table(1).annotate_globals(x=5)
     t3 = hl.utils.range_table(1).annotate_globals(x=0)
     expected = hl.struct(__globals=hl.array([
         hl.struct(x=hl.null(hl.tint32)),
         hl.struct(x=5),
         hl.struct(x=0)]))
     joined = hl.Table._multi_way_zip_join([t1, t2, t3], '__data', '__globals')
     self.assertEqual(hl.eval(joined.globals), hl.eval(expected))
    def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression:
        """
        Get the set of all different one-hot-encoded allele-vectors in a genotype call.
        It is returned as an ordered array where the first vector corresponds to the first allele,
        and the second vector (only present if het) the second allele.

        :param CallExpression call: genotype
        :param ArrayExpression alleles: Alleles at the site
        :return: Array of one-hot-encoded alleles
        :rtype: ArrayExpression
        """
        return hl.cond(
            call.is_het(),
            hl.array([
                hl.call(call[0]).one_hot_alleles(alleles),
                hl.call(call[1]).one_hot_alleles(alleles),
            ]),
            hl.array([hl.call(call[0]).one_hot_alleles(alleles)])
        )
Exemple #41
0
    def test_matrix_filter_intervals(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20)

        self.assertEqual(
            hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3)

        intervals = [hl.parse_locus_interval('20:10639222-10644700'),
                     hl.parse_locus_interval('20:10644700-10644705')]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')),
                     hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
Exemple #42
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
Exemple #43
0
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(5, n_partitions=3)
                        .annotate_globals(**prefix(all_values, 'global_'))
                        .annotate(**all_values)
                        .cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2)
                               .annotate_globals(**prefix(all_values, 'global_'))
                               .annotate_rows(**prefix(all_values, 'row_'))
                               .annotate_cols(**prefix(all_values, 'col_'))
                               .annotate_entries(**prefix(all_values, 'entry_'))
                               .cache())

    return all_values_table, all_values_matrix_table
Exemple #44
0
def quick_summary(mt):
    """compute aggregate INFO fields that do not require densify"""
    return mt.annotate_rows(
        info=hl.struct(
            MQ_DP=hl.agg.sum(mt.entry.gvcf_info.MQ_DP),
            QUALapprox=hl.agg.sum(mt.entry.gvcf_info.QUALapprox),
            RAW_MQ=hl.agg.sum(mt.entry.gvcf_info.RAW_MQ),
            VarDP=hl.agg.sum(mt.entry.gvcf_info.VarDP),
            SB_TABLE=hl.array([
                hl.agg.sum(mt.entry.SB[0]),
                hl.agg.sum(mt.entry.SB[1]),
                hl.agg.sum(mt.entry.SB[2]),
                hl.agg.sum(mt.entry.SB[3]),
            ])))
    def phase_haploid_proband_x_nonpar(
            proband_call: hl.expr.CallExpression,
            father_call: hl.expr.CallExpression,
            mother_call: hl.expr.CallExpression
    ) -> hl.expr.ArrayExpression:
        """
        Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X

        :param CallExpression proband_call: Input proband genotype call
        :param CallExpression father_call: Input father genotype call
        :param CallExpression mother_call: Input mother genotype call
        :return: Array containing: phased proband call, phased father call, phased mother call
        :rtype: ArrayExpression
        """

        transmitted_allele = hl.zip_with_index(hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0])
        return hl.or_missing(
            hl.is_defined(transmitted_allele),
            hl.array([
                hl.call(proband_call[0], phased=True),
                hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)),
                phase_parent_call(mother_call, transmitted_allele[0])
            ])
        )
Exemple #46
0
 def test_agg_cols_filter(self):
     t = hl.utils.range_matrix_table(1, 10)
     tests = [(agg.filter(t.col_idx > 7,
                          agg.collect(t.col_idx + 1).append(0)),
               [9, 10, 0]),
              (agg.filter(t.col_idx > 7,
                          agg.explode(lambda elt: agg.collect(elt + 1).append(0),
                                      [t.col_idx, t.col_idx + 1])),
               [9, 10, 10, 11, 0]),
              (agg.filter(t.col_idx > 7,
                          agg.group_by(t.col_idx % 3,
                                       hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))),
               {0: [10, 0], 2: [9, 0]})
              ]
     for aggregation, expected in tests:
         self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
Exemple #47
0
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1),
                           alleles = ['A', 'G'])
          .key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s = hl.str(mt.col_idx))
          .key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
    def phase_y_nonpar(
            proband_call: hl.expr.CallExpression,
            father_call: hl.expr.CallExpression,
    ) -> hl.expr.ArrayExpression:
        """
        Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase)

        :param CallExpression proband_call: Input proband genotype call
        :param CallExpression father_call: Input father genotype call
        :return: Array containing: phased proband call, phased father call, phased mother call
        :rtype: ArrayExpression
        """
        return hl.or_missing(
            proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]),
            hl.array([
                hl.call(proband_call[0], phased=True),
                hl.call(father_call[0], phased=True),
                hl.null(hl.tcall)
            ])
        )
Exemple #49
0
def _linreg(y, x, nested_dim):
    k = len(x)
    k0 = nested_dim
    if k0 < 0 or k0 > k:
        raise ValueError("linreg: `nested_dim` must be between 0 and the number "
                         f"of covariates ({k}), inclusive")

    t = hl.tstruct(beta=hl.tarray(hl.tfloat64),
                   standard_error=hl.tarray(hl.tfloat64),
                   t_stat=hl.tarray(hl.tfloat64),
                   p_value=hl.tarray(hl.tfloat64),
                   multiple_standard_error=hl.tfloat64,
                   multiple_r_squared=hl.tfloat64,
                   adjusted_r_squared=hl.tfloat64,
                   f_stat=hl.tfloat64,
                   multiple_p_value=hl.tfloat64,
                   n=hl.tint64)

    x = hl.array(x)
    k = hl.int32(k)
    k0 = hl.int32(k0)

    return _agg_func('LinearRegression', [y, x], t, [k, k0])
Exemple #50
0
def full_outer_join_mt(left: hl.MatrixTable, right: hl.MatrixTable) -> hl.MatrixTable:
    """Performs a full outer join on `left` and `right`.

    Replaces row, column, and entry fields with the following:

     - `left_row` / `right_row`: structs of row fields from left and right.
     - `left_col` / `right_col`: structs of column fields from left and right.
     - `left_entry` / `right_entry`: structs of entry fields from left and right.

    Parameters
    ----------
    left : :class:`.MatrixTable`
    right : :class:`.MatrixTable`

    Returns
    -------
    :class:`.MatrixTable`
    """

    if [x.dtype for x in left.row_key.values()] != [x.dtype for x in right.row_key.values()]:
        raise ValueError(f"row key types do not match:\n"
                         f"  left:  {list(left.row_key.values())}\n"
                         f"  right: {list(right.row_key.values())}")

    if [x.dtype for x in left.col_key.values()] != [x.dtype for x in right.col_key.values()]: 
        raise ValueError(f"column key types do not match:\n"
                         f"  left:  {list(left.col_key.values())}\n"
                         f"  right: {list(right.col_key.values())}")

    left = left.select_rows(left_row=left.row)
    left_t = left.localize_entries('left_entries', 'left_cols')
    right = right.select_rows(right_row=right.row)
    right_t = right.localize_entries('right_entries', 'right_cols')

    ht = left_t.join(right_t, how='outer')
    ht = ht.annotate_globals(
        left_keys=hl.group_by(
            lambda t: t[0],
            hl.zip_with_index(
                ht.left_cols.map(lambda x: hl.tuple([x[f] for f in left.col_key])), index_first=False)).map_values(
            lambda elts: elts.map(lambda t: t[1])),
        right_keys=hl.group_by(
            lambda t: t[0],
            hl.zip_with_index(
                ht.right_cols.map(lambda x: hl.tuple([x[f] for f in right.col_key])), index_first=False)).map_values(
            lambda elts: elts.map(lambda t: t[1])))
    ht = ht.annotate_globals(
        key_indices=hl.array(ht.left_keys.key_set().union(ht.right_keys.key_set()))
            .map(lambda k: hl.struct(k=k, left_indices=ht.left_keys.get(k), right_indices=ht.right_keys.get(k)))
            .flatmap(lambda s: hl.case()
                     .when(hl.is_defined(s.left_indices) & hl.is_defined(s.right_indices),
                           hl.range(0, s.left_indices.length()).flatmap(
                               lambda i: hl.range(0, s.right_indices.length()).map(
                                   lambda j: hl.struct(k=s.k, left_index=s.left_indices[i],
                                                       right_index=s.right_indices[j]))))
                     .when(hl.is_defined(s.left_indices),
                           s.left_indices.map(
                               lambda elt: hl.struct(k=s.k, left_index=elt, right_index=hl.null('int32'))))
                     .when(hl.is_defined(s.right_indices),
                           s.right_indices.map(
                               lambda elt: hl.struct(k=s.k, left_index=hl.null('int32'), right_index=elt)))
                     .or_error('assertion error')))
    ht = ht.annotate(__entries=ht.key_indices.map(lambda s: hl.struct(left_entry=ht.left_entries[s.left_index],
                                                                      right_entry=ht.right_entries[s.right_index])))
    ht = ht.annotate_globals(__cols=ht.key_indices.map(
        lambda s: hl.struct(**{f: s.k[i] for i, f in enumerate(left.col_key)},
                            left_col=ht.left_cols[s.left_index],
                            right_col=ht.right_cols[s.right_index])))
    ht = ht.drop('left_entries', 'left_cols', 'left_keys', 'right_entries', 'right_cols', 'right_keys', 'key_indices')
    return ht._unlocalize_entries('__entries', '__cols', list(left.col_key))
Exemple #51
0
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    def get_contig_index(x, starts):
        left = 0
        right = len(starts) - 1
        while left <= right:
            mid = (left + right) // 2
            if x < starts[mid]:
                if x >= starts[mid - 1]:
                    return mid - 1
                right = mid
            elif x >= starts[mid+1]:
                left = mid + 1
            else:
                return mid

    if locus is None:
        locus = pvals._indices.source.locus

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    if collect_all:
        res = hail.tuple([locus.global_position(), pvals, hail.struct(**hover_fields)]).collect()
        hf_struct = [point[2] for point in res]
        for key in hover_fields:
            hover_fields[key] = [item[key] for item in hf_struct]
    else:
        agg_f = pvals._aggregation_method()
        res = agg_f(aggregators.downsample(locus.global_position(), pvals,
                                           label=hail.array([hail.str(x) for x in hover_fields.values()]),
                                           n_divisions=n_divisions))
        fields = [point[2] for point in res]
        for idx, key in enumerate(list(hover_fields.keys())):
            hover_fields[key] = [field[idx] for field in fields]

    x = [point[0] for point in res]
    y = [point[1] for point in res]
    y_linear = [10 ** (-p) for p in y]
    hover_fields['p_value'] = y_linear

    ref = locus.dtype.reference_genome

    total_pos = 0
    start_points = []
    for i in range(0, len(ref.contigs)):
        start_points.append(total_pos)
        total_pos += ref.lengths.get(ref.contigs[i])
    start_points.append(total_pos)  # end point of all contigs

    observed_contigs = set()
    label = []
    for element in x:
        contig_index = get_contig_index(element, start_points)
        label.append(str(contig_index % 2))
        observed_contigs.add(ref.contigs[contig_index])

    labels = ref.contigs.copy()
    num_deleted = 0
    mid_points = []
    for i in range(0, len(ref.contigs)):
        if ref.contigs[i] in observed_contigs:
            length = ref.lengths.get(ref.contigs[i])
            mid = start_points[i] + length / 2
            if mid % 1 == 0:
                mid += 0.5
            mid_points.append(mid)
        else:
            del labels[i - num_deleted]
            num_deleted += 1

    p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)',
                size=size, legend=False, source_fields=hover_fields)

    p.xaxis.ticker = mid_points
    p.xaxis.major_label_overrides = dict(zip(mid_points, labels))
    p.width = 1000

    tooltips = [(key, "@{}".format(key)) for key in hover_fields]
    p.add_tools(HoverTool(
        tooltips=tooltips
    ))

    return p
Exemple #52
0
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True):
    """Returns start and stop indices for window around each locus.

    Examples
    --------

    Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5:

    >>> starts, stops = hl.linalg.utils.locus_windows(
    ...     hl.balding_nichols_model(1, 5, 5).locus,
    ...     radius=2)
    >>> starts, stops
    (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5]))

    The following examples involve three contigs.

    >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
    ...         {'locus': hl.Locus('1', 2), 'cm': 3.0},
    ...         {'locus': hl.Locus('1', 4), 'cm': 4.0},
    ...         {'locus': hl.Locus('2', 1), 'cm': 2.0},
    ...         {'locus': hl.Locus('2', 1), 'cm': 2.0},
    ...         {'locus': hl.Locus('3', 3), 'cm': 5.0}]

    >>> ht = hl.Table.parallelize(
    ...         loci,
    ...         hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
    ...         key=['locus'])

    Windows with 1bp radius:

    >>> hl.linalg.utils.locus_windows(ht.locus, 1)
    (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6]))

    Windows with 1cm radius:

    >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
    (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6]))

    Notes
    -----
    This function returns two 1-dimensional ndarrays of integers,
    ``starts`` and ``stops``, each of size equal to the number of rows.

    By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal
    range of row indices ``j`` such that ``contig[i] == contig[j]`` and
    ``position[i] - radius <= position[j] <= position[i] + radius``.

    If the :meth:`.global_position` on `locus_expr` is not in ascending order,
    this method will fail. Ascending order should hold for a matrix table keyed
    by locus or variant (and the associated row table), or for a table that has
    been ordered by `locus_expr`.

    Set `coord_expr` to use a value other than position to define the windows.
    This row-indexed numeric expression must be non-missing, non-``nan``, on the
    same source as `locus_expr`, and ascending with respect to locus
    position for each contig; otherwise the function will fail.

    The last example above uses centimorgan coordinates, so
    ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such
    that ``contig[i] == contig[j]`` and
    ``cm[i] - radius <= cm[j] <= cm[i] + radius``.

    Index ranges are start-inclusive and stop-exclusive. This function is
    especially useful in conjunction with
    :meth:`.BlockMatrix.sparsify_row_intervals`.

    Parameters
    ----------
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression on a table or matrix table.
    radius: :obj:`int`
        Radius of window for row values.
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value.
        Must be on the same table or matrix table as `locus_expr`.
        By default, the row value is given by the locus position.

    Returns
    -------
    (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`)
        Tuple of start indices array and stop indices array.
    """
    if radius < 0:
        raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}")
    check_row_indexed('locus_windows', locus_expr)
    if coord_expr is not None:
        check_row_indexed('locus_windows', coord_expr)

    src = locus_expr._indices.source
    if locus_expr not in src._fields_inverse:
        locus = Env.get_uid()
        annotate_fields = {locus: locus_expr}

        if coord_expr is not None:
            if coord_expr not in src._fields_inverse:
                coords = Env.get_uid()
                annotate_fields[coords] = coord_expr
            else:
                coords = src._fields_inverse[coord_expr]

        if isinstance(src, hl.MatrixTable):
            new_src = src.annotate_rows(**annotate_fields)
        else:
            new_src = src.annotate(**annotate_fields)

        locus_expr = new_src[locus]
        if coord_expr is not None:
            coord_expr = new_src[coords]

    if coord_expr is None:
        coord_expr = locus_expr.position

    rg = locus_expr.dtype.reference_genome
    contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr))

    # check loci are in sorted order
    last_pos = hl.fold(lambda a, elt: (hl.case()
                                         .when(a <= elt, elt)
                                         .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")),
                       -1,
                       hl.agg.collect(hl.case()
                                        .when(hl.is_defined(locus_expr), locus_expr.global_position())
                                        .or_error("locus_windows: missing value for 'locus_expr'.")))
    checked_contig_groups = (hl.case()
                               .when(last_pos >= 0, contig_group_expr)
                               .or_error("locus_windows: 'locus_expr' has length 0"))

    contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False)

    coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1])
    starts_and_stops = hl._locus_windows_per_contig(coords, radius)

    if not _localize:
        return starts_and_stops

    starts, stops = hl.eval(starts_and_stops)
    return np.array(starts), np.array(stops)
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s']) -> hl.MatrixTable:
    """Splits a trio MatrixTable back into a sample MatrixTable.

    Example
    -------
    >>> # Create a trio matrix from a sample matrix
    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    >>> # Explode trio matrix back into a sample matrix
    >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset)

    Notes
    -----
    This assumes that the input MatrixTable is a trio MatrixTable (similar to the result of :meth:`.methods.trio_matrix`)
    In particular, it should have the following entry schema:
    - proband_entry
    - father_entry
    - mother_entry
    And the following column schema:
    - proband
    - father
    - mother

    Note
    ----
    The only entries kept are `proband_entry`, `father_entry` and `mother_entry` are dropped.
    The only columns kepy are `proband`, `father` and `mother`

    Parameters
    ----------
    tm : :class:`.MatrixTable`
        Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present)
    call_field : :obj:`list` of str
        Column key(s) for the resulting sample MatrixTable

    Returns
    -------
    :class:`.MatrixTable`
        Sample MatrixTable"""

    tm = tm.select_entries(
        __trio_entries=hl.array([tm.proband_entry, tm.father_entry, tm.mother_entry])
    )

    tm = tm.select_cols(
        __trio_members=hl.zip_with_index(hl.array([tm.proband, tm.father, tm.mother]))
    )
    mt = tm.explode_cols(tm.__trio_members)

    mt = mt.select_entries(
        **mt.__trio_entries[mt.__trio_members[0]]
    )

    mt = mt.key_cols_by()
    mt = mt.select_cols(**mt.__trio_members[1])

    if col_keys:
        mt = mt.key_cols_by(*col_keys)

    return mt
Exemple #54
0
 def field_to_array(ds, field):
     return hl.cond(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))
Exemple #55
0
def merge_alleles(alleles) -> ArrayExpression:
    return hl.array(hl.set(hl.flatten(alleles)))
Exemple #56
0
    def test_ld_score_regression(self):

        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint})

        ht_50_irnt = hl.import_table(
            doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_50_irnt = ht_50_irnt.annotate(
            chi_squared=ht_50_irnt['Z']**2,
            n=ht_50_irnt['N'],
            ld_score=ht_scores[ht_50_irnt['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'],
                           ht_scores[ht_50_irnt['SNP']]['BP']),
            alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]),
            phenotype='50_irnt')

        ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'],
                                       ht_50_irnt['alleles'])

        ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'],
                                       ht_50_irnt['n'],
                                       ht_50_irnt['ld_score'],
                                       ht_50_irnt['phenotype'])

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_20160 = ht_20160.annotate(
            chi_squared=ht_20160['Z']**2,
            n=ht_20160['N'],
            ld_score=ht_scores[ht_20160['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'],
                           ht_scores[ht_20160['SNP']]['BP']),
            alleles=hl.array([ht_20160['A2'], ht_20160['A1']]),
            phenotype='20160')

        ht_20160 = ht_20160.key_by(ht_20160['locus'],
                                   ht_20160['alleles'])

        ht_20160 = ht_20160.select(ht_20160['chi_squared'],
                                   ht_20160['n'],
                                   ht_20160['ld_score'],
                                   ht_20160['phenotype'])

        ht = ht_50_irnt.union(ht_20160)
        mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                                col_key=['phenotype'],
                                row_fields=['ld_score'],
                                col_fields=[])

        mt_tmp = new_temp_file()
        mt.write(mt_tmp, overwrite=True)
        mt = hl.read_matrix_table(mt_tmp)

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=mt['ld_score'],
            ld_score_expr=mt['ld_score'],
            chi_sq_exprs=mt['chi_squared'],
            n_samples_exprs=mt['n'],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results['50_irnt']['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results['20160']['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_standard_error'],
            0.0416, places=4)

        ht = ht_50_irnt.annotate(
            chi_squared_50_irnt=ht_50_irnt['chi_squared'],
            n_50_irnt=ht_50_irnt['n'],
            chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'],
            n_20160=ht_20160[ht_50_irnt.key]['n'])

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=ht['ld_score'],
            ld_score_expr=ht['ld_score'],
            chi_sq_exprs=[ht['chi_squared_50_irnt'],
                               ht['chi_squared_20160']],
            n_samples_exprs=[ht['n_50_irnt'],
                             ht['n_20160']],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results[0]['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results[1]['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_standard_error'],
            0.0416, places=4)
Exemple #57
0
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s'], keep_trio_cols: bool = True, keep_trio_entries: bool = False) -> hl.MatrixTable:
    """Splits a trio MatrixTable back into a sample MatrixTable.

    Example
    -------
    >>> # Create a trio matrix from a sample matrix
    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    >>> # Explode trio matrix back into a sample matrix
    >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset)

    Notes
    -----
    The resulting MatrixTable column schema is the same as the proband/father/mother schema,
    and the resulting entry schema is the same as the proband_entry/father_entry/mother_entry schema.
    If the `keep_trio_cols` option is set, then an additional `source_trio` column is added with the trio column data.
    If the `keep_trio_entries` option is set, then an additional `source_trio_entry` column is added with the trio entry data.

    Note
    ----
    This assumes that the input MatrixTable is a trio MatrixTable (similar to the result of :meth:`.methods.trio_matrix`)
    Its entry schema has to contain 'proband_entry`, `father_entry` and `mother_entry` all with the same type.
    Its column schema has to contain 'proband`, `father` and `mother` all with the same type.

    Parameters
    ----------
    tm : :class:`.MatrixTable`
        Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present)
    col_keys : :obj:`list` of str
        Column key(s) for the resulting sample MatrixTable
    keep_trio_cols: bool
        Whether to add a `source_trio` column with the trio column data (default `True`)
    keep_trio_entries: bool
        Whether to add a `source_trio_entries` column with the trio entry data (default `False`)

    Returns
    -------
    :class:`.MatrixTable`
        Sample MatrixTable"""

    select_entries_expr = {'__trio_entries': hl.array([tm.proband_entry, tm.father_entry, tm.mother_entry])}
    if keep_trio_entries:
        select_entries_expr['source_trio_entry'] = hl.struct(**tm.entry)
    tm = tm.select_entries(**select_entries_expr)

    tm = tm.key_cols_by()
    select_cols_expr = {'__trio_members': hl.zip_with_index(hl.array([tm.proband, tm.father, tm.mother]))}
    if keep_trio_cols:
        select_cols_expr['source_trio'] = hl.struct(**tm.col)
    tm = tm.select_cols(**select_cols_expr)

    mt = tm.explode_cols(tm.__trio_members)

    mt = mt.transmute_entries(
        **mt.__trio_entries[mt.__trio_members[0]]
    )

    mt = mt.key_cols_by()
    mt = mt.transmute_cols(**mt.__trio_members[1])

    if col_keys:
        mt = mt.key_cols_by(*col_keys)

    return mt
Exemple #58
0
 def fields_to_array(ds, fields):
     return hl.flatten(hl.array([field_to_array(ds, f) for f in fields]))