Esempio n. 1
0
    def test_from_pandas_works(self):
        d = {'a': [1, 2], 'b': ['foo', 'bar']}
        df = pd.DataFrame(data=d)
        t = hl.Table.from_pandas(df, key='a')

        d2 = [hl.struct(a=hl.int64(1), b='foo'), hl.struct(a=hl.int64(2), b='bar')]
        t2 = hl.Table.parallelize(d2, key='a')

        self.assertTrue(t._same(t2))
Esempio n. 2
0
    def test_from_pandas_works(self):
        d = {'a': [1, 2], 'b': ['foo', 'bar']}
        df = pd.DataFrame(data=d)
        t = hl.Table.from_pandas(df, key='a')

        d2 = [hl.struct(a=hl.int64(1), b='foo'), hl.struct(a=hl.int64(2), b='bar')]
        t2 = hl.Table.parallelize(d2, key='a')

        self.assertTrue(t._same(t2))
Esempio n. 3
0
 def filter_by_coordinates(self, gp_range, nlp_range):
     assert (len(gp_range) == 2 and len(nlp_range) == 2)
     return self.ht.filter(
         hl.interval(hl.int64(gp_range[0]),
                     hl.int64(gp_range[1]),
                     includes_start=True,
                     includes_end=True).contains(self.ht.global_position)
         & hl.interval(hl.float64(nlp_range[0]),
                       hl.float64(nlp_range[1]),
                       includes_start=True,
                       includes_end=True).contains(self.ht.neg_log_pval))
Esempio n. 4
0
    def test_maximal_independent_set(self):
        # prefer to remove nodes with higher index
        t = hl.utils.range_table(10)
        graph = t.select(i=hl.int64(t.idx), j=hl.int64(t.idx + 10), bad_type=hl.float32(t.idx))

        mis_table = hl.maximal_independent_set(graph.i, graph.j, True, lambda l, r: l - r)
        mis = [row['node'] for row in mis_table.collect()]
        self.assertEqual(sorted(mis), list(range(0, 10)))
        self.assertEqual(mis_table.row.dtype, hl.tstruct(node=hl.tint64))

        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, graph.bad_type, True))
        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, hl.utils.range_table(10).idx, True))
        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(hl.literal(1), hl.literal(2), True))
Esempio n. 5
0
    def test_maximal_independent_set(self):
        # prefer to remove nodes with higher index
        t = hl.utils.range_table(10)
        graph = t.select(i=hl.int64(t.idx), j=hl.int64(t.idx + 10), bad_type=hl.float32(t.idx))

        mis_table = hl.maximal_independent_set(graph.i, graph.j, True, lambda l, r: l - r)
        mis = [row['node'] for row in mis_table.collect()]
        self.assertEqual(sorted(mis), list(range(0, 10)))
        self.assertEqual(mis_table.row.dtype, hl.tstruct(node=hl.tint64))
        self.assertEqual(mis_table.key.dtype, hl.tstruct(node=hl.tint64))

        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, graph.bad_type, True))
        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(graph.i, hl.utils.range_table(10).idx, True))
        self.assertRaises(ValueError, lambda: hl.maximal_independent_set(hl.literal(1), hl.literal(2), True))
Esempio n. 6
0
def get_expr_for_xpos(
        locus: hl.expr.LocusExpression) -> hl.expr.Int64Expression:
    """Genomic position represented as a single number = contig_number * 10**9 + position.
    This represents chrom:pos more compactly and allows for easier sorting.
    """
    contig_number = get_expr_for_contig_number(locus)
    return hl.int64(contig_number) * 1_000_000_000 + locus.position
Esempio n. 7
0
    def test_entries_table(self):
        n_rows, n_cols = 5, 3
        rows = [{'i': i, 'j': j, 'entry': float(i + j)} for i in range(n_rows) for j in range(n_cols)]
        schema = hl.tstruct(i=hl.tint32, j=hl.tint32, entry=hl.tfloat64)
        table = hl.Table.parallelize([hl.struct(i=row['i'], j=row['j'], entry=row['entry']) for row in rows], schema)
        table = table.annotate(i=hl.int64(table.i),
                               j=hl.int64(table.j))

        ndarray = np.reshape(list(map(lambda row: row['entry'], rows)), (n_rows, n_cols))

        for block_size in [1, 2, 1024]:
            block_matrix = BlockMatrix.from_numpy(ndarray, block_size)
            entries_table = block_matrix.entries()
            self.assertEqual(entries_table.count(), n_cols * n_rows)
            self.assertEqual(len(entries_table.row), 3)
            self.assertTrue(table._same(entries_table))
Esempio n. 8
0
    def test_to_matrix_table(self):
        n_partitions = 2
        rows, cols = 2, 5
        bm = BlockMatrix._create(rows, cols, [float(i) for i in range(10)])
        actual = bm.to_matrix_table_row_major(n_partitions)

        expected = hl.utils.range_matrix_table(rows, cols)
        expected = expected.annotate_entries(element=hl.float64(expected.row_idx * cols + expected.col_idx))
        expected = expected.key_cols_by(col_idx=hl.int64(expected.col_idx))
        expected = expected.key_rows_by(row_idx=hl.int64(expected.row_idx))
        assert expected._same(actual)

        bm = BlockMatrix.random(50, 100, block_size=25, seed=0)
        mt = bm.to_matrix_table_row_major(n_partitions)
        mt_round_trip = BlockMatrix.from_entry_expr(mt.element).to_matrix_table_row_major()
        assert mt._same(mt_round_trip)
Esempio n. 9
0
def test_ndarray_eval():
    data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    nd_expr = hl._ndarray(data_list)
    evaled = hl.eval(nd_expr)
    np_equiv = np.array(data_list, dtype=np.int32)
    assert(np.array_equal(evaled, np_equiv))
    assert(evaled.strides == np_equiv.strides)

    assert hl.eval(hl._ndarray([[], []])).strides == (8, 8)
    assert np.array_equal(hl.eval(hl._ndarray([])), np.array([]))

    zero_array = np.zeros((10, 10), dtype=np.int64)
    evaled_zero_array = hl.eval(hl.literal(zero_array))

    assert np.array_equal(evaled_zero_array, zero_array)
    assert zero_array.dtype == evaled_zero_array.dtype

    # Testing from hail arrays
    assert np.array_equal(hl.eval(hl._ndarray(hl.range(6))), np.arange(6))
    assert np.array_equal(hl.eval(hl._ndarray(hl.int64(4))), np.array(4))

    # Testing missing data
    assert hl.eval(hl._ndarray(hl.null(hl.tarray(hl.tint32)))) is None

    with pytest.raises(ValueError) as exc:
        hl._ndarray([[4], [1, 2, 3], 5])
    assert "inner dimensions do not match" in str(exc.value)
Esempio n. 10
0
    def test_block_matrix_entries(self):
        n_rows, n_cols = 5, 3
        rows = [{'i': i, 'j': j, 'entry': float(i + j)} for i in range(n_rows) for j in range(n_cols)]
        schema = hl.tstruct(i=hl.tint32, j=hl.tint32, entry=hl.tfloat64)
        table = hl.Table.parallelize([hl.struct(i=row['i'], j=row['j'], entry=row['entry']) for row in rows], schema)
        table = table.annotate(i=hl.int64(table.i),
                               j=hl.int64(table.j)).key_by('i', 'j')

        ndarray = np.reshape(list(map(lambda row: row['entry'], rows)), (n_rows, n_cols))

        for block_size in [1, 2, 1024]:
            block_matrix = BlockMatrix.from_numpy(ndarray, block_size)
            entries_table = block_matrix.entries()
            self.assertEqual(entries_table.count(), n_cols * n_rows)
            self.assertEqual(len(entries_table.row), 3)
            self.assertTrue(table._same(entries_table))
Esempio n. 11
0
def get_expr_for_xpos_end(
        locus: hl.expr.LocusExpression) -> hl.expr.Int64Expression:
    """Genomic position represented as a single number = contig_number * 10**9 + position.
    This represents chrom:pos more compactly and allows for easier sorting.
    """
    contig_number = get_expr_for_contig_number(locus)
    length = hl.contig_length(locus.contig, reference_genome='GRCh37')
    return hl.int64(contig_number) * 1_000_000_000 + locus.position + (
        hl.expr.functions.int64(length))
Esempio n. 12
0
def get_expr_for_xpos(contig, position):
    return hl.bind(
        lambda contig_number: hl.int64(contig_number) * 1_000_000_000 +
        position,
        hl.case().when(contig == "X",
                       23).when(contig == "Y",
                                24).when(contig[0] == "M",
                                         25).default(hl.int(contig)),
    )
def xpos(chrom, position):
    contig_number = (
        hl.case()
        .when(chrom == "X", 23)
        .when(chrom == "Y", 24)
        .when(chrom[0] == "M", 25)
        .default(hl.int(chrom))
    )
    return hl.int64(contig_number) * 1_000_000_000 + position
Esempio n. 14
0
def test_ndarray_eval():
    data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    mishapen_data_list1 = [[4], [1, 2, 3]]
    mishapen_data_list2 = [[[1], [2, 3]]]
    mishapen_data_list3 = [[4], [1, 2, 3], 5]

    nd_expr = hl.nd.array(data_list)
    evaled = hl.eval(nd_expr)
    np_equiv = np.array(data_list, dtype=np.int32)
    np_equiv_fortran_style = np.asfortranarray(np_equiv)
    np_equiv_extra_dimension = np_equiv.reshape((3, 1, 3))
    assert (np.array_equal(evaled, np_equiv))
    assert (evaled.strides == np_equiv.strides)

    assert hl.eval(hl.nd.array([[], []])).strides == (8, 8)
    assert np.array_equal(hl.eval(hl.nd.array([])), np.array([]))

    zero_array = np.zeros((10, 10), dtype=np.int64)
    evaled_zero_array = hl.eval(hl.literal(zero_array))

    assert np.array_equal(evaled_zero_array, zero_array)
    assert zero_array.dtype == evaled_zero_array.dtype

    # Testing correct interpretation of numpy strides
    assert np.array_equal(hl.eval(hl.literal(np_equiv_fortran_style)),
                          np_equiv_fortran_style)
    assert np.array_equal(hl.eval(hl.literal(np_equiv_extra_dimension)),
                          np_equiv_extra_dimension)

    # Testing from hail arrays
    assert np.array_equal(hl.eval(hl.nd.array(hl.range(6))), np.arange(6))
    assert np.array_equal(hl.eval(hl.nd.array(hl.int64(4))), np.array(4))

    # Testing from nested hail arrays
    assert np.array_equal(
        hl.eval(hl.nd.array(hl.array([hl.array(x) for x in data_list]))),
        np.arange(9).reshape((3, 3)) + 1)

    # Testing missing data
    assert hl.eval(hl.nd.array(hl.null(hl.tarray(hl.tint32)))) is None

    with pytest.raises(ValueError) as exc:
        hl.nd.array(mishapen_data_list1)
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array(hl.array(mishapen_data_list1)))
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(FatalError) as exc:
        hl.eval(hl.nd.array(hl.array(mishapen_data_list2)))
    assert "inner dimensions do not match" in str(exc.value)

    with pytest.raises(ValueError) as exc:
        hl.nd.array(mishapen_data_list3)
    assert "inner dimensions do not match" in str(exc.value)
Esempio n. 15
0
 def _promote_scalar(self, typ):
     if typ == tint32:
         return hail.int32(self)
     elif typ == tint64:
         return hail.int64(self)
     elif typ == tfloat32:
         return hail.float32(self)
     else:
         assert typ == tfloat64
         return hail.float64(self)
def run_logistic_bool(mt, variable):

    ht = hl.logistic_regression_rows(test='firth',
                                     y=mt[variable],
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1, mt.imputesex.impute_sex.is_female,
                                         mt.pca.PC1, mt.pca.PC2, mt.pca.PC3,
                                         mt.pca.PC4, mt.pca.PC5, mt.pca.PC6,
                                         mt.pca.PC7, mt.pca.PC8, mt.pca.PC9,
                                         mt.pca.PC10
                                     ])

    mt = mt.filter_cols(hl.is_defined(mt[variable]))
    mt = mt.annotate_rows(MAC=hl.min(
        hl.agg.sum(mt.GT.n_alt_alleles()),
        hl.agg.sum(
            hl.int64(mt.GT.is_het_ref()) + 2 * hl.int64(mt.GT.is_hom_ref()))))
    ht = ht.annotate(MAC=mt.rows()[ht.key].MAC)
    return (ht)
Esempio n. 17
0
def variant_qc_aggregator(mt) -> hl.MatrixTable:
    """:func:`.variant_qc` as an aggregator."""
    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')
    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')
    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            "'variant_qc': expect an entry field 'GT' of type 'call'")
    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    n_cols = hl.agg.count()
    bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)
    return hl.rbind(
        hl.struct(**bound_exprs), lambda e1: hl.rbind(
            hl.case().when(
                hl.len(mt.alleles) == 2,
                hl.hardy_weinberg_test(
                    e1.call_stats.homozygote_count[0], e1.call_stats.AC[
                        1] - 2 * e1.call_stats.homozygote_count[1], e1.
                    call_stats.homozygote_count[1])).or_missing(), lambda hwe:
            hl.struct(
                **{
                    **gq_dp_exprs,
                    **e1.call_stats, 'call_rate':
                    hl.float(e1.n_called) /
                    (e1.n_called + e1.n_not_called + e1.n_filtered),
                    'n_called':
                    e1.n_called,
                    'n_not_called':
                    e1.n_not_called,
                    'n_filtered':
                    e1.n_filtered,
                    'n_het':
                    e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                    'n_non_ref':
                    e1.n_called - e1.call_stats.homozygote_count[0],
                    'het_freq_hwe':
                    hwe.het_freq_hwe,
                    'p_value_hwe':
                    hwe.p_value
                })))
Esempio n. 18
0
def combine_pheno_files(pheno_file_dict: dict):
    full_mt: hl.MatrixTable = None
    for data_type, mt in pheno_file_dict.items():
        if 'pheno' in list(mt.col_key):
            mt = mt.key_cols_by(pheno=hl.str(mt.pheno), coding=mt.coding)
            criteria = mt.value if data_type == 'categorical' else hl.is_defined(
                mt.value)
            mt = mt.annotate_cols(n_cases=hl.agg.count_where(criteria))
            mt = mt.select_entries(value=hl.float64(mt.value))
        elif 'icd_code' in list(mt.col_key):
            mt = mt.key_cols_by(pheno=mt.icd_code, coding=mt.icd_version)
            mt = mt.filter_cols(mt.truncated)
            mt = mt.annotate_cols(n_cases=hl.agg.count_where(mt.any_codes))
            mt = mt.select_entries(value=hl.float64(mt.any_codes))
        elif 'phecode' in list(mt.col_key):
            mt = mt.key_cols_by(pheno=mt.phecode, coding=mt.phecode_sex)
            mt = mt.annotate_cols(n_cases=hl.agg.count_where(mt.case_control))
            mt = mt.select_entries(value=hl.float64(mt.case_control))
        elif 'Generic_Name' in list(mt.col_key):
            mt = mt.select_entries(
                value=hl.float64(hl.or_else(hl.len(mt.values) > 0, False)))
            mt2 = mt.group_cols_by(
                pheno=mt.Drug_Category_and_Indication,
                coding=mt.Drug_Category_and_Indication).aggregate(
                    value=hl.float64(hl.agg.any(mt.value > 0)))
            mt = mt.key_cols_by(
                pheno=mt.Generic_Name,
                coding=mt.Drug_Category_and_Indication).select_cols()
            mt = mt.union_cols(mt2)
            mt = mt.annotate_cols(n_cases=hl.int64(hl.agg.sum(mt.value)))
        else:
            raise ValueError(
                'pheno or icd_code not in column key. New data type?')
        mt = mt.select_cols('n_cases',
                            data_type=data_type,
                            n_defined=hl.agg.count_where(
                                hl.is_defined(mt.value)))
        if full_mt is None:
            full_mt = mt
        else:
            full_mt = full_mt.union_cols(mt,
                                         row_join_type='outer' if data_type
                                         == 'prescriptions' else 'inner')
    full_mt = full_mt.unfilter_entries()
    return full_mt.select_entries(value=hl.cond(
        full_mt.data_type == 'prescriptions',
        hl.or_else(full_mt.value, hl.float64(0.0)), full_mt.value))
Esempio n. 19
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3},
                {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status)
                .aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))),
                x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)
            ).take(1)[0])

        expected = {u'status': 0,
                    u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777,
                             u'observed_homs': 1},
                    u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]},
                    u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'},
                    u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0},
                    u'x8': 1, u'x9': 0.0, u'x16': u'apple',
                    u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5},
                    u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16,
                    u'x17': [],
                    u'x18': [],
                    u'x19': [hl.Call([0, 1])]}

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Esempio n. 20
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3},
                {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status)
                .aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))),
                x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)
            ).take(1)[0])

        expected = {u'status': 0,
                    u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777,
                             u'observed_homs': 1},
                    u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]},
                    u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'},
                    u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0},
                    u'x8': 1, u'x9': 0.0, u'x16': u'apple',
                    u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5},
                    u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16,
                    u'x17': [],
                    u'x18': [],
                    u'x19': [hl.Call([0, 1])]}

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Esempio n. 21
0
    def test_constructors(self):
        rg = hl.ReferenceGenome("foo", ["1"], {"1": 100})

        schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32)
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}]
        kt = hl.Table.parallelize(rows, schema)
        kt = kt.annotate(d=hl.int64(kt.d))

        kt = kt.annotate(l1=hl.parse_locus("1:51"),
                         l2=hl.locus("1", 51, reference_genome=rg),
                         i1=hl.parse_locus_interval("1:51-56", reference_genome=rg),
                         i2=hl.interval(hl.locus("1", 51, reference_genome=rg),
                                        hl.locus("1", 56, reference_genome=rg)))

        expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64,
                           'l1': hl.tlocus(), 'l2': hl.tlocus(rg),
                           'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))}

        self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
Esempio n. 22
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
Esempio n. 23
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
Esempio n. 24
0
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({
            hl.array(['a', 'b']): 0.5,
            hl.array(['x', hl.null(hl.tstr), 'z']): 0.3
        }),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo',
                    hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)))

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(
        5, n_partitions=3).annotate_globals(
            **prefix(all_values, 'global_')).annotate(**all_values).cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(
        3, 2, n_partitions=2).annotate_globals(
            **prefix(all_values, 'global_')).annotate_rows(
                **prefix(all_values, 'row_')).annotate_cols(
                    **prefix(all_values, 'col_')).annotate_entries(
                        **prefix(all_values, 'entry_')).cache())

    return all_values_table, all_values_matrix_table
Esempio n. 25
0
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(5, n_partitions=3)
                        .annotate_globals(**prefix(all_values, 'global_'))
                        .annotate(**all_values)
                        .cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2)
                               .annotate_globals(**prefix(all_values, 'global_'))
                               .annotate_rows(**prefix(all_values, 'row_'))
                               .annotate_cols(**prefix(all_values, 'col_'))
                               .annotate_entries(**prefix(all_values, 'entry_'))
                               .cache())

    return all_values_table, all_values_matrix_table
Esempio n. 26
0
def count_where(condition) -> Int64Expression:
    """Count the number of records where a predicate is ``True``.

    Examples
    --------

    Count the number of individuals with `HT` greater than 68:

    >>> table1.aggregate(agg.count_where(table1.HT > 68))
    2

    Parameters
    ----------
    condition : :class:`.BooleanExpression`
        Criteria for inclusion.

    Returns
    -------
    :class:`.Expression` of type :py:data:`.tint64`
        Total number of records where `condition` is ``True``.
    """

    return _agg_func('Sum', [hl.int64(condition)], tint64)
Esempio n. 27
0
def count_where(condition) -> Int64Expression:
    """Count the number of records where a predicate is ``True``.

    Examples
    --------

    Count the number of individuals with `HT` greater than 68:

    >>> table1.aggregate(agg.count_where(table1.HT > 68))
    2

    Parameters
    ----------
    condition : :class:`.BooleanExpression`
        Criteria for inclusion.

    Returns
    -------
    :class:`.Expression` of type :py:data:`.tint64`
        Total number of records where `condition` is ``True``.
    """

    return _agg_func('Sum', [hl.int64(condition)], tint64)
Esempio n. 28
0
    def test_numeric_conversion(self):
        schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32)
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}]
        kt = hl.Table.parallelize(rows, schema)
        kt = kt.annotate(d=hl.int64(kt.d))

        kt = kt.annotate(x1=[1.0, kt.a, 1],
                         x2=[1, 1.0],
                         x3=[kt.a, kt.c],
                         x4=[kt.c, kt.d],
                         x5=[1, kt.c])

        expected_schema = {'a': hl.tfloat64,
                           'b': hl.tfloat64,
                           'c': hl.tint32,
                           'd': hl.tint64,
                           'x1': hl.tarray(hl.tfloat64),
                           'x2': hl.tarray(hl.tfloat64),
                           'x3': hl.tarray(hl.tfloat64),
                           'x4': hl.tarray(hl.tint64),
                           'x5': hl.tarray(hl.tint32)}

        for f, t in kt.row.dtype.items():
            self.assertEqual(expected_schema[f], t)
Esempio n. 29
0
def mendel_errors(call, pedigree) -> Tuple[Table, Table, Table, Table]:
    r"""Find Mendel errors; count per variant, individual and nuclear family.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in
    a pedigree and return four tables (all errors, errors by family, errors by
    individual, errors by variant):

    >>> ped = hl.Pedigree.read('data/trios.fam')
    >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset['GT'], ped)

    Export all mendel errors to a text file:

    >>> all_errors.export('output/all_mendel_errors.tsv')

    Annotate columns with the number of Mendel errors:

    >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s])

    Annotate rows with the number of Mendel errors:

    >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles])

    Notes
    -----

    The example above returns four tables, which contain Mendelian violations
    grouped in various ways. These tables are modeled after the `PLINK mendel
    formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling
    the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats,
    respectively.

    **First table:** all Mendel errors. This table contains one row per Mendel
    error, keyed by the variant and proband id.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field.
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `mendel_code` (:py:data:`.tint32`) -- Mendel error code, see below.

    **Second table:** errors per nuclear family. This table contains one row
    per nuclear family, keyed by the parents.

        - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field)
        - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field)
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this nuclear family.
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors at SNPs in this
          nuclear family.

    **Third table:** errors per individual. This table contains one row per
    individual. Each error is counted toward the proband, father, and mother
    according to the `Implicated` in the table below.

        - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field).
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual.
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual at SNPs.

    **Fourth table:** errors per variant.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this variant.

    This method only considers complete trios (two parents and proband with
    defined sex). The code of each Mendel error is determined by the table
    below, extending the
    `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__.

    In the table, the copy state of a locus with respect to a trio is defined
    as follows, where PAR is the `pseudoautosomal region
    <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y
    defined by the reference genome and the autosome is defined by
    :meth:`~hail.genetics.Locus.in_autosome`.

    - Auto -- in autosome or in PAR or female child
    - HemiX -- in non-PAR of X and male child
    - HemiY -- in non-PAR of Y and male child

    `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~`
    denotes complement in this set.

    +------+---------+---------+--------+----------------------------+
    | Code | Dad     | Mom     | Kid    | Copy State | Implicated    |
    +======+=========+=========+========+============+===============+
    |    1 | HomVar  | HomVar  | Het    | Auto       | Dad, Mom, Kid |
    +------+---------+---------+--------+------------+---------------+
    |    2 | HomRef  | HomRef  | Het    | Auto       | Dad, Mom, Kid |
    +------+---------+---------+--------+------------+---------------+
    |    3 | HomRef  | ~HomRef | HomVar | Auto       | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    4 | ~HomRef | HomRef  | HomVar | Auto       | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    5 | HomRef  | HomRef  | HomVar | Auto       | Kid           |
    +------+---------+---------+--------+------------+---------------+
    |    6 | HomVar  | ~HomVar | HomRef | Auto       | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    7 | ~HomVar | HomVar  | HomRef | Auto       | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    8 | HomVar  | HomVar  | HomRef | Auto       | Kid           |
    +------+---------+---------+--------+------------+---------------+
    |    9 | Any     | HomVar  | HomRef | HemiX      | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   10 | Any     | HomRef  | HomVar | HemiX      | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   11 | HomVar  | Any     | HomRef | HemiY      | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   12 | HomRef  | Any     | HomVar | HemiY      | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+

    See Also
    --------
    :func:`.mendel_error_code`

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
    pedigree : :class:`.Pedigree`

    Returns
    -------
    (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`)
    """
    source = call._indices.source
    if not isinstance(source, MatrixTable):
        raise ValueError("'mendel_errors': expected 'call' to be an expression of 'MatrixTable', found {}".format(
            "expression of '{}'".format(source.__class__) if source is not None else 'scalar expression'))

    source = source.select_entries(__GT=call)
    dataset = require_biallelic(source, 'mendel_errors')
    tm = trio_matrix(dataset, pedigree, complete_trios=True)
    tm = tm.select_entries(mendel_code=hl.mendel_error_code(
        tm.locus,
        tm.is_female,
        tm.father_entry['__GT'],
        tm.mother_entry['__GT'],
        tm.proband_entry['__GT']
    ))
    ck_name = next(iter(source.col_key))
    tm = tm.filter_entries(hl.is_defined(tm.mendel_code))
    tm = tm.rename({'id' : ck_name})

    entries = tm.entries()

    table1 = entries.select('fam_id', 'mendel_code')

    fam_counts = (
        entries
            .group_by(pat_id=entries.father[ck_name], mat_id=entries.mother[ck_name])
            .partition_hint(min(entries.n_partitions(), 8))
            .aggregate(children=hl.len(hl.agg.collect_as_set(entries[ck_name])),
                       errors=hl.agg.count_where(hl.is_defined(entries.mendel_code)),
                       snp_errors=hl.agg.count_where(hl.is_snp(entries.alleles[0], entries.alleles[1]) &
                                                     hl.is_defined(entries.mendel_code)))
    )
    table2 = tm.key_cols_by().cols()
    table2 = table2.select(pat_id=table2.father[ck_name],
                           mat_id=table2.mother[ck_name],
                           fam_id=table2.fam_id,
                           **fam_counts[table2.father[ck_name], table2.mother[ck_name]])
    table2 = table2.key_by('pat_id', 'mat_id').distinct()
    table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)),
                             snp_errors=hl.or_else(table2.snp_errors, hl.int64(0)))

    # in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child
    implicated = hl.literal([
        [0, 0, 0],  # dummy
        [1, 1, 1],
        [1, 1, 1],
        [1, 0, 1],
        [0, 1, 1],
        [0, 0, 1],
        [1, 0, 1],
        [0, 1, 1],
        [0, 0, 1],
        [0, 1, 1],
        [0, 1, 1],
        [1, 0, 1],
        [1, 0, 1],
    ], dtype=hl.tarray(hl.tarray(hl.tint64)))

    table3 = tm.annotate_cols(all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]), [0, 0, 0]),
                              snp_errors=hl.or_else(
                                  hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]),
                                                hl.agg.array_sum(implicated[tm.mendel_code])),
                                  [0, 0, 0])).key_cols_by().cols()

    table3 = table3.select(xs=[
        hl.struct(**{ck_name: table3.father[ck_name],
                     'fam_id': table3.fam_id,
                     'errors': table3.all_errors[0],
                     'snp_errors': table3.snp_errors[0]}),
        hl.struct(**{ck_name: table3.mother[ck_name],
                     'fam_id': table3.fam_id,
                     'errors': table3.all_errors[1],
                     'snp_errors': table3.snp_errors[1]}),
        hl.struct(**{ck_name: table3.proband[ck_name],
                     'fam_id': table3.fam_id,
                     'errors': table3.all_errors[2],
                     'snp_errors': table3.snp_errors[2]}),
    ])
    table3 = table3.explode('xs')
    table3 = table3.select(**table3.xs)
    table3 = (table3.group_by(ck_name, 'fam_id')
              .aggregate(errors=hl.agg.sum(table3.errors),
                         snp_errors=hl.agg.sum(table3.snp_errors))
              .key_by(ck_name))

    table4 = tm.select_rows(errors=hl.agg.count_where(hl.is_defined(tm.mendel_code))).rows()

    return table1, table2, table3, table4
Esempio n. 30
0
def _to_expr(e, dtype):
    if e is None:
        return None
    elif isinstance(e, Expression):
        if e.dtype != dtype:
            assert is_numeric(dtype), 'expected {}, got {}'.format(
                dtype, e.dtype)
            if dtype == tfloat64:
                return hl.float64(e)
            elif dtype == tfloat32:
                return hl.float32(e)
            elif dtype == tint64:
                return hl.int64(e)
            else:
                assert dtype == tint32
                return hl.int32(e)
        return e
    elif not is_compound(dtype):
        # these are not container types and cannot contain expressions if we got here
        return e
    elif isinstance(dtype, tstruct):
        new_fields = []
        found_expr = False
        for f, t in dtype.items():
            value = _to_expr(e[f], t)
            found_expr = found_expr or isinstance(value, Expression)
            new_fields.append(value)

        if not found_expr:
            return e
        else:
            exprs = [
                new_fields[i] if isinstance(new_fields[i], Expression) else
                hl.literal(new_fields[i], dtype[i])
                for i in range(len(new_fields))
            ]
            fields = {name: expr for name, expr in zip(dtype.keys(), exprs)}
            from .typed_expressions import StructExpression
            return StructExpression._from_fields(fields)

    elif isinstance(dtype, tarray):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert len(elements) > 0
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
        x = ir.MakeArray([e._ir for e in exprs], None)
        return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tset):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert len(elements) > 0
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.ToSet(
                ir.ToStream(ir.MakeArray([e._ir for e in exprs], None)))
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, ttuple):
        elements = []
        found_expr = False
        assert len(e) == len(dtype.types)
        for i in range(len(e)):
            value = _to_expr(e[i], dtype.types[i])
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            exprs = [
                elements[i] if isinstance(elements[i], Expression) else
                hl.literal(elements[i], dtype.types[i])
                for i in range(len(elements))
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.MakeTuple([expr._ir for expr in exprs])
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tdict):
        keys = []
        values = []
        found_expr = False
        for k, v in e.items():
            k_ = _to_expr(k, dtype.key_type)
            v_ = _to_expr(v, dtype.value_type)
            found_expr = found_expr or isinstance(k_, Expression)
            found_expr = found_expr or isinstance(v_, Expression)
            keys.append(k_)
            values.append(v_)
        if not found_expr:
            return e
        else:
            assert len(keys) > 0
            # Here I use `to_expr` to call `lit` the keys and values separately.
            # I anticipate a common mode is statically-known keys and Expression
            # values.
            key_array = to_expr(keys, tarray(dtype.key_type))
            value_array = to_expr(values, tarray(dtype.value_type))
            return hl.dict(hl.zip(key_array, value_array))
    elif isinstance(dtype, hl.tndarray):
        return hl.nd.array(e)
    else:
        raise NotImplementedError(dtype)
Esempio n. 31
0
def maximal_independent_set(i, j, keep=True, tie_breaker=None) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.

    Examples
    --------

    Prune individuals from a dataset until no close relationships remain with
    respect to a PC-Relate measure of kinship.

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
    >>> pairs = pairs.key_by(i=pairs.i.s, j=pairs.j.s).select()
    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(hl.is_defined(related_samples_to_remove[dataset.s]), keep=False)

    Prune individuals from a dataset, preferring to keep cases over controls.

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
    >>> pairs = pairs.key_by(i=pairs.i.s, j=pairs.j.s).select()
    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.select(
    ...        s = related_samples_to_remove.node.id).key_by('s')[dataset.s]), keep=False)

    Notes
    -----

    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    Parameters
    ----------
    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.

    Returns
    -------
    :class:`.Table`
        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.
    """

    if i.dtype != j.dtype:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to have same type. "
            "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError(
            "'maximal_independent_set' expects an expression of 'Table'. Found {}"
            .format("expression of '{}'".format(source.__class__)
                    if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_expr(VariableReference('l'), wrapped_node_t)
        r = construct_expr(VariableReference('r'), wrapped_node_t)
        tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_hql = tie_breaker_expr._ast.to_hql()
    else:
        t, _ = source._process_joins(i, j)
        tie_breaker_hql = None

    nodes = (t.select(node=[i, j]).explode('node').key_by('node').select())

    edges = t.key_by(None).select('i', 'j')
    nodes_in_set = Env.hail().utils.Graph.maximalIndependentSet(
        edges._jt.collect(), node_t._jtype, joption(tie_breaker_hql))

    nt = Table(
        nodes._jt.annotateGlobal(nodes_in_set,
                                 hl.tset(node_t)._jtype, 'nodes_in_set'))
    nt = (nt.filter(nt.nodes_in_set.contains(nt.node),
                    keep).drop('nodes_in_set'))

    return nt
Esempio n. 32
0
def mendel_errors(call, pedigree) -> Tuple[Table, Table, Table, Table]:
    r"""Find Mendel errors; count per variant, individual and nuclear family.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in
    a pedigree and return four tables (all errors, errors by family, errors by
    individual, errors by variant):

    >>> ped = hl.Pedigree.read('data/trios.fam')
    >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset['GT'], ped)

    Export all mendel errors to a text file:

    >>> all_errors.export('output/all_mendel_errors.tsv')

    Annotate columns with the number of Mendel errors:

    >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s])

    Annotate rows with the number of Mendel errors:

    >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles])

    Notes
    -----

    The example above returns four tables, which contain Mendelian violations
    grouped in various ways. These tables are modeled after the `PLINK mendel
    formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling
    the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats,
    respectively.

    **First table:** all Mendel errors. This table contains one row per Mendel
    error, keyed by the variant and proband id.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field.
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `mendel_code` (:py:data:`.tint32`) -- Mendel error code, see below.

    **Second table:** errors per nuclear family. This table contains one row
    per nuclear family, keyed by the parents.

        - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field)
        - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field)
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this nuclear family.
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors at SNPs in this
          nuclear family.

    **Third table:** errors per individual. This table contains one row per
    individual. Each error is counted toward the proband, father, and mother
    according to the `Implicated` in the table below.

        - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field).
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual.
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual at SNPs.

    **Fourth table:** errors per variant.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this variant.

    This method only considers complete trios (two parents and proband with
    defined sex). The code of each Mendel error is determined by the table
    below, extending the
    `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__.

    In the table, the copy state of a locus with respect to a trio is defined
    as follows, where PAR is the `pseudoautosomal region
    <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y
    defined by the reference genome and the autosome is defined by
    :meth:`~hail.genetics.Locus.in_autosome`.

    - Auto -- in autosome or in PAR or female child
    - HemiX -- in non-PAR of X and male child
    - HemiY -- in non-PAR of Y and male child

    `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~`
    denotes complement in this set.

    +------+---------+---------+--------+----------------------------+
    | Code | Dad     | Mom     | Kid    | Copy State | Implicated    |
    +======+=========+=========+========+============+===============+
    |    1 | HomVar  | HomVar  | Het    | Auto       | Dad, Mom, Kid |
    +------+---------+---------+--------+------------+---------------+
    |    2 | HomRef  | HomRef  | Het    | Auto       | Dad, Mom, Kid |
    +------+---------+---------+--------+------------+---------------+
    |    3 | HomRef  | ~HomRef | HomVar | Auto       | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    4 | ~HomRef | HomRef  | HomVar | Auto       | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    5 | HomRef  | HomRef  | HomVar | Auto       | Kid           |
    +------+---------+---------+--------+------------+---------------+
    |    6 | HomVar  | ~HomVar | HomRef | Auto       | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    7 | ~HomVar | HomVar  | HomRef | Auto       | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    8 | HomVar  | HomVar  | HomRef | Auto       | Kid           |
    +------+---------+---------+--------+------------+---------------+
    |    9 | Any     | HomVar  | HomRef | HemiX      | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   10 | Any     | HomRef  | HomVar | HemiX      | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   11 | HomVar  | Any     | HomRef | HemiY      | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   12 | HomRef  | Any     | HomVar | HemiY      | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+

    See Also
    --------
    :func:`.mendel_error_code`

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
    pedigree : :class:`.Pedigree`

    Returns
    -------
    (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`)
    """
    source = call._indices.source
    if not isinstance(source, MatrixTable):
        raise ValueError(
            "'mendel_errors': expected 'call' to be an expression of 'MatrixTable', found {}"
            .format("expression of '{}'".format(source.__class__)
                    if source is not None else 'scalar expression'))

    source = source.select_entries(__GT=call)
    dataset = require_biallelic(source, 'mendel_errors')
    tm = trio_matrix(dataset, pedigree, complete_trios=True)
    tm = tm.select_entries(mendel_code=hl.mendel_error_code(
        tm.locus, tm.is_female, tm.father_entry['__GT'],
        tm.mother_entry['__GT'], tm.proband_entry['__GT']))
    ck_name = next(iter(source.col_key))
    tm = tm.filter_entries(hl.is_defined(tm.mendel_code))
    tm = tm.rename({'id': ck_name})

    entries = tm.entries()

    table1 = entries.select('fam_id', 'mendel_code')

    fam_counts = (entries.group_by(
        pat_id=entries.father[ck_name],
        mat_id=entries.mother[ck_name]).partition_hint(
            min(entries.n_partitions(), 8)).aggregate(
                children=hl.len(hl.agg.collect_as_set(entries[ck_name])),
                errors=hl.agg.count_where(hl.is_defined(entries.mendel_code)),
                snp_errors=hl.agg.count_where(
                    hl.is_snp(entries.alleles[0], entries.alleles[1])
                    & hl.is_defined(entries.mendel_code))))
    table2 = tm.key_cols_by().cols()
    table2 = table2.select(pat_id=table2.father[ck_name],
                           mat_id=table2.mother[ck_name],
                           fam_id=table2.fam_id,
                           **fam_counts[table2.father[ck_name],
                                        table2.mother[ck_name]])
    table2 = table2.key_by('pat_id', 'mat_id').distinct()
    table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)),
                             snp_errors=hl.or_else(table2.snp_errors,
                                                   hl.int64(0)))

    # in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child
    implicated = hl.literal(
        [
            [0, 0, 0],  # dummy
            [1, 1, 1],
            [1, 1, 1],
            [1, 0, 1],
            [0, 1, 1],
            [0, 0, 1],
            [1, 0, 1],
            [0, 1, 1],
            [0, 0, 1],
            [0, 1, 1],
            [0, 1, 1],
            [1, 0, 1],
            [1, 0, 1],
        ],
        dtype=hl.tarray(hl.tarray(hl.tint64)))

    table3 = tm.annotate_cols(
        all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]),
                              [0, 0, 0]),
        snp_errors=hl.or_else(
            hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]),
                          hl.agg.array_sum(implicated[tm.mendel_code])),
            [0, 0, 0])).key_cols_by().cols()

    table3 = table3.select(xs=[
        hl.struct(
            **{
                ck_name: table3.father[ck_name],
                'fam_id': table3.fam_id,
                'errors': table3.all_errors[0],
                'snp_errors': table3.snp_errors[0]
            }),
        hl.struct(
            **{
                ck_name: table3.mother[ck_name],
                'fam_id': table3.fam_id,
                'errors': table3.all_errors[1],
                'snp_errors': table3.snp_errors[1]
            }),
        hl.struct(
            **{
                ck_name: table3.proband[ck_name],
                'fam_id': table3.fam_id,
                'errors': table3.all_errors[2],
                'snp_errors': table3.snp_errors[2]
            }),
    ])
    table3 = table3.explode('xs')
    table3 = table3.select(**table3.xs)
    table3 = (table3.group_by(ck_name, 'fam_id').aggregate(
        errors=hl.agg.sum(table3.errors),
        snp_errors=hl.agg.sum(table3.snp_errors)).key_by(ck_name))

    table4 = tm.select_rows(
        errors=hl.agg.count_where(hl.is_defined(tm.mendel_code))).rows()

    return table1, table2, table3, table4
Esempio n. 33
0
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.

    Examples
    --------
    Run PC-relate and compute pairs of closely related individuals:

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain:

    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(
    ...     hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain, preferring to keep cases over controls:

    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.key_by(
    ...        s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False)

    Notes
    -----

    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    Parameters
    ----------
    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.
    keyed : :obj:`bool`
        If ``True``, key the resulting table by the `node` field, this requires
        a sort.

    Returns
    -------
    :class:`.Table`
        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.
    """

    if i.dtype != j.dtype:
        raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. "
                         "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format(
            "expression of '{}'".format(
                source.__class__) if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_variable('l', wrapped_node_t)
        r = construct_variable('r', wrapped_node_t)
        tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_str = str(tie_breaker_expr._ir)
    else:
        t, _ = source._process_joins(i, j)
        tie_breaker_str = None

    edges = t.select(__i=i, __j=j).key_by().select('__i', '__j')
    edges_path = new_temp_file()
    edges.write(edges_path)
    edges = hl.read_table(edges_path)

    mis_nodes = construct_expr(JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet(
        Env.spark_backend('maximal_independent_set')._to_java_ir(edges.collect(_localize=False)._ir),
        node_t._parsable_string(),
        joption(tie_breaker_str))),
                               hl.tset(node_t))

    nodes = edges.select(node = [edges.__i, edges.__j])
    nodes = nodes.explode(nodes.node)
    nodes = nodes.annotate_globals(mis_nodes=mis_nodes)
    nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep)
    nodes = nodes.select_globals()
    if keyed:
        return nodes.key_by('node')
    return nodes
            .format(chrom))
        gnomad_e = gnomad_e.rows()
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            bravo=hl.if_else(
                hl.is_missing(gnomad_e.info.bravo[0]) == True, 0.0,
                hl.float64(gnomad_e.info.bravo[0]))))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            CADD_phred=hl.float64(gnomad_e.info.CADD_phred[0])))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            CADD16snv_PHRED=hl.float64(gnomad_e.info.CADD16snv_PHRED[0])))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            CADD13_PHRED=hl.float64(gnomad_e.info.CADD13_PHRED[0])))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            MetaSVM_pred=gnomad_e.info.MetaSVM_pred[0]))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            non_topmed_AC=hl.int64(gnomad_e.info.non_topmed_AC[0])))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            non_topmed_AN=gnomad_e.info.non_topmed_AN))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            non_topmed_nhomalt=hl.int64(gnomad_e.info.non_topmed_nhomalt[0])))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            Exonic_refGene=gnomad_e.info["ExonicFunc.refGene"][0]))
        gnomad_e = gnomad_e.annotate(info=gnomad_e.info.annotate(
            Func_refGene=gnomad_e.info["Func.refGene"][0]))

        #Filter out variants that did not pass filters. "RF or AC0"
        gnomad_e = gnomad_e.filter((gnomad_e.filters.contains('AC0')) |
                                   (gnomad_e.filters.contains('RF')),
                                   keep=False)

        #Import Genomes GNOMAD TABLE and modify for easier manipulation
Esempio n. 35
0
def de_novo(mt: MatrixTable,
            pedigree: Pedigree,
            pop_frequency_prior,
            *,
            min_gq: int = 20,
            min_p: float = 0.05,
            max_parent_ab: float = 0.05,
            min_child_ab: float = 0.20,
            min_dp_ratio: float = 0.10,
            ignore_in_sample_allele_frequency: bool = False) -> Table:
    r"""Call putative *de novo* events from trio data.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Call de novo events:

    >>> pedigree = hl.Pedigree.read('data/trios.fam')
    >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True)
    >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles')
    >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF)

    Notes
    -----
    This method assumes the GATK high-throughput sequencing fields exist:
    `GT`, `AD`, `DP`, `GQ`, `PL`.

    This method replicates the functionality of `Kaitlin Samocha's de novo
    caller <https://github.com/ksamocha/de_novo_scripts>`__. The version
    corresponding to git commit ``bde3e40`` is implemented in Hail with her
    permission and assistance.

    This method produces a :class:`.Table` with the following fields:

     - `locus` (``locus``) -- Variant locus.
     - `alleles` (``array<str>``) -- Variant alleles.
     - `id` (``str``) -- Proband sample ID.
     - `prior` (``float64``) -- Site frequency prior. It is the maximum of:
       the computed dataset alternate allele frequency, the
       `pop_frequency_prior` parameter, and the global prior
       ``1 / 3e7``. If the `ignore_in_sample_allele_frequency` parameter is ``True``,
       then the computed allele frequency is not included in the calculation, and the
       prior is the maximum of the `pop_frequency_prior` and ``1 / 3e7``.
     - `proband` (``struct``) -- Proband column fields from `mt`.
     - `father` (``struct``) -- Father column fields from `mt`.
     - `mother` (``struct``) -- Mother column fields from `mt`.
     - `proband_entry` (``struct``) -- Proband entry fields from `mt`.
     - `father_entry` (``struct``) -- Father entry fields from `mt`.
     - `proband_entry` (``struct``) -- Mother entry fields from `mt`.
     - `is_female` (``bool``) -- ``True`` if proband is female.
     - `p_de_novo` (``float64``) -- Unfiltered posterior probability
       that the event is *de novo* rather than a missed heterozygous
       event in a parent.
     - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``,
       ``'MEDIUM'``, ``'LOW'``.

    The key of the table is ``['locus', 'alleles', 'id']``.

    The model looks for de novo events in which both parents are homozygous
    reference and the proband is a heterozygous. The model makes the simplifying
    assumption that when this configuration ``x = (AA, AA, AB)`` of calls
    occurs, exactly one of the following is true:

     - ``d``: a de novo mutation occurred in the proband and all calls are
       accurate.
     - ``m``: at least one parental allele is actually heterozygous and
       the proband call is accurate.

    We can then estimate the posterior probability of a de novo mutation as:

    .. math::

        \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d \mid x)}{\mathrm{P}(d \mid x) + \mathrm{P}(m \mid x)}

    Applying Bayes rule to the numerator and denominator yields

    .. math::

        \frac{\mathrm{P}(x \mid d)\,\mathrm{P}(d)}{\mathrm{P}(x \mid d)\,\mathrm{P}(d) +
        \mathrm{P}(x \mid m)\,\mathrm{P}(m)}

    The prior on de novo mutation is estimated from the rate in the literature:

    .. math::

        \mathrm{P}(d) = \frac{1 \, \text{mutation}}{30{,}000{,}000 \, \text{bases}}

    The prior used for at least one alternate allele between the parents
    depends on the alternate allele frequency:

    .. math::

        \mathrm{P}(m) = 1 - (1 - AF)^4

    The likelihoods :math:`\mathrm{P}(x \mid d)` and :math:`\mathrm{P}(x \mid m)`
    are computed from the PL (genotype likelihood) fields using these
    factorizations:

    .. math::
        \mathrm{P}(x = (AA, AA, AB) \mid d) = \left(
        \begin{aligned}
                &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AA) \\
                {} \cdot {} &\mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AA) \\
                {} \cdot {} &\mathrm{P}(x_{\mathrm{proband}} = AB \mid \mathrm{proband} = AB)
        \end{aligned}
        \right)

    .. math::
        \begin{aligned}
        \mathrm{P}(x = (AA, AA, AB) \mid m) = &\left(
            \begin{aligned}
                &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AB)
                    \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AA) \\
                {} + {} &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AA)
                    \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AB)
            \end{aligned}
        \right) \\
        &{} \cdot \mathrm{P}(x_{\mathrm{proband}} = AB \mid \mathrm{proband} = AB)
        \end{aligned}

    (Technically, the second factorization assumes there is exactly (rather
    than at least) one alternate allele among the parents, which may be
    justified on the grounds that it is typically the most likely case by far.)

    While this posterior probability is a good metric for grouping putative de
    novo mutations by validation likelihood, there exist error modes in
    high-throughput sequencing data that are not appropriately accounted for by
    the phred-scaled genotype likelihoods. To this end, a number of hard filters
    are applied in order to assign validation likelihood.

    These filters are different for SNPs and insertions/deletions. In the below
    rules, the following variables are used:

     - ``DR`` refers to the ratio of the read depth in the proband to the
       combined read depth in the parents.
     - ``DP`` refers to the read depth (DP field) of the proband.
     - ``AB`` refers to the read allele balance of the proband (number of
       alternate reads divided by total reads).
     - ``AC`` refers to the count of alternate alleles across all individuals
       in the dataset at the site.
     - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`.
     - ``min_p`` refers to the `min_p` function parameter.

    HIGH-quality SNV:

    .. code-block:: text

        (p > 0.99) AND (AB > 0.3) AND (AC == 1)
            OR
        (p > 0.99) AND (AB > 0.3) AND (DR > 0.2)
            OR
        (p > 0.5) AND (AB > 0.3) AND (AC < 10) AND (DP > 10)

    MEDIUM-quality SNV:

    .. code-block:: text

        (p > 0.5) AND (AB > 0.3)
            OR
        (AC == 1)

    LOW-quality SNV:

    .. code-block:: text

       (AB > 0.2)

    HIGH-quality indel:

    .. code-block:: text

        (p > 0.99) AND (AB > 0.3) AND (AC == 1)

    MEDIUM-quality indel:

    .. code-block:: text

        (p > 0.5) AND (AB > 0.3) AND (AC < 10)

    LOW-quality indel:

    .. code-block:: text

       (AB > 0.2)

    Additionally, de novo candidates are not considered if the proband GQ is
    smaller than the `min_gq` parameter, if the proband allele balance is
    lower than the `min_child_ab` parameter, if the depth ratio between the
    proband and parents is smaller than the `min_depth_ratio` parameter, if
    the allele balance in a parent is above the `max_parent_ab` parameter, or
    if the posterior probability `p` is smaller than the `min_p` parameter.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        High-throughput sequencing dataset.
    pedigree : :class:`.Pedigree`
        Sample pedigree.
    pop_frequency_prior : :class:`.Float64Expression`
        Expression for population alternate allele frequency prior.
    min_gq
        Minimum proband GQ to be considered for *de novo* calling.
    min_p
        Minimum posterior probability to be considered for *de novo* calling.
    max_parent_ab
        Maximum parent allele balance.
    min_child_ab
        Minimum proband allele balance/
    min_dp_ratio
        Minimum ratio between proband read depth and parental read depth.
    ignore_in_sample_allele_frequency
        Ignore in-sample allele frequency in computing site prior. Experimental.
    Returns
    -------
    :class:`.Table`
    """
    DE_NOVO_PRIOR = 1 / 30000000
    MIN_POP_PRIOR = 100 / 30000000

    required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'}
    missing_fields = required_entry_fields - set(mt.entry)
    if missing_fields:
        raise ValueError(
            f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, "
            f"missing {missing_fields}")

    pop_frequency_prior = hl.case() \
        .when((pop_frequency_prior >= 0) & (pop_frequency_prior <= 1), pop_frequency_prior) \
        .or_error(hl.str("de_novo: expect 0 <= pop_frequency_prior <= 1, found " + hl.str(pop_frequency_prior)))

    if ignore_in_sample_allele_frequency:
        # this mode is used when families larger than a single trio are observed, in which
        # an allele might be de novo in a parent and transmitted to a child in the dataset.
        # The original model does not handle this case correctly, and so this experimental
        # mode can be used to treat each trio as if it were the only one in the dataset.
        mt = mt.annotate_rows(__prior=pop_frequency_prior,
                              __alt_alleles=hl.int64(1),
                              __site_freq=hl.max(pop_frequency_prior,
                                                 MIN_POP_PRIOR))
    else:
        n_alt_alleles = hl.agg.sum(mt.GT.n_alt_alleles())
        total_alleles = 2 * hl.agg.sum(hl.is_defined(mt.GT))
        # subtract 1 from __alt_alleles to correct for the observed genotype
        mt = mt.annotate_rows(__prior=pop_frequency_prior,
                              __alt_alleles=n_alt_alleles,
                              __site_freq=hl.max(
                                  (n_alt_alleles - 1) / total_alleles,
                                  pop_frequency_prior, MIN_POP_PRIOR))

    mt = require_biallelic(mt, 'de_novo')

    tm = trio_matrix(mt, pedigree, complete_trios=True)

    autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar()
                                                 & tm.is_female)
    hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female
    hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female
    hemi_mt = tm.locus.in_mito() & tm.is_female

    is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1])
    n_alt_alleles = tm.__alt_alleles
    prior = tm.__site_freq
    het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref(
    ) & tm.mother_entry.GT.is_hom_ref()
    kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(
        tm.proband_entry.AD) < min_child_ab

    failure = hl.missing(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr))

    kid = tm.proband_entry
    dad = tm.father_entry
    mom = tm.mother_entry

    kid_linear_pl = 10**(-kid.PL / 10)
    kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl)

    dad_linear_pl = 10**(-dad.PL / 10)
    dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl)

    mom_linear_pl = 10**(-mom.PL / 10)
    mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl)

    kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD)
    dp_ratio = kid.DP / (dad.DP + mom.DP)

    def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio):
        p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] *
                                   mom_pp[1]) * kid_pp[1] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP / (dad.DP + mom.DP) < min_dp_ratio)
                | ~(kid_ad_ratio >= min_child_ab), failure).when(
                    (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0),
                    failure).when(
                        (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab)
                        | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab),
                        failure).when(p_de_novo < min_p, failure).when(
                            ~is_snp,
                            hl.case().when(
                                (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                (n_alt_alleles == 1),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              (kid_ad_ratio > 0.3) &
                                              (n_alt_alleles <= 5),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      kid_ad_ratio > 0.2,
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                            or_missing()).default(hl.case().when(
                                ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                 (dp_ratio > 0.2))
                                | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                   (n_alt_alleles == 1))
                                | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                   (n_alt_alleles < 10) & (kid.DP > 10)),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              ((kid_ad_ratio > 0.3) |
                                               (n_alt_alleles == 1)),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      kid_ad_ratio > 0.2,
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                                                  or_missing()))

        return hl.bind(solve, p_de_novo)

    def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio):
        p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (parent_pp[1] +
                                   parent_pp[2]) * kid_pp[2] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP / (parent.DP) < min_dp_ratio)
                | (kid_ad_ratio < min_child_ab),
                failure).when((hl.sum(parent.AD) == 0), failure).when(
                    parent.AD[1] / hl.sum(parent.AD) > max_parent_ab,
                    failure).when(p_de_novo < min_p, failure).when(
                        ~is_snp,
                        hl.case().when(
                            (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                            (n_alt_alleles == 1),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                    (n_alt_alleles <= 5),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')).when(
                                                  kid_ad_ratio > 0.3,
                                                  hl.struct(
                                                      p_de_novo=p_de_novo,
                                                      confidence='LOW')).
                        or_missing()).default(hl.case().when(
                            ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                             (dp_ratio > 0.2))
                            | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                               (n_alt_alleles == 1))
                            | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                               (n_alt_alleles < 10) & (kid.DP > 10)),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) |
                                                         (n_alt_alleles == 1)),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')).when(
                                                  kid_ad_ratio > 0.2,
                                                  hl.struct(
                                                      p_de_novo=p_de_novo,
                                                      confidence='LOW')).
                                              or_missing()))

        return hl.bind(solve, p_de_novo)

    de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when(
        autosomal,
        hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when(
            hemi_x | hemi_mt,
            hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when(
                hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp,
                                kid_ad_ratio)).or_missing())

    tm = tm.annotate_entries(__call=de_novo_call)
    tm = tm.filter_entries(hl.is_defined(tm.__call))
    entries = tm.entries()
    return (entries.select('__site_freq', 'proband', 'father', 'mother',
                           'proband_entry', 'father_entry', 'mother_entry',
                           'is_female',
                           **entries.__call).rename({'__site_freq': 'prior'}))
Esempio n. 36
0
def sample_qc(mt, name='sample_qc') -> MatrixTable:
    """Compute per-sample metrics useful for quality control.

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    Compute sample QC metrics and remove low-quality samples:

    >>> dataset = hl.sample_qc(dataset, name='sample_qc')
    >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5))

    Notes
    -----

    This method computes summary statistics per sample from a genetic matrix and stores
    the results as a new column-indexed struct field in the matrix, named based on the
    `name` parameter.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `call_rate` (``float64``) -- Fraction of calls non-missing.
    - `n_called` (``int64``) -- Number of non-missing calls.
    - `n_not_called` (``int64``) -- Number of missing calls.
    - `n_hom_ref` (``int64``) -- Number of homozygous reference calls.
    - `n_het` (``int64``) -- Number of heterozygous calls.
    - `n_hom_var` (``int64``) -- Number of homozygous alternate calls.
    - `n_non_ref` (``int64``) -- Sum of ``n_het`` and ``n_hom_var``.
    - `n_snp` (``int64``) -- Number of SNP alternate alleles.
    - `n_insertion` (``int64``) -- Number of insertion alternate alleles.
    - `n_deletion` (``int64``) -- Number of deletion alternate alleles.
    - `n_singleton` (``int64``) -- Number of private alleles.
    - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles.
    - `n_transversion` (``int64``) -- Number of transversion alternate alleles.
    - `n_star` (``int64``) -- Number of star (upstream deletion) alleles.
    - `r_ti_tv` (``float64``) -- Transition/Transversion ratio.
    - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio.
    - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio.

    Missing values ``NA`` may result from division by zero.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with a new column-indexed field `name`.
    """

    require_row_key_variant(mt, 'sample_qc')

    from hail.expr.functions import _num_allele_type , _allele_types

    allele_types = _allele_types[:]
    allele_types.extend(['Transition', 'Transversion'])
    allele_enum = {i: v for i, v in enumerate(allele_types)}
    allele_ints = {v: k for k, v in allele_enum.items()}

    def allele_type(ref, alt):
        return hl.bind(lambda at: hl.cond(at == allele_ints['SNP'],
                                          hl.cond(hl.is_transition(ref, alt),
                                                  allele_ints['Transition'],
                                                  allele_ints['Transversion']),
                                          at),
                       _num_allele_type(ref, alt))

    variant_ac = Env.get_uid()
    variant_atypes = Env.get_uid()
    mt = mt.annotate_rows(**{variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC,
                             variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))})

    exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT',  hl.tcall):
        raise ValueError(f"'sample_qc': expect an entry field 'GT' of type 'call'")

    exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref())
    exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het())
    exprs['n_singleton'] = hl.agg.sum(hl.sum(hl.range(0, mt['GT'].ploidy).map(lambda i: mt[variant_ac][mt['GT'][i]] == 1)))

    def get_allele_type(allele_idx):
        return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32))

    exprs['allele_type_counts'] = hl.agg.explode(
        lambda elt: hl.agg.counter(elt),
        hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i])))

    mt = mt.annotate_cols(**{name: hl.struct(**exprs)})

    zero = hl.int64(0)

    select_exprs = {}
    if 'dp_stats' in exprs:
        select_exprs['dp_stats'] = mt[name].dp_stats
    if 'gq_stats' in exprs:
        select_exprs['gq_stats'] = mt[name].gq_stats

    select_exprs = {
        **select_exprs,
        'call_rate': hl.float64(mt[name].n_called) / (mt[name].n_called + mt[name].n_not_called),
        'n_called': mt[name].n_called,
        'n_not_called': mt[name].n_not_called,
        'n_hom_ref': mt[name].n_hom_ref,
        'n_het': mt[name].n_het,
        'n_hom_var': mt[name].n_called - mt[name].n_hom_ref - mt[name].n_het,
        'n_non_ref': mt[name].n_called - mt[name].n_hom_ref,
        'n_singleton': mt[name].n_singleton,
        'n_snp': mt[name].allele_type_counts.get(allele_ints["Transition"], zero) + \
                 mt[name].allele_type_counts.get(allele_ints["Transversion"], zero),
        'n_insertion': mt[name].allele_type_counts.get(allele_ints["Insertion"], zero),
        'n_deletion': mt[name].allele_type_counts.get(allele_ints["Deletion"], zero),
        'n_transition': mt[name].allele_type_counts.get(allele_ints["Transition"], zero),
        'n_transversion': mt[name].allele_type_counts.get(allele_ints["Transversion"], zero),
        'n_star': mt[name].allele_type_counts.get(allele_ints["Star"], zero)
    }

    mt = mt.annotate_cols(**{name: mt[name].select(**select_exprs)})

    mt = mt.annotate_cols(**{name: mt[name].annotate(
        r_ti_tv=divide_null(hl.float64(mt[name].n_transition), mt[name].n_transversion),
        r_het_hom_var=divide_null(hl.float64(mt[name].n_het), mt[name].n_hom_var),
        r_insertion_deletion=divide_null(hl.float64(mt[name].n_insertion), mt[name].n_deletion)
    )})        

    mt = mt.drop(variant_ac, variant_atypes)

    return mt
Esempio n. 37
0
def variant_qc(mt, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes variant statistics from the genotype data, returning
    a new struct field `name` with the following metrics based on the fields
    present in the entry schema.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `AF` (``array<float64>``) -- Calculated allele frequency, one element
      per allele, including the reference. Sums to one. Equivalent to
      `AC` / `AN`.
    - `AC` (``array<int32>``) -- Calculated allele count, one element per
      allele, including the reference. Sums to `AN`.
    - `AN` (``int32``) -- Total number of called alleles.
    - `homozygote_count` (``array<int32>``) -- Number of homozygotes per
      allele. One element per allele, including the reference.
    - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered.
      Equivalent to `n_called` / :meth:`.count_cols`.
    - `n_called` (``int64``) -- Number of samples with a defined `GT`.
    - `n_not_called` (``int64``) -- Number of samples with a missing `GT`.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_het` (``int64``) -- Number of heterozygous samples.
    - `n_non_ref` (``int64``) -- Number of samples with at least one called
      non-reference allele.
    - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous
      samples under Hardy-Weinberg equilibrium. See
      :func:`.functions.hardy_weinberg_test` for details.
    - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium.
      See :func:`.functions.hardy_weinberg_test` for details.

    Warning
    -------
    `het_freq_hwe` and `p_value_hwe` are calculated as in
    :func:`.functions.hardy_weinberg_test`, with non-diploid calls
    (``ploidy != 2``) ignored in the counts. As this test is only
    statistically rigorous in the biallelic setting, :func:`.variant_qc`
    sets both fields to missing for multiallelic variants. Consider using
    :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_row_key_variant(mt, 'variant_qc')

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            f"'variant_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    n_cols_ref = hl.expr.construct_expr(
        hl.ir.Ref('n_cols'), hl.tint32, mt._row_indices,
        hl.utils.LinkedList(hl.expr.Aggregation))
    bound_exprs['n_filtered'] = hl.int64(n_cols_ref) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)

    result = hl.rbind(
        hl.struct(**bound_exprs), lambda e1: hl.rbind(
            hl.case().when(
                hl.len(mt.alleles) == 2,
                hl.hardy_weinberg_test(
                    e1.call_stats.homozygote_count[0], e1.call_stats.AC[
                        1] - 2 * e1.call_stats.homozygote_count[1], e1.
                    call_stats.homozygote_count[1])).or_missing(), lambda hwe:
            hl.struct(
                **{
                    **gq_dp_exprs,
                    **e1.call_stats, 'call_rate':
                    hl.float(e1.n_called) /
                    (e1.n_called + e1.n_not_called + e1.n_filtered),
                    'n_called':
                    e1.n_called,
                    'n_not_called':
                    e1.n_not_called,
                    'n_filtered':
                    e1.n_filtered,
                    'n_het':
                    e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                    'n_non_ref':
                    e1.n_called - e1.call_stats.homozygote_count[0],
                    'het_freq_hwe':
                    hwe.het_freq_hwe,
                    'p_value_hwe':
                    hwe.p_value
                })))

    return mt.annotate_rows(**{name: result})
mt = mt.annotate_cols(is_BPPSY=hl.case().when(
    (mt.is_BP_including_BPSCZ) & (mt.is_PSYCHOSIS),
    True).when(~mt.is_BP_including_BPSCZ, False).default(hl.null(hl.tbool)),
                      is_BP_no_PSY=hl.case().when(
                          (mt.is_BP_including_BPSCZ) & (~mt.is_PSYCHOSIS),
                          True).when(~mt.is_BP_including_BPSCZ,
                                     False).default(hl.null(hl.tbool)))

mt.cols().select('is_BP1', 'is_BP2', 'is_BPNOS', 'is_BPSCZ', 'is_BP',
                 'is_BP_including_BPSCZ', 'is_SCZ', 'is_BPPSY', 'is_BP_no_PSY',
                 'is_PSYCHOSIS').write(PHENOTYPE_TABLE_BOOL, overwrite=True)

mt = mt.annotate_rows(MAC=hl.min(
    hl.agg.sum(mt.GT.n_alt_alleles()),
    hl.agg.sum(
        hl.int64(mt.GT.is_het_ref()) + 2 * hl.int64(mt.GT.is_hom_ref()))))
mt_MAC10 = mt.filter_rows(mt.MAC >= 10)


def run_logistic_bool(mt, variable):

    ht = hl.logistic_regression_rows(test='firth',
                                     y=mt[variable],
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1, mt.imputesex.impute_sex.is_female,
                                         mt.pca.PC1, mt.pca.PC2, mt.pca.PC3,
                                         mt.pca.PC4, mt.pca.PC5, mt.pca.PC6,
                                         mt.pca.PC7, mt.pca.PC8, mt.pca.PC9,
                                         mt.pca.PC10
                                     ])
Esempio n. 39
0
def x_position(locus: hl.expr.LocusExpression) -> hl.expr.Int64Expression:
    return hl.int64(contig_number(
        locus.contig)) * 1_000_000_000 + locus.position