def values(self): values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), { "a": 0, "b": 1, "c": 4 }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1]))] return values
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
def values(self): values = [ (hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), {"a": 0, "b": 1, "c": 4}), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1])) ] return values
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def visit_set(self, node, visited_children): tset, _, angle_bracket, t, angle_bracket = visited_children return hl.tset(t)
def _impute_type(x, partial_type): from hail.genetics import Locus, Call from hail.utils import Interval, Struct def refine(t, refined): if t is None: return refined if not isinstance(t, type(refined)): raise ExpressionException( "Incompatible partial_type, {}, for value {}".format( partial_type, x)) return t if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct) or isinstance(x, dict) and isinstance( partial_type, tstruct): partial_type = refine(partial_type, hl.tstruct()) t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x}) return t elif isinstance(x, tuple): partial_type = refine(partial_type, hl.ttuple()) return ttuple(*[ _impute_type( element, partial_type[index] if index < len(partial_type) else None) for index, element in enumerate(x) ]) elif isinstance(x, list): partial_type = refine(partial_type, hl.tarray(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif is_setlike(x): partial_type = refine(partial_type, hl.tset(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): user_partial_type = partial_type partial_type = refine(partial_type, hl.tdict(None, None)) if len(x) == 0: return partial_type kts = { _impute_type(element, partial_type.key_type) for element in x.keys() } vts = { _impute_type(element, partial_type.value_type) for element in x.values() } unified_key_type = super_unify_types(*kts) unified_value_type = super_unify_types(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys {} of types {} ".format( list(x.keys()), list(kts))) if not unified_value_type: if unified_key_type == hl.tstr and user_partial_type is None: return tstruct(**{k: _impute_type(x[k], None) for k in x}) raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None or pd.isna(x): return partial_type elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))
def maximal_independent_set(i, j, keep=True, tie_breaker=None) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Prune individuals from a dataset until no close relationships remain with respect to a PC-Relate measure of kinship. >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) >>> pairs = pairs.key_by(i=pairs.i.s, j=pairs.j.s).select() >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols(hl.is_defined(related_samples_to_remove[dataset.s]), keep=False) Prune individuals from a dataset, preferring to keep cases over controls. >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) >>> pairs = pairs.key_by(i=pairs.i.s, j=pairs.j.s).select() >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.select( ... s = related_samples_to_remove.node.id).key_by('s')[dataset.s]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError( "'maximal_independent_set' expects an expression of 'Table'. Found {}" .format("expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) l = construct_expr(VariableReference('l'), wrapped_node_t) r = construct_expr(VariableReference('r'), wrapped_node_t) tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_hql = tie_breaker_expr._ast.to_hql() else: t, _ = source._process_joins(i, j) tie_breaker_hql = None nodes = (t.select(node=[i, j]).explode('node').key_by('node').select()) edges = t.key_by(None).select('i', 'j') nodes_in_set = Env.hail().utils.Graph.maximalIndependentSet( edges._jt.collect(), node_t._jtype, joption(tie_breaker_hql)) nt = Table( nodes._jt.annotateGlobal(nodes_in_set, hl.tset(node_t)._jtype, 'nodes_in_set')) nt = (nt.filter(nt.nodes_in_set.contains(nt.node), keep).drop('nodes_in_set')) return nt
def concordance( left, right, *, _localize_global_statistics=True ) -> Tuple[List[List[int]], Table, Table]: """Calculate call concordance with another dataset. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst .. include:: ../_templates/req_unphased_diploid_gt.rst Examples -------- Compute concordance between two datasets and output the global concordance statistics and two tables with concordance computed per column key and per row key: >>> global_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset2) Notes ----- This method computes the genotype call concordance (from the entry field **GT**) between two biallelic variant datasets. It requires unique sample IDs and performs an inner join on samples (only samples in both datasets will be considered). In addition, all genotype calls must be **diploid** and **unphased**. It performs an ordered zip join of the variants. That means the variants of each dataset are sorted, with duplicate variants appearing in some random relative order, and then zipped together. When a variant appears a different number of times between the two datasets, the dataset with the fewer number of instances is padded with "no data". For example, if a variant is only in one dataset, then each genotype is treated as "no data" in the other. This method returns a tuple of three objects: a nested list of list of int with global concordance summary statistics, a table with concordance statistics per column key, and a table with concordance statistics per row key. **Using the global summary result** The global summary is a list of list of int (conceptually a 5 by 5 matrix), where the indices have special meaning: 0. No Data (missing variant) 1. No Call (missing genotype call) 2. Hom Ref 3. Heterozygous 4. Hom Var The first index is the state in the left dataset and the second index is the state in the right dataset. Typical uses of the summary list are shown below. >>> summary, samples, variants = hl.concordance(dataset, dataset2) >>> left_homref_right_homvar = summary[2][4] >>> left_het_right_missing = summary[3][1] >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3] >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4] >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant **Using the table results** Table 1: Concordance statistics by column This table contains the column key field of `left`, and the following fields: - `n_discordant` (:py:data:`.tint64`) -- Count of discordant calls (see below for full definition). - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. Table 2: Concordance statistics by row This table contains the row key fields of `left`, and the following fields: - `n_discordant` (:py:data:`.tfloat64`) -- Count of discordant calls (see below for full definition). - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. In these tables, the column **n_discordant** is provided as a convenience, because this is often one of the most useful concordance statistics. This value is the number of genotypes which were called (homozygous reference, heterozygous, or homozygous variant) in both datasets, but where the call did not match between the two. The column `concordance` matches the structure of the global summmary, which is detailed above. Once again, the first index into this array is the state on the left, and the second index is the state on the right. For example, ``concordance[1][4]`` is the number of "no call" genotypes on the left that were called homozygous variant on the right. Parameters ---------- left : :class:`.MatrixTable` First dataset to compare. right : :class:`.MatrixTable` Second dataset to compare. Returns ------- (list of list of int, :class:`.Table`, :class:`.Table`) The global concordance statistics, a table with concordance statistics per column key, and a table with concordance statistics per row key. """ require_col_key_str(left, 'concordance, left') require_col_key_str(right, 'concordance, right') left_sample_counter = left.aggregate_cols(hl.agg.counter(left.col_key[0])) right_sample_counter = right.aggregate_cols( hl.agg.counter(right.col_key[0])) left_bad = [f'{k!r}: {v}' for k, v in left_sample_counter.items() if v > 1] right_bad = [ f'{k!r}: {v}' for k, v in right_sample_counter.items() if v > 1 ] if left_bad or right_bad: raise ValueError(f"Found duplicate sample IDs:\n" f" left: {', '.join(left_bad)}\n" f" right: {', '.join(right_bad)}") included = set(left_sample_counter.keys()).intersection( set(right_sample_counter.keys())) info( f"concordance: including {len(included)} shared samples " f"({len(left_sample_counter)} total on left, {len(right_sample_counter)} total on right)" ) left = require_biallelic(left, 'concordance, left') right = require_biallelic(right, 'concordance, right') lit = hl.literal(included, dtype=hl.tset(hl.tstr)) left = left.filter_cols(lit.contains(left.col_key[0])) right = right.filter_cols(lit.contains(right.col_key[0])) left = left.select_entries('GT').select_rows().select_cols() right = right.select_entries('GT').select_rows().select_cols() joined = hl.experimental.full_outer_join_mt(left, right) def get_idx(struct): return hl.cond(hl.is_missing(struct), 0, hl.coalesce(2 + struct.GT.n_alt_alleles(), 1)) aggr = hl.agg.counter( get_idx(joined.left_entry) + 5 * get_idx(joined.right_entry)) def concordance_array(counter): return hl.range(0, 5).map( lambda i: hl.range(0, 5).map(lambda j: counter.get(i + 5 * j, 0))) def n_discordant(counter): return hl.sum( hl.array(counter).filter(lambda tup: ~hl.literal( {i**2 for i in range(5)}).contains(tup[0])).map(lambda tup: tup[1])) glob = joined.aggregate_entries(concordance_array(aggr), _localize=_localize_global_statistics) if _localize_global_statistics: total_conc = [x[1:] for x in glob[1:]] on_diag = sum(total_conc[i][i] for i in range(len(total_conc))) total_obs = sum(sum(x) for x in total_conc) info(f"concordance: total concordance {on_diag/total_obs * 100:.2f}%") per_variant = joined.annotate_rows(concordance=aggr) per_variant = per_variant.annotate_rows( concordance=concordance_array(per_variant.concordance), n_discordant=n_discordant(per_variant.concordance)) per_sample = joined.annotate_cols(concordance=aggr) per_sample = per_sample.annotate_cols( concordance=concordance_array(per_sample.concordance), n_discordant=n_discordant(per_sample.concordance)) return glob, per_sample.cols(), per_variant.rows()
def value_irs(self): b = ir.TrueIR() c = ir.Ref('c') i = ir.I32(5) j = ir.I32(7) a = ir.Ref('a') st = ir.Ref('st') aa = ir.Ref('aa') sta = ir.Ref('sta') da = ir.Ref('da') nd = ir.Ref('nd') v = ir.Ref('v') s = ir.Ref('s') t = ir.Ref('t') call = ir.Ref('call') table = ir.TableRange(5, 3) matrix_read = ir.MatrixRead( ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False), False, False) block_matrix_read = ir.BlockMatrixRead( ir.BlockMatrixNativeReader('fake_file_path')) value_irs = [ i, ir.I64(5), ir.F32(3.14), ir.F64(3.14), s, ir.TrueIR(), ir.FalseIR(), ir.Void(), ir.Cast(i, hl.tfloat64), ir.NA(hl.tint32), ir.IsNA(i), ir.If(b, i, j), ir.Coalesce(i, j), ir.Let('v', i, v), ir.Ref('x'), ir.ApplyBinaryPrimOp('+', i, j), ir.ApplyUnaryPrimOp('-', i), ir.ApplyComparisonOp('EQ', i, j), ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)], hl.tarray(hl.tint32)), ir.ArrayRef(a, i, ir.Str('foo')), ir.ArrayLen(a), ir.ArraySort(ir.ToStream(a), 'l', 'r', ir.ApplyComparisonOp("LT", ir.Ref('l'), ir.Ref('r'))), ir.ToSet(a), ir.ToDict(da), ir.ToArray(a), ir.CastToArray(ir.NA(hl.tset(hl.tint32))), ir.MakeNDArray( ir.MakeArray([ir.F64(-1.0), ir.F64(1.0)], hl.tarray(hl.tfloat64)), ir.MakeTuple([ir.I64(1), ir.I64(2)]), ir.TrueIR()), ir.NDArrayShape(nd), ir.NDArrayReshape(nd, ir.MakeTuple([ir.I64(5)])), ir.NDArrayRef(nd, [ir.I64(1), ir.I64(2)]), ir.NDArrayMap(nd, 'v', v), ir.NDArrayMatMul(nd, nd), ir.LowerBoundOnOrderedCollection(a, i, True), ir.GroupByKey(da), ir.StreamMap(st, 'v', v), ir.StreamZip([st, st], ['a', 'b'], ir.TrueIR(), 'ExtendNA'), ir.StreamFilter(st, 'v', v), ir.StreamFlatMap(sta, 'v', ir.ToStream(v)), ir.StreamFold(st, ir.I32(0), 'x', 'v', v), ir.StreamScan(st, ir.I32(0), 'x', 'v', v), ir.StreamLeftJoinDistinct(st, st, 'l', 'r', ir.I32(0), ir.I32(1)), ir.StreamFor(st, 'v', ir.Void()), ir.AggFilter(ir.TrueIR(), ir.I32(0), False), ir.AggExplode(ir.StreamRange(ir.I32(0), ir.I32(2), ir.I32(1)), 'x', ir.I32(0), False), ir.AggGroupBy(ir.TrueIR(), ir.I32(0), False), ir.AggArrayPerElement( ir.ToArray(ir.StreamRange(ir.I32(0), ir.I32(2), ir.I32(1))), 'x', 'y', ir.I32(0), False), ir.ApplyAggOp('Collect', [], [ir.I32(0)]), ir.ApplyScanOp('Collect', [], [ir.I32(0)]), ir.ApplyAggOp('CallStats', [ir.I32(2)], [call]), ir.ApplyAggOp('TakeBy', [ir.I32(10)], [ir.F64(-2.11), ir.F64(-2.11)]), ir.Begin([ir.Void()]), ir.MakeStruct([('x', i)]), ir.SelectFields(s, ['x', 'z']), ir.InsertFields(s, [('x', i)], None), ir.GetField(s, 'x'), ir.MakeTuple([i, b]), ir.GetTupleElement(t, 1), ir.Die(ir.Str('mumblefoo'), hl.tfloat64), ir.Apply('land', hl.tbool, b, c), ir.Apply('toFloat64', hl.tfloat64, i), ir.Literal(hl.tarray(hl.tint32), [1, 2, None]), ir.TableCount(table), ir.TableGetGlobals(table), ir.TableCollect(ir.TableKeyBy(table, [], False)), ir.TableToValueApply(table, {'name': 'ForceCountTable'}), ir.MatrixToValueApply(matrix_read, {'name': 'ForceCountMatrixTable'}), ir.TableAggregate( table, ir.MakeStruct([('foo', ir.ApplyAggOp('Collect', [], [ir.I32(0)]))])), ir.TableWrite( table, ir.TableNativeWriter(new_temp_file(), False, True, "fake_codec_spec$$")), ir.TableWrite( table, ir.TableTextWriter(new_temp_file(), None, True, "concatenated", ",")), ir.MatrixAggregate( matrix_read, ir.MakeStruct([('foo', ir.ApplyAggOp('Collect', [], [ir.I32(0)]))])), ir.MatrixWrite( matrix_read, ir.MatrixNativeWriter(new_temp_file(), False, False, "", None, None)), ir.MatrixWrite( matrix_read, ir.MatrixNativeWriter( new_temp_file(), False, False, "", '[{"start":{"row_idx":0},"end":{"row_idx": 10},"includeStart":true,"includeEnd":false}]', hl.dtype('array<interval<struct{row_idx:int32}>>'))), ir.MatrixWrite( matrix_read, ir.MatrixVCFWriter(new_temp_file(), None, ir.ExportType.CONCATENATED, None)), ir.MatrixWrite(matrix_read, ir.MatrixGENWriter(new_temp_file(), 4)), ir.MatrixWrite(matrix_read, ir.MatrixPLINKWriter(new_temp_file())), ir.MatrixMultiWrite([matrix_read, matrix_read], ir.MatrixNativeMultiWriter( new_temp_file(), False, False)), ir.BlockMatrixWrite( block_matrix_read, ir.BlockMatrixNativeWriter('fake_file_path', False, False, False)), ir.LiftMeOut(ir.I32(1)), ir.BlockMatrixWrite( block_matrix_read, ir.BlockMatrixPersistWriter('x', 'MEMORY_ONLY')), ir.UnpersistBlockMatrix(block_matrix_read), ] return value_irs
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. If `keyed` is ``False``, then a node may appear twice in the resulting table. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format( "expression of '{}'".format( source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) left = construct_variable('l', wrapped_node_t) right = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.float64(tie_breaker(left[0], right[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = construct_expr( ir.JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet( Env.spark_backend('maximal_independent_set')._to_java_value_ir(edges.collect(_localize=False)._ir), node_t._parsable_string(), tie_breaker_str)), hl.tset(node_t)) nodes = edges.select(node=[edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) nodes = nodes.annotate_globals(mis_nodes=mis_nodes) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node').distinct() return nodes
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format( "expression of '{}'".format( source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) l = construct_variable('l', wrapped_node_t) r = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = construct_expr(JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet( Env.spark_backend('maximal_independent_set')._to_java_ir(edges.collect(_localize=False)._ir), node_t._parsable_string(), joption(tie_breaker_str))), hl.tset(node_t)) nodes = edges.select(node = [edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) nodes = nodes.annotate_globals(mis_nodes=mis_nodes) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node') return nodes
def visit_set(self, node, visited_children): tset, _, angle_bracket, t, angle_bracket = visited_children return hl.tset(t)