def range_table(n, n_partitions=None) -> 'hail.Table': """Construct a table with the row index and no other fields. Examples -------- .. doctest:: >>> df = hl.utils.range_table(100) >>> df.count() 100 Notes ----- The resulting table contains one field: - `idx` (:py:data:`.tint32`) - Row index (key). This method is meant for testing and learning, and is not optimized for production performance. Parameters ---------- n : int Number of rows. n_partitions : int, optional Number of partitions (uses Spark default parallelism if None). Returns ------- :class:`.Table` """ return hail.Table(Env.hail().table.Table.range(Env.hc()._jhc, n, joption(n_partitions)))
def test_value_same_after_parsing(self): for t, v in self.values(): row_v = ir.Literal(t, v) map_globals_ir = ir.TableMapGlobals( ir.TableRange(1, 1), ir.InsertFields(ir.Ref("global"), [("foo", row_v)], None)) new_globals = hl.eval(hl.Table(map_globals_ir).index_globals()) self.assertEqual(new_globals, hl.Struct(foo=v))
def check(ht): keys = list(ht.key) if keys[0] != 'locus': raise TypeError( f'table inputs must have first key "locus", found {keys}') if keys != ['locus']: return hl.Table(TableKeyBy(ht._tir, ['locus'], is_sorted=True)) return ht
def test_value_same_after_parsing(self): test_exprs = [] expecteds = [] for t, v in self.values(): row_v = ir.Literal(t, v) map_globals_ir = ir.TableMapGlobals( ir.TableRange(1, 1), ir.InsertFields(ir.Ref("global"), [("foo", row_v)], None)) test_exprs.append(hl.Table(map_globals_ir).index_globals()) expecteds.append(hl.Struct(foo=v)) actuals = hl._eval_many(*test_exprs) for expr, actual, expected in zip(test_exprs, actuals, expecteds): assert actual == expected, str(expr)
def range_table(n, n_partitions=None) -> 'hail.Table': """Construct a table with the row index and no other fields. Examples -------- >>> df = hl.utils.range_table(100) >>> df.count() 100 Notes ----- The resulting table contains one field: - `idx` (:py:data:`.tint32`) - Row index (key). This method is meant for testing and learning, and is not optimized for production performance. Parameters ---------- n : int Number of rows. n_partitions : int, optional Number of partitions (uses Spark default parallelism if None). Returns ------- :class:`.Table` """ check_nonnegative_and_in_range('range_table', 'n', n) if n_partitions is not None: check_positive_and_in_range('range_table', 'n_partitions', n_partitions) return hail.Table(hail.ir.TableRange(n, n_partitions))
def sparse_split_multi(sparse_mt, *, filter_changed_loci=False): """Splits multiallelic variants on a sparse MatrixTable. Takes a dataset formatted like the output of :func:`.vcf_combiner`. The splitting will add `was_split` and `a_index` fields, as :func:`.split_multi` does. This function drops the `LA` (local alleles) field, as it re-computes entry fields based on the new, split globals alleles. Variants are split thus: - A row with only one (reference) or two (reference and alternate) alleles is unchanged, as local and global alleles are the same. - A row with multiple alternate alleles will be split, with one row for each alternate allele, and each row will contain two alleles: ref and alt. The reference and alternate allele will be minrepped using :func:`.min_rep`. The split multi logic handles the following entry fields: .. code-block:: text struct { LGT: call LAD: array<int32> DP: int32 GQ: int32 LPL: array<int32> RGQ: int32 LPGT: call LA: array<int32> END: int32 } All fields except for `LA` are optional, and only handled if they exist. - `LA` is used to find the corresponding local allele index for the desired global `a_index`, and then dropped from the resulting dataset. If `LA` does not contain the global `a_index`, calls will be downcoded to hom ref and `PL` will be set to missing. - `LGT` and `LPGT` are downcoded using the corresponding local `a_index`. They are renamed to `GT` and `PGT` respectively, as the resulting call is no longer local. - `LAD` is used to create an `AD` field consisting of the allele depths corresponding to the reference and global `a_index` alleles. - `DP` is preserved unchanged. - `GQ` is recalculated from the updated `PL`, if it exists, but otherwise preserved unchanged. - `PL` array elements are calculated from the minimum `LPL` value for all allele pairs that downcode to the desired one. (This logic is identical to the `PL` logic in :func:`.split_mult_hts`.) If a row has an alternate allele but it is not present in `LA`, the `PL` field is set to missing. The `PL` for `ref/<NON_REF>` in that case can be drawn from `RGQ`. - `RGQ` (the reference genotype quality) is preserved unchanged. - `END` is untouched. Notes ----- This version of split-multi doesn't deal with either duplicate loci (in which case the explode could possibly result in out-of-order rows, although the actual split_multi function also doesn't handle that case). It also checks that min-repping will not change the locus and will error if it does. (I believe the VCF combiner checks that this holds true, currently.) Parameters ---------- sparse_mt : :class:`.MatrixTable` Sparse MatrixTable to split. filter_changed_loci : :obj:`.bool` Rather than erroring if any REF/ALT pair changes locus under :func:`.min_rep` filter that variant instead. Returns ------- :class:`.MatrixTable` The split MatrixTable in sparse format. """ hl.methods.misc.require_row_key_variant(sparse_mt, "sparse_split_multi") entries = hl.utils.java.Env.get_uid() cols = hl.utils.java.Env.get_uid() ds = sparse_mt.localize_entries(entries, cols) new_id = hl.utils.java.Env.get_uid() def struct_from_min_rep(i): return hl.bind( lambda mr: (hl.case(). when( ds.locus == mr.locus, hl.struct(locus=ds.locus, alleles=[mr.alleles[0], mr.alleles[1]], a_index=i, was_split=True)).when( filter_changed_loci, hl.null( hl.tstruct(locus=ds.locus.dtype, alleles=hl.tarray(hl.tstr), a_index=hl.tint, was_split=hl.tbool))). or_error("Found non-left-aligned variant in sparse_split_multi\n" + "old locus: " + hl.str(ds.locus) + "\n" + "old ref : " + ds.alleles[0] + "\n" + "old alt : " + ds.alleles[ i] + "\n" + "mr locus : " + hl.str( mr.locus) + "\n" + "mr ref : " + mr.alleles[ 0] + "\n" + "mr alt : " + mr.alleles[1])), hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]])) explode_structs = hl.cond( hl.len(ds.alleles) < 3, [ hl.struct( locus=ds.locus, alleles=ds.alleles, a_index=1, was_split=False) ], hl._sort_by( hl.cond( filter_changed_loci, hl.range(1, hl.len(ds.alleles)).map(struct_from_min_rep).filter( hl.is_defined), hl.range(1, hl.len(ds.alleles)).map(struct_from_min_rep)), lambda l, r: hl._compare(l.alleles, r.alleles) < 0)) ds = ds.annotate(**{new_id: explode_structs}).explode(new_id) def transform_entries(old_entry): def with_local_a_index(local_a_index): fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields))) if 'LPL' in fields: new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index) == hl. unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) return hl.bind(with_pl, new_pl) else: return with_pl(None) lai = hl.fold( lambda accum, elt: hl.cond(old_entry.LA[elt] == ds[new_id].a_index, elt, accum), hl.null(hl.tint32), hl.range(0, hl.len(old_entry.LA))) return hl.bind(with_local_a_index, lai) new_row = ds.row.annotate( **{ 'locus': ds[new_id].locus, 'alleles': ds[new_id].alleles, 'a_index': ds[new_id].a_index, 'was_split': ds[new_id].was_split, entries: ds[entries].map(transform_entries) }).drop(new_id) ds = hl.Table( hl.ir.TableKeyBy(hl.ir.TableMapRows( hl.ir.TableKeyBy(ds._tir, ['locus']), new_row._ir), ['locus', 'alleles'], is_sorted=True)) return ds._unlocalize_entries(entries, cols, list(sparse_mt.col_key.keys()))