def get_n_even_intervals(n): ref = hl.default_reference() genome_size = sum(ref.lengths.values()) partition_size = int(genome_size / n) + 1 return list( map( lambda x: hl.Interval( hl.eval(hl.locus_from_global_position(x * partition_size)), hl.eval( hl.locus_from_global_position( min(x * partition_size + partition_size, genome_size - 1)))), range(n)))
def calculate_new_intervals(ht, n, reference_genome): """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable for repartitioning a combiner matrix table Parameters ---------- ht : :class:`.Table` Table / Rows Table to compute new intervals for n : :obj:`int` Number of rows each partition should have, (last partition may be smaller) reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. Returns ------- :obj:`List[Interval]` """ assert list(ht.key) == ['locus'] assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome) end = hl.Locus(reference_genome.contigs[-1], reference_genome.lengths[reference_genome.contigs[-1]], reference_genome=reference_genome) n_rows = ht.count() if n_rows == 0: raise ValueError('empty table!') ht = ht.select() ht = ht.annotate(x=hl.scan.count()) ht = ht.annotate(y=ht.x + 1) ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1))) ht = ht.select() ht = ht.annotate(start=hl.or_else( hl.scan._prev_nonnull( hl.locus_from_global_position(ht.locus.global_position() + 1, reference_genome=reference_genome)), hl.locus_from_global_position(0, reference_genome=reference_genome))) ht = ht.key_by() ht = ht.select( interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True)) intervals = ht.aggregate(hl.agg.collect(ht.interval)) last_st = hl.eval( hl.locus_from_global_position( hl.literal(intervals[-1].end).global_position() + 1, reference_genome=reference_genome)) interval = hl.Interval(start=last_st, end=end, includes_end=True) intervals.append(interval) return intervals