Ejemplo n.º 1
0
def get_n_even_intervals(n):
    ref = hl.default_reference()
    genome_size = sum(ref.lengths.values())
    partition_size = int(genome_size / n) + 1
    return list(
        map(
            lambda x: hl.Interval(
                hl.eval(hl.locus_from_global_position(x * partition_size)),
                hl.eval(
                    hl.locus_from_global_position(
                        min(x * partition_size + partition_size, genome_size -
                            1)))), range(n)))
Ejemplo n.º 2
0
def calculate_new_intervals(ht, n, reference_genome):
    """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable
    for repartitioning a combiner matrix table

    Parameters
    ----------
    ht : :class:`.Table`
        Table / Rows Table to compute new intervals for
    n : :obj:`int`
        Number of rows each partition should have, (last partition may be smaller)
    reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional
        Reference genome to use.

    Returns
    -------
    :obj:`List[Interval]`
    """
    assert list(ht.key) == ['locus']
    assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome)
    end = hl.Locus(reference_genome.contigs[-1],
                   reference_genome.lengths[reference_genome.contigs[-1]],
                   reference_genome=reference_genome)

    n_rows = ht.count()

    if n_rows == 0:
        raise ValueError('empty table!')

    ht = ht.select()
    ht = ht.annotate(x=hl.scan.count())
    ht = ht.annotate(y=ht.x + 1)
    ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1)))
    ht = ht.select()
    ht = ht.annotate(start=hl.or_else(
        hl.scan._prev_nonnull(
            hl.locus_from_global_position(ht.locus.global_position() + 1,
                                          reference_genome=reference_genome)),
        hl.locus_from_global_position(0, reference_genome=reference_genome)))
    ht = ht.key_by()
    ht = ht.select(
        interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True))

    intervals = ht.aggregate(hl.agg.collect(ht.interval))

    last_st = hl.eval(
        hl.locus_from_global_position(
            hl.literal(intervals[-1].end).global_position() + 1,
            reference_genome=reference_genome))
    interval = hl.Interval(start=last_st, end=end, includes_end=True)
    intervals.append(interval)
    return intervals