Exemple #1
0
def combine_gvcfs(mts):
    """merges vcfs using multi way join"""

    # pylint: disable=protected-access
    def localize(mt):
        return mt._localize_entries('__entries', '__cols')

    mts = [
        hl.MatrixTable(MatrixKeyRowsBy(mt._mir, ['locus'], is_sorted=True))
        for mt in mts
    ]
    mts = [
        mt.annotate_rows(alleles=hl.bind(
            lambda ref: mt.alleles[1:].map(lambda alt: hl.struct(
                ref=ref, alt=alt)), mt.alleles[0])) for mt in mts
    ]
    ts = hl.Table._multi_way_zip_join([localize(mt) for mt in mts], 'data',
                                      'g')
    combined = combine(ts)
    combined = combined.annotate(alleles=combined.alleles[:1].map(
        lambda d: d.ref).extend(combined.alleles.map(lambda d: d.alt)))
    return hl.MatrixTable(
        MatrixKeyRowsBy(combined._unlocalize_entries('__entries', '__cols',
                                                     ['s'])._mir,
                        ['locus', 'alleles'],
                        is_sorted=True))
Exemple #2
0
def combine_gvcfs(mts):
    """merges vcfs using multi way join"""

    # pylint: disable=protected-access
    def localize(mt):
        return mt._localize_entries('__entries', '__cols')

    def fix_alleles(alleles):
        ref = alleles.map(lambda d: d.ref).fold(
            lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), '')
        alts = alleles.map(lambda a: hl.switch(hl.allele_type(
            a.ref, a.alt)).when('SNP', a.alt + ref[hl.len(a.alt):]).when(
                'Insertion', a.alt + ref[hl.len(a.ref):]).when(
                    'Deletion', a.alt + ref[hl.len(a.ref):]).default(a.alt))
        return hl.array([ref]).extend(alts)

    def min_rep(locus, ref, alt):
        mr = hl.min_rep(locus, [ref, alt])
        return (hl.case().when(
            alt == '<NON_REF>', hl.struct(ref=ref[0:1], alt=alt)).when(
                locus == mr.locus,
                hl.struct(ref=mr.alleles[0], alt=mr.alleles[1])).or_error(
                    "locus before and after minrep differ"))

    mts = [
        hl.MatrixTable(MatrixKeyRowsBy(mt._mir, ['locus'], is_sorted=True))
        for mt in mts
    ]
    mts = [
        mt.annotate_rows(
            # now minrep'ed (ref, alt) allele pairs
            alleles=hl.bind(
                lambda ref, locus: mt.alleles[1:].map(lambda alt: min_rep(
                    locus, ref, alt)), mt.alleles[0], mt.locus)) for mt in mts
    ]
    ts = hl.Table._multi_way_zip_join([localize(mt) for mt in mts], 'data',
                                      'g')
    combined = combine(ts)
    combined = combined.annotate(alleles=fix_alleles(combined.alleles))
    return hl.MatrixTable(
        MatrixKeyRowsBy(combined._unlocalize_entries('__entries', '__cols',
                                                     ['s'])._mir,
                        ['locus', 'alleles'],
                        is_sorted=True))
Exemple #3
0
def range_matrix_table(n_rows,
                       n_cols,
                       n_partitions=None) -> 'hail.MatrixTable':
    """Construct a matrix table with row and column indices and no entry fields.

    Examples
    --------

    >>> range_ds = hl.utils.range_matrix_table(n_rows=100, n_cols=10)

    >>> range_ds.count_rows()
    100

    >>> range_ds.count_cols()
    10

    Notes
    -----
    The resulting matrix table contains the following fields:

     - `row_idx` (:py:data:`.tint32`) - Row index (row key).
     - `col_idx` (:py:data:`.tint32`) - Column index (column key).

    It contains no entry fields.

    This method is meant for testing and learning, and is not optimized for
    production performance.

    Parameters
    ----------
    n_rows : :obj:`int`
        Number of rows.
    n_cols : :obj:`int`
        Number of columns.
    n_partitions : int, optional
        Number of partitions (uses Spark default parallelism if None).

    Returns
    -------
    :class:`.MatrixTable`
    """
    check_nonnegative_and_in_range('range_matrix_table', 'n_rows', n_rows)
    check_nonnegative_and_in_range('range_matrix_table', 'n_cols', n_cols)
    if n_partitions is not None:
        check_positive_and_in_range('range_matrix_table', 'n_partitions',
                                    n_partitions)
    return hail.MatrixTable(
        hail.ir.MatrixRead(
            hail.ir.MatrixRangeReader(n_rows, n_cols, n_partitions)))
Exemple #4
0
def get_full_mt(
        split: bool = True,
        key_by_locus_and_alleles: bool = False,
        remove_hard_filtered_samples: bool = True,
        release_only: bool = False
) -> hl.MatrixTable:
    mt = hl.read_matrix_table(get_full_mt_path(split))
    if key_by_locus_and_alleles:
        mt = hl.MatrixTable(hl.ir.MatrixKeyRowsBy(mt._mir, ['locus', 'alleles'], is_sorted=True))

    if remove_hard_filtered_samples:
        hard_filtered_ht = hl.read_table(hard_filtered_samples_ht_path)
        mt = mt.filter_cols(hl.is_missing(hard_filtered_ht[mt.col_key]))

    return mt
Exemple #5
0
def get_gnomad_v3_mt(
    split=False,
    key_by_locus_and_alleles: bool = False,
    remove_hard_filtered_samples: bool = True,
    release_only: bool = False,
    samples_meta: bool = False,
) -> hl.MatrixTable:
    """
    Wrapper function to get gnomAD data with desired filtering and metadata annotations

    :param split: Perform split on MT - Note: this will perform a split on the MT rather than grab an already split MT
    :param key_by_locus_and_alleles: Whether to key the MatrixTable by locus and alleles (only needed for v3)
    :param remove_hard_filtered_samples: Whether to remove samples that failed hard filters (only relevant after sample QC)
    :param release_only: Whether to filter the MT to only samples available for release (can only be used if metadata is present)
    :param samples_meta: Whether to add metadata to MT in 'meta' column
    :return: gnomAD v3 dataset with chosen annotations and filters
    """
    mt = gnomad_v3_genotypes.mt()
    if key_by_locus_and_alleles:
        mt = hl.MatrixTable(
            hl.ir.MatrixKeyRowsBy(
                mt._mir, ["locus", "alleles"], is_sorted=True
            )  # Prevents hail from running sort on genotype MT which is already sorted by a unique locus
        )

    if remove_hard_filtered_samples:
        mt = mt.filter_cols(
            hl.is_missing(hard_filtered_samples.ht()[mt.col_key]))

    if samples_meta:
        mt = mt.annotate_cols(meta=meta.ht()[mt.col_key])

        if release_only:
            mt = mt.filter_cols(mt.meta.release)

    elif release_only:
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

    if split:
        mt = mt.annotate_rows(
            n_unsplit_alleles=hl.len(mt.alleles),
            mixed_site=(hl.len(mt.alleles) > 2)
            & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
            & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]),
        )
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    return mt
Exemple #6
0
def range_matrix_table(n_rows,
                       n_cols,
                       n_partitions=None) -> 'hail.MatrixTable':
    """Construct a matrix table with row and column indices and no entry fields.

    Examples
    --------
    .. doctest::

        >>> range_ds = hl.utils.range_matrix_table(n_rows=100, n_cols=10)

        >>> range_ds.count_rows()
        100

        >>> range_ds.count_cols()
        10

    Notes
    -----
    The resulting matrix table contains the following fields:

     - `row_idx` (:py:data:`.tint32`) - Row index (row key).
     - `col_idx` (:py:data:`.tint32`) - Column index (column key).

    It contains no entry fields.

    This method is meant for testing and learning, and is not optimized for
    production performance.

    Parameters
    ----------
    n_rows : :obj:`int`
        Number of rows.
    n_cols : :obj:`int`
        Number of columns.
    n_partitions : int, optional
        Number of partitions (uses Spark default parallelism if None).

    Returns
    -------
    :class:`.MatrixTable`
    """
    return hail.MatrixTable(Env.hail().variant.MatrixTable.range(
        Env.hc()._jhc, n_rows, n_cols, joption(n_partitions)))
Exemple #7
0
def get_gnomad_v3_mt(key_by_locus_and_alleles: bool = False,
                     remove_hard_filtered_samples: bool = True,
                     release_only: bool = False,
                     samples_meta: bool = False) -> hl.MatrixTable:
    mt = gnomad_v3_genotypes.mt()
    if key_by_locus_and_alleles:
        mt = hl.MatrixTable(
            hl.ir.MatrixKeyRowsBy(mt._mir, ['locus', 'alleles'],
                                  is_sorted=True))

    if remove_hard_filtered_samples:
        mt = mt.filter_cols(
            hl.is_missing(hard_filtered_samples.ht()[mt.col_key]))

    if samples_meta:
        mt = mt.annotate_cols(meta=meta.ht()[mt.col_key])

        if release_only:
            mt = mt.filter_cols(mt.meta.release)

    elif release_only:
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

    return mt
Exemple #8
0
def main(args):

    hl.init(log="/select_samples", default_reference="GRCh38")
    meta_ht = hl.read_table(args.sample_metadata_ht)
    meta_ht = meta_ht.filter(meta_ht.release
                             & hl.is_defined(meta_ht.project_meta.cram_path))
    meta_ht = meta_ht.select(
        cram_path=meta_ht.project_meta.cram_path,
        crai_path=meta_ht.project_meta.cram_path.replace(
            ".cram", ".cram.crai"),
        sex=meta_ht.project_meta.sex,
    )

    mt = MatrixTableResource(args.gnomad_mt).mt()
    mt = hl.MatrixTable(
        hl.ir.MatrixKeyRowsBy(mt._mir, ['locus', 'alleles'], is_sorted=True))

    if args.test:
        logger.info("Filtering to chrX PAR1 boundary: chrX:2781477-2781900")
        mt = hl.filter_intervals(
            mt, [hl.parse_locus_interval("chrX:2781477-2781900")])

    meta_join = meta_ht[mt.s]
    mt = mt.annotate_cols(meta=hl.struct(
        sex=meta_join.sex,
        cram=meta_join.cram_path,
        crai=meta_join.crai_path,
    ))
    logger.info("Filtering to releasable samples with a defined cram path")
    mt = mt.filter_cols(mt.meta.release & hl.is_defined(mt.meta.cram))
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Adjusting samples' sex ploidy")
    mt = mt.annotate_entries(GT=adjusted_sex_ploidy_expr(
        mt.locus,
        mt.GT,
        mt.meta.sex,
        xy_karyotype_str="male",
        xx_karyotype_str="female",
    ))
    mt = mt.select_entries("GT", "GQ", "DP", "AD")

    logger.info(
        "Filtering to entries meeting GQ, DP and other 'adj' thresholds")
    mt = filter_to_adj(mt)
    mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info(
        f"Taking up to {args.num_samples} samples per site where samples are het, hom_var, or hemi"
    )

    def sample_ordering_expr(mt):
        """It can be problematic for downstream steps when several samples have many times more variants selected
        than in other samples. To avoid this, and distribute variants more evenly across samples,
        add a random number as the secondary sort order. This way, when many samples have an identically high GQ
        (as often happens for common variants), the same few samples don't get selected repeatedly for all common
        variants.
        """

        return -mt.GQ, hl.rand_unif(0, 1, seed=1)

    mt = mt.annotate_rows(
        samples_w_het_var=hl.agg.filter(
            het_expr(mt),
            hl.agg.take(het_hom_hemi_take_expr(mt),
                        args.num_samples,
                        ordering=sample_ordering_expr(mt)),
        ),
        samples_w_hom_var=hl.agg.filter(
            hom_expr(mt),
            hl.agg.take(het_hom_hemi_take_expr(mt),
                        args.num_samples,
                        ordering=sample_ordering_expr(mt)),
        ),
        samples_w_hemi_var=hl.agg.filter(
            hemi_expr(mt),
            hl.agg.take(het_hom_hemi_take_expr(mt),
                        args.num_samples,
                        ordering=sample_ordering_expr(mt)),
        ),
    )

    ht = mt.rows()
    ht = ht.select(ht.samples_w_het_var, ht.samples_w_hom_var,
                   ht.samples_w_hemi_var)
    ht.write(args.output_ht_path, overwrite=args.overwrite)