Beispiel #1
0
import hail.expr.aggregators as agg
from hail.expr import (expr_float64, expr_call, expr_array, analyze,
                       matrix_table_source)
from hail.expr.types import tarray
from hail import ir
from hail.linalg import BlockMatrix
from hail.table import Table
from hail.typecheck import typecheck, nullable, numeric, enumeration

from ..pca import hwe_normalized_pca


@typecheck(call_expr=expr_call,
           min_individual_maf=numeric,
           k=nullable(int),
           scores_expr=nullable(expr_array(expr_float64)),
           min_kinship=nullable(numeric),
           statistics=enumeration('kin', 'kin2', 'kin20', 'all'),
           block_size=nullable(int),
           include_self_kinship=bool)
def pc_relate(call_expr,
              min_individual_maf,
              *,
              k=None,
              scores_expr=None,
              min_kinship=None,
              statistics="all",
              block_size=None,
              include_self_kinship=False) -> Table:
    r"""Compute relatedness estimates between individuals using a variant of the
    PC-Relate method.
Beispiel #2
0
        if s_ != s:
            mapping.append((s, s_))
        uniques.add(s_)
        new_ids.append(s_)

    if mapping:
        info(f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:'
             + ''.join(f'\n  "{pre}" => "{post}"' for pre, post in mapping))
    else:
        info('No duplicate sample IDs found.')
    return dataset.annotate_cols(**{name: hl.literal(new_ids)[hl.int(hl.scan.count())]})


@typecheck(ds=oneof(Table, MatrixTable),
           intervals=expr_array(expr_interval(expr_any)),
           keep=bool)
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)
Beispiel #3
0
    ids = dataset.col_key[0].collect()

    mapping, new_ids = deduplicate(ids)

    if mapping:
        info(
            f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:'
            + ''.join(f'\n  "{pre}" => "{post}"' for pre, post in mapping))
    else:
        info('No duplicate sample IDs found.')
    return dataset.annotate_cols(
        **{name: hl.literal(new_ids)[hl.int(hl.scan.count())]})


@typecheck(ds=oneof(Table, MatrixTable),
           intervals=expr_array(expr_interval(expr_any)),
           keep=bool)
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)
Beispiel #4
0
        Variants to filter on.
    keep: :obj:`bool`
        Whether to keep (default), or filter out the variants from `variants_table`.

    Returns
    -------
    :class:`.VariantDataset`.
    """
    if keep:
        variant_data = vds.variant_data.semi_join_rows(variants_table)
    else:
        variant_data = vds.variant_data.anti_join_rows(variants_table)
    return VariantDataset(vds.reference_data, variant_data)


@typecheck(vds=VariantDataset, intervals=expr_array(expr_interval(expr_any)), keep=bool)
def filter_intervals(vds: 'VariantDataset', intervals: 'ArrayExpression', *, keep: bool = False) -> 'VariantDataset':
    """Filter intervals in a :class:`.VariantDataset` (only on variant data for
    now).

    Parameters
    ----------
    vds : :class:`.VariantDataset`
        Dataset in VariantDataset representation.
    intervals : :class:`.ArrayExpression` of type :class:`.tinterval`
        Intervals to filter on.
    keep : :obj:`bool`
        Whether to keep, or filter out (default) rows that fall within any
        interval in `intervals`.

    Returns
Beispiel #5
0
        vd = vd.annotate_entries(LA=vd.LA.map(lambda la: new_la_index(la)))
        vd = vd.key_rows_by('locus')
        vd = vd.annotate_rows(
            alleles=vd.__kept_indices.keys().map(lambda i: vd.alleles[i]))
        vd = vd._key_rows_by_assert_sorted('locus', 'alleles')
        vd = vd.drop('__allele_counts', '__kept_indices', '__old_to_new_LA')
        return VariantDataset(reference_data, vd)

    variant_data = variant_data.filter_rows(hl.agg.count() > 0)
    return VariantDataset(reference_data, variant_data)


@typecheck(vds=VariantDataset,
           calling_intervals=oneof(Table,
                                   expr_array(expr_interval(expr_locus()))),
           normalization_contig=str)
def impute_sex_chromosome_ploidy(vds: VariantDataset, calling_intervals,
                                 normalization_contig: str) -> hl.Table:
    """Impute sex chromosome ploidy from depth of reference data within calling intervals.

    Returns a :class:`.Table` with sample ID keys, with the following fields:

     -  ``autosomal_mean_dp`` (*float64*): Mean depth on calling intervals on normalization contig.
     -  ``x_mean_dp`` (*float64*): Mean depth on calling intervals on X chromosome.
     -  ``x_ploidy`` (*float64*): Estimated ploidy on X chromosome. Equal to ``2 * x_mean_dp / autosomal_mean_dp``.
     -  ``y_mean_dp`` (*float64*): Mean depth on calling intervals on  chromosome.
     -  ``y_ploidy`` (*float64*): Estimated ploidy on Y chromosome. Equal to ``2 * y_mean_db / autosomal_mean_dp``.

    Parameters
    ----------