def vcf_to_mt(path, genome_version):
    '''
    Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates.
    This function independently filters the mutli-allelics to split, then unions with
    the bi-allelics.

    :param path: vcf path
    :param genome_version: genome version
    :return:
    '''
    # Import but do not split multis here.
    mt = import_vcf(path,
                    genome_version=genome_version,
                    min_partitions=1000,
                    split_multi_alleles=False)

    multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2)
    multiallelic_mt = hl.split_multi_hts(multiallelic_mt)

    # We annotate some rows manually to conform to the multiallelic_mt (after split).
    # Calling split_multi_hts on biallelic to annotate the rows causes problems.
    biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False)

    all_mt = biallelic_mt.union_rows(multiallelic_mt)
    all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles)

    # 37 is known to have some unneeded symbolic alleles, so we filter out.
    all_mt = all_mt.filter_rows(hl.allele_type(
        all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic',
                                keep=False)

    return all_mt
Beispiel #2
0
 def fix_alleles(alleles):
     ref = alleles.map(lambda d: d.ref).fold(
         lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), '')
     alts = alleles.map(lambda a: hl.switch(hl.allele_type(
         a.ref, a.alt)).when('SNP', a.alt + ref[hl.len(a.alt):]).when(
             'Insertion', a.alt + ref[hl.len(a.ref):]).when(
                 'Deletion', a.alt + ref[hl.len(a.ref):]).default(a.alt))
     return hl.array([ref]).extend(alts)
Beispiel #3
0
def summarize_variants(mt: MatrixTable, show=True):
    """Summarize the variants present in a dataset and print the results.

    Examples
    --------
    >>> hl.summarize_variants(dataset)  # doctest: +SKIP
    ==============================
    Number of variants: 346
    ==============================
    Alleles per variant
    -------------------
      2 alleles: 346 variants
    ==============================
    Variants per contig
    -------------------
      20: 346 variants
    ==============================
    Allele type distribution
    ------------------------
            SNP: 301 alleles
       Deletion: 27 alleles
      Insertion: 18 alleles
    ==============================

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Matrix table with a variant (locus / alleles) row key.
    show : :obj:`bool`
        If ``True``, print results instead of returning them.

    Notes
    -----
    The result returned if `show` is ``False`` is a  :class:`.Struct` with
    four fields:

    - `n_variants` (:obj:`int`): Number of variants present in the matrix table.
    - `allele_types` (:obj:`Dict[str, int]`): Number of alternate alleles in
      each allele allele category.
    - `contigs` (:obj:`Dict[str, int]`): Number of variants on each contig.
    - `allele_counts` (:obj:`Dict[int, int]`): Number of variants broken down
      by number of alleles (biallelic is 2, for example).

    Returns
    -------
    :obj:`None` or :class:`.Struct`
        Returns ``None`` if `show` is ``True``, or returns results as a struct.
    """
    require_row_key_variant(mt, 'summarize_variants')
    alleles_per_variant = hl.range(1, hl.len(mt.alleles)).map(lambda i: hl.allele_type(mt.alleles[0], mt.alleles[i]))
    allele_types, contigs, allele_counts, n_variants = mt.aggregate_rows(
        (hl.agg.explode(lambda elt: hl.agg.counter(elt), alleles_per_variant),
         hl.agg.counter(mt.locus.contig),
         hl.agg.counter(hl.len(mt.alleles)),
         hl.agg.count()))
    rg = mt.locus.dtype.reference_genome
    contig_idx = {contig: i for i, contig in enumerate(rg.contigs)}
    if show:
        max_contig_len = max(len(contig) for contig in contigs)
        contig_formatter = f'%{max_contig_len}s'

        max_allele_count_len = max(len(str(x)) for x in allele_counts)
        allele_count_formatter = f'%{max_allele_count_len}s'

        max_allele_type_len = max(len(x) for x in allele_types)
        allele_type_formatter = f'%{max_allele_type_len}s'

        line_break = '=============================='

        print(line_break)
        print(f'Number of variants: {n_variants}')
        print(line_break)
        print('Alleles per variant')
        print('-------------------')
        for n_alleles, count in sorted(allele_counts.items(), key=lambda x: x[0]):
            print(f'  {allele_count_formatter % n_alleles} alleles: {count} variants')
        print(line_break)
        print('Variants per contig')
        print('-------------------')
        for contig, count in sorted(contigs.items(), key=lambda x: contig_idx[x[0]]):
            print(f'  {contig_formatter % contig}: {count} variants')
        print(line_break)
        print('Allele type distribution')
        print('------------------------')
        for allele_type, count in Counter(allele_types).most_common():
            print(f'  {allele_type_formatter % allele_type}: {count} alternate alleles')
        print(line_break)
    else:
        return hl.Struct(allele_types=allele_types,
                         contigs=contigs,
                         allele_counts=allele_counts,
                         n_variants=n_variants)
Beispiel #4
0
 def explode_result(alleles):
     ref, alt = alleles
     return hl.agg.counter(hl.allele_type(ref, alt)), \
            hl.agg.count_where(hl.is_transition(ref, alt)), \
            hl.agg.count_where(hl.is_transversion(ref, alt))
Beispiel #5
0
def summarize_variants(mt: Union[MatrixTable, MatrixTable],
                       show=True,
                       *,
                       handler=None):
    """Summarize the variants present in a dataset and print the results.

    Examples
    --------
    >>> hl.summarize_variants(dataset)  # doctest: +SKIP
    ==============================
    Number of variants: 346
    ==============================
    Alleles per variant
    -------------------
      2 alleles: 346 variants
    ==============================
    Variants per contig
    -------------------
      20: 346 variants
    ==============================
    Allele type distribution
    ------------------------
            SNP: 301 alleles
       Deletion: 27 alleles
      Insertion: 18 alleles
    ==============================

    Parameters
    ----------
    mt : :class:`.MatrixTable` or :class:`.Table`
        Matrix table with a variant (locus / alleles) row key.
    show : :obj:`bool`
        If ``True``, print results instead of returning them.
    handler

    Notes
    -----
    The result returned if `show` is ``False`` is a  :class:`.Struct` with
    four fields:

    - `n_variants` (:obj:`int`): Number of variants present in the matrix table.
    - `allele_types` (:obj:`Dict[str, int]`): Number of alternate alleles in
      each allele allele category.
    - `contigs` (:obj:`Dict[str, int]`): Number of variants on each contig.
    - `allele_counts` (:obj:`Dict[int, int]`): Number of variants broken down
      by number of alleles (biallelic is 2, for example).

    Returns
    -------
    :obj:`None` or :class:`.Struct`
        Returns ``None`` if `show` is ``True``, or returns results as a struct.
    """
    require_row_key_variant(mt, 'summarize_variants')
    if isinstance(mt, MatrixTable):
        ht = mt.rows()
    else:
        ht = mt
    alleles_per_variant = hl.range(1, hl.len(ht.alleles)).map(
        lambda i: hl.allele_type(ht.alleles[0], ht.alleles[i]))
    allele_types, contigs, allele_counts, n_variants = ht.aggregate(
        (hl.agg.explode(lambda elt: hl.agg.counter(elt),
                        alleles_per_variant), hl.agg.counter(ht.locus.contig),
         hl.agg.counter(hl.len(ht.alleles)), hl.agg.count()))
    rg = ht.locus.dtype.reference_genome
    if show:
        summary = _VariantSummary(rg, n_variants, allele_counts, contigs,
                                  allele_types)
        if handler is None:
            handler = hl.utils.default_handler()
        handler(summary)
    else:
        return hl.Struct(allele_types=allele_types,
                         contigs=contigs,
                         allele_counts=allele_counts,
                         n_variants=n_variants)