def vcf_to_mt(path, genome_version): ''' Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates. This function independently filters the mutli-allelics to split, then unions with the bi-allelics. :param path: vcf path :param genome_version: genome version :return: ''' # Import but do not split multis here. mt = import_vcf(path, genome_version=genome_version, min_partitions=1000, split_multi_alleles=False) multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2) multiallelic_mt = hl.split_multi_hts(multiallelic_mt) # We annotate some rows manually to conform to the multiallelic_mt (after split). # Calling split_multi_hts on biallelic to annotate the rows causes problems. biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2) biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False) all_mt = biallelic_mt.union_rows(multiallelic_mt) all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles) # 37 is known to have some unneeded symbolic alleles, so we filter out. all_mt = all_mt.filter_rows(hl.allele_type( all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic', keep=False) return all_mt
def fix_alleles(alleles): ref = alleles.map(lambda d: d.ref).fold( lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), '') alts = alleles.map(lambda a: hl.switch(hl.allele_type( a.ref, a.alt)).when('SNP', a.alt + ref[hl.len(a.alt):]).when( 'Insertion', a.alt + ref[hl.len(a.ref):]).when( 'Deletion', a.alt + ref[hl.len(a.ref):]).default(a.alt)) return hl.array([ref]).extend(alts)
def summarize_variants(mt: MatrixTable, show=True): """Summarize the variants present in a dataset and print the results. Examples -------- >>> hl.summarize_variants(dataset) # doctest: +SKIP ============================== Number of variants: 346 ============================== Alleles per variant ------------------- 2 alleles: 346 variants ============================== Variants per contig ------------------- 20: 346 variants ============================== Allele type distribution ------------------------ SNP: 301 alleles Deletion: 27 alleles Insertion: 18 alleles ============================== Parameters ---------- mt : :class:`.MatrixTable` Matrix table with a variant (locus / alleles) row key. show : :obj:`bool` If ``True``, print results instead of returning them. Notes ----- The result returned if `show` is ``False`` is a :class:`.Struct` with four fields: - `n_variants` (:obj:`int`): Number of variants present in the matrix table. - `allele_types` (:obj:`Dict[str, int]`): Number of alternate alleles in each allele allele category. - `contigs` (:obj:`Dict[str, int]`): Number of variants on each contig. - `allele_counts` (:obj:`Dict[int, int]`): Number of variants broken down by number of alleles (biallelic is 2, for example). Returns ------- :obj:`None` or :class:`.Struct` Returns ``None`` if `show` is ``True``, or returns results as a struct. """ require_row_key_variant(mt, 'summarize_variants') alleles_per_variant = hl.range(1, hl.len(mt.alleles)).map(lambda i: hl.allele_type(mt.alleles[0], mt.alleles[i])) allele_types, contigs, allele_counts, n_variants = mt.aggregate_rows( (hl.agg.explode(lambda elt: hl.agg.counter(elt), alleles_per_variant), hl.agg.counter(mt.locus.contig), hl.agg.counter(hl.len(mt.alleles)), hl.agg.count())) rg = mt.locus.dtype.reference_genome contig_idx = {contig: i for i, contig in enumerate(rg.contigs)} if show: max_contig_len = max(len(contig) for contig in contigs) contig_formatter = f'%{max_contig_len}s' max_allele_count_len = max(len(str(x)) for x in allele_counts) allele_count_formatter = f'%{max_allele_count_len}s' max_allele_type_len = max(len(x) for x in allele_types) allele_type_formatter = f'%{max_allele_type_len}s' line_break = '==============================' print(line_break) print(f'Number of variants: {n_variants}') print(line_break) print('Alleles per variant') print('-------------------') for n_alleles, count in sorted(allele_counts.items(), key=lambda x: x[0]): print(f' {allele_count_formatter % n_alleles} alleles: {count} variants') print(line_break) print('Variants per contig') print('-------------------') for contig, count in sorted(contigs.items(), key=lambda x: contig_idx[x[0]]): print(f' {contig_formatter % contig}: {count} variants') print(line_break) print('Allele type distribution') print('------------------------') for allele_type, count in Counter(allele_types).most_common(): print(f' {allele_type_formatter % allele_type}: {count} alternate alleles') print(line_break) else: return hl.Struct(allele_types=allele_types, contigs=contigs, allele_counts=allele_counts, n_variants=n_variants)
def explode_result(alleles): ref, alt = alleles return hl.agg.counter(hl.allele_type(ref, alt)), \ hl.agg.count_where(hl.is_transition(ref, alt)), \ hl.agg.count_where(hl.is_transversion(ref, alt))
def summarize_variants(mt: Union[MatrixTable, MatrixTable], show=True, *, handler=None): """Summarize the variants present in a dataset and print the results. Examples -------- >>> hl.summarize_variants(dataset) # doctest: +SKIP ============================== Number of variants: 346 ============================== Alleles per variant ------------------- 2 alleles: 346 variants ============================== Variants per contig ------------------- 20: 346 variants ============================== Allele type distribution ------------------------ SNP: 301 alleles Deletion: 27 alleles Insertion: 18 alleles ============================== Parameters ---------- mt : :class:`.MatrixTable` or :class:`.Table` Matrix table with a variant (locus / alleles) row key. show : :obj:`bool` If ``True``, print results instead of returning them. handler Notes ----- The result returned if `show` is ``False`` is a :class:`.Struct` with four fields: - `n_variants` (:obj:`int`): Number of variants present in the matrix table. - `allele_types` (:obj:`Dict[str, int]`): Number of alternate alleles in each allele allele category. - `contigs` (:obj:`Dict[str, int]`): Number of variants on each contig. - `allele_counts` (:obj:`Dict[int, int]`): Number of variants broken down by number of alleles (biallelic is 2, for example). Returns ------- :obj:`None` or :class:`.Struct` Returns ``None`` if `show` is ``True``, or returns results as a struct. """ require_row_key_variant(mt, 'summarize_variants') if isinstance(mt, MatrixTable): ht = mt.rows() else: ht = mt alleles_per_variant = hl.range(1, hl.len(ht.alleles)).map( lambda i: hl.allele_type(ht.alleles[0], ht.alleles[i])) allele_types, contigs, allele_counts, n_variants = ht.aggregate( (hl.agg.explode(lambda elt: hl.agg.counter(elt), alleles_per_variant), hl.agg.counter(ht.locus.contig), hl.agg.counter(hl.len(ht.alleles)), hl.agg.count())) rg = ht.locus.dtype.reference_genome if show: summary = _VariantSummary(rg, n_variants, allele_counts, contigs, allele_types) if handler is None: handler = hl.utils.default_handler() handler(summary) else: return hl.Struct(allele_types=allele_types, contigs=contigs, allele_counts=allele_counts, n_variants=n_variants)