def generate_allele_data(mt: hl.MatrixTable) -> hl.Table: """ Writes bi-allelic sites MT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param MatrixTable mt: Full unsplit MT :return: Table with allele data annotations :rtype: Table """ ht = mt.rows().select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == '*', ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), 'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), 'del').default('complex')) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == 'mixed')) return ht
def generate_allele_data(ht: hl.Table) -> hl.Table: """ Returns bi-allelic sites HT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param Table ht: Full unsplit HT :return: Table with allele data annotations :rtype: Table """ ht = ht.select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == "*", ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) ht = ht.filter(hl.len(ht.alleles) > 1) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), "del").default("complex")) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == "mixed")) return ht
def create_cnn_rank_file() -> None: """ Creates a rank file for the CNN data and writes it to its correct location. :return: Nothing :rtype: None """ logger.info("Creating CNN rank file.") if not hl.utils.hadoop_exists( 'gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht/_SUCCESS'): logger.info(f"Importing CNN scores") ht = hl.import_table( 'gs://gnomad/variant_qc/temp/friedman_cnn_scores.tsv.bgz', min_partitions=1000, impute=True) ht.write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht', overwrite=True) logger.info('Formatting CNN scores...') ht = hl.read_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht') ht = ht.annotate( alt_alleles=ht.Alt.split(','), was_split=ht.Alt.contains(',')) # This transforms to a list ht = ht.explode('alt_alleles') ht = ht.annotate(locus=hl.locus(hl.str(ht.Contig), ht.Pos)) # Minrep ht = ht.annotate(**hl.min_rep(ht.locus, [ht.Ref, ht.alt_alleles])) ht = ht.annotate(vartype=add_variant_type(ht.alleles)) ht = ht.key_by('locus', 'alleles') # Select relevant annotations and add singleton / n_nonref, was_split gnomad_ht = get_gnomad_annotations('genomes') ht = ht.select( **gnomad_ht[ht.key], variant_type=ht.vartype.variant_type, n_alt_alleles=ht.vartype.n_alt_alleles, score=ht.CNN_1D_Score, ) ht = ht.filter(ht.n_nonref > 0) ht = add_rank(ht, score_expr=-1 * ht.score, subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton, 'adj_rank': ht.ac > 0, 'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0), 'adj_singleton_rank': ht.singleton & (ht.ac > 0), 'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0) }) ht.write(score_ranking_path('genomes', 'cnn'), overwrite=True)
# Read matrix table mt = hl.read_matrix_table("recalibrated.mt") # Show first few samples mt.s.show() # Check matrix table fields mt.describe() # Mixture of non-empty with empty PL fields causes problems with sample QC for some reason; setting field to all empty mt = mt.annotate_entries(PL=hl.missing(mt.PL.dtype)) # Add variant-level annotations necessary for variant QC later ## Annotate variants in one of the categories: SNV, multi-SNV, indel, multi-indel, mixed mt = mt.annotate_rows(**add_variant_type(mt.alleles)) ## Number of alleles at the site mt = mt.annotate_rows(n_alleles = hl.len(mt.alleles)) ## Mixed sites (SNVs and indels present at the site) mt = mt.annotate_rows(mixed_site = hl.if_else(mt.variant_type == "mixed", True, False)) ## Spanning deletions mt = mt.annotate_rows(spanning_deletion=hl.any(lambda a: a == "*", mt.alleles)) # Number of Rows, Columns mt.count() # Number of Columns mt.count_cols()