def generate_allele_data(mt: hl.MatrixTable) -> hl.Table:
    """
    Writes bi-allelic sites MT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param MatrixTable mt: Full unsplit MT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = mt.rows().select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == '*', ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                'del').default('complex'))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == 'mixed'))
    return ht
Exemple #2
0
def generate_allele_data(ht: hl.Table) -> hl.Table:
    """
    Returns bi-allelic sites HT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param Table ht: Full unsplit HT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = ht.select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == "*", ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    ht = ht.filter(hl.len(ht.alleles) > 1)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                "del").default("complex"))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == "mixed"))
    return ht
def create_cnn_rank_file() -> None:
    """
    Creates a rank file for the CNN data and writes it to its correct location.

    :return: Nothing
    :rtype: None
    """

    logger.info("Creating CNN rank file.")

    if not hl.utils.hadoop_exists(
            'gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht/_SUCCESS'):
        logger.info(f"Importing CNN scores")
        ht = hl.import_table(
            'gs://gnomad/variant_qc/temp/friedman_cnn_scores.tsv.bgz',
            min_partitions=1000,
            impute=True)
        ht.write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht',
                 overwrite=True)

    logger.info('Formatting CNN scores...')
    ht = hl.read_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht')
    ht = ht.annotate(
        alt_alleles=ht.Alt.split(','),
        was_split=ht.Alt.contains(','))  # This transforms to a list
    ht = ht.explode('alt_alleles')
    ht = ht.annotate(locus=hl.locus(hl.str(ht.Contig), ht.Pos))

    # Minrep
    ht = ht.annotate(**hl.min_rep(ht.locus, [ht.Ref, ht.alt_alleles]))
    ht = ht.annotate(vartype=add_variant_type(ht.alleles))
    ht = ht.key_by('locus', 'alleles')

    # Select relevant annotations and add singleton / n_nonref, was_split
    gnomad_ht = get_gnomad_annotations('genomes')
    ht = ht.select(
        **gnomad_ht[ht.key],
        variant_type=ht.vartype.variant_type,
        n_alt_alleles=ht.vartype.n_alt_alleles,
        score=ht.CNN_1D_Score,
    )
    ht = ht.filter(ht.n_nonref > 0)
    ht = add_rank(ht,
                  score_expr=-1 * ht.score,
                  subrank_expr={
                      'singleton_rank':
                      ht.singleton,
                      'biallelic_rank':
                      ~ht.was_split,
                      'biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton,
                      'adj_rank':
                      ht.ac > 0,
                      'adj_biallelic_rank':
                      ~ht.was_split & (ht.ac > 0),
                      'adj_singleton_rank':
                      ht.singleton & (ht.ac > 0),
                      'adj_biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton & (ht.ac > 0)
                  })

    ht.write(score_ranking_path('genomes', 'cnn'), overwrite=True)
# Read matrix table
mt = hl.read_matrix_table("recalibrated.mt")

# Show first few samples
mt.s.show()

# Check matrix table fields
mt.describe()

# Mixture of non-empty with empty PL fields causes problems with sample QC for some reason; setting field to all empty
mt = mt.annotate_entries(PL=hl.missing(mt.PL.dtype))

# Add variant-level annotations necessary for variant QC later
## Annotate variants in one of the categories: SNV, multi-SNV, indel, multi-indel, mixed
mt = mt.annotate_rows(**add_variant_type(mt.alleles))

## Number of alleles at the site
mt = mt.annotate_rows(n_alleles = hl.len(mt.alleles))

## Mixed sites (SNVs and indels present at the site)
mt = mt.annotate_rows(mixed_site = hl.if_else(mt.variant_type == "mixed", True, False))

## Spanning deletions
mt = mt.annotate_rows(spanning_deletion=hl.any(lambda a: a == "*", mt.alleles))

# Number of Rows, Columns
mt.count()

# Number of Columns
mt.count_cols()