def split_mt_to_indels(mt: hl.MatrixTable) -> hl.MatrixTable:
    '''

    :param mt: hail matrixtable of all samples with both indels and SNVs
    :return: hail matrixtable with only the indels
    '''
    mt_indels = hl.filter_alleles(
        mt, lambda allele, _: hl.is_indel(mt.alleles[0], allele))
    return mt_indels
Example #2
0
def get_lowqual_expr(
    alleles: hl.expr.ArrayExpression,
    qual_approx_expr: Union[hl.expr.ArrayNumericExpression, hl.expr.NumericExpression],
    snv_phred_threshold: int = 30,
    snv_phred_het_prior: int = 30,  # 1/1000
    indel_phred_threshold: int = 30,
    indel_phred_het_prior: int = 39,  # 1/8,000
) -> Union[hl.expr.BooleanExpression, hl.expr.ArrayExpression]:
    """
    Computes lowqual threshold expression for either split or unsplit alleles based on QUALapprox or AS_QUALapprox

    .. note::

        When running This lowqual annotation using QUALapprox, it differs from the GATK LowQual filter.
        This is because GATK computes this annotation at the site level, which uses the least stringent prior for mixed sites.
        When run using AS_QUALapprox, this implementation can thus be more stringent for certain alleles at mixed sites.

    :param alleles: Array of alleles
    :param qual_approx_expr: QUALapprox or AS_QUALapprox
    :param snv_phred_threshold: Phred-scaled SNV "emission" threshold (similar to GATK emission threshold)
    :param snv_phred_het_prior: Phred-scaled SNV heterozygosity prior (30 = 1/1000 bases, GATK default)
    :param indel_phred_threshold: Phred-scaled indel "emission" threshold (similar to GATK emission threshold)
    :param indel_phred_het_prior: Phred-scaled indel heterozygosity prior (30 = 1/1000 bases, GATK default)
    :return: lowqual expression (BooleanExpression if `qual_approx_expr`is Numeric, Array[BooleanExpression] if `qual_approx_expr` is ArrayNumeric)
    """

    min_snv_qual = snv_phred_threshold + snv_phred_het_prior
    min_indel_qual = indel_phred_threshold + indel_phred_het_prior
    min_mixed_qual = max(min_snv_qual, min_indel_qual)

    if isinstance(qual_approx_expr, hl.expr.ArrayNumericExpression):
        return hl.range(1, hl.len(alleles)).map(
            lambda ai: hl.cond(
                hl.is_snp(alleles[0], alleles[ai]),
                qual_approx_expr[ai - 1] < min_snv_qual,
                qual_approx_expr[ai - 1] < min_indel_qual,
            )
        )
    else:
        return (
            hl.case()
            .when(
                hl.range(1, hl.len(alleles)).all(
                    lambda ai: hl.is_snp(alleles[0], alleles[ai])
                ),
                qual_approx_expr < min_snv_qual,
            )
            .when(
                hl.range(1, hl.len(alleles)).all(
                    lambda ai: hl.is_indel(alleles[0], alleles[ai])
                ),
                qual_approx_expr < min_indel_qual,
            )
            .default(qual_approx_expr < min_mixed_qual)
        )
Example #3
0
def main(args):
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.annotate_entries(
        gvcf_info=mt.gvcf_info.drop('ClippingRankSum', 'ReadPosRankSum'))
    mt = mt.annotate_rows(
        n_unsplit_alleles=hl.len(mt.alleles),
        mixed_site=(hl.len(mt.alleles) > 2)
        & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
        & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]))
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt.write(args.split_mt_location, overwrite=args.overwrite)
Example #4
0
def genetics_pipeline():
    mt = get_mt()
    mt = hl.split_multi_hts(mt)
    mt = hl.variant_qc(mt)
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols(mt.sample_qc.call_rate > 0.95)
    mt = mt.filter_rows(mt.variant_qc.AC[1] > 5)
    mt = mt.filter_entries(hl.case().when(
        hl.is_indel(mt.alleles[0], mt.alleles[1]),
        mt.GQ > 20).default(mt.GQ > 10))
    mt.write('/tmp/genetics_pipeline.mt', overwrite=True)
Example #5
0
def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression:
    """
    Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings (all alleles)
    """
    ref = alt_alleles[0]
    alts = alt_alleles[1:]
    non_star_alleles = hl.filter(lambda a: a != '*', alts)
    return hl.struct(variant_type=hl.cond(
        hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles),
        hl.cond(hl.len(non_star_alleles) > 1, "multi-snv", "snv"),
        hl.cond(
            hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles),
            hl.cond(hl.len(non_star_alleles) > 1, "multi-indel", "indel"),
            "mixed")
    ), n_alt_alleles=hl.len(non_star_alleles))
Example #6
0
def get_gnomad_v3_mt(
    split=False,
    key_by_locus_and_alleles: bool = False,
    remove_hard_filtered_samples: bool = True,
    release_only: bool = False,
    samples_meta: bool = False,
) -> hl.MatrixTable:
    """
    Wrapper function to get gnomAD data with desired filtering and metadata annotations

    :param split: Perform split on MT - Note: this will perform a split on the MT rather than grab an already split MT
    :param key_by_locus_and_alleles: Whether to key the MatrixTable by locus and alleles (only needed for v3)
    :param remove_hard_filtered_samples: Whether to remove samples that failed hard filters (only relevant after sample QC)
    :param release_only: Whether to filter the MT to only samples available for release (can only be used if metadata is present)
    :param samples_meta: Whether to add metadata to MT in 'meta' column
    :return: gnomAD v3 dataset with chosen annotations and filters
    """
    mt = gnomad_v3_genotypes.mt()
    if key_by_locus_and_alleles:
        mt = hl.MatrixTable(
            hl.ir.MatrixKeyRowsBy(
                mt._mir, ["locus", "alleles"], is_sorted=True
            )  # Prevents hail from running sort on genotype MT which is already sorted by a unique locus
        )

    if remove_hard_filtered_samples:
        mt = mt.filter_cols(
            hl.is_missing(hard_filtered_samples.ht()[mt.col_key]))

    if samples_meta:
        mt = mt.annotate_cols(meta=meta.ht()[mt.col_key])

        if release_only:
            mt = mt.filter_cols(mt.meta.release)

    elif release_only:
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

    if split:
        mt = mt.annotate_rows(
            n_unsplit_alleles=hl.len(mt.alleles),
            mixed_site=(hl.len(mt.alleles) > 2)
            & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
            & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]),
        )
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    return mt
ANNOTATION_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/annotations/gene.ht'

URV_NOT_IN_GNOMAD_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/20_URVs_not_in_gnomAD.tsv'
NOT_IN_GNOMAD_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/20_not_in_gnomAD.tsv'

mt = hl.read_matrix_table(MT)

sample_annotations = hl.read_table(PHENOTYPES_TABLE)
constraint_annotations = hl.read_table(ANNOTATION_TABLE)

mt = mt.annotate_cols(phenotype = sample_annotations[mt.s])
mt = mt.annotate_rows(constraint = constraint_annotations[mt.row_key])
mt = mt.filter_rows(~mt.constraint.inGnomAD_nonpsych)

mt = mt.annotate_cols(n_SNP = hl.agg.count_where(mt.GT.is_non_ref() & hl.is_snp(mt.alleles[0], mt.alleles[1])),
                      n_indel = hl.agg.count_where(mt.GT.is_non_ref() & hl.is_indel(mt.alleles[0], mt.alleles[1])),
                      n_coding_SNP = hl.agg.count_where(mt.GT.is_non_ref() & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (mt.constraint.consequence_category != "non_coding")),
                      n_coding_indel = hl.agg.count_where(mt.GT.is_non_ref() & hl.is_indel(mt.alleles[0], mt.alleles[1]) & (mt.constraint.consequence_category != "non_coding")),
                      n_PTV = hl.agg.count_where(mt.GT.is_non_ref() & (mt.constraint.consequence_category == "ptv")),
                      n_damaging_missense = hl.agg.count_where(mt.GT.is_non_ref() & (mt.constraint.consequence_category == "damaging_missense")),
                      n_other_missense = hl.agg.count_where(mt.GT.is_non_ref() & (mt.constraint.consequence_category == "other_missense")),
                      n_synonymous = hl.agg.count_where(mt.GT.is_non_ref() & (mt.constraint.consequence_category == "synonymous")),
                      n_non_coding = hl.agg.count_where(mt.GT.is_non_ref() & (mt.constraint.consequence_category == "non_coding")))

mt.cols().flatten().export(NOT_IN_GNOMAD_FILE)

mt = mt.annotate_rows(is_singleton = hl.agg.sum(mt.GT.n_alt_alleles()) == 1)
mt = mt.filter_rows(mt.is_singleton)

mt = mt.annotate_cols(n_URV_SNP = hl.agg.count_where(mt.GT.is_non_ref() & hl.is_snp(mt.alleles[0], mt.alleles[1])),
                      n_URV_indel = hl.agg.count_where(mt.GT.is_non_ref() & hl.is_indel(mt.alleles[0], mt.alleles[1])),
Example #8
0
def create_binned_concordance(data_type: str, truth_sample: str, metric: str,
                              nbins: int, overwrite: bool) -> None:
    """
    Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric.

    :param str data_type: One 'exomes' or 'genomes'
    :param str truth_sample: Which truth sample concordance to load
    :param str metric: One of the evaluation metrics (or a RF hash)
    :param int nbins: Number of bins for the rank
    :param bool overwrite: Whether to overwrite existing table
    :return: Nothing -- just writes the table
    :rtype: None
    """

    if hl.hadoop_exists(
            binned_concordance_path(data_type, truth_sample, metric) +
            '/_SUCCESS') and not overwrite:
        logger.warn(
            f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False"
        )
    else:
        ht = hl.read_table(
            annotations_ht_path(data_type, f'{truth_sample}_concordance'))
        # Remove 1bp indels for syndip as cannot be trusted
        if truth_sample == 'syndip':
            ht = ht.filter(
                hl.is_indel(ht.alleles[0], ht.alleles[1]) &
                (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1),
                keep=False)
            high_conf_intervals = hl.import_locus_intervals(
                syndip_high_conf_regions_bed_path)
        else:
            high_conf_intervals = hl.import_locus_intervals(
                NA12878_high_conf_regions_bed_path)

        lcr = hl.import_locus_intervals(lcr_intervals_path)
        segdup = hl.import_locus_intervals(segdup_intervals_path)
        ht = ht.filter(
            hl.is_defined(high_conf_intervals[ht.locus])
            & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus]))

        if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']:
            metric_ht = hl.read_table(score_ranking_path(data_type, metric))
        else:
            metric_ht = hl.read_table(
                rf_path(data_type, 'rf_result', run_hash=metric))

        metric_snvs, metrics_indels = metric_ht.aggregate([
            hl.agg.count_where(
                hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])),
            hl.agg.count_where(
                ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1]))
        ])

        snvs, indels = ht.aggregate([
            hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])),
            hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1]))
        ])

        ht = ht.annotate_globals(global_counts=hl.struct(
            snvs=metric_snvs, indels=metrics_indels),
                                 counts=hl.struct(snvs=snvs, indels=indels))

        ht = ht.annotate(
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            score=metric_ht[ht.key].score,
            global_rank=metric_ht[ht.key].rank,
            # TP => allele is found in both data sets
            n_tp=ht.concordance[3][3] + ht.concordance[3][4] +
            ht.concordance[4][3] + ht.concordance[4][4],
            # FP => allele is found only in test data set
            n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]),
            # FN => allele is found only in truth data set
            n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4])))

        ht = add_rank(ht, -1.0 * ht.score)

        ht = ht.annotate(rank=[
            hl.tuple([
                'global_rank', (ht.global_rank + 1) /
                hl.cond(ht.snv, ht.globals.global_counts.snvs,
                        ht.globals.global_counts.indels)
            ]),
            hl.tuple([
                'truth_sample_rank', (ht.rank + 1) / hl.cond(
                    ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels)
            ])
        ])

        ht = ht.explode(ht.rank)
        ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins))

        ht = ht.group_by('rank_name', 'snv', 'bin').aggregate(
            # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons
            tp=hl.agg.count_where(ht.n_tp > 0),
            fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)),
            fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0)
                                  & (ht.n_fn > 0)),
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n_alleles=hl.agg.count()).repartition(5)

        ht.write(binned_concordance_path(data_type, truth_sample, metric),
                 overwrite=overwrite)
Example #9
0
def get_summary_counts_dict(
    locus_expr: hl.expr.LocusExpression,
    allele_expr: hl.expr.ArrayExpression,
    lof_expr: hl.expr.StringExpression,
    no_lof_flags_expr: hl.expr.BooleanExpression,
    most_severe_csq_expr: hl.expr.StringExpression,
    prefix_str: str = "",
) -> Dict[str, hl.expr.Int64Expression]:
    """
    Return dictionary containing containing counts of multiple variant categories.

    Categories are:
        - Number of variants
        - Number of indels
        - Number of SNVs
        - Number of LoF variants
        - Number of LoF variants that pass LOFTEE
        - Number of LoF variants that pass LOFTEE without any flgs
        - Number of LoF variants annotated as 'other splice' (OS) by LOFTEE
        - Number of LoF variants that fail LOFTEE
        - Number of missense variants
        - Number of synonymous variants
        - Number of autosomal variants
        - Number of allosomal variants

    .. warning::
        Assumes `allele_expr` contains only two variants (multi-allelics have been split).

    :param locus_expr: LocusExpression.
    :param allele_expr: ArrayExpression containing alleles.
    :param lof_expr: StringExpression containing LOFTEE annotation.
    :param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags.
    :param most_severe_csq_expr: StringExpression containing most severe consequence annotation.
    :param prefix_str: Desired prefix string for category names. Default is empty str.
    :return: Dict of categories and counts per category.
    """
    logger.warning(
        "This function expects that multi-allelic variants have been split!")
    return {
        f"{prefix_str}num_variants":
        hl.agg.count(),
        f"{prefix_str}indels":
        hl.agg.count_where(hl.is_indel(allele_expr[0], allele_expr[1])),
        f"{prefix_str}snps":
        hl.agg.count_where(hl.is_snp(allele_expr[0], allele_expr[1])),
        f"{prefix_str}LOF":
        hl.agg.count_where(hl.is_defined(lof_expr)),
        f"{prefix_str}pass_loftee":
        hl.agg.count_where(lof_expr == "HC"),
        f"{prefix_str}pass_loftee_no_flag":
        hl.agg.count_where((lof_expr == "HC") & (no_lof_flags_expr)),
        f"{prefix_str}loftee_os":
        hl.agg.count_where(lof_expr == "OS"),
        f"{prefix_str}fail_loftee":
        hl.agg.count_where(lof_expr == "LC"),
        f"{prefix_str}num_missense":
        hl.agg.count_where(most_severe_csq_expr == "missense_variant"),
        f"{prefix_str}num_synonymous":
        hl.agg.count_where(most_severe_csq_expr == "synonymous_variant"),
        f"{prefix_str}num_autosomal_variants":
        hl.agg.filter(locus_expr.in_autosome_or_par(), hl.agg.count()),
        f"{prefix_str}num_allosomal_variants":
        hl.agg.filter(locus_expr.in_x_nonpar() | locus_expr.in_y_nonpar(),
                      hl.agg.count()),
    }
Example #10
0
def default_generate_gene_lof_matrix(
    mt: hl.MatrixTable,
    tx_ht: Optional[hl.Table],
    high_expression_cutoff: float = 0.9,
    low_expression_cutoff: float = 0.1,
    filter_field: str = "filters",
    freq_field: str = "freq",
    freq_index: int = 0,
    additional_csq_set: Set[str] = {"missense_variant", "synonymous_variant"},
    all_transcripts: bool = False,
    filter_an: bool = False,
    filter_to_rare: bool = False,
    pre_loftee: bool = False,
    lof_csq_set: Set[str] = LOF_CSQ_SET,
    remove_ultra_common: bool = False,
) -> hl.MatrixTable:
    """
    Generate loss-of-function gene matrix.

    Used to generate summary metrics on LoF variants.

    :param mt: Input MatrixTable.
    :param tx_ht: Optional Table containing expression levels per transcript.
    :param high_expression_cutoff: Minimum mean proportion expressed cutoff for a transcript to be considered highly expressed. Default is 0.9.
    :param low_expression_cutoff: Upper mean proportion expressed cutoff for a transcript to lowly expressed. Default is 0.1.
    :param filter_field: Name of field in MT that contains variant filters. Default is 'filters'.
    :param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
    :param freq_index: Which index of frequency struct to use. Default is 0.
    :param additional_csq_set: Set of additional consequences to keep. Default is {'missense_variant', 'synonymous_variant'}.
    :param all_transcripts: Whether to use all transcripts instead of just the transcript with most severe consequence. Default is False.
    :param filter_an: Whether to filter using allele number as proxy for call rate. Default is False.
    :param filter_to_rare: Whether to filter to rare (AF < 5%) variants. Default is False.
    :param pre_loftee: Whether LoF consequences have been annotated with LOFTEE. Default is False.
    :param lof_csq_set: Set of LoF consequence strings. Default is {"splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant"}.
    :param remove_ultra_common: Whether to remove ultra common (AF > 95%) variants. Default is False.
    """
    logger.info("Filtering to PASS variants...")
    filt_criteria = hl.len(mt[filter_field]) == 0
    if filter_an:
        logger.info(
            "Using AN (as a call rate proxy) to filter to variants that meet a minimum call rate..."
        )
        mt = mt.filter_rows(get_an_criteria(mt))
    if remove_ultra_common:
        logger.info("Removing ultra common (AF > 95%) variants...")
        filt_criteria &= mt[freq_field][freq_index].AF < 0.95
    if filter_to_rare:
        logger.info("Filtering to rare (AF < 5%) variants...")
        filt_criteria &= mt[freq_field][freq_index].AF < 0.05
    mt = mt.filter_rows(filt_criteria)

    if all_transcripts:
        logger.info("Exploding transcript_consequences field...")
        explode_field = "transcript_consequences"
    else:
        logger.info(
            "Adding most severe (worst) consequence and expoding worst_csq_by_gene field..."
        )
        mt = process_consequences(mt)
        explode_field = "worst_csq_by_gene"

    if additional_csq_set:
        logger.info("Including these consequences: %s", additional_csq_set)
        additional_cats = hl.literal(additional_csq_set)

    if pre_loftee:
        logger.info("Filtering to LoF consequences: %s", lof_csq_set)
        lof_cats = hl.literal(lof_csq_set)
        criteria = lambda x: lof_cats.contains(
            add_most_severe_consequence_to_consequence(x).
            most_severe_consequence)
        if additional_csq_set:
            criteria = lambda x: lof_cats.contains(
                add_most_severe_consequence_to_consequence(x).
                most_severe_consequence) | additional_cats.contains(
                    add_most_severe_consequence_to_consequence(x).
                    most_severe_consequence)

    else:
        logger.info(
            "Filtering to LoF variants that pass LOFTEE with no LoF flags...")
        criteria = lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)
        if additional_csq_set:
            criteria = lambda x: (x.lof == "HC") & hl.is_missing(
                x.lof_flags) | additional_cats.contains(
                    add_most_severe_consequence_to_consequence(x).
                    most_severe_consequence)

    csqs = mt.vep[explode_field].filter(criteria)
    mt = mt.select_rows(mt[freq_field], csqs=csqs)
    mt = mt.explode_rows(mt.csqs)
    annotation_expr = {
        "gene_id": mt.csqs.gene_id,
        "gene": mt.csqs.gene_symbol,
        "indel": hl.is_indel(mt.alleles[0], mt.alleles[1]),
        "most_severe_consequence": mt.csqs.most_severe_consequence,
    }

    if tx_ht:
        logger.info("Adding transcript expression annotation...")
        tx_annotation = get_tx_expression_expr(
            mt.row_key,
            tx_ht,
            mt.csqs,
        ).mean_proportion
        annotation_expr["expressed"] = (hl.case().when(
            tx_annotation >= high_expression_cutoff,
            "high").when(tx_annotation > low_expression_cutoff,
                         "medium").when(hl.is_defined(tx_annotation),
                                        "low").default("missing"))
    else:
        annotation_expr["transcript_id"] = mt.csqs.transcript_id
        annotation_expr["canonical"] = hl.is_defined(mt.csqs.canonical)

    mt = mt.annotate_rows(**annotation_expr)
    return (mt.group_rows_by(*list(annotation_expr.keys())).aggregate_rows(
        n_sites=hl.agg.count(),
        n_sites_array=hl.agg.array_sum(
            mt.freq.map(lambda x: hl.int(x.AC > 0))),
        classic_caf=hl.agg.sum(mt[freq_field][freq_index].AF),
        max_af=hl.agg.max(mt[freq_field][freq_index].AF),
        classic_caf_array=hl.agg.array_sum(mt[freq_field].map(lambda x: x.AF)),
    ).aggregate_entries(
        num_homs=hl.agg.count_where(mt.GT.is_hom_var()),
        num_hets=hl.agg.count_where(mt.GT.is_het()),
        defined_sites=hl.agg.count_where(hl.is_defined(mt.GT)),
    ).result())
Example #11
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("importing vds...")
vds = hl.read_matrix_table(vds_splitmulti_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# III. Remove rare variants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("removing rare variants...")
vds = vds.filter_rows((vds.info.AF[0] > 0.01) & (vds.info.AF[0] < 0.99))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IV. Remove indels
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("removing indels...")

vds = vds.filter_rows(hl.is_indel(vds.alleles[0], vds.alleles[1]) == False)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# V. Write output
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("writing out...")
hl.export_plink(vds, plink_files_out)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# print Runtime
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stop = timeit.default_timer()

print("runtime: " + str(stop - start) + " seconds")
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    # done the above on pca_RF jupyter notebook
    # mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt")
    #mt = hl.split_multi_hts(    mt, keep_star=False, left_aligned=False)
    mt.write(
        f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    # mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts_split_multi.mt")
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:

    mt_1kg_chr1_chr20 = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt"
    )
Example #13
0
# Drop some fields that are not needed.
mt = mt.drop('a_index', 'qual', 'rsid', 'info', 'filters', 'was_split')

sample_annotations = hl.read_table(PHENOTYPES_TABLE)
constraint_annotations = hl.read_table(ANNOTATION_TABLE)

mt = mt.annotate_cols(phenotype=sample_annotations[mt.s])
mt = mt.annotate_rows(constraint=constraint_annotations[mt.row_key])
mt = mt.annotate_rows(is_singleton=hl.agg.sum(mt.GT.n_alt_alleles()) == 1)
mt = mt.filter_rows((mt.is_singleton) & (~mt.constraint.inGnomAD_nonpsych))

mt = mt.annotate_cols(
    n_URV_SNP=hl.agg.count_where(mt.GT.is_non_ref()
                                 & hl.is_snp(mt.alleles[0], mt.alleles[1])),
    n_URV_indel=hl.agg.count_where(
        mt.GT.is_non_ref() & hl.is_indel(mt.alleles[0], mt.alleles[1])),
    n_coding_URV_SNP=hl.agg.count_where(
        mt.GT.is_non_ref() & hl.is_snp(mt.alleles[0], mt.alleles[1])
        & (mt.constraint.consequence_category != "non_coding")),
    n_coding_URV_indel=hl.agg.count_where(
        mt.GT.is_non_ref() & hl.is_indel(mt.alleles[0], mt.alleles[1])
        & (mt.constraint.consequence_category != "non_coding")),
    n_URV_PTV=hl.agg.count_where(
        mt.GT.is_non_ref() & (mt.constraint.consequence_category == "ptv")),
    n_URV_damaging_missense=hl.agg.count_where(mt.GT.is_non_ref() & (
        mt.constraint.consequence_category == "damaging_missense")),
    n_URV_other_missense=hl.agg.count_where(mt.GT.is_non_ref() & (
        mt.constraint.consequence_category == "other_missense")),
    n_URV_synonymous=hl.agg.count_where(mt.GT.is_non_ref() & (
        mt.constraint.consequence_category == "synonymous")),
    n_URV_non_coding=hl.agg.count_where(mt.GT.is_non_ref() & (
Example #14
0
def burden_annotations(mt, root_ann='burden', annotate=True):

    mt = mt.annotate_cols(**{root_ann: hl.struct()})
    ann_data = mt[root_ann].annotate(
        # URVs (Singletons)
        n_URV=hl.agg.count_where(mt.GT.is_non_ref()),
        n_URV_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref()) & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_indel=hl.agg.count_where((mt.GT.is_non_ref()) & (
            hl.is_indel(mt.alleles[0], mt.alleles[1]))),
        # Coding Singletons
        n_coding_URV=hl.agg.count_where((mt.GT.is_non_ref()) & (
            mt.annotation.consequence_category != "non_coding")),
        n_coding_URV_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref()) & (hl.is_snp(mt.alleles[0], mt.alleles[1]))
            & (mt.annotation.consequence_category != "non_coding")),
        n_coding_URV_indel=hl.agg.count_where(
            (mt.GT.is_non_ref()) & (hl.is_indel(mt.alleles[0], mt.alleles[1]))
            & (mt.annotation.consequence_category != "non_coding")),
        # PTVs
        n_URV_PTV=hl.agg.count_where((mt.GT.is_non_ref()) & (
            mt.annotation.consequence_category == "ptv")),
        n_URV_PTV_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "ptv")
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_PTV_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "ptv")
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))),
        # Damaging missense
        n_URV_damaging_missense=hl.agg.count_where((mt.GT.is_non_ref()) & (
            mt.annotation.consequence_category == "damaging_missense")),
        n_URV_damaging_missense_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "damaging_missense")
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_damaging_missense_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "damaging_missense")
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))),
        # Other missense
        n_URV_other_missense=hl.agg.count_where((mt.GT.is_non_ref()) & (
            mt.annotation.consequence_category == "other_missense")),
        n_URV_other_missense_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "other_missense")
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_other_missense_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "other_missense")
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))),
        # Synonymous
        n_URV_synonymous=hl.agg.count_where((mt.GT.is_non_ref()) & (
            mt.annotation.consequence_category == "synonymous")),
        n_URV_synonymous_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "synonymous")
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_synonymous_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "synonymous")
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))),
        # Non-coding
        n_URV_non_coding=hl.agg.count_where((mt.GT.is_non_ref()) & (
            mt.annotation.consequence_category == "non_coding")),
        n_URV_non_coding_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "non_coding")
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_non_coding_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "non_coding")
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))),

        # Next, determine counts with MPC >= 2
        n_URV_MPC_2_damaging_missense=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "damaging_missense")
            & (mt.annotation.mpc.MPC >= 2)),
        n_URV_MPC_2_damaging_missense_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "damaging_missense")
            & (mt.annotation.mpc.MPC >= 2)
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_MPC_2_damaging_missense_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & (mt.annotation.consequence_category == "damaging_missense")
            & (mt.annotation.mpc.MPC >= 2)
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))),
        n_URV_MPC_2_missense=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & ((mt.annotation.consequence_category == "damaging_missense")
               | (mt.annotation.consequence_category == "other_missense"))
            & (mt.annotation.mpc.MPC >= 2)),
        n_URV_MPC_2_missense_SNP=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & ((mt.annotation.consequence_category == "damaging_missense")
               | (mt.annotation.consequence_category == "other_missense"))
            & (mt.annotation.mpc.MPC >= 2)
            & (hl.is_snp(mt.alleles[0], mt.alleles[1]))),
        n_URV_MPC_2_missense_indel=hl.agg.count_where(
            (mt.GT.is_non_ref())
            & ((mt.annotation.consequence_category == "damaging_missense")
               | (mt.annotation.consequence_category == "other_missense"))
            & (mt.annotation.mpc.MPC >= 2)
            & (hl.is_indel(mt.alleles[0], mt.alleles[1]))))

    if annotate:
        return mt.annotate_cols(**{root_ann: ann_data})
    else:
        return ann_data
def main(args):

    bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca,
                                       reference_genome='GRCh38')
    cohorts_pop = hl.import_table(cohorts_populations,
                                  delimiter="\t").key_by('s')

    # # overlap AKT dataset
    overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap)
    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(args.matrixtable)
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated)
    # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:
    # overlap_1kg_AKT
    # mt_1kg_chr1_chr20 = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt")
    overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus")
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(
        hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus]))
    logger.info("done filtering writing mt")
    # ld pruning
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    #pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))
    # remove pruned areas that need to be removed

    # autosomes only:
    pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome())

    pruned_mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt",
        overwrite=True)
    # pruned_mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")

    # related_samples_to_drop = hl.read_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")

    logger.info("run_pca_with_relateds")
    # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds(
    #    pruned_mt, related_samples_to_drop, autosomes_only=True)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
    pca_scores = pca_scores.annotate(
        known_pop=pruned_mt.cols()[pca_scores.s].known_pop)
    # mt = mt.annotate_cols(
    #    loadings=pca_loadings[mt_vqc_filtered.col_key].loadings)
    # mt = mt.annotate_cols(known_pop="unk")
    # pca_scores = pca_scores.annotate(known_pop="unk")
    pca_scores.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht",
        overwrite=True)
    pca_loadings.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht",
        overwrite=True)
    with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt",
              'w') as f:
        for val in pca_evals:
            f.write(str(val))

    logger.info("assign population pcs")

    pop_ht, pop_clf = assign_population_pcs(pca_scores,
                                            pca_scores.scores,
                                            known_col="known_pop",
                                            n_estimators=100,
                                            prop_train=0.8,
                                            min_prob=0.5)
    pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht",
                 overwrite=True)
    pop_ht.export(
        f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")