def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    truthset_ht = get_truth_ht(args.onmi, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/ddd-elgh-ukbb/truthset.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(f'{args.output_dir}/ddd-elgh-ukbb/mt_trios_adj.mt',
                            overwrite=True)
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_trios_stats.ht',
        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    ht_inbreeding.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac_new.ht',
        overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_allele_data_new.ht',
        overwrite=True)
Beispiel #2
0
def main(args):
    hl.init(log='/frequency_data_generation.log', default_reference='GRCh38')

    logger.info("Reading sparse MT and metadata table...")
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters')

    if args.test:
        logger.info("Filtering to chr20:1-1000000")
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')])

    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Annotating sparse MT with metadata...")
    mt = mt.annotate_cols(meta=meta_ht[mt.s])
    mt = mt.filter_cols(mt.meta.release)
    samples = mt.count_cols()
    logger.info(f"Running frequency table prep and generation pipeline on {samples} samples")

    logger.info("Computing adj and sex adjusted genotypes.")
    mt = mt.annotate_entries(
        GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex),
        adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Densify-ing...")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...")
    # hotfix for depletion of homozygous alternate genotypes
    # Using v3.0 AF to avoid an extra frequency calculation
    # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments
    freq_ht = freq.versions["3"].ht()
    freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF)

    mt = mt.annotate_entries(
        GT=hl.cond(
            (freq_ht[mt.row_key].AF > 0.01)
            & mt.GT.is_het()
            & (mt.AD[1] / mt.DP > 0.9),
            hl.call(1, 1),
            mt.GT,
        )
    )

    logger.info("Calculating InbreedingCoefficient...")
    # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify
    mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    logger.info("Generating frequency data...")
    mt = annotate_freq(
        mt,
        sex_expr=mt.meta.sex,
        pop_expr=mt.meta.pop
    )

    # Select freq, FAF and popmax
    faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX)
    mt = mt.select_rows(
        'InbreedingCoeff',
        'freq',
        faf=faf,
        popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX)
    )
    mt = mt.annotate_globals(faf_meta=faf_meta)

    # Annotate quality metrics histograms, as these also require densifying
    mt = mt.annotate_rows(
        **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Writing out frequency data...")
    if args.test:
        mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True)
    else:
        mt.rows().write(freq.path, overwrite=args.overwrite)
Beispiel #3
0
def main(args):
    subsets = args.subsets
    hl.init(
        log=
        f"/generate_frequency_data{'.' + '_'.join(subsets) if subsets else ''}.log",
        default_reference="GRCh38",
    )

    invalid_subsets = []
    n_subsets_use_subpops = 0
    for s in subsets:
        if s not in SUBSETS:
            invalid_subsets.append(s)
        if s in COHORTS_WITH_POP_STORED_AS_SUBPOP:
            n_subsets_use_subpops += 1

    if invalid_subsets:
        raise ValueError(
            f"{', '.join(invalid_subsets)} subset(s) are not one of the following official subsets: {SUBSETS}"
        )
    if n_subsets_use_subpops & (n_subsets_use_subpops != len(subsets)):
        raise ValueError(
            f"All or none of the supplied subset(s) should be in the list of cohorts that need to use subpops instead "
            f"of pops in frequency calculations: {COHORTS_WITH_POP_STORED_AS_SUBPOP}"
        )

    try:
        logger.info("Reading full sparse MT and metadata table...")
        mt = get_gnomad_v3_mt(
            key_by_locus_and_alleles=True,
            release_only=not args.include_non_release,
            samples_meta=True,
        )

        if args.test:
            logger.info("Filtering to two partitions on chr20")
            mt = hl.filter_intervals(
                mt, [hl.parse_locus_interval("chr20:1-1000000")])
            mt = mt._filter_partitions(range(2))

        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        if args.include_non_release:
            logger.info("Filtering MT columns to high quality samples")
            total_sample_count = mt.count_cols()
            mt = mt.filter_cols(mt.meta.high_quality)
            high_quality_sample_count = mt.count_cols()
            logger.info(
                f"Filtered {total_sample_count - high_quality_sample_count} from the full set of {total_sample_count} "
                f"samples...")

        if subsets:
            mt = mt.filter_cols(hl.any([mt.meta.subsets[s] for s in subsets]))
            logger.info(
                f"Running frequency generation pipeline on {mt.count_cols()} samples in {', '.join(subsets)} subset(s)..."
            )
        else:
            logger.info(
                f"Running frequency generation pipeline on {mt.count_cols()} samples..."
            )

        logger.info("Computing adj and sex adjusted genotypes...")
        mt = mt.annotate_entries(
            GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT,
                                        mt.meta.sex_imputation.sex_karyotype),
            adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD),
        )

        logger.info("Densify-ing...")
        mt = hl.experimental.densify(mt)
        mt = mt.filter_rows(hl.len(mt.alleles) > 1)

        # Temporary hotfix for depletion of homozygous alternate genotypes
        logger.info(
            "Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt..."
        )
        # Load v3.0 allele frequencies to avoid an extra frequency calculation
        # NOTE: Using previous callset AF works for small incremental changes to a callset, but we will need to revisit for large increments
        freq_ht = get_freq(version="3").ht()
        freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF)

        mt = mt.annotate_entries(GT=hl.cond(
            (freq_ht[mt.row_key].AF > 0.01)
            & mt.GT.is_het()
            & (mt.AD[1] / mt.DP > 0.9),
            hl.call(1, 1),
            mt.GT,
        ))

        logger.info("Generating frequency data...")
        if subsets:
            mt = annotate_freq(
                mt,
                sex_expr=mt.meta.sex_imputation.sex_karyotype,
                pop_expr=mt.meta.population_inference.pop
                if not n_subsets_use_subpops else
                mt.meta.project_meta.project_subpop,
                # NOTE: TGP and HGDP labeled populations are highly specific and are stored in the project_subpop meta field
            )

            # NOTE: no FAFs or popmax needed for subsets
            mt = mt.select_rows("freq")

            logger.info(
                f"Writing out frequency data for {', '.join(subsets)} subset(s)..."
            )
            if args.test:
                mt.rows().write(
                    get_checkpoint_path(
                        f"chr20_test_freq.{'_'.join(subsets)}"),
                    overwrite=True,
                )
            else:
                mt.rows().write(get_freq(subset="_".join(subsets)).path,
                                overwrite=args.overwrite)

        else:
            logger.info("Computing age histograms for each variant...")
            mt = mt.annotate_cols(age=hl.if_else(
                hl.is_defined(mt.meta.project_meta.age),
                mt.meta.project_meta.age,
                mt.meta.project_meta.age_alt,
                # NOTE: most age data is stored as integers in 'age' annotation, but for a select number of samples, age is stored as a bin range and 'age_alt' corresponds to an integer in the middle of the bin
            ))
            mt = mt.annotate_rows(**age_hists_expr(mt.adj, mt.GT, mt.age))

            # Compute callset-wide age histogram global
            mt = mt.annotate_globals(age_distribution=mt.aggregate_cols(
                hl.agg.hist(mt.age, 30, 80, 10)))

            mt = annotate_freq(
                mt,
                sex_expr=mt.meta.sex_imputation.sex_karyotype,
                pop_expr=mt.meta.population_inference.pop,
                downsamplings=DOWNSAMPLINGS,
            )
            # Remove all loci with raw AC=0
            mt = mt.filter_rows(mt.freq[1].AC > 0)

            logger.info("Calculating InbreedingCoeff...")
            # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify
            mt = mt.annotate_rows(
                InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

            logger.info("Computing filtering allele frequencies and popmax...")
            faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus,
                                     POPS_TO_REMOVE_FOR_POPMAX)
            mt = mt.select_rows(
                "InbreedingCoeff",
                "freq",
                faf=faf,
                popmax=pop_max_expr(mt.freq, mt.freq_meta,
                                    POPS_TO_REMOVE_FOR_POPMAX),
            )
            mt = mt.annotate_globals(
                faf_meta=faf_meta,
                faf_index_dict=make_faf_index_dict(faf_meta))
            mt = mt.annotate_rows(popmax=mt.popmax.annotate(
                faf95=mt.faf[mt.faf_meta.index(
                    lambda x: x.values() == ["adj", mt.popmax.pop])].faf95))

            logger.info("Annotating quality metrics histograms...")
            # NOTE: these are performed here as the quality metrics histograms also require densifying
            mt = mt.annotate_rows(
                qual_hists=qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD, mt.adj))
            ht = mt.rows()
            ht = ht.annotate(
                qual_hists=hl.Struct(
                    **{
                        i.replace("_adj", ""): ht.qual_hists[i]
                        for i in ht.qual_hists if "_adj" in i
                    }),
                raw_qual_hists=hl.Struct(**{
                    i: ht.qual_hists[i]
                    for i in ht.qual_hists if "_adj" not in i
                }),
            )

            logger.info("Writing out frequency data...")
            if args.test:
                ht.write(get_checkpoint_path("chr20_test_freq"),
                         overwrite=True)
            else:
                ht.write(get_freq().path, overwrite=args.overwrite)

    finally:
        logger.info("Copying hail log to logging bucket...")
        hl.copy_log(f"{qc_temp_prefix()}logs/")
Beispiel #4
0
    # Trio data
    # trio annotation:
    mt_adj = annotate_adj(mt)
    fam = f"{project_dir}/data/annotation/samples/sample.complete_trios.wes50k.02022021.noheader.fam"
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(f'{hdfs_dir}/chd_ukbb.trios.adj.mt',
                            overwrite=True)
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(f'{hdfs_dir}/chd_ukbb.trios.stats.ht', overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    ht_inbreeding.write(f'{hdfs_dir}/chd_ukbb.inbreeding.ht', overwrite=True)
    qc_ac_ht.write(f'{hdfs_dir}/chd_ukbb.qc_ac.ht', overwrite=True)
    allele_data_ht.write(f'{hdfs_dir}/chd_ukbb.allele_data.ht', overwrite=True)
Beispiel #5
0
        def _initial_filter(data_type):
            """
            Get Table of CCDG variants passing desired filters.

            Possible filters are:
                - Autosomes only
                - SNVs only
                - gnomAD v3.1.2 AC filter
                - CCDG high quality exome intervals
                - UK Biobank high quality exome intervals

            After densification of the VDS, rows are annotated with:
                - ccdg_{data_type}_was_split
                - ccdg_{data_type}_AC
                - ccdg_{data_type}_AN

            The filtered and annotated rows are returned as a Table and are also checkpointed
            :param data_type: Whether data is from genomes or exomes

            :return: Table of CCDG filtered variants
            """
            logger.info(
                "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...",
                data_type,
            )
            vds = get_ccdg_vds(data_type, filter_washu=filter_washu)
            logger.info(
                f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..."
            )
            vds = hl.vds.split_multi(vds)

            if autosomes_only:
                logger.info("Filtering CCDG %s VDS to autosomes...", data_type)
                vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True)

            ht = vds.variant_data.rows()
            variant_filter_expr = True
            if snv_only:
                logger.info("Filtering CCDG %s VDS to SNVs...", data_type)
                variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1])

            if min_gnomad_v3_ac:
                logger.info(
                    "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...",
                    data_type,
                    min_gnomad_v3_ac,
                )
                variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac

            vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True)

            if high_qual_ccdg_exome_interval_only:
                logger.info(
                    f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...",
                    data_type,
                    INTERVAL_DP,
                )
                interval_qc_ht = hl.read_table(
                    get_ccdg_results_path(
                        data_type="exomes", result=f"intervals_{INTERVAL_DP}x"
                    )
                )
                interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep)
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            if high_qual_ukbb_exome_interval_only:
                if not autosomes_only:
                    raise ValueError(
                        "UK Biobank interval QC filtering is only available for autosomes!"
                    )

                logger.info(
                    "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...",
                    data_type,
                )
                interval_qc_ht = hl.read_table(
                    ukbb_interval_qc_path("broad", 7, "autosomes")
                )  # Note: freeze 7 is all included in gnomAD v4
                interval_qc_ht = interval_qc_ht.filter(
                    interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval
                )
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            logger.info("Densifying filtered CCDG %s VDS...", data_type)
            mt = hl.vds.to_dense_mt(vds)
            if adj_only:
                mt = filter_to_adj(mt)

            annotation_expr = {
                f"ccdg_{data_type}_was_split": mt.was_split,
                f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()),
                f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2,
            }

            if min_inbreeding_coeff_threshold is not None:
                annotation_expr[
                    f"ccdg_{data_type}_site_inbreeding_coeff"
                ] = bi_allelic_site_inbreeding_expr(mt.GT)
            if min_hardy_weinberg_threshold is not None:
                annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test(
                    mt.GT
                )

            mt = mt.annotate_rows(**annotation_expr)
            ht = mt.rows().checkpoint(
                get_ccdg_results_path(
                    data_type=data_type,
                    mt=False,
                    result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}",
                ),
                overwrite=(not read_per_dataset_checkpoint_if_exists),
                _read_if_exists=read_per_dataset_checkpoint_if_exists,
            )

            return ht
Beispiel #6
0
def filter_rows_for_qc(
    mt: hl.MatrixTable,
    min_af: Optional[float] = 0.001,
    min_callrate: Optional[float] = 0.99,
    min_inbreeding_coeff_threshold: Optional[float] = -0.8,
    min_hardy_weinberg_threshold: Optional[float] = 1e-8,
    apply_hard_filters: bool = True,
    bi_allelic_only: bool = True,
    snv_only: bool = True,
) -> hl.MatrixTable:
    """
    Annotates rows with `sites_callrate`, `site_inbreeding_coeff` and `af`, then applies thresholds.
    AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from GATK best practices

    .. note::

        This function expect the typical ``info`` annotation of type struct with fields ``MQ``, ``FS`` and ``QD``
        if applying hard filters.

    :param mt: Input MT
    :param min_af: Minimum site AF to keep. Not applied if set to ``None``.
    :param min_callrate: Minimum site call rate to keep. Not applied if set to ``None``.
    :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``.
    :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``.
    :paramapply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30
    :parambi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too
    :paramsnv_only: Whether to only keep SNVs or include other variant types
    :return: annotated and filtered table
    """
    annotation_expr = {}

    if min_af is not None:
        annotation_expr["af"] = hl.agg.mean(mt.GT.n_alt_alleles()) / 2
    if min_callrate is not None:
        annotation_expr["site_callrate"] = hl.agg.fraction(hl.is_defined(
            mt.GT))
    if min_inbreeding_coeff_threshold is not None:
        annotation_expr[
            "site_inbreeding_coeff"] = bi_allelic_site_inbreeding_expr(mt.GT)
    if min_hardy_weinberg_threshold is not None:
        annotation_expr["hwe"] = hl.agg.hardy_weinberg_test(mt.GT)

    if annotation_expr:
        mt = mt.annotate_rows(**annotation_expr)

    filter_expr = []
    if min_af is not None:
        filter_expr.append((mt.af > min_af))
    if min_callrate is not None:
        filter_expr.append((mt.site_callrate > min_callrate))
    if min_inbreeding_coeff_threshold is not None:
        filter_expr.append(
            (mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold))
    if min_hardy_weinberg_threshold is not None:
        filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold))
    if snv_only:
        filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    if bi_allelic_only:
        filter_expr.append(bi_allelic_expr(mt))

    if apply_hard_filters:
        if "info" in mt.row_value:
            if "QD" in mt.info:
                filter_expr.append((mt.info.QD >= 2))
            else:
                logger.warning(
                    "Could not apply QD hard filter, as `info.QD` not found in schema."
                )
            if "FS" in mt.info:
                filter_expr.append((mt.info.FS <= 60))
            else:
                logger.warning(
                    "Could not apply FS hard filter, as `info.FS` not found in schema."
                )
            if "MQ" in mt.info:
                filter_expr.append((mt.info.MQ >= 30))
            else:
                logger.warning(
                    "Could not apply MQ hard filter, as `info.MQ` not found in schema."
                )
        else:
            logger.warning(
                "Could not apply hard filters as `info` not found in schema.")

    return mt.filter_rows(functools.reduce(operator.iand, filter_expr))
from gnomad.variant_qc.pipeline import train_rf_model
from hail_init import DEFAULT_REF

# Variant Quality hard filters
INBR_COEFF = -0.3
AB_LOWER_LIM = 0.2
AB_UPPER_LIM = 1 - AB_LOWER_LIM

# Read MatrixTable with sample QC-passing dataset
mt = hl.read_matrix_table("sampleqc_pass.mt")

# Calculate variant statistics
mt = hl.variant_qc(mt)

# Calculate inbreeding coefficient
mt = mt.annotate_rows(inbr_coeff=bi_allelic_site_inbreeding_expr(mt.GT))

# Determine the maximum p-value for sampling the observed allele balance under a binomial model
mt = mt.annotate_rows(
    pab_max=hl.agg.max(hl.binom_test(mt.AD[1], mt.DP, 0.5, "two-sided")))

# Removing variants with excess of heterozygotes
mt = mt.filter_rows(mt.inbr_coeff > INBR_COEFF)

# Removing variants for which no sample had high quality genotypes
mt = mt.filter_rows(hl.agg.any(mt.GQ >= 20))
mt = mt.filter_rows(hl.agg.any(mt.DP >= 10))

mt = mt.annotate_entries(AB=(mt.AD[1] / hl.sum(mt.AD)))

mt = mt.filter_rows(
def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    mt = hl.variant_qc(mt)

    truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    logger.info("Trio annotation and writing trios_adj.mt")
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True)
    logger.info("Trio stats and writing MegaWes_stats.ht")
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht',
                        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    logger.info("Writing tables for inbreeding, allele counts")
    ht_inbreeding.write(
        f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht',
                   overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht',
        overwrite=True)

    # Trio matrix table
    logger.info("Split multi allelic variants and write mt")
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    logger.info("Trio matrixtable generation:")
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt',
                       overwrite=True)

    # Family stats
    logger.info("Family stats")
    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht',
              overwrite=True)

    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt',
                       overwrite=True)

    #Family stats with Allele Frequencies from gnomad
    logger.info("Family stats with gnomad AF")
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)

    logger.info("De novo table cration")
    #De novo table
    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)