def generate_ac(mt: hl.MatrixTable) -> hl.Table:
    """
    Creates Table containing allele counts per variant.

    Returns table containing the following annotations:
        - `ac_qc_samples_raw`: Allele count of high quality samples
        - `ac_qc_samples_unrelated_raw`: Allele count of high quality unrelated samples
        - `ac_release_samples_raw`: Allele count of release samples
        - `ac_qc_samples_adj`: Allele count of high quality samples after adj filtering
        - `ac_qc_samples_unrelated_adj`: Allele count of high quality unrelated samples after adj filtering
        - `ac_release_samples_adj`: Allele count of release samples after adj filtering

    :param mt: Input MatrixTable
    :return: Table containing allele counts
    """
    mt = mt.filter_cols(mt.meta.high_quality)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)
    mt = annotate_adj(mt)
    mt = mt.annotate_rows(
        ac_qc_samples_raw=hl.agg.sum(mt.GT.n_alt_alleles()),
        ac_qc_samples_unrelated_raw=hl.agg.filter(~mt.meta.sample_filters.all_samples_related, hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_release_samples_raw=hl.agg.filter(mt.meta.release, hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_qc_samples_adj=hl.agg.filter(mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_qc_samples_unrelated_adj=hl.agg.filter(~mt.meta.sample_filters.all_samples_related & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_release_samples_adj=hl.agg.filter(mt.meta.release & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())),
    )
    return mt.rows()
Exemple #2
0
def generate_ac(mt: hl.MatrixTable, fam_file: str) -> hl.Table:
    """
    Creates Table with QC samples, QC samples removing children and release samples raw and adj ACs.
    """
    mt = mt.filter_cols(mt.meta.high_quality)
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    mt = mt.annotate_cols(unrelated_sample=hl.is_missing(fam_ht[mt.s]))
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)
    mt = annotate_adj(mt)
    mt = mt.annotate_rows(
        ac_qc_samples_raw=hl.agg.sum(mt.GT.n_alt_alleles()),
        ac_qc_samples_unrelated_raw=hl.agg.filter(
            ~mt.meta.all_samples_related, hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_release_samples_raw=hl.agg.filter(mt.meta.release,
                                             hl.agg.sum(
                                                 mt.GT.n_alt_alleles())),
        ac_qc_samples_adj=hl.agg.filter(mt.adj,
                                        hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_qc_samples_unrelated_adj=hl.agg.filter(
            ~mt.meta.all_samples_related & mt.adj,
            hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_release_samples_adj=hl.agg.filter(mt.meta.release & mt.adj,
                                             hl.agg.sum(
                                                 mt.GT.n_alt_alleles())),
    )
    return mt.rows()
Exemple #3
0
def generate_sib_stats(
    mt: hl.MatrixTable,
    relatedness_ht: hl.Table,
    i_col: str = "i",
    j_col: str = "j",
    relationship_col: str = "relationship",
    autosomes_only: bool = True,
    bi_allelic_only: bool = True,
) -> hl.Table:
    """
    This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants
    shared by pairs of siblings in `relatedness_ht`.

    This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too).
    The `relationship_col` should be a column specifying the relationship between each two samples as defined by
    the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of
    samples that are annotated as `SIBLINGS`.

    .. note::

        By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites.

    :param mt: Input Matrix table
    :param relatedness_ht: Input relationship table
    :param i_col: Column containing the 1st sample of the pair in the relationship table
    :param j_col: Column containing the 2nd sample of the pair in the relationship table
    :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants.
    :param autosomes_only: If set, only autosomal intervals are used.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :return: A Table with the sibling shared variant counts
    """
    if autosomes_only:
        mt = filter_to_autosomes(mt)
    if bi_allelic_only:
        mt = mt.filter_rows(bi_allelic_expr(mt))

    sib_ht = relatedness_ht.filter(
        relatedness_ht[relationship_col] == SIBLINGS)
    s_to_keep = sib_ht.aggregate(
        hl.agg.explode(lambda s: hl.agg.collect_as_set(s),
                       [sib_ht[i_col].s, sib_ht[j_col].s]),
        _localize=False,
    )
    mt = mt.filter_cols(s_to_keep.contains(mt.s))
    if "adj" not in mt.entry:
        mt = annotate_adj(mt)

    sib_stats_ht = mt.select_rows(**generate_sib_stats_expr(
        mt,
        sib_ht,
        i_col=i_col,
        j_col=j_col,
        strata={
            "raw": True,
            "adj": mt.adj
        },
    )).rows()

    return sib_stats_ht
Exemple #4
0
def filter_to_adj(mt: hl.MatrixTable) -> hl.MatrixTable:
    """
    Filter genotypes to adj criteria
    """
    if "adj" not in list(mt.entry):
        mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj)
    return mt.drop(mt.adj)
Exemple #5
0
def main(args):
    hl.init()

    data_type = 'genomes' if args.genomes else 'exomes'

    if args.write_hardcalls:
        mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None)
        ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
        mt = annotate_adj(
            mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex))
        mt = mt.select_entries(GT=hl.case(missing_false=True).when(
            hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT),
                               PID=mt.PID,
                               adj=mt.adj)
        mt = adjust_sex_ploidy(mt, mt.sex)
        mt = mt.select_cols().naive_coalesce(10000)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False),
                 args.overwrite)

    if args.split_hardcalls:
        mt = get_gnomad_data(data_type, split=False, meta_root=None)
        mt = hl.split_multi_hts(mt)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True),
                 args.overwrite)

    if args.write_nonrefs:  # CPU-hours: 600 (E)
        mt = get_gnomad_data(data_type, split=False, raw=True,
                             meta_root=None).select_cols()
        mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT))
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt = annotate_adj(mt)
        if args.exomes:
            mt = mt.naive_coalesce(10000)
        mt.write(
            get_gnomad_data_path(data_type, split=False, non_refs_only=True),
            args.overwrite)

    if args.split_nonrefs:  # CPU-hours: 300 (E)
        mt = get_gnomad_data(data_type, split=False, non_refs_only=True)
        mt = hl.split_multi_hts(mt)
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt.write(
            get_gnomad_data_path(data_type, split=True, non_refs_only=True),
            args.overwrite)
def generate_fam_stats(
        mt: hl.MatrixTable,
        fam_file: str
) -> hl.Table:
    """
    Calculate transmission and de novo mutation statistics using trios in the dataset.

    :param mt: Input MatrixTable
    :param fam_file: path to text file containing trio pedigree
    :return: Table containing trio stats
    """
    # Load Pedigree data and filter MT to samples present in any of the trios
    ped = hl.Pedigree.read(fam_file, delimiter="\t")
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    fam_ht = fam_ht.annotate(
        fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]
    )
    fam_ht = fam_ht.explode('fam_members', name='s')
    fam_ht = fam_ht.key_by('s').select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    logger.info(f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios.")

    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)
    mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj')
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True)
    trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj)

    ht = mt.select_rows(
        **generate_trio_stats_expr(
            mt,
            transmitted_strata={
                'raw': True,
                'adj': trio_adj
            },
            de_novo_strata={
                'raw': True,
                'adj': trio_adj,
            },
            proband_is_female_expr=mt.is_female
        )
    ).rows()

    return ht.filter(
        ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0
    )
def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    truthset_ht = get_truth_ht(args.onmi, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/ddd-elgh-ukbb/truthset.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(f'{args.output_dir}/ddd-elgh-ukbb/mt_trios_adj.mt',
                            overwrite=True)
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_trios_stats.ht',
        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    ht_inbreeding.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac_new.ht',
        overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_allele_data_new.ht',
        overwrite=True)
Exemple #8
0
def generate_fam_stats(mt: hl.MatrixTable, fam_file: str) -> hl.Table:
    # Load Pedigree data and filter MT to samples present in any of the trios
    ped = hl.Pedigree.read(fam_file, delimiter="\t")
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    fam_ht = fam_ht.annotate(
        fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id])
    fam_ht = fam_ht.explode('fam_members', name='s')
    fam_ht = fam_ht.key_by('s').select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    logger.info(
        f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios."
    )

    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)
    mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj')
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True)
    trio_adj = (mt.proband_entry.adj & mt.father_entry.adj
                & mt.mother_entry.adj)
    parents_no_alt = (mt.mother_entry.AD[1] == 0) & (mt.father_entry.AD[1]
                                                     == 0)
    parents_high_depth = (mt.mother_entry.AD[0] + mt.mother_entry.AD[1] >
                          20) & (mt.father_entry.AD[0] + mt.father_entry.AD[1]
                                 > 20)
    parents_high_gq = (mt.mother_entry.GQ >= 30) & (mt.father_entry.GQ >= 30)

    ht = mt.select_rows(**generate_trio_stats_expr(
        mt,
        transmitted_strata={
            'raw': None,
            'adj': trio_adj
        },
        de_novo_strata={
            'raw': None,
            'adj': trio_adj,
            'hq': trio_adj & parents_high_gq & parents_high_depth
            & parents_no_alt
        },
        proband_is_female_expr=mt.is_female)).rows()

    return ht.filter(
        ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0)
Exemple #9
0
def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable:
    """
    Filters a MatrixTable to a set of trios in `fam_ht` and annotates with adj.

    :param mt: A Matrix Table to filter to only trios
    :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam`
    :return: A MT filtered to trios and adj annotated
    """
    # Filter MT to samples present in any of the trios
    fam_ht = fam_ht.annotate(
        fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id])
    fam_ht = fam_ht.explode("fam_members", name="s")
    fam_ht = fam_ht.key_by("s").select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    if "adj" not in mt.entry:
        mt = annotate_adj(mt)

    return mt
Exemple #10
0
def query(output):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt_path = f'{output}/filtered_mt.mt'
    mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    # reproduce gnomAD genotype filtering
    mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj)
    mt = hl.variant_qc(mt)
    # Filter to common and biallelic variants
    mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                        & (mt.variant_qc.AF[1] > 0.05))
    pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
    filtered_mt = mt.filter_rows(
        hl.is_defined(pruned_variant_table[mt.row_key]))
    # save filtered mt table
    filtered_mt.write(mt_path, overwrite=True)
Exemple #11
0
    thousand_genomes = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_phase1.snps.high_confidence.hg38.ht'
    thousand_genomes_ht = hl.read_table(thousand_genomes)
    hapmap = f'{temp_dir}/ddd-elgh-ukbb/training_sets/hapmap_3.3.hg38.ht'
    hapmap_ht = hl.read_table(hapmap)
    # ANNOTATION TABLES:
    truth_data_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht')
    trio_stats_table = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht')
    #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht')
    allele_data_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_allele_data.ht')
    allele_counts_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_qc_ac.ht')

    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/filtering/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED.mt'
    )
    mt = annotate_adj(mt)
    mt_freq = annotate_freq(mt)
    print("repartitioning:")
    #mt_freq = mt_freq.repartition(1000, shuffle=False)
    mt_freq = mt_freq.checkpoint(
        f'{tmp_dir}/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED_FREQ_adj.mt',
        overwrite=True)
    ht_freq = mt_freq.rows()
    ht_freq.describe()
    ht_freq.write(
        f'{tmp_dir}/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED_FREQ_adj.ht',
        overwrite=True)
Exemple #12
0
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory

    group = "raw"

    mt = hl.read_matrix_table(
        f'{nfs_dir}/hail_data/mts/chd_ukbb_split_v2_09092020.mt')

    # Truthset
    truthset_ht = get_truth_ht()
    truthset_ht.write(f'{nfs_dir}/hail_data/hts/truthset.ht', overwrite=True)
    truthset_ht = hl.read_table(f'{nfs_dir}/hail_data/hts/truthset.ht')

    # Trio data
    # trio annotation:
    mt_adj = annotate_adj(mt)
    fam = f"{project_dir}/data/annotation/samples/sample.complete_trios.wes50k.02022021.noheader.fam"
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(f'{hdfs_dir}/chd_ukbb.trios.adj.mt',
                            overwrite=True)
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(f'{hdfs_dir}/chd_ukbb.trios.stats.ht', overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import raw split MT
    mt = (get_mt_data(dataset=args.exome_cohort, part='raw',
                      split=True).select_cols())

    ht = (mt.cols().key_by('s'))

    # Annotate samples filters
    sample_qc_filters = {}

    # 1. Add sample hard filters annotation expr
    sample_qc_hard_filters_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort, part='hard_filters'))

    sample_qc_filters.update(
        {'hard_filters': sample_qc_hard_filters_ht[ht.s]['hard_filters']})

    # 2. Add population qc filters annotation expr
    sample_qc_pop_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort, part='population_qc'))

    sample_qc_filters.update(
        {'predicted_pop': sample_qc_pop_ht[ht.s]['predicted_pop']})

    # 3. Add relatedness filters annotation expr
    related_samples_to_drop = get_related_samples_to_drop()
    related_samples = hl.set(
        related_samples_to_drop.aggregate(
            hl.agg.collect_as_set(related_samples_to_drop.node.id)))

    sample_qc_filters.update({'is_related': related_samples.contains(ht.s)})

    # 4. Add stratified sample qc (population/platform) annotation expr
    sample_qc_pop_platform_filters_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort,
                              part='stratified_metrics_filter'))

    sample_qc_filters.update({
        'pop_platform_filters':
        sample_qc_pop_platform_filters_ht[ht.s]['pop_platform_filters']
    })

    ht = (ht.annotate(**sample_qc_filters))

    # Final sample qc filter joint expression
    final_sample_qc_ann_expr = {
        'pass_filters':
        hl.cond((hl.len(ht.hard_filters) == 0) &
                (hl.len(ht.pop_platform_filters) == 0) &
                (ht.predicted_pop == 'EUR') & ~ht.is_related, True, False)
    }
    ht = (ht.annotate(**final_sample_qc_ann_expr))

    logger.info('Writing final sample qc HT to disk...')
    output_path_ht = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='final_qc')

    ht = ht.checkpoint(output_path_ht, overwrite=args.overwrite)

    # Export final sample QC annotations to file
    if args.write_to_file:
        (ht.export(f'{output_path_ht}.tsv.bgz'))

    ## Release final unphase MT with adjusted genotypes filtered
    mt = unphase_mt(mt)
    mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj).select_entries('GT', 'DP', 'GQ', 'adj')

    logger.info('Writing unphase MT with adjusted genotypes to disk...')
    # write MT
    mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                            part='unphase_adj_genotypes',
                            split=True),
             overwrite=args.overwrite)

    # Stop Hail
    hl.stop()

    print("Finished!")
def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    mt = hl.variant_qc(mt)

    truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    logger.info("Trio annotation and writing trios_adj.mt")
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True)
    logger.info("Trio stats and writing MegaWes_stats.ht")
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht',
                        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    logger.info("Writing tables for inbreeding, allele counts")
    ht_inbreeding.write(
        f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht',
                   overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht',
        overwrite=True)

    # Trio matrix table
    logger.info("Split multi allelic variants and write mt")
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    logger.info("Trio matrixtable generation:")
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt',
                       overwrite=True)

    # Family stats
    logger.info("Family stats")
    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht',
              overwrite=True)

    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt',
                       overwrite=True)

    #Family stats with Allele Frequencies from gnomad
    logger.info("Family stats with gnomad AF")
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)

    logger.info("De novo table cration")
    #De novo table
    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)