Beispiel #1
0
    def test_trio_matrix_null_keys(self):
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s].fam_id)

        # Make keys all null
        mt = mt.key_cols_by(s=hl.null(hl.tstr))

        tt = hl.trio_matrix(mt, ped, complete_trios=True)
        self.assertEqual(tt.count_cols(), 0)
Beispiel #2
0
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_autosomes_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/MegaWES_trio_table.mt',
                       overwrite=True)

    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    #famstats_ht.write(
    #    f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/MegaWES_family_stats.mt',
                       overwrite=True)
    #(mt1, famstats_ht) = generate_family_stats(mt, fam)
    #print("Writing mt and family stats_ht")
    #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    # famstats_ht.write(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True)
    #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats)
    #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)

    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)
    #mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True)

    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)
def generate_fam_stats(
        mt: hl.MatrixTable,
        fam_file: str
) -> hl.Table:
    """
    Calculate transmission and de novo mutation statistics using trios in the dataset.

    :param mt: Input MatrixTable
    :param fam_file: path to text file containing trio pedigree
    :return: Table containing trio stats
    """
    # Load Pedigree data and filter MT to samples present in any of the trios
    ped = hl.Pedigree.read(fam_file, delimiter="\t")
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    fam_ht = fam_ht.annotate(
        fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]
    )
    fam_ht = fam_ht.explode('fam_members', name='s')
    fam_ht = fam_ht.key_by('s').select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    logger.info(f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios.")

    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)
    mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj')
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True)
    trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj)

    ht = mt.select_rows(
        **generate_trio_stats_expr(
            mt,
            transmitted_strata={
                'raw': True,
                'adj': trio_adj
            },
            de_novo_strata={
                'raw': True,
                'adj': trio_adj,
            },
            proband_is_female_expr=mt.is_female
        )
    ).rows()

    return ht.filter(
        ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0
    )
def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    truthset_ht = get_truth_ht(args.onmi, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/ddd-elgh-ukbb/truthset.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(f'{args.output_dir}/ddd-elgh-ukbb/mt_trios_adj.mt',
                            overwrite=True)
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_trios_stats.ht',
        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    ht_inbreeding.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac_new.ht',
        overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_allele_data_new.ht',
        overwrite=True)
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/Sanger_cohort_trio_table.mt',
                       overwrite=True)

    (mt1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    mt1.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt',
              overwrite=True)
    famstats_ht.write(f'{args.output_dir}/Sanger_cohorts_family_stats.ht',
                      overwrite=True)
    mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats)
    mt.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt',
             overwrite=True)
    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.mt')
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    # mt = mt.checkpoint(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True)
    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(f'{args.output_dir}/Sanger_cohort_denovo_table.ht',
                        overwrite=True)
Beispiel #6
0
    def test_trio_matrix(self):
        """
        This test depends on certain properties of the trio matrix VCF and
        pedigree structure. This test is NOT a valid test if the pedigree
        includes quads: the trio_matrix method will duplicate the parents
        appropriately, but the genotypes_table and samples_table orthogonal
        paths would require another duplication/explode that we haven't written.
        """
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s])

        dads = ht.filter(hl.is_defined(ht.pat_id))
        dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id')

        moms = ht.filter(hl.is_defined(ht.mat_id))
        moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id')

        et = (mt.entries()
            .key_by('s')
            .join(dads, how='left')
            .join(moms, how='left'))
        et = et.annotate(is_dad=hl.is_defined(et.is_dad),
                         is_mom=hl.is_defined(et.is_mom))

        et = (et
            .group_by(et.locus, et.alleles, fam=et.fam.fam_id)
            .aggregate(data=hl.agg.collect(hl.struct(
            role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0),
            g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL)))))

        et = et.filter(hl.len(et.data) == 3)
        et = et.select('locus', 'alleles', 'fam', 'data').explode('data')

        tt = hl.trio_matrix(mt, ped, complete_trios=True).entries()
        tt = tt.annotate(fam=tt.proband.fam.fam_id,
                         data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')),
                               hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')),
                               hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))])
        tt = tt.select('locus', 'alleles', 'fam', 'data').explode('data')
        tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam')

        self.assertTrue(et._same(tt))

        # test annotations
        e_cols = (mt.cols()
            .join(dads, how='left')
            .join(moms, how='left'))
        e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad),
                                 is_mom=hl.is_defined(e_cols.is_mom))
        e_cols = (e_cols.group_by(fam=e_cols.fam.fam_id)
            .aggregate(data=hl.agg.collect(hl.struct(role=hl.case()
                                                     .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0),
                                                     sa=e_cols.row.select(*mt.col)))))
        e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('fam', 'data').explode('data')

        t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols()
        t_cols = t_cols.annotate(fam=t_cols.proband.fam.fam_id,
                                 data=[
                                     hl.struct(role=0, sa=t_cols.proband),
                                     hl.struct(role=1, sa=t_cols.father),
                                     hl.struct(role=2, sa=t_cols.mother)]).select('fam', 'data').explode('data')
        t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa)).key_by('fam')

        self.assertTrue(e_cols._same(t_cols))
data = data.annotate_rows(AC=data.info.AC[data.a_index - 1],
                          iAF=data.info.AF[data.a_index - 1])
data = hl.variant_qc(data)

logger.info("Applying de novo filter...")
de_novo_scores = hl.de_novo(data,
                            pedigree,
                            pop_frequency_prior=data.variant_qc.AF[-1])
de_novo_mt = de_novo_scores.to_matrix_table(row_key=['locus', 'alleles'],
                                            col_key=['id'])
de_novo_data = data.annotate_entries(p_de_novo=de_novo_mt[(data.locus,
                                                           data.alleles),
                                                          data.s].p_de_novo)

logger.info("Annotating trio data...")
trio_mt = hl.trio_matrix(de_novo_data, pedigree, complete_trios=True)
de_novo_data = de_novo_data.annotate_entries(
    mother=trio_mt[(de_novo_data.locus, de_novo_data.alleles),
                   de_novo_data.s].mother_entry,
    father=trio_mt[(de_novo_data.locus, de_novo_data.alleles),
                   de_novo_data.s].father_entry,
)
de_novo_data = de_novo_data.filter_entries(
    hl.is_defined(de_novo_data.GT)
    & hl.is_defined(de_novo_data.PL)
    & de_novo_data.GT.is_non_ref()
    & (de_novo_data.p_de_novo > args.min_p_de_novo))

r_de_novo_mt = de_novo_data.select_cols()
r_de_novo_mt = r_de_novo_mt.select_rows('AC', 'iAF')
r_de_novo_mt = r_de_novo_mt.select_entries(
Beispiel #8
0
 def test_trio_matrix_incomplete_trios(self):
     ped = hl.Pedigree.read(resource('triomatrix.fam'))
     mt = hl.import_vcf(resource('triomatrix.vcf'))
     hl.trio_matrix(mt, ped, complete_trios=False)
Beispiel #9
0
    def test_trio_matrix(self):
        """
        This test depends on certain properties of the trio matrix VCF and
        pedigree structure. This test is NOT a valid test if the pedigree
        includes quads: the trio_matrix method will duplicate the parents
        appropriately, but the genotypes_table and samples_table orthogonal
        paths would require another duplication/explode that we haven't written.
        """
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s].fam_id)

        dads = ht.filter(hl.is_defined(ht.pat_id))
        dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id')

        moms = ht.filter(hl.is_defined(ht.mat_id))
        moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id')

        et = (mt.entries()
              .key_by('s')
              .join(dads, how='left')
              .join(moms, how='left'))
        et = et.annotate(is_dad=hl.is_defined(et.is_dad),
                         is_mom=hl.is_defined(et.is_mom))

        et = (et
            .group_by(et.locus, et.alleles, fam=et.fam)
            .aggregate(data=hl.agg.collect(hl.struct(
            role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0),
            g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL)))))

        et = et.filter(hl.len(et.data) == 3)
        et = et.select('data').explode('data')

        tt = hl.trio_matrix(mt, ped, complete_trios=True).entries().key_by('locus', 'alleles')
        tt = tt.annotate(fam=tt.proband.fam,
                         data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')),
                               hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')),
                               hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))])
        tt = tt.select('fam', 'data').explode('data')
        tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam')

        self.assertEqual(et.key.dtype, tt.key.dtype)
        self.assertEqual(et.row.dtype, tt.row.dtype)
        self.assertTrue(et._same(tt))

        # test annotations
        e_cols = (mt.cols()
                  .join(dads, how='left')
                  .join(moms, how='left'))
        e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad),
                                 is_mom=hl.is_defined(e_cols.is_mom))
        e_cols = (e_cols.group_by(fam=e_cols.fam)
                  .aggregate(data=hl.agg.collect(hl.struct(role=hl.case()
                                                           .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0),
                                                           sa=hl.struct(**e_cols.row.select(*mt.col))))))
        e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('data').explode('data')

        t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols()
        t_cols = t_cols.annotate(fam=t_cols.proband.fam,
                                 data=[
                                     hl.struct(role=0, sa=t_cols.proband),
                                     hl.struct(role=1, sa=t_cols.father),
                                     hl.struct(role=2, sa=t_cols.mother)]).key_by('fam').select('data').explode('data')
        t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa))

        self.assertEqual(e_cols.key.dtype, t_cols.key.dtype)
        self.assertEqual(e_cols.row.dtype, t_cols.row.dtype)
        self.assertTrue(e_cols._same(t_cols))
Beispiel #10
0
    mills = f'{temp_dir}/ddd-elgh-ukbb/training_sets/Mills_and_1000G_gold_standard.indels.hg38.ht'
    mills_ht = hl.read_table(mills)
    thousand_genomes = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_phase1.snps.high_confidence.hg38.ht'
    thousand_genomes_ht = hl.read_table(thousand_genomes)
    hapmap = f'{temp_dir}/ddd-elgh-ukbb/training_sets/hapmap_3.3.hg38.ht'
    hapmap_ht = hl.read_table(hapmap)
    truthset_table = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/training_sets/truthset_table.ht')
    #################################

    # trio_stats_table = hl.read_table(
    #    f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht')
    group = "raw"

    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1to6-20.mt')

    fam = f"{temp_dir}/ddd-elgh-ukbb/variant_qc/DDD_trios.fam"
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(
        f'{tmp_dir}/Sanger_cohort_trio_table.mt', overwrite=True)
    # DONE THIS BEFORE:
    #(mt1, famstats_ht) = generate_family_stats(mt, fam)
    #print("Writing mt and family stats_ht")
    #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    # famstats_ht.write(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True)
    #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats)
    #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
def main(args):
    data_type = 'exomes' if args.exomes else 'genomes'

    if args.pbt_tm:
        mt = get_gnomad_data(data_type, split=False)
        meta = mt.cols()
        hq_samples = meta.aggregate(
            hl.agg.filter(meta.meta.high_quality, hl.agg.collect(meta.s)))
        ped = hl.Pedigree.read(fam_path(data_type),
                               delimiter='\\t').filter_to(hq_samples)
        ped_samples = hl.literal(
            set([
                s for trio in ped.complete_trios()
                for s in [trio.s, trio.pat_id, trio.mat_id]
            ]))

        mt = mt.filter_cols(ped_samples.contains(mt.s))
        mt = mt.select_cols().select_rows()
        mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

        tm = hl.trio_matrix(mt, ped, complete_trios=True)
        tm = hl.experimental.phase_trio_matrix_by_transmission(tm)
        tm.write(pbt_phased_trios_mt_path(data_type,
                                          split=False,
                                          trio_matrix=True),
                 overwrite=args.overwrite)

    if args.pbt_explode:
        tm = hl.read_matrix_table(
            pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True))

        tm = tm.annotate_entries(trio_adj=tm.proband_entry.adj
                                 & tm.father_entry.adj & tm.mother_entry.adj)
        pmt = explode_trio_matrix(tm, keep_trio_entries=True)
        pmt = pmt.transmute_entries(trio_adj=pmt.source_trio_entry.trio_adj)
        pmt.write(pbt_phased_trios_mt_path(data_type, split=False),
                  overwrite=args.overwrite)

        pmt = hl.read_matrix_table(
            pbt_phased_trios_mt_path(data_type, split=False))
        pmt = pmt.rename({'PBT_GT':
                          'PGT'})  # ugly but supported by hl.split_multi_hts
        pmt = hl.split_multi_hts(pmt)
        pmt = pmt.rename({'PGT': 'PBT_GT'})
        pmt.write(pbt_phased_trios_mt_path(data_type),
                  overwrite=args.overwrite)

    if args.phase_multi_families:
        pbt = hl.read_matrix_table(pbt_phased_trios_mt_path(data_type))
        # Keep samples that:
        # 1. There are more than one entry in the Matrix (i.e. they are part of multiple trios)
        # 2. In all their entries, the parents are the same (there are only two exceptions to this, so best to ignore these and focus on parents/multi-offspring families)
        nt_samples = pbt.cols()
        nt_samples = nt_samples.group_by('s').aggregate(
            trios=hl.agg.collect(nt_samples.source_trio))
        nt_samples = nt_samples.filter(
            (hl.len(nt_samples.trios) > 1) &
            nt_samples.trios[1:].any(lambda x: (x.mother.s != nt_samples.trios[
                0].mother.s) | (x.father.s != nt_samples.trios[0].father.s)),
            keep=False)
        pbt = pbt.filter_cols(hl.is_defined(nt_samples[pbt.col_key]))

        # Group cols for these samples, keeping all GTs in an array
        # Compute the consensus GT (incl. phase) + QC metrics based on (a) phased genotypes have priority, (b) genotypes with most votes
        pbt = pbt.group_cols_by('s').aggregate(PBT_GTs=hl.agg.filter(
            hl.is_defined(pbt.GT), hl.agg.collect(pbt.GT)))
        gt_counter = hl.sorted(hl.array(
            pbt.PBT_GTs.group_by(lambda x: x).map_values(lambda x: hl.len(x))),
                               key=lambda x: x[0].phased * 100 + x[1],
                               reverse=True)
        phased_gt_counts = gt_counter.filter(lambda x: x[0].phased).map(
            lambda x: x[1])
        pbt = pbt.annotate_entries(
            consensus_gt=gt_counter.map(lambda x: x[0]).find(lambda x: True),
            phase_concordance=phased_gt_counts.find(lambda x: True) /
            hl.sum(phased_gt_counts),
            discordant_gts=hl.len(
                hl.set(
                    pbt.PBT_GTs.map(lambda x: hl.cond(
                        x.phased, hl.call(x[0], x[1]), x)))) > 1)
        pbt.write('gs://gnomad/projects/compound_hets/pbt_multi_families.mt')
def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    mt = hl.variant_qc(mt)

    truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    logger.info("Trio annotation and writing trios_adj.mt")
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True)
    logger.info("Trio stats and writing MegaWes_stats.ht")
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht',
                        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    logger.info("Writing tables for inbreeding, allele counts")
    ht_inbreeding.write(
        f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht',
                   overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht',
        overwrite=True)

    # Trio matrix table
    logger.info("Split multi allelic variants and write mt")
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    logger.info("Trio matrixtable generation:")
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt',
                       overwrite=True)

    # Family stats
    logger.info("Family stats")
    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht',
              overwrite=True)

    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt',
                       overwrite=True)

    #Family stats with Allele Frequencies from gnomad
    logger.info("Family stats with gnomad AF")
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)

    logger.info("De novo table cration")
    #De novo table
    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)