def test_trio_matrix_null_keys(self): ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s].fam_id) # Make keys all null mt = mt.key_cols_by(s=hl.null(hl.tstr)) tt = hl.trio_matrix(mt, ped, complete_trios=True) self.assertEqual(tt.count_cols(), 0)
def main(args): ################################ truthset_table = hl.read_table(args.truthset_table) ################################# group = "raw" mt = hl.read_matrix_table(args.matrixtable) mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt = mt.checkpoint( f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_autosomes_split.mt', overwrite=True) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/MegaWES_trio_table.mt', overwrite=True) (ht1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") ht1.write(f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True) #famstats_ht.write( # f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats) mt = mt.checkpoint(f'{args.output_dir}/MegaWES_family_stats.mt', overwrite=True) #(mt1, famstats_ht) = generate_family_stats(mt, fam) #print("Writing mt and family stats_ht") #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) # famstats_ht.write( # f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True) #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats) #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt', overwrite=True) #mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht', overwrite=True)
def generate_fam_stats( mt: hl.MatrixTable, fam_file: str ) -> hl.Table: """ Calculate transmission and de novo mutation statistics using trios in the dataset. :param mt: Input MatrixTable :param fam_file: path to text file containing trio pedigree :return: Table containing trio stats """ # Load Pedigree data and filter MT to samples present in any of the trios ped = hl.Pedigree.read(fam_file, delimiter="\t") fam_ht = hl.import_fam(fam_file, delimiter="\t") fam_ht = fam_ht.annotate( fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id] ) fam_ht = fam_ht.explode('fam_members', name='s') fam_ht = fam_ht.key_by('s').select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) logger.info(f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios.") mt = filter_to_autosomes(mt) mt = annotate_adj(mt) mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj') mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True) trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj) ht = mt.select_rows( **generate_trio_stats_expr( mt, transmitted_strata={ 'raw': True, 'adj': trio_adj }, de_novo_strata={ 'raw': True, 'adj': trio_adj, }, proband_is_female_expr=mt.is_female ) ).rows() return ht.filter( ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0 )
def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset truthset_ht = get_truth_ht(args.onmi, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/ddd-elgh-ukbb/truthset.ht', overwrite=True) # Trio data # trio annotation: mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint(f'{args.output_dir}/ddd-elgh-ukbb/mt_trios_adj.mt', overwrite=True) trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_trios_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) ht_inbreeding.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_inbreeding_new.ht', overwrite=True) qc_ac_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_allele_data_new.ht', overwrite=True)
def main(args): ################################ truthset_table = hl.read_table(args.truthset_table) ################################# group = "raw" mt = hl.read_matrix_table(args.matrixtable) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/Sanger_cohort_trio_table.mt', overwrite=True) (mt1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") mt1.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) famstats_ht.write(f'{args.output_dir}/Sanger_cohorts_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats) mt.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.mt') priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) # mt = mt.checkpoint( # f'{tmp_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True) de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write(f'{args.output_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)
def test_trio_matrix(self): """ This test depends on certain properties of the trio matrix VCF and pedigree structure. This test is NOT a valid test if the pedigree includes quads: the trio_matrix method will duplicate the parents appropriately, but the genotypes_table and samples_table orthogonal paths would require another duplication/explode that we haven't written. """ ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s]) dads = ht.filter(hl.is_defined(ht.pat_id)) dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id') moms = ht.filter(hl.is_defined(ht.mat_id)) moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id') et = (mt.entries() .key_by('s') .join(dads, how='left') .join(moms, how='left')) et = et.annotate(is_dad=hl.is_defined(et.is_dad), is_mom=hl.is_defined(et.is_mom)) et = (et .group_by(et.locus, et.alleles, fam=et.fam.fam_id) .aggregate(data=hl.agg.collect(hl.struct( role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0), g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL))))) et = et.filter(hl.len(et.data) == 3) et = et.select('locus', 'alleles', 'fam', 'data').explode('data') tt = hl.trio_matrix(mt, ped, complete_trios=True).entries() tt = tt.annotate(fam=tt.proband.fam.fam_id, data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))]) tt = tt.select('locus', 'alleles', 'fam', 'data').explode('data') tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam') self.assertTrue(et._same(tt)) # test annotations e_cols = (mt.cols() .join(dads, how='left') .join(moms, how='left')) e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad), is_mom=hl.is_defined(e_cols.is_mom)) e_cols = (e_cols.group_by(fam=e_cols.fam.fam_id) .aggregate(data=hl.agg.collect(hl.struct(role=hl.case() .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0), sa=e_cols.row.select(*mt.col))))) e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('fam', 'data').explode('data') t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols() t_cols = t_cols.annotate(fam=t_cols.proband.fam.fam_id, data=[ hl.struct(role=0, sa=t_cols.proband), hl.struct(role=1, sa=t_cols.father), hl.struct(role=2, sa=t_cols.mother)]).select('fam', 'data').explode('data') t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa)).key_by('fam') self.assertTrue(e_cols._same(t_cols))
data = data.annotate_rows(AC=data.info.AC[data.a_index - 1], iAF=data.info.AF[data.a_index - 1]) data = hl.variant_qc(data) logger.info("Applying de novo filter...") de_novo_scores = hl.de_novo(data, pedigree, pop_frequency_prior=data.variant_qc.AF[-1]) de_novo_mt = de_novo_scores.to_matrix_table(row_key=['locus', 'alleles'], col_key=['id']) de_novo_data = data.annotate_entries(p_de_novo=de_novo_mt[(data.locus, data.alleles), data.s].p_de_novo) logger.info("Annotating trio data...") trio_mt = hl.trio_matrix(de_novo_data, pedigree, complete_trios=True) de_novo_data = de_novo_data.annotate_entries( mother=trio_mt[(de_novo_data.locus, de_novo_data.alleles), de_novo_data.s].mother_entry, father=trio_mt[(de_novo_data.locus, de_novo_data.alleles), de_novo_data.s].father_entry, ) de_novo_data = de_novo_data.filter_entries( hl.is_defined(de_novo_data.GT) & hl.is_defined(de_novo_data.PL) & de_novo_data.GT.is_non_ref() & (de_novo_data.p_de_novo > args.min_p_de_novo)) r_de_novo_mt = de_novo_data.select_cols() r_de_novo_mt = r_de_novo_mt.select_rows('AC', 'iAF') r_de_novo_mt = r_de_novo_mt.select_entries(
def test_trio_matrix_incomplete_trios(self): ped = hl.Pedigree.read(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) hl.trio_matrix(mt, ped, complete_trios=False)
def test_trio_matrix(self): """ This test depends on certain properties of the trio matrix VCF and pedigree structure. This test is NOT a valid test if the pedigree includes quads: the trio_matrix method will duplicate the parents appropriately, but the genotypes_table and samples_table orthogonal paths would require another duplication/explode that we haven't written. """ ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s].fam_id) dads = ht.filter(hl.is_defined(ht.pat_id)) dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id') moms = ht.filter(hl.is_defined(ht.mat_id)) moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id') et = (mt.entries() .key_by('s') .join(dads, how='left') .join(moms, how='left')) et = et.annotate(is_dad=hl.is_defined(et.is_dad), is_mom=hl.is_defined(et.is_mom)) et = (et .group_by(et.locus, et.alleles, fam=et.fam) .aggregate(data=hl.agg.collect(hl.struct( role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0), g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL))))) et = et.filter(hl.len(et.data) == 3) et = et.select('data').explode('data') tt = hl.trio_matrix(mt, ped, complete_trios=True).entries().key_by('locus', 'alleles') tt = tt.annotate(fam=tt.proband.fam, data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))]) tt = tt.select('fam', 'data').explode('data') tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam') self.assertEqual(et.key.dtype, tt.key.dtype) self.assertEqual(et.row.dtype, tt.row.dtype) self.assertTrue(et._same(tt)) # test annotations e_cols = (mt.cols() .join(dads, how='left') .join(moms, how='left')) e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad), is_mom=hl.is_defined(e_cols.is_mom)) e_cols = (e_cols.group_by(fam=e_cols.fam) .aggregate(data=hl.agg.collect(hl.struct(role=hl.case() .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0), sa=hl.struct(**e_cols.row.select(*mt.col)))))) e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('data').explode('data') t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols() t_cols = t_cols.annotate(fam=t_cols.proband.fam, data=[ hl.struct(role=0, sa=t_cols.proband), hl.struct(role=1, sa=t_cols.father), hl.struct(role=2, sa=t_cols.mother)]).key_by('fam').select('data').explode('data') t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa)) self.assertEqual(e_cols.key.dtype, t_cols.key.dtype) self.assertEqual(e_cols.row.dtype, t_cols.row.dtype) self.assertTrue(e_cols._same(t_cols))
mills = f'{temp_dir}/ddd-elgh-ukbb/training_sets/Mills_and_1000G_gold_standard.indels.hg38.ht' mills_ht = hl.read_table(mills) thousand_genomes = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_phase1.snps.high_confidence.hg38.ht' thousand_genomes_ht = hl.read_table(thousand_genomes) hapmap = f'{temp_dir}/ddd-elgh-ukbb/training_sets/hapmap_3.3.hg38.ht' hapmap_ht = hl.read_table(hapmap) truthset_table = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/training_sets/truthset_table.ht') ################################# # trio_stats_table = hl.read_table( # f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht') group = "raw" mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1to6-20.mt') fam = f"{temp_dir}/ddd-elgh-ukbb/variant_qc/DDD_trios.fam" pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write( f'{tmp_dir}/Sanger_cohort_trio_table.mt', overwrite=True) # DONE THIS BEFORE: #(mt1, famstats_ht) = generate_family_stats(mt, fam) #print("Writing mt and family stats_ht") #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) # famstats_ht.write( # f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True) #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats) #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
def main(args): data_type = 'exomes' if args.exomes else 'genomes' if args.pbt_tm: mt = get_gnomad_data(data_type, split=False) meta = mt.cols() hq_samples = meta.aggregate( hl.agg.filter(meta.meta.high_quality, hl.agg.collect(meta.s))) ped = hl.Pedigree.read(fam_path(data_type), delimiter='\\t').filter_to(hq_samples) ped_samples = hl.literal( set([ s for trio in ped.complete_trios() for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = mt.select_cols().select_rows() mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref())) tm = hl.trio_matrix(mt, ped, complete_trios=True) tm = hl.experimental.phase_trio_matrix_by_transmission(tm) tm.write(pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True), overwrite=args.overwrite) if args.pbt_explode: tm = hl.read_matrix_table( pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True)) tm = tm.annotate_entries(trio_adj=tm.proband_entry.adj & tm.father_entry.adj & tm.mother_entry.adj) pmt = explode_trio_matrix(tm, keep_trio_entries=True) pmt = pmt.transmute_entries(trio_adj=pmt.source_trio_entry.trio_adj) pmt.write(pbt_phased_trios_mt_path(data_type, split=False), overwrite=args.overwrite) pmt = hl.read_matrix_table( pbt_phased_trios_mt_path(data_type, split=False)) pmt = pmt.rename({'PBT_GT': 'PGT'}) # ugly but supported by hl.split_multi_hts pmt = hl.split_multi_hts(pmt) pmt = pmt.rename({'PGT': 'PBT_GT'}) pmt.write(pbt_phased_trios_mt_path(data_type), overwrite=args.overwrite) if args.phase_multi_families: pbt = hl.read_matrix_table(pbt_phased_trios_mt_path(data_type)) # Keep samples that: # 1. There are more than one entry in the Matrix (i.e. they are part of multiple trios) # 2. In all their entries, the parents are the same (there are only two exceptions to this, so best to ignore these and focus on parents/multi-offspring families) nt_samples = pbt.cols() nt_samples = nt_samples.group_by('s').aggregate( trios=hl.agg.collect(nt_samples.source_trio)) nt_samples = nt_samples.filter( (hl.len(nt_samples.trios) > 1) & nt_samples.trios[1:].any(lambda x: (x.mother.s != nt_samples.trios[ 0].mother.s) | (x.father.s != nt_samples.trios[0].father.s)), keep=False) pbt = pbt.filter_cols(hl.is_defined(nt_samples[pbt.col_key])) # Group cols for these samples, keeping all GTs in an array # Compute the consensus GT (incl. phase) + QC metrics based on (a) phased genotypes have priority, (b) genotypes with most votes pbt = pbt.group_cols_by('s').aggregate(PBT_GTs=hl.agg.filter( hl.is_defined(pbt.GT), hl.agg.collect(pbt.GT))) gt_counter = hl.sorted(hl.array( pbt.PBT_GTs.group_by(lambda x: x).map_values(lambda x: hl.len(x))), key=lambda x: x[0].phased * 100 + x[1], reverse=True) phased_gt_counts = gt_counter.filter(lambda x: x[0].phased).map( lambda x: x[1]) pbt = pbt.annotate_entries( consensus_gt=gt_counter.map(lambda x: x[0]).find(lambda x: True), phase_concordance=phased_gt_counts.find(lambda x: True) / hl.sum(phased_gt_counts), discordant_gts=hl.len( hl.set( pbt.PBT_GTs.map(lambda x: hl.cond( x.phased, hl.call(x[0], x[1]), x)))) > 1) pbt.write('gs://gnomad/projects/compound_hets/pbt_multi_families.mt')
def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset mt = hl.variant_qc(mt) truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht', overwrite=True) # Trio data # trio annotation: logger.info("Trio annotation and writing trios_adj.mt") mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint( f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True) logger.info("Trio stats and writing MegaWes_stats.ht") trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) logger.info("Writing tables for inbreeding, allele counts") ht_inbreeding.write( f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht', overwrite=True) qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht', overwrite=True) # Trio matrix table logger.info("Split multi allelic variants and write mt") mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt = mt.checkpoint( f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt', overwrite=True) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) logger.info("Trio matrixtable generation:") trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt', overwrite=True) # Family stats logger.info("Family stats") (ht1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats) mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt', overwrite=True) #Family stats with Allele Frequencies from gnomad logger.info("Family stats with gnomad AF") priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt', overwrite=True) logger.info("De novo table cration") #De novo table de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht', overwrite=True)