def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def main(args): ################################ truthset_table = hl.read_table(args.truthset_table) ################################# group = "raw" fam = args.trio_fam pedigree = hl.Pedigree.read(fam) #mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats) #mt=mt.checkpoint(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) mt=hl.read_matrix_table(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt') mt = hl.split_multi_hts( mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt=mt.checkpoint(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered_split_multi.mt', overwrite=True) priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True) de_novo_table = hl.de_novo( mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)
def compute_samocha_denovos(mt, pedigree): gnomad_ht = hl.read_table("gs://gnomad-public/release/2.1.1/liftover_grch38/ht/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.ht") gnomad_ht = hl.split_multi_hts(gnomad_ht) de_novo_priors_ht = gnomad_ht.select(AF=gnomad_ht.freq[gnomad_ht.freq_index_dict["gnomad"]].AF) de_novos_ht = hl.de_novo(mt, pedigree, de_novo_priors_ht[mt.row_key].AF) de_novos_ht = de_novos_ht.transmute(proband=de_novos_ht.proband.s, father=de_novos_ht.father.s, mother=de_novos_ht.mother.s) de_novos_ht = de_novos_ht.annotate(proband_AB=de_novos_ht.proband_entry.AD[1]/(de_novos_ht.proband_entry.AD[0]+de_novos_ht.proband_entry.AD[1])) de_novos_ht = de_novos_ht.annotate(proband_DP=de_novos_ht.proband_entry.DP) de_novos_ht = de_novos_ht.annotate(proband_GQ=de_novos_ht.proband_entry.GQ) de_novos_ht = de_novos_ht.annotate(proband_GT=de_novos_ht.proband_entry.GT) de_novos_ht = de_novos_ht.annotate(father_AB=de_novos_ht.father_entry.AD[1]/(de_novos_ht.father_entry.AD[0]+de_novos_ht.father_entry.AD[1])) de_novos_ht = de_novos_ht.annotate(father_DP=de_novos_ht.father_entry.DP) de_novos_ht = de_novos_ht.annotate(father_GQ=de_novos_ht.father_entry.GQ) de_novos_ht = de_novos_ht.annotate(father_GT=de_novos_ht.father_entry.GT) de_novos_ht = de_novos_ht.annotate(mother_AB=de_novos_ht.mother_entry.AD[1]/(de_novos_ht.mother_entry.AD[0]+de_novos_ht.mother_entry.AD[1])) de_novos_ht = de_novos_ht.annotate(mother_DP=de_novos_ht.mother_entry.DP) de_novos_ht = de_novos_ht.annotate(mother_GQ=de_novos_ht.mother_entry.GQ) de_novos_ht = de_novos_ht.annotate(mother_GT=de_novos_ht.mother_entry.GT) de_novos_ht = de_novos_ht.drop(de_novos_ht.proband_entry, de_novos_ht.father_entry, de_novos_ht.mother_entry) return de_novos_ht
def main(args): ################################ truthset_table = hl.read_table(args.truthset_table) ################################# group = "raw" mt = hl.read_matrix_table(args.matrixtable) mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt = mt.checkpoint( f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_autosomes_split.mt', overwrite=True) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/MegaWES_trio_table.mt', overwrite=True) (ht1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") ht1.write(f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True) #famstats_ht.write( # f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats) mt = mt.checkpoint(f'{args.output_dir}/MegaWES_family_stats.mt', overwrite=True) #(mt1, famstats_ht) = generate_family_stats(mt, fam) #print("Writing mt and family stats_ht") #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) # famstats_ht.write( # f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True) #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats) #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt', overwrite=True) #mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht', overwrite=True)
def generate_de_novos(mt: hl.MatrixTable, fam_file: str, freq_data: hl.Table) -> hl.Table: mt = mt.select_cols() fam_ht = read_fam(fam_file).key_by() fam_ht = fam_ht.select( s=[fam_ht.s, fam_ht.pat_id, fam_ht.mat_id]).explode('s').key_by('s') mt = mt.filter_cols(hl.is_defined(fam_ht[mt.s])) mt = mt.select_rows() mt = hl.split_multi_hts(mt) mt = mt.annotate_rows(family_stats=freq_data[mt.row_key].family_stats) ped = hl.Pedigree.read(fam_file, delimiter='\\t') de_novo_table = hl.de_novo( mt, ped, mt.family_stats[0].unrelated_qc_callstats.AF[1]) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') return de_novo_table
def main(args): ################################ truthset_table = hl.read_table(args.truthset_table) ################################# group = "raw" mt = hl.read_matrix_table(args.matrixtable) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/Sanger_cohort_trio_table.mt', overwrite=True) (mt1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") mt1.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) famstats_ht.write(f'{args.output_dir}/Sanger_cohorts_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats) mt.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True) mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.mt') priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) # mt = mt.checkpoint( # f'{tmp_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True) de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write(f'{args.output_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)
logger.info(f"Reading pedigree file {args.fam}") pedigree = hl.Pedigree.read(args.fam) logger.info(f"Importing vcf file {args.vcf}") data = hl.import_vcf(args.vcf, call_fields=['GT'], skip_invalid_loci=True, force_bgz=True) data = hl.split_multi_hts(data) data = data.annotate_rows(AC=data.info.AC[data.a_index - 1], iAF=data.info.AF[data.a_index - 1]) data = hl.variant_qc(data) logger.info("Applying de novo filter...") de_novo_scores = hl.de_novo(data, pedigree, pop_frequency_prior=data.variant_qc.AF[-1]) de_novo_mt = de_novo_scores.to_matrix_table(row_key=['locus', 'alleles'], col_key=['id']) de_novo_data = data.annotate_entries(p_de_novo=de_novo_mt[(data.locus, data.alleles), data.s].p_de_novo) logger.info("Annotating trio data...") trio_mt = hl.trio_matrix(de_novo_data, pedigree, complete_trios=True) de_novo_data = de_novo_data.annotate_entries( mother=trio_mt[(de_novo_data.locus, de_novo_data.alleles), de_novo_data.s].mother_entry, father=trio_mt[(de_novo_data.locus, de_novo_data.alleles), de_novo_data.s].father_entry, )
def run_pipeline(args): hl.init(log='./hail_annotation_pipeline.log') ''' mt = hl.read_matrix_table('/gpfs/ycga/project/gruber/nsd35/hail_DeNovo/PCGC11.mt') pprint.pprint(mt.count_cols()) # output: 1100 pprint.pprint(mt.count_rows()) # output: 1493771 mt = generate_split_alleles(mt) pprint.pprint(mt.count_cols()) pprint.pprint(mt.count_rows()) table = (hl.import_table('/gpfs/ycga/project/gruber/nsd35/hail_DeNovo/IDs_keep.txt', impute=True).key_by('Sample')) mt = mt.annotate_cols(should_retain = table[mt.s].should_retain) mt = mt.filter_cols(mt.should_retain == 'yes', keep=True) pprint.pprint(mt.count_cols()) pprint.pprint(mt.count_rows()) mt = mt.annotate_rows(gt_stats = hl.agg.call_stats(mt.GT, mt.alleles)) mt = mt.filter_rows(mt.row.gt_stats.AC[1] == 0, keep=False) pprint.pprint(mt.count_cols()) pprint.pprint(mt.count_rows()) mt.write('pcgc11_subset.mt',overwrite=True) ''' ''' mt = hl.read_matrix_table('pcgc11_subset.mt') mt = hl.vep(mt, 'vep85-loftee-ruddle.json') mt.write('pcgc11_subset_vep.mt',overwrite=True) ''' #mt = hl.read_matrix_table('pcgc11_subset.mt') #pprint.pprint(mt.count_cols()) #pprint.pprint(mt.count_rows()) #pprint.pprint(mt.describe()) #pprint.pprint(mt.show(include_row_fields=True)) #pprint.pprint(mt.row_key) mt = hl.read_matrix_table('pcgc11_subset_vep.mt') #pprint.pprint(mt_vep.count_cols()) #pprint.pprint(mt_vep.count_rows()) #pprint.pprint(mt_vep.describe()) mt = mt.annotate_rows( sortedTranscriptConsequences= get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep)) pprint.pprint(mt.describe()) mt = mt.annotate_rows( gene_symbol=hl.cond(mt.sortedTranscriptConsequences.size() > 0, mt.sortedTranscriptConsequences[0].gene_symbol, hl.null(hl.tstr)), major_consequence=hl.cond( mt.sortedTranscriptConsequences.size() > 0, mt.sortedTranscriptConsequences[0].major_consequence, hl.null(hl.tstr)), hgvs=hl.cond(mt.sortedTranscriptConsequences.size() > 0, mt.sortedTranscriptConsequences[0].hgvs, hl.null(hl.tstr)), category=hl.cond(mt.sortedTranscriptConsequences.size() > 0, mt.sortedTranscriptConsequences[0].category, hl.null(hl.tstr)), canonical=hl.cond(mt.sortedTranscriptConsequences.size() > 0, mt.sortedTranscriptConsequences[0].canonical, -1)) #pprint.pprint(mt.describe()) #pprint.pprint(mt.show(include_row_fields=True)) gnomad_exomes_ht = hl.read_table( '/gpfs/ycga/project/lek/shared/data/gnomad/gnomad.exomes.r2.1.1.sites.ht' ) #pprint.pprint(gnomad_exomes_ht.describe()) #pprint.pprint(gnomad_exomes_ht.show()) #global_meta = hl.eval(gnomad_exomes_ht.globals.freq_index_dict) #pprint.pprint(global_meta) #mt = mt.annotate_rows(gnomad_af = gnomad_exomes_ht[mt.row_key].freq[0].AF) mt = mt.annotate_rows( gnomad_af=hl.cond(hl.is_defined(gnomad_exomes_ht[mt.row_key]), gnomad_exomes_ht[mt.row_key].freq[0].AF, 0.0)) #pprint.pprint(mt.describe()) #pprint.pprint(mt.show(include_row_fields=True)) #pprint.pprint(mt.count_cols()) #pprint.pprint(mt.count_rows()) #pprint.pprint(mt.major_consequence.show()) #mt = mt.filter_rows(mt.gnomad_af > 0.0004, keep=False) #pprint.pprint(mt.count_cols()) #pprint.pprint(mt.count_rows()) #552,067 114,297 #pprint.pprint(mt.show(include_row_fields=True)) #pprint.pprint(mt.major_consequence.show()) pedigree = hl.Pedigree.read( '/gpfs/ycga/project/gruber/nsd35/hail_DeNovo/PCGC11.fam') de_novo_results = (hl.de_novo(mt, pedigree, pop_frequency_prior=mt.gnomad_af)).key_by( 'locus', 'alleles') #pprint.pprint(de_novo_results.describe()) #de_novo_results = hl.de_novo(mt, pedigree, pop_frequency_prior=gnomad_exomes_ht[mt.row_key].freq[0].AF) #de_novo_results = de_novo_results.filter(de_novo_results.confidence == "HIGH") #pprint.pprint(de_novo_results.key) #ht = mt.rows() #pprint.pprint(ht.describe()) #pprint.pprint(ht.show()) ''' de_novo_results = de_novo_results.annotate( gene=ht[de_novo_results.key].gene_symbol, major_consequence=ht[de_novo_results.key].major_consequence, hgvs=ht[de_novo_results.key].hgvs, category=ht[de_novo_results.key].category, canonical=ht[de_novo_results.key].canonical, ) ''' #pprint.pprint(de_novo_results.show()) de_novo_results.export('denovo_results.tsv') '''
def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset mt = hl.variant_qc(mt) truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht', overwrite=True) # Trio data # trio annotation: logger.info("Trio annotation and writing trios_adj.mt") mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint( f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True) logger.info("Trio stats and writing MegaWes_stats.ht") trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) logger.info("Writing tables for inbreeding, allele counts") ht_inbreeding.write( f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht', overwrite=True) qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht', overwrite=True) # Trio matrix table logger.info("Split multi allelic variants and write mt") mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt = mt.checkpoint( f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt', overwrite=True) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) logger.info("Trio matrixtable generation:") trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt', overwrite=True) # Family stats logger.info("Family stats") (ht1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats) mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt', overwrite=True) #Family stats with Allele Frequencies from gnomad logger.info("Family stats with gnomad AF") priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt', overwrite=True) logger.info("De novo table cration") #De novo table de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht', overwrite=True)