Esempio n. 1
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    #mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats)
    #mt=mt.checkpoint(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    mt=hl.read_matrix_table(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt')
    mt = hl.split_multi_hts(
        mt, keep_star=False, left_aligned=False, permit_shuffle=True)
    mt=mt.checkpoint(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered_split_multi.mt', overwrite=True)
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True)
    de_novo_table = hl.de_novo(
        mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)
Esempio n. 3
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
Esempio n. 4
0
def compute_samocha_denovos(mt, pedigree):
    gnomad_ht = hl.read_table("gs://gnomad-public/release/2.1.1/liftover_grch38/ht/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.ht")
    gnomad_ht = hl.split_multi_hts(gnomad_ht)

    de_novo_priors_ht = gnomad_ht.select(AF=gnomad_ht.freq[gnomad_ht.freq_index_dict["gnomad"]].AF)

    de_novos_ht = hl.de_novo(mt, pedigree, de_novo_priors_ht[mt.row_key].AF)

    de_novos_ht = de_novos_ht.transmute(proband=de_novos_ht.proband.s, father=de_novos_ht.father.s, mother=de_novos_ht.mother.s)

    de_novos_ht = de_novos_ht.annotate(proband_AB=de_novos_ht.proband_entry.AD[1]/(de_novos_ht.proband_entry.AD[0]+de_novos_ht.proband_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(proband_DP=de_novos_ht.proband_entry.DP)
    de_novos_ht = de_novos_ht.annotate(proband_GQ=de_novos_ht.proband_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(proband_GT=de_novos_ht.proband_entry.GT)

    de_novos_ht = de_novos_ht.annotate(father_AB=de_novos_ht.father_entry.AD[1]/(de_novos_ht.father_entry.AD[0]+de_novos_ht.father_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(father_DP=de_novos_ht.father_entry.DP)
    de_novos_ht = de_novos_ht.annotate(father_GQ=de_novos_ht.father_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(father_GT=de_novos_ht.father_entry.GT)

    de_novos_ht = de_novos_ht.annotate(mother_AB=de_novos_ht.mother_entry.AD[1]/(de_novos_ht.mother_entry.AD[0]+de_novos_ht.mother_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(mother_DP=de_novos_ht.mother_entry.DP)
    de_novos_ht = de_novos_ht.annotate(mother_GQ=de_novos_ht.mother_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(mother_GT=de_novos_ht.mother_entry.GT)

    de_novos_ht = de_novos_ht.drop(de_novos_ht.proband_entry, de_novos_ht.father_entry, de_novos_ht.mother_entry)

    return de_novos_ht
Esempio n. 5
0
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_autosomes_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/MegaWES_trio_table.mt',
                       overwrite=True)

    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    #famstats_ht.write(
    #    f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/MegaWES_family_stats.mt',
                       overwrite=True)
    #(mt1, famstats_ht) = generate_family_stats(mt, fam)
    #print("Writing mt and family stats_ht")
    #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    # famstats_ht.write(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True)
    #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats)
    #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)

    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)
    #mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True)

    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)
Esempio n. 6
0
def generate_de_novos(mt: hl.MatrixTable, fam_file: str, freq_data: hl.Table) -> hl.Table:
    mt = mt.select_cols()
    fam_ht = read_fam(fam_file).key_by()
    fam_ht = fam_ht.select(
        s=[fam_ht.s, fam_ht.pat_id, fam_ht.mat_id]).explode('s').key_by('s')
    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.s]))
    mt = mt.select_rows()
    mt = hl.split_multi_hts(mt)
    mt = mt.annotate_rows(family_stats=freq_data[mt.row_key].family_stats)
    ped = hl.Pedigree.read(fam_file, delimiter='\\t')

    de_novo_table = hl.de_novo(
        mt, ped, mt.family_stats[0].unrelated_qc_callstats.AF[1])
    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')

    return de_novo_table
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/Sanger_cohort_trio_table.mt',
                       overwrite=True)

    (mt1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    mt1.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt',
              overwrite=True)
    famstats_ht.write(f'{args.output_dir}/Sanger_cohorts_family_stats.ht',
                      overwrite=True)
    mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats)
    mt.write(f'{args.output_dir}/Sanger_cohorts_family_stats.mt',
             overwrite=True)
    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.mt')
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    # mt = mt.checkpoint(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True)
    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(f'{args.output_dir}/Sanger_cohort_denovo_table.ht',
                        overwrite=True)
Esempio n. 8
0
logger.info(f"Reading pedigree file {args.fam}")
pedigree = hl.Pedigree.read(args.fam)

logger.info(f"Importing vcf file {args.vcf}")
data = hl.import_vcf(args.vcf,
                     call_fields=['GT'],
                     skip_invalid_loci=True,
                     force_bgz=True)
data = hl.split_multi_hts(data)
data = data.annotate_rows(AC=data.info.AC[data.a_index - 1],
                          iAF=data.info.AF[data.a_index - 1])
data = hl.variant_qc(data)

logger.info("Applying de novo filter...")
de_novo_scores = hl.de_novo(data,
                            pedigree,
                            pop_frequency_prior=data.variant_qc.AF[-1])
de_novo_mt = de_novo_scores.to_matrix_table(row_key=['locus', 'alleles'],
                                            col_key=['id'])
de_novo_data = data.annotate_entries(p_de_novo=de_novo_mt[(data.locus,
                                                           data.alleles),
                                                          data.s].p_de_novo)

logger.info("Annotating trio data...")
trio_mt = hl.trio_matrix(de_novo_data, pedigree, complete_trios=True)
de_novo_data = de_novo_data.annotate_entries(
    mother=trio_mt[(de_novo_data.locus, de_novo_data.alleles),
                   de_novo_data.s].mother_entry,
    father=trio_mt[(de_novo_data.locus, de_novo_data.alleles),
                   de_novo_data.s].father_entry,
)
Esempio n. 9
0
def run_pipeline(args):
    hl.init(log='./hail_annotation_pipeline.log')
    '''
    mt = hl.read_matrix_table('/gpfs/ycga/project/gruber/nsd35/hail_DeNovo/PCGC11.mt')

    pprint.pprint(mt.count_cols()) # output: 1100
    pprint.pprint(mt.count_rows()) # output: 1493771

    mt = generate_split_alleles(mt)

    pprint.pprint(mt.count_cols()) 
    pprint.pprint(mt.count_rows()) 

    table = (hl.import_table('/gpfs/ycga/project/gruber/nsd35/hail_DeNovo/IDs_keep.txt', impute=True).key_by('Sample'))
    mt = mt.annotate_cols(should_retain = table[mt.s].should_retain)
    mt = mt.filter_cols(mt.should_retain == 'yes', keep=True)

    pprint.pprint(mt.count_cols()) 
    pprint.pprint(mt.count_rows()) 

    mt = mt.annotate_rows(gt_stats = hl.agg.call_stats(mt.GT, mt.alleles))
    mt = mt.filter_rows(mt.row.gt_stats.AC[1] == 0, keep=False)

    pprint.pprint(mt.count_cols()) 
    pprint.pprint(mt.count_rows()) 

    mt.write('pcgc11_subset.mt',overwrite=True)    
    '''
    '''
    mt = hl.read_matrix_table('pcgc11_subset.mt')
    mt = hl.vep(mt, 'vep85-loftee-ruddle.json')    
    mt.write('pcgc11_subset_vep.mt',overwrite=True)
    '''

    #mt = hl.read_matrix_table('pcgc11_subset.mt')
    #pprint.pprint(mt.count_cols())
    #pprint.pprint(mt.count_rows())
    #pprint.pprint(mt.describe())
    #pprint.pprint(mt.show(include_row_fields=True))
    #pprint.pprint(mt.row_key)

    mt = hl.read_matrix_table('pcgc11_subset_vep.mt')
    #pprint.pprint(mt_vep.count_cols())
    #pprint.pprint(mt_vep.count_rows())
    #pprint.pprint(mt_vep.describe())

    mt = mt.annotate_rows(
        sortedTranscriptConsequences=
        get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep))
    pprint.pprint(mt.describe())

    mt = mt.annotate_rows(
        gene_symbol=hl.cond(mt.sortedTranscriptConsequences.size() > 0,
                            mt.sortedTranscriptConsequences[0].gene_symbol,
                            hl.null(hl.tstr)),
        major_consequence=hl.cond(
            mt.sortedTranscriptConsequences.size() > 0,
            mt.sortedTranscriptConsequences[0].major_consequence,
            hl.null(hl.tstr)),
        hgvs=hl.cond(mt.sortedTranscriptConsequences.size() > 0,
                     mt.sortedTranscriptConsequences[0].hgvs,
                     hl.null(hl.tstr)),
        category=hl.cond(mt.sortedTranscriptConsequences.size() > 0,
                         mt.sortedTranscriptConsequences[0].category,
                         hl.null(hl.tstr)),
        canonical=hl.cond(mt.sortedTranscriptConsequences.size() > 0,
                          mt.sortedTranscriptConsequences[0].canonical, -1))

    #pprint.pprint(mt.describe())
    #pprint.pprint(mt.show(include_row_fields=True))

    gnomad_exomes_ht = hl.read_table(
        '/gpfs/ycga/project/lek/shared/data/gnomad/gnomad.exomes.r2.1.1.sites.ht'
    )
    #pprint.pprint(gnomad_exomes_ht.describe())
    #pprint.pprint(gnomad_exomes_ht.show())
    #global_meta = hl.eval(gnomad_exomes_ht.globals.freq_index_dict)
    #pprint.pprint(global_meta)

    #mt = mt.annotate_rows(gnomad_af = gnomad_exomes_ht[mt.row_key].freq[0].AF)
    mt = mt.annotate_rows(
        gnomad_af=hl.cond(hl.is_defined(gnomad_exomes_ht[mt.row_key]),
                          gnomad_exomes_ht[mt.row_key].freq[0].AF, 0.0))

    #pprint.pprint(mt.describe())
    #pprint.pprint(mt.show(include_row_fields=True))
    #pprint.pprint(mt.count_cols())
    #pprint.pprint(mt.count_rows())
    #pprint.pprint(mt.major_consequence.show())

    #mt = mt.filter_rows(mt.gnomad_af > 0.0004, keep=False)
    #pprint.pprint(mt.count_cols())
    #pprint.pprint(mt.count_rows()) #552,067 114,297
    #pprint.pprint(mt.show(include_row_fields=True))
    #pprint.pprint(mt.major_consequence.show())

    pedigree = hl.Pedigree.read(
        '/gpfs/ycga/project/gruber/nsd35/hail_DeNovo/PCGC11.fam')
    de_novo_results = (hl.de_novo(mt,
                                  pedigree,
                                  pop_frequency_prior=mt.gnomad_af)).key_by(
                                      'locus', 'alleles')
    #pprint.pprint(de_novo_results.describe())

    #de_novo_results = hl.de_novo(mt, pedigree, pop_frequency_prior=gnomad_exomes_ht[mt.row_key].freq[0].AF)

    #de_novo_results = de_novo_results.filter(de_novo_results.confidence == "HIGH")
    #pprint.pprint(de_novo_results.key)

    #ht = mt.rows()
    #pprint.pprint(ht.describe())
    #pprint.pprint(ht.show())
    '''
    de_novo_results = de_novo_results.annotate(
                        gene=ht[de_novo_results.key].gene_symbol,
                        major_consequence=ht[de_novo_results.key].major_consequence,
                        hgvs=ht[de_novo_results.key].hgvs,
                        category=ht[de_novo_results.key].category,
                        canonical=ht[de_novo_results.key].canonical,                    
                        )

    '''
    #pprint.pprint(de_novo_results.show())
    de_novo_results.export('denovo_results.tsv')
    '''
def main(args):
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)

    # Truthset
    mt = hl.variant_qc(mt)

    truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes,
                               args.hapmap)
    truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht',
                      overwrite=True)
    # Trio data
    # trio annotation:
    logger.info("Trio annotation and writing trios_adj.mt")
    mt_adj = annotate_adj(mt)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True)
    trio_dataset.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True)
    logger.info("Trio stats and writing MegaWes_stats.ht")
    trio_stats_ht = generate_trio_stats(trio_dataset,
                                        autosomes_only=True,
                                        bi_allelic_only=True)
    trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht',
                        overwrite=True)

    # inbreeding ht
    mt_inbreeding = mt.annotate_rows(
        InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    ht_inbreeding = mt_inbreeding.rows()

    # allele data and qc_ac ht
    allele_data_ht = generate_allele_data(mt)

    qc_ac_ht = generate_ac(mt, fam)

    logger.info("Writing tables for inbreeding, allele counts")
    ht_inbreeding.write(
        f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht',
        overwrite=True)
    qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht',
                   overwrite=True)
    allele_data_ht.write(
        f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht',
        overwrite=True)

    # Trio matrix table
    logger.info("Split multi allelic variants and write mt")
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    logger.info("Trio matrixtable generation:")
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt',
                       overwrite=True)

    # Family stats
    logger.info("Family stats")
    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht',
              overwrite=True)

    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt',
                       overwrite=True)

    #Family stats with Allele Frequencies from gnomad
    logger.info("Family stats with gnomad AF")
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)

    logger.info("De novo table cration")
    #De novo table
    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)