def main(args):
    hl.init(default_reference='GRCh38')
    logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage")

    if args.compute_coverage_ht:

        print("Building reference context HT")
        ref_ht = get_reference_ht(hl.get_reference('GRCh38'),
                                  excluded_intervals=telomeres_and_centromeres.
                                  ht().interval.collect())
        ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht",
                                   overwrite=True)
        logger.info("Done building reference context HT")

        mt = get_gnomad_v3_mt()
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

        coverage_ht = compute_coverage_stats(mt, ref_ht)

        coverage_ht = coverage_ht.checkpoint(
            'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht',
            overwrite=True)
        coverage_ht = coverage_ht.naive_coalesce(5000)

        coverage_ht.write(coverage('genomes').versions['3.0'].path,
                          overwrite=args.overwrite)

    if args.export_coverage:
        ht = coverage('genomes').versions['3.0'].ht()
        if 'count_array' in ht.row_value:  # Note that count_array isn't computed any more, so this is v3.0-specific
            ht = ht.drop('count_array')
        ht.export(coverage_tsv_path('genomes', '3.0'))
Beispiel #2
0
def run_mendel_errors() -> hl.Table:
    meta_ht = meta.ht()
    ped = pedigree.versions["raw"].pedigree()
    logger.info(f"Running Mendel errors for {len(ped.trios)} trios.")

    fake_ped = create_fake_pedigree(
        n=100,
        sample_list=list(
            meta_ht.aggregate(
                hl.agg.filter(
                    hl.rand_bool(0.01)
                    & ((hl.len(meta_ht.qc_metrics_filters) == 0)
                       & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)),
                    hl.agg.collect_as_set(meta_ht.s),
                ))),
        real_pedigree=ped,
    )
    merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios)

    ped_samples = hl.literal(
        set([
            s for trio in merged_ped.trios
            for s in [trio.s, trio.pat_id, trio.mat_id]
        ]))
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.filter_cols(ped_samples.contains(mt.s))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')])
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt = mt.select_entries("GT", "END")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped)
    return mendel_errors
Beispiel #3
0
def main(args):
    hl.init(log='/frequency_data_generation.log', default_reference='GRCh38')

    logger.info("Reading sparse MT and metadata table...")
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters')

    if args.test:
        logger.info("Filtering to chr20:1-1000000")
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')])

    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Annotating sparse MT with metadata...")
    mt = mt.annotate_cols(meta=meta_ht[mt.s])
    mt = mt.filter_cols(mt.meta.release)
    samples = mt.count_cols()
    logger.info(f"Running frequency table prep and generation pipeline on {samples} samples")

    logger.info("Computing adj and sex adjusted genotypes.")
    mt = mt.annotate_entries(
        GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex),
        adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Densify-ing...")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info("Generating frequency data...")
    mt = annotate_freq(
        mt,
        sex_expr=mt.meta.sex,
        pop_expr=mt.meta.pop
    )

    # Select freq, FAF and popmax
    faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX)
    mt = mt.select_rows(
        'freq',
        faf=faf,
        popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX)
    )
    mt = mt.annotate_globals(faf_meta=faf_meta)

    # Annotate quality metrics histograms, as these also require densifying
    mt = mt.annotate_rows(
        **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Writing out frequency data...")
    if args.test:
        mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True)
    else:
        mt.rows().write(freq.path, overwrite=args.overwrite)
Beispiel #4
0
def run_infer_families() -> hl.Pedigree:
    logger.info("Inferring families")
    ped = infer_families(get_v3_relatedness_annotated_ht(), v3_sex.ht(),
                         duplicates.ht())

    # Remove all trios containing any QC-filtered sample
    meta_ht = meta.ht()
    filtered_samples = meta_ht.aggregate(
        hl.agg.filter(
            (hl.len(meta_ht.qc_metrics_filters) > 0)
            | hl.or_else(hl.len(meta_ht.hard_filters) > 0, False),
            hl.agg.collect_as_set(meta_ht.s),
        ))

    return hl.Pedigree(trios=[
        trio for trio in ped.trios
        if trio.s not in filtered_samples and trio.pat_id not in
        filtered_samples and trio.mat_id not in filtered_samples
    ])
Beispiel #5
0
def main(args):
    if args.join_qc_mt:
        v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True)
        v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes")
        v3_qc_mt = v3_qc.mt()
        v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release)
        v3_qc_mt = v3_qc_mt.select_rows().select_cols()
        v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes")
        joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt)
        joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PC-Relate')
        logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!")
        joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt")
        joint_qc_mt = joint_qc_mt.sample_rows(0.1)
        eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False)
        scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite)
        relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores,
                                      block_size=4096, min_kinship=0.1, statistics='all')
        relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
Beispiel #6
0
def main(args):
    hl.init(default_reference='GRCh38')
    coverage_version = args.coverage_version if args.coverage_version else CURRENT_GENOME_COVERAGE_RELEASE
    logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage")

    logger.warning(
        "Last time this was run (July 2020), this script required high-mem machines."
    )

    if args.compute_coverage_ht:

        print("Building reference context HT")
        ref_ht = get_reference_ht(
            hl.get_reference('GRCh38'),
            contigs=[f'chr{x}' for x in range(1, 23)] + ['chrX', 'chrY'],
            excluded_intervals=telomeres_and_centromeres.ht().interval.collect(
            ))
        ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht",
                                   overwrite=True)
        logger.info("Done building reference context HT")

        mt = get_gnomad_v3_mt()
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

        coverage_ht = compute_coverage_stats(mt, ref_ht)

        coverage_ht = coverage_ht.checkpoint(
            'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht',
            overwrite=True)
        coverage_ht = coverage_ht.naive_coalesce(5000)

        coverage_ht.write(coverage('genomes').versions[coverage_version].path,
                          overwrite=args.overwrite)

    if args.export_coverage:
        ht = coverage('genomes').versions[coverage_version].ht()
        if 'count_array' in ht.row_value:  # Note that count_array isn't computed any more, so this is v3.0-specific
            ht = ht.drop('count_array')
        ht.export(coverage_tsv_path('genomes', coverage_version))