def main(args):
    hl.init(default_reference='GRCh38')
    logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage")

    if args.compute_coverage_ht:

        print("Building reference context HT")
        ref_ht = get_reference_ht(hl.get_reference('GRCh38'),
                                  excluded_intervals=telomeres_and_centromeres.
                                  ht().interval.collect())
        ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht",
                                   overwrite=True)
        logger.info("Done building reference context HT")

        mt = get_gnomad_v3_mt()
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

        coverage_ht = compute_coverage_stats(mt, ref_ht)

        coverage_ht = coverage_ht.checkpoint(
            'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht',
            overwrite=True)
        coverage_ht = coverage_ht.naive_coalesce(5000)

        coverage_ht.write(coverage('genomes').versions['3.0'].path,
                          overwrite=args.overwrite)

    if args.export_coverage:
        ht = coverage('genomes').versions['3.0'].ht()
        if 'count_array' in ht.row_value:  # Note that count_array isn't computed any more, so this is v3.0-specific
            ht = ht.drop('count_array')
        ht.export(coverage_tsv_path('genomes', '3.0'))
Example #2
0
def run_vep() -> hl.Table:
    def get_mt_partitions(mt_path: str) -> List[hl.Interval]:
        """
        This function loads the partitioning from a given MT.
        Note that because it relies on hardcoded paths within the MT that are still in flux,
        it isn't guaranteed to work on future versions of the MT format.

        :param str mt_path: MT path
        :return: MT partitions
        :rtype: List of Interval
        """
        logger.info(f'Reading partitions for {mt_path}')
        import json
        from os import path
        mt = hl.read_matrix_table(mt_path)
        with hl.hadoop_open(
                path.join(mt_path, 'rows', 'rows', 'metadata.json.gz')) as f:
            intervals_json = json.load(f)['jRangeBounds']
            return hl.tarray(hl.tinterval(hl.tstruct(
                locus=mt.locus.dtype)))._convert_from_json(intervals_json)

    ht = get_gnomad_v3_mt(key_by_locus_and_alleles=True).rows()
    ht = ht.filter(hl.len(ht.alleles) > 1)

    return vep_or_lookup_vep(ht, reference='GRCh38')
Example #3
0
def run_mendel_errors() -> hl.Table:
    meta_ht = meta.ht()
    ped = pedigree.versions["raw"].pedigree()
    logger.info(f"Running Mendel errors for {len(ped.trios)} trios.")

    fake_ped = create_fake_pedigree(
        n=100,
        sample_list=list(
            meta_ht.aggregate(
                hl.agg.filter(
                    hl.rand_bool(0.01)
                    & ((hl.len(meta_ht.qc_metrics_filters) == 0)
                       & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)),
                    hl.agg.collect_as_set(meta_ht.s),
                ))),
        real_pedigree=ped,
    )
    merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios)

    ped_samples = hl.literal(
        set([
            s for trio in merged_ped.trios
            for s in [trio.s, trio.pat_id, trio.mat_id]
        ]))
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.filter_cols(ped_samples.contains(mt.s))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')])
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt = mt.select_entries("GT", "END")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped)
    return mendel_errors
Example #4
0
def compute_info() -> hl.Table:
    """
    Computes a HT with the typical GATK AS and site-level info fields
    as well as ACs and lowqual fields.
    Note that this table doesn't split multi-allelic sites.

    :return: Table with info fields
    :rtype: Table
    """
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                          remove_hard_filtered_samples=False)
    mt = mt.filter_rows((hl.len(mt.alleles) > 1))
    mt = mt.transmute_entries(**mt.gvcf_info)

    # Compute AS and site level info expr
    # Note that production defaults have changed:
    # For new releases, the `RAWMQ_andDP` field replaces the `RAW_MQ` and `MQ_DP` fields
    info_expr = get_site_info_expr(
        mt,
        sum_agg_fields=INFO_SUM_AGG_FIELDS + ['RAW_MQ'],
        int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ['MQ_DP'],
        array_sum_agg_fields=['SB'])
    info_expr = info_expr.annotate(
        **get_as_info_expr(mt,
                           sum_agg_fields=INFO_SUM_AGG_FIELDS + ['RAW_MQ'],
                           int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS +
                           ['MQ_DP'],
                           array_sum_agg_fields=['SB']))

    # Add AC and AC_raw:
    # First compute ACs for each non-ref allele, grouped by adj
    grp_ac_expr = hl.agg.array_agg(
        lambda ai: hl.agg.filter(
            mt.LA.contains(ai),
            hl.agg.group_by(
                get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
                hl.agg.sum(
                    mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[
                        mt.LA.index(ai)]))), hl.range(1, hl.len(mt.alleles)))

    # Then, for each non-ref allele, compute
    # AC as the adj group
    # AC_raw as the sum of adj and non-adj groups
    info_expr = info_expr.annotate(
        AC_raw=grp_ac_expr.map(
            lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))),
        AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))))

    info_ht = mt.select_rows(info=info_expr).rows()

    # Add lowqual flag
    info_ht = info_ht.annotate(lowqual=get_lowqual_expr(
        info_ht.alleles,
        info_ht.info.QUALapprox,
        # The indel het prior used for gnomad v3 was 1/10k bases (phred=40).
        # This value is usually 1/8k bases (phred=39).
        indel_phred_het_prior=40))

    return info_ht.naive_coalesce(5000)
Example #5
0
def main(args):
    hl.init(log='/frequency_data_generation.log', default_reference='GRCh38')

    logger.info("Reading sparse MT and metadata table...")
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters')

    if args.test:
        logger.info("Filtering to chr20:1-1000000")
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')])

    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Annotating sparse MT with metadata...")
    mt = mt.annotate_cols(meta=meta_ht[mt.s])
    mt = mt.filter_cols(mt.meta.release)
    samples = mt.count_cols()
    logger.info(f"Running frequency table prep and generation pipeline on {samples} samples")

    logger.info("Computing adj and sex adjusted genotypes.")
    mt = mt.annotate_entries(
        GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex),
        adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Densify-ing...")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info("Generating frequency data...")
    mt = annotate_freq(
        mt,
        sex_expr=mt.meta.sex,
        pop_expr=mt.meta.pop
    )

    # Select freq, FAF and popmax
    faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX)
    mt = mt.select_rows(
        'freq',
        faf=faf,
        popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX)
    )
    mt = mt.annotate_globals(faf_meta=faf_meta)

    # Annotate quality metrics histograms, as these also require densifying
    mt = mt.annotate_rows(
        **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Writing out frequency data...")
    if args.test:
        mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True)
    else:
        mt.rows().write(freq.path, overwrite=args.overwrite)
Example #6
0
def main(args):
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.annotate_entries(
        gvcf_info=mt.gvcf_info.drop('ClippingRankSum', 'ReadPosRankSum'))
    mt = mt.annotate_rows(
        n_unsplit_alleles=hl.len(mt.alleles),
        mixed_site=(hl.len(mt.alleles) > 2)
        & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
        & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]))
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt.write(args.split_mt_location, overwrite=args.overwrite)
def compute_stats(stats_path: str):
    mt = get_gnomad_v3_mt()
    mt = mt.filter_entries(hl.is_defined(mt.END))
    ref_block_stats = mt.aggregate_entries(
        hl.struct(ref_block_stats=hl.struct(
            stats=hl.agg.stats(mt.END - mt.locus.position),
            hist=hl.agg.hist(mt.END - mt.locus.position, 0, 9999, 10000),
            hist_log=hl.agg.hist(hl.log10(1 + mt.END - mt.locus.position), 0,
                                 5, 100)),
                  adj_ref_block_stats=hl.agg.filter(
                      get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
                      hl.struct(stats=hl.agg.stats(mt.END - mt.locus.position),
                                hist=hl.agg.hist(mt.END - mt.locus.position, 0,
                                                 9999, 10000),
                                hist_log=hl.agg.hist(
                                    hl.log10(1 + mt.END - mt.locus.position),
                                    0, 5, 100)))))

    with hl.hadoop_open(stats_path, 'wb') as f:
        pickle.dump(ref_block_stats, f)
Example #8
0
def main(args):
    hl.init(default_reference='GRCh38')
    coverage_version = args.coverage_version if args.coverage_version else CURRENT_GENOME_COVERAGE_RELEASE
    logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage")

    logger.warning(
        "Last time this was run (July 2020), this script required high-mem machines."
    )

    if args.compute_coverage_ht:

        print("Building reference context HT")
        ref_ht = get_reference_ht(
            hl.get_reference('GRCh38'),
            contigs=[f'chr{x}' for x in range(1, 23)] + ['chrX', 'chrY'],
            excluded_intervals=telomeres_and_centromeres.ht().interval.collect(
            ))
        ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht",
                                   overwrite=True)
        logger.info("Done building reference context HT")

        mt = get_gnomad_v3_mt()
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

        coverage_ht = compute_coverage_stats(mt, ref_ht)

        coverage_ht = coverage_ht.checkpoint(
            'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht',
            overwrite=True)
        coverage_ht = coverage_ht.naive_coalesce(5000)

        coverage_ht.write(coverage('genomes').versions[coverage_version].path,
                          overwrite=args.overwrite)

    if args.export_coverage:
        ht = coverage('genomes').versions[coverage_version].ht()
        if 'count_array' in ht.row_value:  # Note that count_array isn't computed any more, so this is v3.0-specific
            ht = ht.drop('count_array')
        ht.export(coverage_tsv_path('genomes', coverage_version))
Example #9
0
def main(args):
    hl.init(default_reference='GRCh38', log='/qc_annotations.log')

    if args.compute_info:
        compute_info().write(get_info(split=False).path,
                             overwrite=args.overwrite)

    if args.split_info:
        split_info().write(get_info(split=True).path, overwrite=args.overwrite)

    if args.export_info_vcf:
        info_ht = get_info(split=False).ht()
        hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path)

    # if args.generate_ac: # TODO: compute AC and qc_AC as part of compute_info
    # mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True)
    # mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    #
    # ht = generate_ac(mt, ).checkpoint('gs://gnomad-tmp/v3_ac_tmp.ht', overwrite=args.overwrite, _read_if_exists=not args.overwrite)
    # ht.repartition(10000, shuffle=False).write(ac_ht_path, overwrite=args.overwrite)

    if args.generate_fam_stats:
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True)
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
        fam_stats_ht = generate_fam_stats(mt, trios.path)
        fam_stats_ht = fam_stats_ht.checkpoint(
            'gs://gnomad-tmp/v3_fam_stats_tmp.ht',
            overwrite=args.overwrite,
            _read_if_exists=not args.overwrite)
        fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False)
        fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite)

    if args.export_transmitted_singletons_vcf:
        export_transmitted_singletons_vcf()

    if args.vep:
        run_vep().write(vep.path, overwrite=args.overwrite)
Example #10
0
def main(args):
    hl.init(log="/variant_qc_evaluation.log")

    if args.create_bin_ht:
        create_bin_ht(
            args.model_id,
            args.n_bins,
        ).write(
            get_score_bins(args.model_id, aggregated=False).path,
            overwrite=args.overwrite,
        )

    if args.run_sanity_checks:
        ht = get_score_bins(args.model_id, aggregated=False).ht()
        logger.info("Running sanity checks...")
        print(
            ht.aggregate(
                hl.struct(
                    was_biallelic=hl.agg.counter(~ht.was_split),
                    has_biallelic_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_bin)),
                    was_singleton=hl.agg.counter(ht.singleton),
                    has_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.singleton_bin)),
                    was_biallelic_singleton=hl.agg.counter(ht.singleton
                                                           & ~ht.was_split),
                    has_biallelic_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_singleton_bin)),
                )))

    if args.create_aggregated_bin_ht:
        logger.warning(
            "Use only workers, it typically crashes with preemptibles")
        create_aggregated_bin_ht(args.model_id).write(
            get_score_bins(args.model_id, aggregated=True).path,
            overwrite=args.overwrite,
        )

    if args.extract_truth_samples:
        logger.info(f"Extracting truth samples from MT...")
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                              remove_hard_filtered_samples=False)

        mt = mt.filter_cols(
            hl.literal([v["s"]
                        for k, v in TRUTH_SAMPLES.items()]).contains(mt.s))
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        # Checkpoint to prevent needing to go through the large table a second time
        mt = mt.checkpoint(
            get_checkpoint_path("truth_samples", mt=True),
            overwrite=args.overwrite,
        )

        for truth_sample in TRUTH_SAMPLES:
            truth_sample_mt = mt.filter_cols(
                mt.s == TRUTH_SAMPLES[truth_sample]["s"])
            # Filter to variants in truth data
            truth_sample_mt = truth_sample_mt.filter_rows(
                hl.agg.any(truth_sample_mt.GT.is_non_ref()))
            truth_sample_mt.naive_coalesce(args.n_partitions).write(
                get_callset_truth_data(truth_sample).path,
                overwrite=args.overwrite,
            )

    if args.merge_with_truth_data:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating a merged table with callset truth sample and truth data for {truth_sample}..."
            )

            # Load truth data
            mt = get_callset_truth_data(truth_sample).mt()
            truth_hc_intervals = TRUTH_SAMPLES[truth_sample][
                "hc_intervals"].ht()
            truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt()
            truth_mt = truth_mt.key_cols_by(
                s=hl.str(TRUTH_SAMPLES[truth_sample]["s"]))

            # Remove low quality sites
            info_ht = get_info(split=True).ht()
            mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual)

            ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals)
            ht.write(
                get_callset_truth_data(truth_sample, mt=False).path,
                overwrite=args.overwrite,
            )

    if args.bin_truth_sample_concordance:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating binned concordance table for {truth_sample} for model {args.model_id}"
            )
            ht = get_callset_truth_data(truth_sample, mt=False).ht()

            info_ht = get_info(split=True).ht()
            ht = ht.filter(
                ~info_ht[ht.key].AS_lowqual
                & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

            logger.info("Filtering out low confidence regions and segdups...")
            ht = filter_low_conf_regions(
                ht,
                filter_lcr=True,
                # TODO: Uncomment when we have decoy path
                filter_decoy=False,  # True,
                filter_segdup=True,
            )

            logger.info(
                "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..."
            )
            metric_ht = get_score_bins(args.model_id, aggregated=False).ht()
            ht = ht.filter(hl.is_defined(metric_ht[ht.key]))

            ht = ht.annotate(score=metric_ht[ht.key].score)

            ht = compute_binned_truth_sample_concordance(
                ht, metric_ht, args.n_bins)
            ht.write(
                get_binned_concordance(args.model_id, truth_sample).path,
                overwrite=args.overwrite,
            )
import hail as hl
from gnomad_qc.v3.resources import get_gnomad_v3_mt, last_END_position

# END RESOURCES

mt = get_gnomad_v3_mt()
mt = mt.select_entries('END')
t = mt._localize_entries('__entries', '__cols')
t = t.select(last_END_position=hl.or_else(
    hl.min(
        hl.scan.array_agg(
            lambda entry: hl.scan._prev_nonnull(
                hl.or_missing(hl.is_defined(entry.END),
                              hl.tuple([t.locus, entry.END]))), t.__entries).
        map(lambda x: hl.or_missing((x[1] >= t.locus.position) & (x[
            0].contig == t.locus.contig), x[0].position))), t.locus.position))
t.write(last_END_position.path, overwrite=True)