Exemple #1
0
def read_vcf(dirname: str, basename: str) -> hl.MatrixTable:
    hl._set_flags(no_whole_stage_codegen='1')
    vcf_file = '{}{}.vcf.gz'.format(dirname, basename)
    hl.import_vcf(vcf_file, force_bgz=True,
                  block_size=16).write('{}GWASpy.preimpQC.mt'.format(dirname),
                                       overwrite=True)
    in_mt = hl.read_matrix_table('{}GWASpy.preimpQC.mt'.format(dirname))

    # Unlike array data, a VCF might have multi-allelic sites
    # split multi-allelic sites into bi-allelic
    print("Checking for multi-allelic sites")
    pre_filt_multi_n = in_mt.count_rows()
    bi = in_mt.filter_rows(hl.len(in_mt.alleles) == 2)
    bi = bi.annotate_rows(
        a_index=hl.missing(hl.tint)
    )  # when we update Hail version, use hl.missing instead of hl.null
    bi = bi.annotate_rows(was_split=False)

    multi = in_mt.filter_rows(hl.len(in_mt.alleles) > 2)
    split = hl.split_multi_hts(multi)

    in_mt = split.union_rows(bi)
    pos_filt_multi_n = in_mt.count_rows()
    print("Number of multi-allelic SNPs in VCF file: {}".format(
        pos_filt_multi_n - pre_filt_multi_n))

    return in_mt
Exemple #2
0
 def wrapper(func, *args, **kwargs):
     old_flags = hl._get_flags('cpp')
     try:
         hl._set_flags(cpp='t')
         func(*args, **kwargs)
     finally:
         hl._set_flags(**old_flags)
Exemple #3
0
    def wrapper(func, *args, **kwargs):
        prev_flags = {k: v for k, v in hl._get_flags().items() if k in flags}

        hl._set_flags(**{k: '1' for k in flags})

        try:
            return func(*args, **kwargs)
        finally:
            hl._set_flags(**prev_flags)
Exemple #4
0
    def wrapper(func, *args, **kwargs):
        flags = hl._get_flags()
        prev_lower = flags.get('lower')
        prev_lower_only = flags.get('lower_only')

        hl._set_flags(lower='1', lower_only='1')

        try:
            return func(*args, **kwargs)
        finally:
            hl._set_flags(lower=prev_lower, lower_only=prev_lower_only)
    def run(self):
        # Hack that fixes something in Hail. TODO: Remove when Hail fix comes.
        hl._set_flags(newaggs=None)
        variants_mt = hl.read_matrix_table(self.input()[0].path)
        genotypes_mt = hl.read_matrix_table(self.input()[1].path)
        row_ht = genotypes_mt.rows().join(variants_mt.rows())

        row_ht = SeqrVariantsAndGenotypesSchema.elasticsearch_row(row_ht)
        self.export_table_to_elasticsearch(row_ht,
                                           self._mt_num_shards(genotypes_mt))

        self.cleanup()
    def run(self):
        # Hack that fixes something in Hail. TODO: Remove when Hail fix comes.
        hl._set_flags(newaggs=None)
        mt = hl.read_matrix_table(self.input()[0].path)

        if self.remap_path:
            mt = self.remap_sample_ids(mt, self.remap_path)
        if self.subset_path:
            mt = self.subset_samples_and_variants(mt, self.subset_path)

        mt = SeqrGenotypesSchema(mt).annotate_all(
            overwrite=True).select_annotated_mt()

        mt.describe()
        mt.write(self.output().path, stage_locally=True, overwrite=True)
def compile_2k_merge(path):
    flagname = 'no_ir_logging'
    prev_flag_value = hl._get_flags(flagname).get(flagname)
    try:
        hl._set_flags(**{flagname: '1'})
        vcf = setup(path)
        vcfs = [vc_all.transform_gvcf(vcf)] * COMBINE_GVCF_MAX
        combined = [vc_all.combine_gvcfs(vcfs)] * 20
        with TemporaryDirectory() as tmpdir:
            hl.experimental.write_matrix_tables(combined,
                                                os.path.join(
                                                    tmpdir,
                                                    'combiner-multi-write'),
                                                overwrite=True)
    finally:
        hl._set_flags(**{flagname: prev_flag_value})
    def run(self):
        flagname = 'no_ir_logging'
        prev_flag_value = hl._get_flags(flagname).get(flagname)
        hl._set_flags(**{flagname: '1'})

        vds_samples = sum(vds.n_samples for vdses in self.vdses.values()
                          for vds in vdses)
        info(
            'Running VDS combiner:\n'
            f'    VDS arguments: {self._num_vdses} datasets with {vds_samples} samples\n'
            f'    GVCF arguments: {len(self.gvcfs)} inputs/samples\n'
            f'    Branch factor: {self.branch_factor}\n'
            f'    GVCF merge batch size: {self.gvcf_batch_size}')
        while not self.finished:
            self.save()
            self.step()
        self.save()
        info('Finished VDS combiner!')
        hl._set_flags(**{flagname: prev_flag_value})
def main():
    hl.init(
        log="/ccdg_contamination_estimator.log",
        tmp_dir="gs://ccdg-4day-temp/contamination/",
    )
    hl._set_flags(distributed_scan_comb_op="1")

    DATA_TYPE = ["genomes", "exomes"]

    for data_type in DATA_TYPE:
        autosomes_only = not args.not_autosomes_only
        flag = "autosomes_only" if autosomes_only else "all_chromosomes"
        print(f"data: ccdg_{data_type}")
        print(flag)
        mt = get_contamination_metric_stats(
            data_type=data_type, autosomes_only=autosomes_only
        ).persist()
        mt.cols().write(
            get_ccdg_results_path(
                data_type=data_type, mt=False, result=f"contamination_metric_{flag}"
            ),
            overwrite=args.overwrite,
        )
Exemple #10
0
def main(args):
    hl.init(
        log=f"/variant_filter.log",
        tmp_dir="gs://ccdg-30day-temp/",
        default_reference="GRCh38",
    )
    # TODO: This flag can be removed if this error is no longer relevant: log4j:ERROR Failed to flush writer,
    #  java.io.IOException: No space left on device when trying to write a densified MT from VDS
    hl._set_flags(distributed_scan_comb_op="1")

    if args.update_ccdg_exome_interval_table:
        ccdg_interval_qc_ht(args.pct_samples_defined, overwrite=True)

    determine_pca_variants(
        autosomes_only=not args.not_autosomes_only,
        bi_allelic_only=not args.not_bi_allelic_only,
        adj_only=not args.not_adj_only,
        snv_only=not args.not_snv_only,
        min_gnomad_v3_ac=args.gnomad_v3_ac_filter,
        high_qual_ccdg_exome_interval_only=not args.not_high_qual_ccdg_interval_only,
        high_qual_ukbb_exome_interval_only=not args.not_high_qual_ukbb_interval_only,
        filter_lcr=not args.not_filter_lcr,
        filter_segdup=not args.not_filter_segdup,
        min_joint_af=args.min_af,
        min_joint_callrate=args.min_callrate,
        min_ccdg_exome_callrate=args.ccdg_exome_callrate_cutoff,
        min_ukbb_exome_callrate=args.ukbb_exome_callrate_cutoff,
        ld_pruning=not args.not_ld_pruning,
        ld_pruning_dataset=args.ld_pruning_dataset,
        ld_r2=args.ld_r2,
        read_per_dataset_checkpoint_if_exists=args.read_per_dataset_checkpoint_if_exists,
        read_pre_ld_prune_ht_checkpoint_if_exists=args.read_pre_ld_prune_ht_checkpoint_if_exists,
        read_pre_ld_prune_mt_checkpoint_if_exists=args.read_pre_ld_prune_mt_checkpoint_if_exists,
        overwrite=args.overwrite,
        filter_washu=args.filter_washu,
    )
Exemple #11
0
 def wrapper(func, *args, **kwargs):
     old_flags = hl._get_flags('cpp')
     hl._set_flags(cpp='t')
     func(*args, **kwargs)
     hl._set_flags(**old_flags)
Exemple #12
0
def determine_pca_variants(
    autosomes_only: bool = True,
    snv_only: bool = True,
    bi_allelic_only: bool = False,
    adj_only: bool = True,
    min_gnomad_v3_ac: Optional[int] = None,
    high_qual_ccdg_exome_interval_only: bool = False,
    high_qual_ukbb_exome_interval_only: bool = False,
    pct_samples_ukbb_exome_interval: float = 0.8,
    min_joint_af: float = 0.0001,  # TODO: Konrad mentioned that he might want to lower this
    min_joint_callrate: float = 0.95,
    min_inbreeding_coeff_threshold: Optional[float] = -0.8,
    min_hardy_weinberg_threshold: Optional[float] = 1e-8,
    min_ccdg_exome_callrate: float = 0.99,  # TODO: What parameter should this start with?
    min_ukbb_exome_callrate: float = 0.99,  # TODO: What parameter should this start with?
    filter_lcr: bool = True,
    filter_segdup: bool = True,
    ld_pruning: bool = True,
    ld_pruning_dataset: str = "ccdg_genomes",
    ld_r2: float = 0.1,
    read_per_dataset_checkpoint_if_exists: bool = False,
    read_pre_ld_prune_ht_checkpoint_if_exists: bool = False,
    read_pre_ld_prune_mt_checkpoint_if_exists: bool = False,
    overwrite: bool = True,
    filter_washu: bool = False,
) -> None:
    """
    Determine a diverse set of variants for relatedness/ancestry PCA using CCDG, gnomAD v3, and UK Biobank.

    :param autosomes_only: Whether to filter to variants in autosomes
    :param snv_only: Whether to filter to SNVs
    :param bi_allelic_only: Whether to filter to variants that are bi-allelic in either CCDG and gnomAD v3
    :param adj_only: If set, only ADJ genotypes (QD >= 2, FS <= 60 and MQ >= 30) are kept. This filter is applied before the call rate and AF calculation
    :param min_gnomad_v3_ac: Optional lower bound of AC for variants in gnomAD v3 genomes
    :param high_qual_ccdg_exome_interval_only: Whether to filter to high quality intervals in CCDG exomes
    :param float pct_samples_ukbb_exome_interval: Percent of samples with over 80% of bases having coverage of over 20x per interval
    :param high_qual_ukbb_exome_interval_only: Whether to filter to high quality intervals in UKBB 455K exomes
    :param float pct_samples_ukbb: Percent of samples with coverage greater than 20x over the interval for filtering
    :param min_joint_af: Lower bound for combined MAF computed from CCDG and gnomAD v3 genomes
    :param min_joint_callrate: Lower bound for combined callrate computed from CCDG and gnomAD v3 genomes
    :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to `None`
    :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to `None`
    :param min_ccdg_exome_callrate: Lower bound for CCDG exomes callrate
    :param min_ukbb_exome_callrate: Lower bound for UKBB exomes callrate
    :param filter_lcr: Whether to filter LCR regions
    :param filter_segdup: Whether to filter Segdup regions
    :param ld_pruning: Whether to conduct LD pruning
    :param ld_pruning_dataset: Which dataset is used for LD pruning, 'ccdg_genomes' or 'gnomAD_genomes'
    :param ld_r2: LD pruning cutoff
    :param read_per_dataset_checkpoint_if_exists: Whether to read the CCDG exome/genome pre filtered HT if it exists.
        Each dataset possible filtered to: autosomes only, SNVs only, gnomAD v3.1.2 AC filter, CCDG high quality exome
        intervals, and UK Biobank high quality exome intervals
    :param read_pre_ld_prune_ht_checkpoint_if_exists: Whether to read in the PCA variant HT with no LD-pruning if it exists
    :param read_pre_ld_prune_mt_checkpoint_if_exists: Whether to read in the checkpointed MT filtered to variants in the
        PCA variant HT with no LD-pruning if it exists
    :param overwrite: Whether to overwrite the final variant HT
    :param filter_washu: Whether to filter out washU samples
    :return: Table with desired variants for PCA
    """
    if not read_pre_ld_prune_ht_checkpoint_if_exists:
        logger.info(
            "Loading gnomAD v3.1.2 release HT and UK Biobank 455K release HT ..."
        )
        flag = "_without_washu" if filter_washu else ""
        gnomad_ht = gnomad_public_release("genomes").ht()
        gnomad_ht = gnomad_ht.select(
            gnomad_was_split=gnomad_ht.was_split,
            gnomad_AC=gnomad_ht.freq[0].AC,
            gnomad_AN=gnomad_ht.freq[0].AN,
            gnomad_genomes_site_inbreeding_coeff=gnomad_ht.info.InbreedingCoeff,
            gnomad_genomes_homozygote_count=gnomad_ht.freq[0].homozygote_count,
        )
        if min_hardy_weinberg_threshold is not None:
            gnomad_ht = gnomad_ht.annotate(
                gnomad_genomes_hwe=hl.hardy_weinberg_test(
                    hl.int32(
                        (gnomad_ht.gnomad_AN / 2)
                        - gnomad_ht.gnomad_genomes_homozygote_count
                        - (
                            gnomad_ht.gnomad_AC
                            - (gnomad_ht.gnomad_genomes_homozygote_count * 2)
                        )
                    ),  # Num hom ref genotypes
                    hl.int32(
                        (
                            gnomad_ht.gnomad_AC
                            - (gnomad_ht.gnomad_genomes_homozygote_count * 2)
                        )
                    ),  # Num het genotypes
                    gnomad_ht.gnomad_genomes_homozygote_count,  # Num hom alt genotypes
                ),
            )

        ukbb_ht = hl.read_table(ukbb_release_ht_path("broad", 7))
        ukbb_ht = ukbb_ht.select(
            ukbb_AC=ukbb_ht.freq[0].AC,
            ukbb_AN=ukbb_ht.freq[0].AN,
        )
        ukbb_meta_ht = hl.read_table(ukbb_meta_ht_path("broad", 7))

        # Only count samples used in the UK Biobank exome frequency calculations
        ukbb_exome_count = ukbb_meta_ht.filter(
            ukbb_meta_ht.sample_filters.high_quality
            & hl.is_defined(ukbb_meta_ht.ukbb_meta.batch)
            & ~ukbb_meta_ht.sample_filters.related
        ).count()

        logger.info("Getting CCDG genome and exome sample counts...")
        ccdg_genome_count = get_ccdg_vds(
            "genomes", filter_washu=filter_washu
        ).variant_data.count_cols()
        logger.info(f"Number of CCDG genome samples: {ccdg_genome_count}...")
        ccdg_exome_count = get_ccdg_vds("exomes").variant_data.count_cols()
        logger.info(f"Number of CCDG exome samples: {ccdg_exome_count} ...")

        def _initial_filter(data_type):
            """
            Get Table of CCDG variants passing desired filters.

            Possible filters are:
                - Autosomes only
                - SNVs only
                - gnomAD v3.1.2 AC filter
                - CCDG high quality exome intervals
                - UK Biobank high quality exome intervals

            After densification of the VDS, rows are annotated with:
                - ccdg_{data_type}_was_split
                - ccdg_{data_type}_AC
                - ccdg_{data_type}_AN

            The filtered and annotated rows are returned as a Table and are also checkpointed
            :param data_type: Whether data is from genomes or exomes

            :return: Table of CCDG filtered variants
            """
            logger.info(
                "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...",
                data_type,
            )
            vds = get_ccdg_vds(data_type, filter_washu=filter_washu)
            logger.info(
                f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..."
            )
            vds = hl.vds.split_multi(vds)

            if autosomes_only:
                logger.info("Filtering CCDG %s VDS to autosomes...", data_type)
                vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True)

            ht = vds.variant_data.rows()
            variant_filter_expr = True
            if snv_only:
                logger.info("Filtering CCDG %s VDS to SNVs...", data_type)
                variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1])

            if min_gnomad_v3_ac:
                logger.info(
                    "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...",
                    data_type,
                    min_gnomad_v3_ac,
                )
                variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac

            vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True)

            if high_qual_ccdg_exome_interval_only:
                logger.info(
                    f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...",
                    data_type,
                    INTERVAL_DP,
                )
                interval_qc_ht = hl.read_table(
                    get_ccdg_results_path(
                        data_type="exomes", result=f"intervals_{INTERVAL_DP}x"
                    )
                )
                interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep)
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            if high_qual_ukbb_exome_interval_only:
                if not autosomes_only:
                    raise ValueError(
                        "UK Biobank interval QC filtering is only available for autosomes!"
                    )

                logger.info(
                    "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...",
                    data_type,
                )
                interval_qc_ht = hl.read_table(
                    ukbb_interval_qc_path("broad", 7, "autosomes")
                )  # Note: freeze 7 is all included in gnomAD v4
                interval_qc_ht = interval_qc_ht.filter(
                    interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval
                )
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            logger.info("Densifying filtered CCDG %s VDS...", data_type)
            mt = hl.vds.to_dense_mt(vds)
            if adj_only:
                mt = filter_to_adj(mt)

            annotation_expr = {
                f"ccdg_{data_type}_was_split": mt.was_split,
                f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()),
                f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2,
            }

            if min_inbreeding_coeff_threshold is not None:
                annotation_expr[
                    f"ccdg_{data_type}_site_inbreeding_coeff"
                ] = bi_allelic_site_inbreeding_expr(mt.GT)
            if min_hardy_weinberg_threshold is not None:
                annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test(
                    mt.GT
                )

            mt = mt.annotate_rows(**annotation_expr)
            ht = mt.rows().checkpoint(
                get_ccdg_results_path(
                    data_type=data_type,
                    mt=False,
                    result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}",
                ),
                overwrite=(not read_per_dataset_checkpoint_if_exists),
                _read_if_exists=read_per_dataset_checkpoint_if_exists,
            )

            return ht

        logger.info(
            "Creating Table with joint gnomAD v3.1.2 and CCDG genome allele frequencies and callrate...",
        )
        ccdg_genomes_ht = _initial_filter("genomes")
        ccdg_exomes_ht = _initial_filter("exomes")
        ht = ccdg_exomes_ht.join(ccdg_genomes_ht, how="inner")
        ht = ht.annotate(**gnomad_ht[ht.key], **ukbb_ht[ht.key])
        ht = ht.annotate(
            joint_biallelic=(~ht.ccdg_genomes_was_split) | (~ht.gnomad_was_split),
            joint_AC=ht.ccdg_genomes_AC + ht.gnomad_AC,
            joint_AN=ht.ccdg_genomes_AN + ht.gnomad_AN,
        )
        total_genome_an = hl.eval(
            (gnomad_ht.freq_sample_count[0] + ccdg_genome_count) * 2
        )
        ht = ht.annotate(
            joint_AF=ht.joint_AC / ht.joint_AN,
            joint_callrate=ht.joint_AN / total_genome_an,
        )
        ht = ht.checkpoint(
            f"{get_joint_pca_variants_ht_path(filter_washu=filter_washu)}",
            overwrite=(not read_pre_ld_prune_ht_checkpoint_if_exists),
            _read_if_exists=read_pre_ld_prune_ht_checkpoint_if_exists,
        )

        logger.info(
            "Filtering variants to combined gnomAD v3.1.2 and CCDG genome AF of %.3f and callrate of %.2f, CCDG exome callrate "
            "of %.2f, and UK Biobank exome callrate of %.2f....",
            min_joint_af,
            min_joint_callrate,
            min_ccdg_exome_callrate,
            min_ukbb_exome_callrate,
        )

        variant_filter_expr = True
        if bi_allelic_only:
            variant_filter_expr &= ht.joint_biallelic
        if min_inbreeding_coeff_threshold is not None:
            variant_filter_expr &= (
                ht.ccdg_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold
            ) & (
                ht.gnomad_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold
            )
        if min_hardy_weinberg_threshold is not None:
            variant_filter_expr &= (
                ht.ccdg_genomes_hwe.p_value > min_hardy_weinberg_threshold
            ) & (ht.gnomad_genomes_hwe.p_value > min_hardy_weinberg_threshold)

        variant_filter_expr &= (
            (ht.joint_AF > min_joint_af)
            & (ht.joint_callrate > min_joint_callrate)
            & (ht.ccdg_exomes_AN / (ccdg_exome_count * 2) > min_ccdg_exome_callrate)
            & (ht.ukbb_AN / (ukbb_exome_count * 2) > min_ukbb_exome_callrate)
        )

        ht = ht.filter(variant_filter_expr)

        ht = ht.annotate_globals(
            autosomes_only=autosomes_only,
            snv_only=snv_only,
            adj_only=adj_only,
            bi_allelic_only=bi_allelic_only,
            min_gnomad_v3_ac=min_gnomad_v3_ac,
            high_qual_ccdg_exome_interval_only=high_qual_ccdg_exome_interval_only,
            high_qual_ukbb_exome_interval_only=high_qual_ukbb_exome_interval_only,
            filter_lcr=filter_lcr,
            filter_segdup=filter_segdup,
            min_af=min_joint_af,
            min_callrate=min_joint_callrate,
            min_ccdg_exome_callrate=min_ccdg_exome_callrate,
            min_ukbb_exome_callrate=min_ukbb_exome_callrate,
            min_inbreeding_coeff_threshold=min_inbreeding_coeff_threshold,
            min_hardy_weinberg_threshold=min_hardy_weinberg_threshold,
        )

        ht = filter_low_conf_regions(
            ht,
            filter_lcr=filter_lcr,
            filter_decoy=False,  # No decoy for GRCh38
            filter_segdup=filter_segdup,
        )

        ht = ht.checkpoint(
            get_pca_variants_path(ld_pruned=False, filter_washu=filter_washu),
            overwrite=True,
        )
    else:
        ht = hl.read_table(
            get_pca_variants_path(
                ld_pruned=False, data=ld_pruning_dataset, filter_washu=filter_washu
            )
        )

    if ld_pruning:
        # Whether this is still required?
        logger.warning(
            "The LD-prune step of this function requires non-preemptible workers only!"
        )
        logger.info("Creating Table after LD pruning of %s...", ld_pruning_dataset)
        if ld_pruning_dataset == "ccdg_genomes":
            vds = get_ccdg_vds("genomes")
            vds = hl.vds.split_multi(vds, filter_changed_loci=True)
            vds = hl.vds.filter_variants(vds, ht, keep=True)
            mt = hl.vds.to_dense_mt(vds)
        elif ld_pruning_dataset == "gnomad_genomes":
            mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
            logger.info("Converting gnomAD v3.1 MatrixTable to VDS...")
            mt = mt.select_entries(
                "END", "LA", "LGT", adj=get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD)
            )
            vds = hl.vds.VariantDataset.from_merged_representation(mt)

            logger.info("Performing split-multi and filtering variants...")
            vds = hl.vds.split_multi(vds, filter_changed_loci=True)
            vds = hl.vds.filter_variants(vds, ht)

            logger.info("Densifying data...")
            mt = hl.vds.to_dense_mt(vds)
        else:
            ValueError(
                "Only options for LD pruning are `ccdg_genomes` and `gnomad_genomes`"
            )

        hl._set_flags(no_whole_stage_codegen="1")
        mt = mt.checkpoint(
            get_pca_variants_path(ld_pruned=False, data=ld_pruning_dataset, mt=True),
            overwrite=(not read_pre_ld_prune_mt_checkpoint_if_exists),
            _read_if_exists=read_pre_ld_prune_mt_checkpoint_if_exists,
        )
        hl._set_flags(no_whole_stage_codegen=None)
        ht = hl.ld_prune(mt.GT, r2=ld_r2)
        ht = ht.annotate_globals(ld_r2=ld_r2, ld_pruning_dataset=ld_pruning_dataset)
        ht = ht.checkpoint(
            get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset),
            overwrite=overwrite,
            _read_if_exists=(not overwrite),
        )
        mt = mt.filter_rows(hl.is_defined(ht[mt.row_key]))
        mt.naive_coalesce(1000).write(
            get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset, mt=True),
            overwrite=overwrite,
        )
 def run(self):
     # Hack that fixes something in Hail. TODO: Remove when Hail fix comes.
     hl._set_flags(newaggs=None)
     # We only want to use the Variant Schema.
     self.read_vcf_write_mt(schema_cls=SeqrVariantSchema)
def main(args):

    hl.init(log="/seqr_sample_qc.log")
    hl._set_flags(no_whole_stage_codegen="1") #Flag needed for hail 0.2.93, may be able to remove in future release.
    logger.info("Beginning seqr sample QC pipeline...")

    data_type = args.data_type
    build = args.build
    data_source = args.data_source
    version = args.callset_version
    is_test = args.is_test
    overwrite = args.overwrite

    logger.info("Importing callset...")
    if not args.skip_write_mt:
        logger.info("Converting vcf to MatrixTable...")
        mt = hl.import_vcf(
            args.vcf_path,
            force_bgz=True,
            reference_genome=f"GRCh{build}",
            min_partitions=4,
        ).write(
            mt_path(build, data_type, data_source, version, is_test), overwrite=True
        )
    mt = hl.read_matrix_table(mt_path(build, data_type, data_source, version, is_test))
    mt = mt.annotate_entries(
        GT=hl.case()
        .when(mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False))
        .when(mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False))
        .default(hl.missing(hl.tcall))
    )
    if not args.skip_validate_mt:
        logger.info("Validating data type...")
        validate_mt(mt, build, data_type)

    if is_test:
        logger.info("Creating test mt...")
        mt = hl.filter_intervals(
            mt,
            [
                hl.parse_locus_interval(
                    hl.if_else(build == "37", "20", "chr20"),
                    reference_genome=f"GRCh{build}",
                )
            ],
        ).persist()

    logger.info("Annotating with sequencing metrics and filtered callrate...")
    meta_ht = get_all_sample_metadata(mt, build, data_type, data_source, version)
    mt = mt.annotate_cols(**meta_ht[mt.col_key], data_type=data_type)

    logger.info("Annotating with sample metric filter flags...")
    metric_thresholds = {
        "callrate_thres": args.callrate_low_threshold,
        "contam_thres": args.contam_up_threshold,
        "chimera_thres": args.chimera_up_threshold,
        "wes_cov_thres": args.wes_coverage_low_threshold,
        "wgs_cov_thres": args.wgs_coverage_low_threshold,
    }
    mt = mt.annotate_cols(
        filter_flags=apply_filter_flags_expr(mt, data_type, metric_thresholds)
    )

    logger.info("Assign platform or product")
    if data_type == "WES" and data_source == "External":
        logger.info("Running platform imputation...")
        plat_ht = run_platform_imputation(
            mt,
            args.plat_min_cluster_size,
            args.plat_min_sample_size,
            args.plat_assignment_pcs,
        )
        mt = mt.annotate_cols(**plat_ht[mt.col_key])
    elif data_source == "Internal":
        logger.info("Assigning platform from product in metadata...")
        mt = mt.annotate_cols(
            qc_platform=hl.if_else(hl.is_defined(mt.PRODUCT), mt.PRODUCT, "Unknown")
        )

        missing_metrics = mt.filter_cols(hl.is_defined(mt.PRODUCT), keep=False)
        missing_metrics.cols().select().export(
            missing_metrics_path(build, data_type, data_source, version)
        )  #  TODO Add logging step that prints unexpected missing samples
    else:
        mt = mt.annotate_cols(qc_platform="Unknown")

    logger.info("Projecting gnomAD population PCs...")
    pop_ht = run_population_pca(mt, build)
    mt = mt.annotate_cols(**pop_ht[mt.col_key])

    logger.info("Running Hail's sample qc...")
    hail_metric_ht = run_hail_sample_qc(mt, data_type)
    mt = mt.annotate_cols(**hail_metric_ht[mt.col_key])

    logger.info("Exporting sample QC tables...")
    ht = mt.cols()
    ht = ht.checkpoint(
        sample_qc_ht_path(build, data_type, data_source, version, is_test), overwrite
    )
    ht.flatten().export(sample_qc_tsv_path(build, data_type, data_source, version))
Exemple #15
0
from gnomad.utils.reference_genome import get_reference_genome
from gnomad_qc.v2.resources.sample_qc import qc_mt_path
from gnomad_qc.v3.resources.sample_qc import qc
from typing import Dict, List, Tuple

logging.basicConfig(
    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger("check pedigree")
logger.setLevel(logging.INFO)

logger.info("Setting hail flag to avoid array index out of bounds error...")
# Setting this flag isn't generally recommended, but is needed (since at least Hail version 0.2.75) to avoid an array index out of bounds error until changes are made in future versions of Hail
# TODO: reassess if this flag is still needed for future versions of Hail
hl._set_flags(no_whole_stage_codegen="1")


def subset_samples(
    input_mt: hl.MatrixTable,
    pedigree: hl.Table,
    sex_ht: hl.Table,
    output_dir: str,
    output_name: str,
) -> Tuple[hl.MatrixTable, hl.Table, list, list]:
    """
    Filter the MatrixTable and sex Table to only samples in the pedigree.

    :param input_mt: MatrixTable
    :param pedigree: Pedigree file from seqr loaded as a Hail Table
    :param sex_ht: Table of inferred sexes for each sample