def compute_sex() -> hl.Table: # Compute sex chrom poloidy ht = impute_sex_ploidy( get_gnomad_v3_mt(remove_hard_filtered_samples=False), excluded_calling_intervals=telomeres_and_centromeres.ht()) ht = ht.checkpoint('gs://gnomad-tmp/sex_depth.ht', overwrite=True) # Compute F-stat chrom_x_ht = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False) n_samples = chrom_x_ht.count_cols() chrom_x_ht = hl.filter_intervals(chrom_x_ht, [hl.parse_locus_interval('chrX')]) chrom_x_ht = chrom_x_ht.filter_rows((hl.len(chrom_x_ht.alleles) == 2)) # Use AC / 2*n_samples for AF. This doesn't take missing into account but avoids densifying # Should be fine for this purpose. info_ht = get_info(split=False).ht() info_ht = hl.filter_intervals(info_ht, [hl.parse_locus_interval('chrX')]) chrom_x_ht = chrom_x_ht.annotate_rows( aaf=info_ht[chrom_x_ht.row_key].info.AC[0] / (2 * n_samples)) inbreeding_ht = hl.impute_sex(chrom_x_ht.LGT, aaf_threshold=0.001, aaf='aaf') ht = ht.annotate(**inbreeding_ht[ht.key]) x_ploidy_cutoff, y_ploidy_cutoff = get_ploidy_cutoffs(ht, f_stat_cutoff=0.5) return ht.annotate(**get_sex_expr(ht.chrX_ploidy, ht.chrY_ploidy, x_ploidy_cutoff, y_ploidy_cutoff))
def main(args): hl.init(log='/hail.log', default_reference='GRCh38') if args.sample_qc: compute_sample_qc().write(get_sample_qc().path, overwrite=args.overwrite) if args.compute_qc_mt: compute_qc_mt().write(v3_qc.path, overwrite=args.overwrite) if args.impute_sex: compute_sex().write(v3_sex.path, overwrite=args.overwrite) elif args.reannotate_sex: sex_ht = v3_sex.ht().checkpoint( 'gs://gnomad-tmp/sex_ht_checkpoint.ht', overwrite=True) # Copy HT to temp location to overwrite annotation x_ploidy_cutoff, y_ploidy_cutoff = get_ploidy_cutoffs( sex_ht, f_stat_cutoff=0.5) sex_ht = sex_ht.annotate( **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoff, y_ploidy_cutoff)) sex_ht.write(v3_sex.path, overwrite=args.overwrite) if args.compute_hard_filters: compute_hard_filters(args.min_cov).write(hard_filtered_samples.path, overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PC-Relate') logger.warn( "PC-relate requires SSDs and doesn't work with preemptible workers!" ) qc_mt = v3_qc.mt() eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint(v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) relatedness_ht = hl.pc_relate(qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[qc_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='all') relatedness_ht.write(v3_relatedness.path, args.overwrite) if args.run_pca: rank_ht = compute_sample_rankings( use_qc_metrics_filters=False ) # QC metrics filters do not exist at this point rank_ht = rank_ht.checkpoint(pca_samples_rankings.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) filtered_samples = hl.literal( rank_ht.aggregate( hl.agg.filter(rank_ht.filtered, hl.agg.collect_as_set(rank_ht.s))) ) # TODO: don't localize once hail bug is fixed samples_to_drop = compute_related_samples_to_drop( v3_relatedness.ht(), rank_ht, args.kin_threshold, filtered_samples=filtered_samples) samples_to_drop.checkpoint(pca_related_samples_to_drop.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) pop_pca_eignevalues, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca( args.include_unreleasable_samples, args.n_pcs, samples_to_drop) pop_pca_scores_ht.write(get_ancestry_pca_scores( args.include_unreleasable_samples).path, overwrite=args.overwrite) pop_pca_loadings_ht.write(get_ancestry_pca_loadings( args.include_unreleasable_samples).path, overwrite=args.overwrite) with hl.utils.hadoop_open(get_ancestry_pca_eigenvalues_path( args.include_unreleasable_samples), mode='w') as f: f.write(",".join([str(x) for x in pop_pca_eignevalues])) if args.assign_pops: pop_ht, pops_rf_model = assign_pops(args.min_pop_prob, args.include_unreleasable_samples) pop_ht = pop_ht.checkpoint(pop.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) pop_ht.transmute( **{f'PC{i + 1}': pop_ht.pca_scores[i] for i in range(0, 10)}).export(pop_tsv_path) with hl.hadoop_open(pop_rf_path, 'wb') as out: pickle.dump(pops_rf_model, out) if args.apply_stratified_filters: apply_stratified_filters(args.filtering_qc_metrics.split(",")).write( stratified_metrics.path, overwrite=args.overwrite) if args.apply_regressed_filters: apply_regressed_filters(args.filtering_qc_metrics.split(","), args.include_unreleasable_samples).write( regressed_metrics.path, overwrite=args.overwrite) if args.compute_related_samples_to_drop: rank_ht = compute_sample_rankings(use_qc_metrics_filters=True) rank_ht = rank_ht.checkpoint(release_samples_rankings.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) filtered_samples = hl.literal( rank_ht.aggregate( hl.agg.filter(rank_ht.filtered, hl.agg.collect_as_set(rank_ht.s))) ) # TODO: don't localize once hail bug is fixed print(filtered_samples) samples_to_drop = compute_related_samples_to_drop( v3_relatedness.ht(), rank_ht, args.kin_threshold, filtered_samples=filtered_samples) samples_to_drop.write(release_related_samples_to_drop.path, overwrite=args.overwrite) if args.generate_metadata: meta_ht = generate_metadata() meta_ht.checkpoint(meta.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) n_pcs = meta_ht.aggregate(hl.agg.min(hl.len(meta_ht.pca_scores))) meta_ht = meta_ht.transmute(**{ f'PC{i + 1}': meta_ht.pca_scores[i] for i in range(n_pcs) }, hard_filters=hl.or_missing( hl.len(meta_ht.hard_filters) > 0, hl.delimit(meta_ht.hard_filters)), qc_metrics_filters=hl.or_missing( hl.len(meta_ht.qc_metrics_filters) > 0, hl.delimit( meta_ht.qc_metrics_filters))) meta_ht.flatten().export(meta_tsv_path)
def annotate_sex( mt: hl.MatrixTable, is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", f_stat_cutoff: float = 0.5, aaf_threshold: float = 0.001, ) -> hl.Table: """ Imputes sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. Returns Table with the following fields: - s (str): Sample - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex. - X_karyotype (str): Sample's chromosome X karyotype. - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. :param mt: Input MatrixTable :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. :param gt_expr: Name of entry field storing the genotype. Default: 'GT' :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") if is_sparse: ploidy_ht = impute_sex_ploidy(mt, excluded_intervals, included_intervals, normalization_contig) else: raise NotImplementedError( "Imputing sex ploidy does not exist yet for dense data.") x_contigs = get_reference_genome(mt.locus).x_contigs logger.info(f"Filtering mt to biallelic SNPs in X contigs: {x_contigs}") if "was_split" in list(mt.row): mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1])) else: mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = hl.filter_intervals( mt, [hl.parse_locus_interval(contig) for contig in x_contigs]) if sites_ht is not None: if aaf_expr == None: logger.warning( "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'." ) aaf_expr = "AF" logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr, ) logger.info("Annotating sex ht with sex chromosome ploidies") sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key]) logger.info("Inferring sex karyotypes") x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( sex_ht, f_stat_cutoff) sex_ht = sex_ht.annotate_globals( x_ploidy_cutoffs=hl.struct( upper_cutoff_X=x_ploidy_cutoffs[0], lower_cutoff_XX=x_ploidy_cutoffs[1][0], upper_cutoff_XX=x_ploidy_cutoffs[1][1], lower_cutoff_XXX=x_ploidy_cutoffs[2], ), y_ploidy_cutoffs=hl.struct( lower_cutoff_Y=y_ploidy_cutoffs[0][0], upper_cutoff_Y=y_ploidy_cutoffs[0][1], lower_cutoff_YY=y_ploidy_cutoffs[1], ), f_stat_cutoff=f_stat_cutoff, ) return sex_ht.annotate( **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs))
def annotate_sex( mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", f_stat_cutoff: float = 0.5, aaf_threshold: float = 0.001, variants_only_x_ploidy: bool = False, variants_only_y_ploidy: bool = False, ) -> hl.Table: """ Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. Return Table with the following fields: - s (str): Sample - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex. - X_karyotype (str): Sample's chromosome X karyotype. - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. :param mtds: Input MatrixTable or VariantDataset :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. :param gt_expr: Name of entry field storing the genotype. Default: 'GT' :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation. :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation. :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: if excluded_intervals is not None: raise NotImplementedError( "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!" ) # Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy' ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, use_variant_dataset=variants_only_x_ploidy, ) ploidy_ht = ploidy_ht.rename({ "x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy", "x_mean_dp": "chrX_mean_dp", "y_mean_dp": "chrY_mean_dp", "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp" if variants_only_x_ploidy else f"{normalization_contig}_mean_dp", }) # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using # the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates. if variants_only_y_ploidy != variants_only_x_ploidy: y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, use_variant_dataset=variants_only_y_ploidy, ) y_ploidy_idx = y_ploidy_ht[ploidy_ht.key] ploidy_ht = ploidy_ht.annotate( chrY_ploidy=y_ploidy_idx.y_ploidy, chrY_mean_dp=y_ploidy_idx.y_mean_dp, ) # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate # that this is the variant dataset only mean DP (this will have already been added if # 'variants_only_x_ploidy' was also True). if variants_only_y_ploidy: ploidy_ht = ploidy_ht.annotate( **{ f"var_data_{normalization_contig}_mean_dp": y_ploidy_idx.autosomal_mean_dp }) mt = mtds.variant_data else: mt = mtds if is_sparse: ploidy_ht = impute_sex_ploidy( mt, excluded_intervals, included_intervals, normalization_contig, use_only_variants=variants_only_x_ploidy, ) ploidy_ht = ploidy_ht.rename({ "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp" if variants_only_x_ploidy else f"{normalization_contig}_mean_dp", }) # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation # using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates. if variants_only_y_ploidy != variants_only_x_ploidy: y_ploidy_ht = impute_sex_ploidy( mt, excluded_intervals, included_intervals, normalization_contig, use_only_variants=variants_only_y_ploidy, ) y_ploidy_ht.select( "chrY_ploidy", "chrY_mean_dp", f"{normalization_contig}_mean_dp", ) # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate # that this is the variant dataset only mean DP (this will have already been added if # 'variants_only_x_ploidy' was also True). if variants_only_y_ploidy: ploidy_ht = ploidy_ht.rename({ f"{normalization_contig}_mean_dp": f"var_data_{normalization_contig}_mean_dp" }) # Re-annotate the ploidy HT with modified Y ploidy annotations ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key]) else: raise NotImplementedError( "Imputing sex ploidy does not exist yet for dense data.") x_contigs = get_reference_genome(mt.locus).x_contigs logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) if "was_split" in list(mt.row): mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1])) else: mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1])) build = get_reference_genome(mt.locus).name mt = hl.filter_intervals( mt, [ hl.parse_locus_interval(contig, reference_genome=build) for contig in x_contigs ], keep=True, ) if sites_ht is not None: if aaf_expr == None: logger.warning( "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'." ) aaf_expr = "AF" logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr, ) logger.info("Annotating sex ht with sex chromosome ploidies") sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key]) logger.info("Inferring sex karyotypes") x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( sex_ht, f_stat_cutoff) sex_ht = sex_ht.annotate_globals( x_ploidy_cutoffs=hl.struct( upper_cutoff_X=x_ploidy_cutoffs[0], lower_cutoff_XX=x_ploidy_cutoffs[1][0], upper_cutoff_XX=x_ploidy_cutoffs[1][1], lower_cutoff_XXX=x_ploidy_cutoffs[2], ), y_ploidy_cutoffs=hl.struct( lower_cutoff_Y=y_ploidy_cutoffs[0][0], upper_cutoff_Y=y_ploidy_cutoffs[0][1], lower_cutoff_YY=y_ploidy_cutoffs[1], ), f_stat_cutoff=f_stat_cutoff, variants_only_x_ploidy=variants_only_x_ploidy, variants_only_y_ploidy=variants_only_y_ploidy, ) return sex_ht.annotate( **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs))