def test_table_filter_intervals(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20).rows() self.assertEqual( hl.filter_intervals( ds, [hl.parse_locus_interval('20:10639222-10644705')]).count(), 3) intervals = [ hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705') ] self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3) intervals = hl.array([ hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705') ]) self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3) intervals = hl.array([ hl.eval(hl.parse_locus_interval('20:10639222-10644700')), hl.parse_locus_interval('20:10644700-10644705') ]) self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3) intervals = [ hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')), hl.eval(hl.parse_locus_interval('[20:17705793-17716416]')) ] self.assertEqual(hl.filter_intervals(ds, intervals).count(), 4)
def _parameterized_filter_intervals(vds: 'VariantDataset', intervals, keep: bool, mode: str) -> 'VariantDataset': intervals_table = None if isinstance(intervals, Table): expected = hl.tinterval(hl.tlocus(vds.reference_genome)) if len(intervals.key) != 1 or intervals.key[0].dtype != hl.tinterval( hl.tlocus(vds.reference_genome)): raise ValueError( f"'filter_intervals': expect a table with a single key of type {expected}; " f"found {list(intervals.key.dtype.values())}") intervals_table = intervals intervals = intervals.aggregate(hl.agg.collect(intervals.key[0])) if mode == 'variants_only': variant_data = hl.filter_intervals(vds.variant_data, intervals, keep) return VariantDataset(vds.reference_data, variant_data) if mode == 'split_at_boundaries': if not keep: raise ValueError( "filter_intervals mode 'split_at_boundaries' not implemented for keep=False" ) par_intervals = intervals_table or hl.Table.parallelize( intervals.map(lambda x: hl.struct(interval=x)), schema=hl.tstruct(interval=intervals.dtype.element_type), key='interval') ref = segment_reference_blocks(vds.reference_data, par_intervals).drop( 'interval_end', list(par_intervals.key)[0]) return VariantDataset( ref, hl.filter_intervals(vds.variant_data, intervals, keep)) return VariantDataset( hl.filter_intervals(vds.reference_data, intervals, keep), hl.filter_intervals(vds.variant_data, intervals, keep))
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'), default_reference=args.reference) sys.path.append('/') add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') load_module = importlib.import_module(args.load_module) mt = getattr(load_module, args.load_mt_function)(*add_args) if args.gene_map_ht_path is None: interval = [hl.parse_locus_interval(args.interval)] else: gene_ht = hl.read_table(args.gene_map_ht_path) if args.gene is not None: gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene) interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False) else: interval = [hl.parse_locus_interval(args.interval)] gene_ht = hl.filter_intervals(gene_ht, interval) gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation)) gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t') ).key_by().drop('start').export(args.group_output_file, header=False) # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants` if not args.no_adj: mt = mt.filter_entries(mt.adj) mt = hl.filter_intervals(mt, interval) if not args.input_bgen: mt = mt.select_entries('GT') mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0) mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1]) if args.callrate_filter: mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter) if args.export_bgen: if not args.input_bgen: mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT)) mt = gt_to_gp(mt) mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing) hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid) else: mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0))) # Note: no mean-imputation for VCF hl.export_vcf(mt, args.output_file)
def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def annotate_sex(mt: hl.MatrixTable, out_internal_mt_prefix: str, male_threshold: float = 0.8, female_threshold: float = 0.5) -> hl.MatrixTable: """ Imputes sex, exports data, and annotates mt with this data NOTE: Evaluated in R (plots) and decided on cutoff of F<0.5 for females and F>0.8 for males (default) for genomes :param MatrixTable mt: MT containing samples to be ascertained for sex :param str out_internal_mt_prefix: file path prefix for tsv containing samples and sex imputation annotations :return: MatrixTable with imputed sex annotations stashed in column annotation 'sex_check' :rtype: MatrixTable """ mt1 = hl.filter_intervals(mt, [hl.parse_locus_interval('chrX')]) #mt = mt.filter_rows(mt.locus.in_x_nonpar()) mtx_unphased = mt1.select_entries( GT=hl.unphased_diploid_gt_index_call(mt1.GT.n_alt_alleles())) #imputed_sex = hl.impute_sex(mtx_unphased.GT) sex_ht = hl.impute_sex(mtx_unphased.GT, aaf_threshold=0.05, female_threshold=female_threshold, male_threshold=male_threshold) sex_ht.export(out_internal_mt_prefix + '.sex_check.txt.bgz') sex_colnames = ['f_stat', 'is_female'] sex_ht = sex_ht.select(*sex_colnames) mt = mt.annotate_cols(**sex_ht[mt.col_key]) return mt
def main(args): hl.init(log='/liftover.log') if args.gnomad: gnomad = True path = None if args.exomes: data_type = 'exomes' if args.genomes: data_type = 'genomes' logger.info('Working on gnomAD {} release ht'.format(data_type)) logger.info('Reading in release ht') t = public_release(data_type).ht() logger.info('Variants in release ht: {}'.format(t.count())) else: data_type = None gnomad = False if args.ht: path = args.ht t = hl.read_table(args.ht) if args.mt: path = args.mt t = hl.read_matrix_table(args.mt) logger.info('Checking if input data has been split') if 'was_split' not in t.row: t = hl.split_multi(t) if isinstance( t, hl.Table) else hl.split_multi_hts(t) logger.info('Preparing reference genomes for liftover') source, target = get_liftover_genome(t) if args.test: logger.info('Filtering to chr21 for testing') if source.name == 'GRCh38': contig = 'chr21' else: contig = '21' t = hl.filter_intervals( t, [hl.parse_locus_interval(contig, reference_genome=source.name)]) logger.info(f'Lifting data to {target.name}') t = lift_data(t, gnomad, data_type, path, target, args.overwrite) logger.info('Checking SNPs for reference mismatches') t = annotate_snp_mismatch(t, data_type, target) mismatch = check_mismatch(t) if isinstance( t, hl.Table) else check_mismatch(t.rows()) logger.info('{} total SNPs'.format(mismatch['total_variants'])) logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand'])) logger.info('{} reference mismatches in SNPs'.format( mismatch['total_mismatch'])) logger.info('{} mismatches on minus strand'.format( mismatch['negative_strand_mismatch']))
def genotype_correlation(chr_list): """ Get classic LD correlation matrix from genotypes of white British, using variant_set variants. """ starttime = datetime.datetime.now() gt1 = hl.read_matrix_table('gs://nbaya/split/ukb31063.' + variant_set + '_variants.gwas_samples_repart.mt') print(gt1.count()) print(gt1.describe()) for ch in chr_list: mt_chr = hl.filter_intervals(gt1, [hl.parse_locus_interval(str(ch))]) print(mt_chr.count_rows()) print(mt_chr.describe()) ld = hl.ld_matrix(mt_chr.dosage, mt_chr.locus, radius=3e7) ld_sparse = ld.sparsify_band(lower=1, upper=1000) ld_sparse.write('gs://nbaya/sumstats_corr/hm3_gt_correlation_chr' + str(ch) + '.bm', overwrite=True) endtime = datetime.datetime.now() elapsed = endtime - starttime print('\n####################') print('Iteration time: ' + str(round(elapsed.seconds / 60, 2)) + ' minutes') print('####################')
def sumstats_correlation(chr_list): """ Get LD correlation matrix using genotypes of white British, using variant_set variants. """ starttime = datetime.datetime.now() mt0 = hl.read_matrix_table( 'gs://phenotype_31063/hail/gwas.imputed_v3.both_sexes.annotated.mt') mt1 = mt0.filter_rows(hl.is_defined( variants[mt0.locus, mt0.alleles])) #filter to variant_set variants mt1.describe() for ch in chr_list: mt_chr = hl.filter_intervals(mt1, [hl.parse_locus_interval(str(ch))]) print(mt_chr.count_rows()) ld = hl.ld_matrix(mt_chr.beta, mt_chr.locus, radius=3e7) ld_sparse = ld.sparsify_band(lower=1, upper=1000) ld_sparse.write('gs://nbaya/sumstats_corr/hm3_ss_correlation_chr' + str(ch) + '.bm', overwrite=True) endtime = datetime.datetime.now() elapsed = endtime - starttime print('\n####################') print('Iteration time: ' + str(round(elapsed.seconds / 60, 2)) + ' minutes') print('####################')
def pre_process_subset_freq(subset: str, global_ht: hl.Table, test: bool = False) -> hl.Table: """ Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort. .. note:: The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry) :param subset: subset ID :param global_ht: Hail Table containing all variants discovered in the overall release cohort :param test: If True, filter to small region on chr20 :return: Table containing subset frequencies with missing freq structs filled in """ # Read in subset HTs subset_ht_path = get_freq(subset=subset).path subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht" if test: if file_exists(subset_chr20_ht_path): logger.info( "Loading chr20 %s subset frequency data for testing: %s", subset, subset_chr20_ht_path, ) subset_ht = hl.read_table(subset_chr20_ht_path) elif file_exists(subset_ht_path): logger.info( "Loading %s subset frequency data for testing: %s", subset, subset_ht_path, ) subset_ht = hl.read_table(subset_ht_path) subset_ht = hl.filter_intervals( subset_ht, [hl.parse_locus_interval("chr20:1-1000000")]) elif file_exists(subset_ht_path): logger.info("Loading %s subset frequency data: %s", subset, subset_ht_path) subset_ht = hl.read_table(subset_ht_path) else: raise DataException( f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first." ) # Fill in missing freq structs ht = subset_ht.join(global_ht.select().select_globals(), how="right") ht = ht.annotate(freq=hl.if_else( hl.is_missing(ht.freq), hl.map(lambda x: missing_callstats_expr(), hl.range(hl.len(ht.freq_meta))), ht.freq, )) return ht
def test_filter_intervals_compound_partition_key(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles)) .key_rows_by('locus', 'alleles')) intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']), hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
def test_filter_intervals_compound_key(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles)) .key_rows_by('locus', 'alleles')) intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']), hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
def filter_to_autosomes(t): """ Filters the Table or MatrixTable to autosomes only. This assumes that the input contains a field named `locus` of type Locus :param MatrixTable or Table t: Input MT/HT :return: MT/HT autosomes :rtype: MatrixTable or Table """ reference = get_reference_genome(t.locus) autosomes = hl.parse_locus_interval(f'{reference.contigs[0]}-{reference.contigs[21]}', reference_genome=reference) return hl.filter_intervals(t, [autosomes])
def get_chr_dp_ann(chrom: str) -> hl.Table: """ Compute the mean depth of the specified chromosome. The total depth will be determined using the sum DP of either reference and variant data or only variant data depending on the value of `use_only_variants` in the outer function. If `use_only_variants` is set to False then this value is computed using the median block coverage (summed over the block size). If `use_only_variants` is set to True, this value is computed using the sum of DP for all variants divided by the total number of variants. The depth calculations will be determined using only non par regions if the contig is an X or Y reference contig and using the intervals specified by `included_calling_intervals` and excluding intervals specified by `excluded_calling_intervals` if either is defined in the outer function (when `use_only_variants` is not set this only applies to the contig size estimate and is not used when computing chromosome depth). :param chrom: Chromosome to compute the mean depth of :return: Table of a per sample mean depth of `chrom` """ contig_size = get_contig_size(chrom) chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)]) if chrom in ref.x_contigs: chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar()) if chrom in ref.y_contigs: chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar()) if use_only_variants: if included_calling_intervals is not None: chr_mt = chr_mt.filter_rows( hl.is_defined(included_calling_intervals[chr_mt.row_key])) if excluded_calling_intervals is not None: chr_mt = chr_mt.filter_rows( hl.is_missing(excluded_calling_intervals[chr_mt.row_key])) return chr_mt.select_cols( **{ f"{chrom}_mean_dp": hl.agg.filter( chr_mt.LGT.is_non_ref(), hl.agg.sum(chr_mt.DP), ) / hl.agg.filter(chr_mt.LGT.is_non_ref(), hl.agg.count()) }).cols() else: return chr_mt.select_cols( **{ f"{chrom}_mean_dp": hl.agg.sum( hl.if_else( chr_mt.LGT.is_hom_ref(), chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position), chr_mt.DP, )) / contig_size }).cols()
def densify_sites( mt: hl.MatrixTable, sites_ht: hl.Table, last_END_positions_ht: hl.Table, semi_join_rows: bool = True, ) -> hl.MatrixTable: """ Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required. Note that only rows that appear both in `mt` and `sites_ht` are returned. :param mt: Input sparse MT :param sites_ht: Desired sites to densify :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag) :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites) :return: Dense MT filtered to the sites in `sites_ht` """ logger.info("Computing intervals to densify from sites Table.") sites_ht = sites_ht.key_by("locus") sites_ht = sites_ht.annotate( interval=hl.locus_interval( sites_ht.locus.contig, last_END_positions_ht[sites_ht.key].last_END_position, end=sites_ht.locus.position, includes_end=True, reference_genome=sites_ht.locus.dtype.reference_genome, ) ) sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval)) if semi_join_rows: mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus])) else: logger.info("Collecting intervals to densify.") intervals = sites_ht.interval.collect() print( "Found {0} intervals, totalling {1} bp in the dense Matrix.".format( len(intervals), sum( [ interval_length(interval) for interval in union_intervals(intervals) ] ), ) ) mt = hl.filter_intervals(mt, intervals) mt = hl.experimental.densify(mt) return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
def test_matrix_filter_intervals(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) self.assertEqual( hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3) intervals = [hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')), hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
def filter_to_autosomes( t: Union[hl.MatrixTable, hl.Table]) -> Union[hl.MatrixTable, hl.Table]: """ Filters the Table or MatrixTable to autosomes only. This assumes that the input contains a field named `locus` of type Locus :param t: Input MT/HT :return: MT/HT autosomes """ reference = get_reference_genome(t.locus) autosomes = hl.parse_locus_interval( f"{reference.contigs[0]}-{reference.contigs[21]}", reference_genome=reference) return hl.filter_intervals(t, [autosomes])
def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version): """ Loads the snv path and indels source path to a matrix table and returns the table. :param splice_ai_snvs_path: source location :param splice_ai_indels_path: source location :return: matrix table """ logger.info("==> reading in splice_ai vcfs: %s, %s" % (splice_ai_snvs_path, splice_ai_indels_path)) # for 37, extract to MT, for 38, MT not included. interval = "1-MT" if genome_version == "37" else "chr1-chrY" contig_dict = None if genome_version == "38": contig_dict = NO_CHR_TO_CHR_CONTIG_RECODING mt = hl.import_vcf( [splice_ai_snvs_path, splice_ai_indels_path], reference_genome=f"GRCh{genome_version}", contig_recoding=contig_dict, force_bgz=True, min_partitions=10000, skip_invalid_loci=True, ) interval = [ hl.parse_locus_interval(interval, reference_genome=f"GRCh{genome_version}") ] mt = hl.filter_intervals(mt, interval) # Split SpliceAI field by | delimiter. Capture delta score entries and map to floats delta_scores = mt.info.SpliceAI[0].split(delim="\\|")[2:6] splice_split = mt.info.annotate( SpliceAI=hl.map(lambda x: hl.float32(x), delta_scores)) mt = mt.annotate_rows(info=splice_split) # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info. # delta_score array is |DS_AG|DS_AL|DS_DG|DS_DL consequences = hl.literal( ["Acceptor gain", "Acceptor loss", "Donor gain", "Donor loss"]) mt = mt.annotate_rows(info=mt.info.annotate( max_DS=hl.max(mt.info.SpliceAI))) mt = mt.annotate_rows(info=mt.info.annotate(splice_consequence=hl.if_else( mt.info.max_DS > 0, consequences[mt.info.SpliceAI.index(mt.info.max_DS)], "No consequence", ))) return mt
def filter_snps(mt, maf): mt = hl.variant_qc(mt) mt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt.filter_rows(mt.maf > maf) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt = hl.filter_intervals(mt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) return mt
def test_filter_intervals_default(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_2samples_starts.vds')) intervals = [ hl.parse_locus_interval('chr22:10514784-10517000', reference_genome='GRCh38') ] filt = hl.vds.filter_intervals(vds, intervals) assert hl.vds.to_dense_mt(filt)._same( hl.filter_intervals(hl.vds.to_dense_mt(vds), intervals)) var = filt.variant_data assert var.aggregate_rows(hl.agg.all(intervals[0].contains(var.locus)))
def pca_filter_mt(in_mt: hl.MatrixTable, maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000): print("\nInitial number of SNPs before filtering: {}".format( in_mt.count_rows())) mt = hl.variant_qc(in_mt) print(f'\nFiltering out variants with MAF < {maf}') mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt_filt = mt_filt.filter_rows(mt_filt.maf > maf) print(f'\nFiltering out variants with HWE < {hwe:1e}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe) print(f'\nFiltering out variants with Call Rate < {call_rate}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate) # no strand ambiguity print('\nFiltering out strand ambigous variants') mt_filt = mt_filt.filter_rows( ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1])) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb print( '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]' ) intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt_filt = hl.filter_intervals(mt_filt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) # This step is expensive (on local machine) print( f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}' ) mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window) mt_ld_pruned = mt_filt.filter_rows( hl.is_defined(mt_ld_prune[mt_filt.row_key])) print("\nNumber of SNPs after filtering: {}".format( mt_ld_pruned.count_rows())) return mt_ld_pruned
def test_filter_intervals_default_table(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_2samples_starts.vds')) intervals = [ hl.parse_locus_interval('chr22:10514784-10517000', reference_genome='GRCh38') ] intervals_table = hl.Table.parallelize( hl.array(intervals).map(lambda x: hl.struct(interval=x)), key='interval') filt = hl.vds.filter_intervals(vds, intervals_table) assert hl.vds.to_dense_mt(filt)._same( hl.filter_intervals(hl.vds.to_dense_mt(vds), intervals)) var = filt.variant_data assert var.aggregate_rows(hl.agg.all(intervals[0].contains(var.locus)))
def get_chr_dp_ann(chrom: str) -> hl.Table: contig_size = get_contig_size(chrom) chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)]) if chrom in ref.x_contigs: chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar()) if chrom in ref.y_contigs: chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar()) return chr_mt.select_cols( **{ f"{chrom}_mean_dp": hl.agg.sum( hl.cond( chr_mt.LGT.is_hom_ref(), chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position), chr_mt.DP, )) / contig_size }).cols()
def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version): ''' Loads the snv path and indels source path to a matrix table and returns the table. :param splice_ai_snvs_path: source location :param splice_ai_indels_path: source location :return: matrix table ''' logger.info('==> reading in splice_ai vcfs: %s, %s' % (splice_ai_snvs_path, splice_ai_indels_path)) # for 37, extract to MT, for 38, MT not included. interval = '1-MT' if genome_version == '37' else 'chr1-chrY' contig_dict = None if genome_version == '38': rg = hl.get_reference('GRCh37') grch37_contigs = [ x for x in rg.contigs if not x.startswith('GL') and not x.startswith('M') ] contig_dict = dict( zip(grch37_contigs, ['chr' + x for x in grch37_contigs])) mt = hl.import_vcf([splice_ai_snvs_path, splice_ai_indels_path], reference_genome=f"GRCh{genome_version}", contig_recoding=contig_dict, force_bgz=True, min_partitions=10000, skip_invalid_loci=True) interval = [ hl.parse_locus_interval(interval, reference_genome=f"GRCh{genome_version}") ] mt = hl.filter_intervals(mt, interval) # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info. info = mt.info.annotate(max_DS=hl.max( [mt.info.DS_AG, mt.info.DS_AL, mt.info.DS_DG, mt.info.DS_DL])) mt = mt.annotate_rows(info=info) return mt
def filter_intervals(vds: 'VariantDataset', intervals: 'ArrayExpression', *, keep: bool = False) -> 'VariantDataset': """Filter intervals in a :class:`.VariantDataset` (only on variant data for now). Parameters ---------- vds : :class:`.VariantDataset` Dataset in VariantDataset representation. intervals : :class:`.ArrayExpression` of type :class:`.tinterval` Intervals to filter on. keep : :obj:`bool` Whether to keep, or filter out (default) rows that fall within any interval in `intervals`. Returns ------- :class:`.VariantDataset` """ # for now, don't touch reference data. # should remove large regions and scan forward ref blocks to the start of the next kept region variant_data = hl.filter_intervals(vds.variant_data, intervals, keep) return VariantDataset(vds.reference_data, variant_data)
def get_chr_coverage_ann(chrom: str) -> hl.Table: logger.info(f"Working on {chrom}...") chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)]) # filtering if exclude/include intervals are defined if included_calling_intervals is not None: logger.info(f"Filtering variants defined in calling interval {args.interval_to_include}...") total_variants = chr_mt.count_rows() chr_mt = chr_mt.filter_rows( hl.is_defined(included_calling_intervals[chr_mt.locus]) ) variant_in_interval = chr_mt.count_rows() logger.info(f"Including {variant_in_interval} out of {total_variants} defined in intervals...") if excluded_calling_intervals is not None: logger.info(f"Excluding variants defined in interval {args.interval_to_exclude}...") total_variants = chr_mt.count_rows() chr_mt = chr_mt.filter_rows( hl.is_missing(excluded_calling_intervals[chr_mt.locus]) ) excluded_variants = total_variants - chr_mt.count_rows() logger.info(f"Excluding {excluded_variants} out of {total_variants} defined in intervals...") # exclude sex chromosome pseudo-autosomal-region (PAR) from the computation if chrom in ref.x_contigs: chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar()) if chrom in ref.y_contigs: chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar()) return chr_mt.select_cols( **{ f"{chrom}_mean_coverage": hl.agg.mean(chr_mt.DP), f"{chrom}_callrate": hl.agg.fraction(hl.is_defined(chr_mt.GT)) } ).cols()
def get_r_within_gene( bm: BlockMatrix, ld_index: hl.Table, gene: str, vep_ht: hl.Table = None, reference_genome: str = None, ): """ Get LD information (`r`) for all pairs of variants within `gene`. Warning: this returns a table quadratic in number of variants. Exercise caution with large genes. :param bm: Input Block Matrix :param ld_index: Corresponding index table :param gene: Gene symbol as string :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data()) :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene :return: Table with pairs of variants """ if vep_ht is None: vep_ht = public_release("exomes").ht() if reference_genome is None: reference_genome = hl.default_reference().name intervals = hl.experimental.get_gene_intervals( gene_symbols=[gene], reference_genome=reference_genome) ld_index = hl.filter_intervals(ld_index, intervals) ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep) ld_index = ld_index.filter( hl.any(lambda tc: tc.gene_symbol == gene, ld_index.vep.transcript_consequences)) indices_to_keep = ld_index.idx.collect() filt_bm = bm.filter(indices_to_keep, indices_to_keep) ht = filt_bm.entries() ld_index = ld_index.add_index("new_idx").key_by("new_idx") return ht.transmute(r=ht.entry, i_variant=ld_index[ht.i], j_variant=ld_index[ht.j])
def filter_to_autosomes( mtds: Union[hl.Table, hl.MatrixTable, hl.vds.VariantDataset] ) -> Union[hl.Table, hl.MatrixTable, hl.vds.VariantDataset]: """ Filter Table, MatrixTable or VariantDataset to autosome contigs only. This assumes that the input MT or VDS variant_data MT contains a field named `locus` of type Locus :param mtds: Input MatrixTable/Table/VariantDataset :return: MatrixTable/Table/VariantDataset subset to autosomes """ if isinstance(mtds, hl.vds.VariantDataset): reference = get_reference_genome(mtds.variant_data.locus) else: reference = get_reference_genome(mtds.locus) autosomes = hl.parse_locus_interval( f"{reference.contigs[0]}-{reference.contigs[21]}", reference_genome=reference ) if isinstance(mtds, hl.vds.VariantDataset): return hl.vds.filter_intervals(mtds, [autosomes], keep=True) else: return hl.filter_intervals(mtds, [autosomes], keep=True)
def read_gnomad_subset(genome_version): logger.info("==> Read gnomAD subset") filtered_contig = '1' if genome_version == '37' else 'chr1' # select a subset of gnomad genomes variants that are in > 90% of samples ht = hl.read_table( 'gs://gnomad-public/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht' ) ht = hl.filter_intervals(ht, [ hl.parse_locus_interval(filtered_contig, reference_genome='GRCh%s' % genome_version) ]) ht = ht.filter(ht.freq[0].AF > 0.90) ht = ht.annotate(sorted_transaction_consequences=( get_expr_for_vep_sorted_transcript_consequences_array( ht.vep, omit_consequences=[]))) ht = ht.annotate(main_transcript=( get_expr_for_worst_transcript_consequence_annotations_struct( ht.sorted_transaction_consequences))) ht.describe() return ht
def annotate_sex( mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", reference_genome: str = "GRCh38", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", f_stat_cutoff: float = 0.5, aaf_threshold: float = 0.001, ) -> hl.Table: """ Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. Return Table with the following fields: - s (str): Sample - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex. - X_karyotype (str): Sample's chromosome X karyotype. - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. :param mtds: Input MatrixTable or VariantDataset :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. :param reference_genome: Reference genome used for constructing interval list. Default: 'GRCh38' :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. :param gt_expr: Name of entry field storing the genotype. Default: 'GT' :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: if excluded_intervals is not None: raise NotImplementedError( "excluded_intervals is not used when imputing sex chromosome ploidy for VDS" ) ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, ) ploidy_ht = ploidy_ht.rename( {"x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy"} ) mt = mtds.variant_data else: mt = mtds if is_sparse: ploidy_ht = impute_sex_ploidy( mt, excluded_intervals, included_intervals, normalization_contig ) else: raise NotImplementedError( "Imputing sex ploidy does not exist yet for dense data." ) x_contigs = get_reference_genome(mt.locus).x_contigs logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) if "was_split" in list(mt.row): mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1])) else: mt = mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) mt = hl.filter_intervals( mt, [ hl.parse_locus_interval(contig, reference_genome=reference_genome) for contig in x_contigs ], keep=True, ) if sites_ht is not None: if aaf_expr == None: logger.warning( "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'." ) aaf_expr = "AF" logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr, ) logger.info("Annotating sex ht with sex chromosome ploidies") sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key]) logger.info("Inferring sex karyotypes") x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(sex_ht, f_stat_cutoff) sex_ht = sex_ht.annotate_globals( x_ploidy_cutoffs=hl.struct( upper_cutoff_X=x_ploidy_cutoffs[0], lower_cutoff_XX=x_ploidy_cutoffs[1][0], upper_cutoff_XX=x_ploidy_cutoffs[1][1], lower_cutoff_XXX=x_ploidy_cutoffs[2], ), y_ploidy_cutoffs=hl.struct( lower_cutoff_Y=y_ploidy_cutoffs[0][0], upper_cutoff_Y=y_ploidy_cutoffs[0][1], lower_cutoff_YY=y_ploidy_cutoffs[1], ), f_stat_cutoff=f_stat_cutoff, ) return sex_ht.annotate( **get_sex_expr( sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs ) )
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...") # hotfix for depletion of homozygous alternate genotypes # Using v3.0 AF to avoid an extra frequency calculation # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments freq_ht = freq.versions["3"].ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries( GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, ) ) logger.info("Calculating InbreedingCoefficient...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'InbreedingCoeff', 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
print('n variants failing VQSR:') pprint(fail_VQSR) print('n variants in low complexity regions:') pprint(in_LCR) print('n variants not in padded target intervals:') pprint(not_in_padded_target_intervals) # Filter to variants in the autosomes, chrX and Y. mt_rows = mt.rows() mt = mt.filter_rows(mt.fail_VQSR | mt.in_LCR | mt.not_in_padded_target_intervals, keep=False) intervals = [hl.parse_locus_interval(x, reference_genome='GRCh38') for x in ['chr1:START-chr22:END', 'chrX:START-chrX:END', 'chrY:START-chrY:END']] n_after_filter = mt.count_rows() mt = hl.filter_intervals(mt, intervals) n_in_chr = mt.count_rows() print('n variants in autosomes, chrX, chrY:') pprint(n_in_chr) print('n variants removed in contigs outside autosomes, X and Y:') pprint(n_after_filter - n_in_chr) # Filter to variants in the autosomes, chrX and Y. n_initial_variant_list = hl.import_table(INITIAL_VARIANT_LIST).count() print('Invariant sites after initial variant and genotype filters:') pprint(n_initial_variant_list) # Create an initial markdown table with hl.hadoop_open('gs://dalio_bipolar_w1_w2_hail_02/data/summary_variant_table.tsv', 'w') as f: