def test_agg_cols_explode(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.explode( lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), [9, 10, 10, 11, 0]), (agg.explode( lambda elt: agg.explode( lambda elt2: agg.collect(elt2 + 1).append(0), [elt, elt + 1]), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), [9, 10, 10, 11, 10, 11, 11, 12, 0]), (agg.explode( lambda elt: agg.filter(elt > 8, agg.collect(elt + 1).append(0)), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), [10, 10, 11, 0]), (agg.explode( lambda elt: agg.group_by(elt % 3, agg.collect(elt + 1).append(0)), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), { 0: [10, 10, 0], 1: [11, 0], 2: [9, 0] })] for aggregation, expected in tests: self.assertEqual( t.select_rows(result=aggregation).result.collect()[0], expected)
def test_agg_cols_group_by(self): t = hl.utils.range_matrix_table(1, 10) tests = [ (agg.group_by( t.col_idx % 2, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), { 0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0] }), (agg.group_by( t.col_idx % 3, agg.filter( t.col_idx > 7, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), { 0: [10, 0], 1: [0], 2: [9, 0] }), (agg.group_by( t.col_idx % 3, agg.explode( lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32)))), { 0: [10, 11, 0], 1: [0], 2: [9, 10, 0] }), ] for aggregation, expected in tests: self.assertEqual( t.select_rows(result=aggregation).result.collect()[0], expected)
def add_strand_flip_annotation(reference_ref, reference_alt, ds_a1, ds_a2): """ Document me here :) """ is_strand_ambig = hl.is_strand_ambiguous(ds_a1, ds_a2) ds_a1_flipped = flip_strand(ds_a1) ds_a2_flipped = flip_strand(ds_a2) is_snp = hl.is_snp(ds_a1, ds_a2) null = hl.null(hl.tbool) return (hl.case().when( (ds_a1 == reference_alt) & (ds_a2 == reference_ref), hl.cond(is_strand_ambig, [ hl.struct(swap=True, flip=True), hl.struct(swap=False, flip=False) ], [hl.struct(swap=False, flip=False)])).when( (ds_a1 == reference_ref) & (ds_a2 == reference_alt), hl.cond(is_strand_ambig, [ hl.struct(swap=True, flip=False), hl.struct(swap=False, flip=True) ], [hl.struct(swap=True, flip=False)])).when( (ds_a1_flipped == reference_alt) & (ds_a2_flipped == reference_ref) & is_snp, [hl.struct(swap=False, flip=True)]).when( (ds_a1_flipped == reference_ref) & (ds_a2_flipped == reference_alt) & is_snp, [hl.struct(swap=True, flip=True)]).default( hl.empty_array(hl.tstruct(swap=hl.tbool, flip=hl.tbool))))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--base-level-pext", help="Path to Hail table with base-level data", default= "gs://gnomad-public/papers/2019-tx-annotation/gnomad_browser/all.baselevel.021620.ht", ) parser.add_argument( "--low-max-pext-genes", help="Path to table containing list of genes with low max pext", default= "gs://gnomad-public/papers/2019-tx-annotation/data/GRCH37_hg19/max_pext_low_genes.021520.tsv", ) parser.add_argument( "output_path", help="Path to output Hail table with region-level data") args = parser.parse_args() ds = prepare_pext_data(args.base_level_pext) low_max_pext_genes = hl.import_table(args.low_max_pext_genes) low_max_pext_genes = low_max_pext_genes.aggregate( hl.agg.collect_as_set(low_max_pext_genes.ensg)) ds = ds.annotate(flags=hl.cond( hl.set(low_max_pext_genes).contains(ds.gene_id), hl.literal(["low_max_pext"]), hl.empty_array(hl.tstr), )) ds.write(args.output_path)
def test_explode_cols(self): mt = hl.utils.range_matrix_table(4, 4) mt = mt.annotate_entries(e=mt.row_idx * 10 + mt.col_idx) self.assertTrue(mt.annotate_cols(x=[1]).explode_cols('x').drop('x')._same(mt)) self.assertEqual(mt.annotate_cols(x=hl.empty_array('int')).explode_cols('x').count_cols(), 0) self.assertEqual(mt.annotate_cols(x=hl.null('array<int>')).explode_cols('x').count_cols(), 0) self.assertEqual(mt.annotate_cols(x=hl.range(0, mt.col_idx)).explode_cols('x').count_cols(), 6)
def test_agg_explode(self): t = hl.Table.parallelize([ hl.struct(a=[1, 2]), hl.struct(a=hl.empty_array(hl.tint32)), hl.struct(a=hl.null(hl.tarray(hl.tint32))), hl.struct(a=[3]), hl.struct(a=[hl.null(hl.tint32)]) ]) self.assertCountEqual(t.aggregate(hl.agg.collect(hl.agg.explode(t.a))), [1, 2, None, 3])
def test_agg_explode(self): t = hl.Table.parallelize([ hl.struct(a=[1, 2]), hl.struct(a=hl.empty_array(hl.tint32)), hl.struct(a=hl.null(hl.tarray(hl.tint32))), hl.struct(a=[3]), hl.struct(a=[hl.null(hl.tint32)]) ]) self.assertCountEqual(t.aggregate(hl.agg.explode(lambda elt: hl.agg.collect(elt), t.a)), [1, 2, None, 3])
def prepare_gene_models(): genes_grch37 = prepare_gene_models_helper("GRCh37") genes_grch38 = prepare_gene_models_helper("GRCh38") genes_grch37 = genes_grch37.select(GRCh37=genes_grch37.row_value) genes_grch38 = genes_grch38.select(GRCh38=genes_grch38.row_value) genes = genes_grch37.join(genes_grch38, how="outer") # Annotate genes with information from HGNC hgnc_path = pipeline_config.get("reference_data", "hgnc_path") hgnc = load_hgnc(hgnc_path) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate( symbol=hl.or_else(genes.symbol, hl.or_else(genes.GRCh38.gencode_gene_symbol, genes.GRCh37.gencode_gene_symbol)), ) # Collect all fields that can be used to search by gene symbol genes = genes.annotate( search_terms=hl.set( hl.empty_array(hl.tstr) .append(genes.symbol) .extend(hl.or_else(genes.previous_symbols, hl.empty_array(hl.tstr))) .extend(hl.or_else(genes.alias_symbols, hl.empty_array(hl.tstr))) .append(genes.GRCh38.gencode_gene_symbol) .append(genes.GRCh37.gencode_gene_symbol) .filter(hl.is_defined) .map(lambda s: s.upper()) ), ) gnomad_constraint_path = pipeline_config.get("reference_data", "gnomad_constraint_path") gnomad_constraint = prepare_gnomad_constraint(gnomad_constraint_path) genes = genes.annotate(gnomad_constraint=gnomad_constraint[genes.GRCh37.canonical_transcript_id]) exac_constraint_path = pipeline_config.get("reference_data", "exac_constraint_path") exac_constraint = prepare_exac_constraint(exac_constraint_path) genes = genes.annotate(exac_constraint=exac_constraint[genes.GRCh37.canonical_transcript_id]) staging_path = pipeline_config.get("output", "staging_path") genes.write(f"{staging_path}/gene_models.ht", overwrite=True)
def prepare_pext_data(base_level_pext_path, low_max_pext_genes_path): ds = prepare_base_level_pext(base_level_pext_path) low_max_pext_genes = hl.import_table(low_max_pext_genes_path) low_max_pext_genes = low_max_pext_genes.aggregate( hl.agg.collect_as_set(low_max_pext_genes.ensg)) ds = ds.annotate(flags=hl.if_else( hl.set(low_max_pext_genes).contains(ds.gene_id), hl.literal(["low_max_pext"]), hl.empty_array(hl.tstr), )) return ds
def test_agg_cols_explode(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.explode(lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), [9, 10, 10, 11, 0]), (agg.explode(lambda elt: agg.explode(lambda elt2: agg.collect(elt2 + 1).append(0), [elt, elt + 1]), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), [9, 10, 10, 11, 10, 11, 11, 12, 0]), (agg.explode(lambda elt: agg.filter(elt > 8, agg.collect(elt + 1).append(0)), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), [10, 10, 11, 0]), (agg.explode(lambda elt: agg.group_by(elt % 3, agg.collect(elt + 1).append(0)), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))), {0: [10, 10, 0], 1: [11, 0], 2:[9, 0]}) ] for aggregation, expected in tests: self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
def main(): parser = argparse.ArgumentParser() parser.add_argument("gencode") parser.add_argument("canonical_transcripts") parser.add_argument("hgnc") parser.add_argument("--min-partitions", type=int, default=8) parser.add_argument("--output", required=True) args = parser.parse_args() # Load genes from GTF file genes = load_gencode_gene_models(args.gencode, min_partitions=args.min_partitions) genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol) # Annotate genes with canonical transcript canonical_transcripts = load_canonical_transcripts(args.canonical_transcripts, min_partitions=args.min_partitions) genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id) # Drop transcripts except for canonical genes = genes.annotate( canonical_transcript=genes.transcripts.filter( lambda transcript: transcript.transcript_id == genes.canonical_transcript_id ).head() ) genes = genes.drop("transcripts") # Annotate genes with information from HGNC hgnc = load_hgnc(args.hgnc) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr))) genes = genes.annotate( symbol=hl.or_else(genes.symbol, genes.gencode_gene_symbol), symbol_source=hl.or_else(genes.symbol_source, "gencode"), ) # Collect all fields that can be used to search by gene symbol genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.set( hl.empty_array(hl.tstr) .append(genes.symbol) .extend(genes.previous_symbols) .extend(genes.alias_symbols) .append(genes.gencode_gene_symbol) .map(lambda s: s.upper()) ), ) genes.describe() genes.write(args.output, overwrite=True)
def test_complex_round_trips(): assert_round_trip(hl.struct()) assert_round_trip(hl.empty_array(hl.tint32)) assert_round_trip(hl.empty_set(hl.tint32)) assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32)) assert_round_trip(hl.locus('1', 100)) assert_round_trip(hl.struct(x=3)) assert_round_trip(hl.set([3, 4, 5, 3])) assert_round_trip(hl.array([3, 4, 5])) assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'})) assert_round_trip( hl.struct(x=hl.dict({ 3: 'a', 4: 'b', 5: 'c' }), y=hl.array([3, 4, 5]), z=hl.set([3, 4, 5, 3])))
def test_agg_cols_group_by(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.group_by(t.col_idx % 2, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), {0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0]}), (agg.group_by(t.col_idx % 3, agg.filter(t.col_idx > 7, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {0: [10, 0], 1: [0], 2: [9, 0]}), (agg.group_by(t.col_idx % 3, agg.explode(lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32)))), {0: [10, 11, 0], 1: [0], 2:[9, 10, 0]}), ] for aggregation, expected in tests: self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
mnvs = mnvs.annotate(related_mnvs=mnvs.related_mnvs.map( lambda related_mnv: related_mnv.select( "combined_variant_id", "n_individuals", "other_constituent_snvs", changes_amino_acids=hl.bind( lambda mnv_consequences, related_mnv_consequences: mnv_consequences.key_set( ).union(related_mnv_consequences.key_set() ).any(lambda gene_id: mnv_consequences.get(gene_id) != related_mnv_consequences.get(gene_id)), hl.dict( mnvs.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))), hl.dict( related_mnv.consequences.map(lambda c: ( c.gene_id, c.amino_acids.lower()))), ), ))) mnvs_3bp = mnvs_3bp.annotate( related_mnvs=hl.empty_array(mnvs.related_mnvs.dtype.element_type)) mnvs = mnvs.union(mnvs_3bp) mnvs = mnvs.repartition(8, shuffle=True) mnvs = mnvs.key_by() mnvs.write(args.output_url)
def combine_datasets(dataset_ids): gene_models_path = f"{pipeline_config.get('output', 'staging_path')}/gene_models.ht" ds = hl.read_table(gene_models_path) ds = ds.annotate(gene_results=hl.struct(), variants=hl.struct()) ds = ds.annotate_globals( meta=hl.struct(variant_fields=VARIANT_FIELDS, datasets=hl.struct())) for dataset_id in dataset_ids: dataset_path = os.path.join( pipeline_config.get("output", "staging_path"), dataset_id.lower()) gene_results = hl.read_table( os.path.join(dataset_path, "gene_results.ht")) gene_group_result_field_names = gene_results.group_results.dtype.value_type.fields gene_group_result_field_types = [ str(typ).rstrip("3264") for typ in gene_results.group_results.dtype.value_type.types ] gene_result_analysis_groups = list( gene_results.aggregate( hl.agg.explode(hl.agg.collect_as_set, gene_results.group_results.keys()))) gene_results = gene_results.annotate(group_results=hl.array([ hl.tuple([ gene_results.group_results.get(group)[field] for field in gene_group_result_field_names ]) for group in gene_result_analysis_groups ])) ds = ds.annotate(gene_results=ds.gene_results.annotate( **{dataset_id: gene_results[ds.gene_id]})) variant_results = hl.read_table( os.path.join(dataset_path, "variant_results.ht")) reference_genome = variant_results.locus.dtype.reference_genome.name variant_info_field_names = variant_results.info.dtype.fields variant_info_field_types = [ str(typ).rstrip("3264") for typ in variant_results.info.dtype.types ] variant_group_result_field_names = variant_results.group_results.dtype.value_type.fields variant_group_result_field_types = [ str(typ).rstrip("3264") for typ in variant_results.group_results.dtype.value_type.types ] variant_result_analysis_groups = list( variant_results.aggregate( hl.agg.explode(hl.agg.collect_as_set, variant_results.group_results.keys()))) variant_results = variant_results.annotate( info=hl.tuple([ variant_results.info[field] for field in variant_info_field_names ]), group_results=hl.array([ hl.rbind( variant_results.group_results.get(group), lambda group_result: hl.or_missing( hl.is_defined(group_result), hl.tuple([ group_result[field] for field in variant_group_result_field_names ]), ), ) for group in variant_result_analysis_groups ]), ) variant_results = variant_results.annotate( variant_id=variant_results.locus.contig.replace("^chr", "") + "-" + hl.str(variant_results.locus.position) + "-" + variant_results.alleles[0] + "-" + variant_results.alleles[1], pos=variant_results.locus.position, ) variant_results = variant_results.annotate(variant=hl.tuple( [variant_results[field] for field in VARIANT_FIELDS])) variant_results = variant_results.group_by("gene_id").aggregate( variants=hl.agg.collect(variant_results.variant)) ds = ds.annotate(variants=ds.variants.annotate( **{ dataset_id: hl.or_else( variant_results[ds.gene_id].variants, hl.empty_array( variant_results.variants.dtype.element_type), ) })) ds = ds.annotate_globals(meta=ds.globals.meta.annotate( datasets=ds.globals.meta.datasets.annotate( **{ dataset_id: hl.struct( reference_genome=reference_genome, gene_result_analysis_groups=gene_result_analysis_groups or hl.empty_array(hl.tstr), gene_group_result_field_names= gene_group_result_field_names or hl.empty_array(hl.tstr), gene_group_result_field_types= gene_group_result_field_types or hl.empty_array(hl.tstr), variant_info_field_names=variant_info_field_names or hl.empty_array(hl.tstr), variant_info_field_types=variant_info_field_types or hl.empty_array(hl.tstr), variant_result_analysis_groups= variant_result_analysis_groups or hl.empty_array(hl.tstr), variant_group_result_field_names= variant_group_result_field_names or hl.empty_array(hl.tstr), variant_group_result_field_types= variant_group_result_field_types or hl.empty_array(hl.tstr), ), }))) return ds
def prepare_gnomad_v2_variants_helper(path, exome_or_genome): ds = hl.read_table(path) ############### # Frequencies # ############### g = hl.eval(ds.globals) subsets = ["gnomad", "controls", "non_neuro", "non_topmed"] + (["non_cancer"] if exome_or_genome == "exome" else []) ds = ds.select_globals() ds = ds.annotate( freq=hl.struct( **{ subset: hl.struct( ac=ds.freq[g.freq_index_dict[subset]].AC, ac_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC, an=ds.freq[g.freq_index_dict[subset]].AN, hemizygote_count=hl.if_else(ds.nonpar, ds.freq[g.freq_index_dict[f"{subset}_male"]].AC, 0), homozygote_count=ds.freq[g.freq_index_dict[subset]].homozygote_count, populations=population_frequencies_expression(ds, g.freq_index_dict, subset), ) for subset in subsets } ) ) # If a variant is not present in a subset, do not store population frequencies for that subset ds = ds.annotate( freq=ds.freq.annotate( **{ subset: ds.freq[subset].annotate( populations=hl.if_else( ds.freq[subset].ac_raw == 0, hl.empty_array(ds.freq[subset].populations.dtype.element_type), ds.freq[subset].populations, ) ) for subset in subsets } ) ) ########################################### # Subsets in which the variant is present # ########################################### ds = ds.annotate( subsets=hl.set( hl.array([(subset, ds.freq[subset].ac_raw > 0) for subset in subsets]) .filter(lambda t: t[1]) .map(lambda t: t[0]) ) ) if exome_or_genome == "genome": ds = ds.annotate(subsets=ds.subsets.add("non_cancer")) ############################## # Filtering allele frequency # ############################## ds = ds.annotate( freq=ds.freq.annotate( **{ subset: ds.freq[subset].annotate( faf95=hl.rbind( hl.sorted( hl.array( [ hl.struct( faf=ds.faf[g.faf_index_dict[f"{subset}_{pop_id}"]].faf95, population=pop_id, ) for pop_id in ( ["afr", "amr", "eas", "nfe"] + (["sas"] if exome_or_genome == "exome" else []) ) ] ).filter(lambda f: f.faf > 0), key=lambda f: (-f.faf, f.population), ), lambda fafs: hl.if_else( hl.len(fafs) > 0, hl.struct(popmax=fafs[0].faf, popmax_population=fafs[0].population,), hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr),), ), ), faf99=hl.rbind( hl.sorted( hl.array( [ hl.struct( faf=ds.faf[g.faf_index_dict[f"{subset}_{pop_id}"]].faf99, population=pop_id, ) for pop_id in ( ["afr", "amr", "eas", "nfe"] + (["sas"] if exome_or_genome == "exome" else []) ) ] ).filter(lambda f: f.faf > 0), key=lambda f: (-f.faf, f.population), ), lambda fafs: hl.if_else( hl.len(fafs) > 0, hl.struct(popmax=fafs[0].faf, popmax_population=fafs[0].population,), hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr),), ), ), ) for subset in ( ["gnomad", "controls", "non_neuro", "non_topmed"] + (["non_cancer"] if exome_or_genome == "exome" else []) ) } ), ) ds = ds.drop("faf") #################### # Age distribution # #################### # Format age distributions ds = ds.transmute( age_distribution=hl.struct( **{ subset: hl.struct(het=ds.age_hist_het[index], hom=ds.age_hist_hom[index],) for subset, index in g.age_index_dict.items() }, ) ) ################### # Quality metrics # ################### ds = ds.transmute( quality_metrics=hl.struct( allele_balance=hl.struct( alt_raw=ds.ab_hist_alt.annotate( bin_edges=ds.ab_hist_alt.bin_edges.map(lambda n: hl.float(hl.format("%.3f", n))) ) ), genotype_depth=hl.struct(all_raw=ds.dp_hist_all, alt_raw=ds.dp_hist_alt), genotype_quality=hl.struct(all_raw=ds.gq_hist_all, alt_raw=ds.gq_hist_alt), # Use the same fields as the VCFs # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159 site_quality_metrics=[ hl.struct(metric="BaseQRankSum", value=ds.allele_info.BaseQRankSum), hl.struct(metric="ClippingRankSum", value=ds.allele_info.ClippingRankSum), hl.struct(metric="DP", value=hl.float(ds.allele_info.DP)), hl.struct(metric="FS", value=ds.info_FS), hl.struct(metric="InbreedingCoeff", value=ds.info_InbreedingCoeff), hl.struct(metric="MQ", value=ds.info_MQ), hl.struct(metric="MQRankSum", value=ds.info_MQRankSum), hl.struct(metric="pab_max", value=ds.pab_max), hl.struct(metric="QD", value=ds.info_QD), hl.struct(metric="ReadPosRankSum", value=ds.info_ReadPosRankSum), hl.struct(metric="RF", value=ds.rf_probability), hl.struct(metric="SiteQuality", value=ds.qual), hl.struct(metric="SOR", value=ds.info_SOR), hl.struct(metric="VQSLOD", value=ds.allele_info.VQSLOD), hl.struct(metric="VQSR_NEGATIVE_TRAIN_SITE", value=hl.float(ds.info_NEGATIVE_TRAIN_SITE)), hl.struct(metric="VQSR_POSITIVE_TRAIN_SITE", value=hl.float(ds.info_POSITIVE_TRAIN_SITE)), ], ) ) ################# # Unused fields # ################# ds = ds.drop( "adj_biallelic_rank", "adj_biallelic_singleton_rank", "adj_rank", "adj_singleton_rank", "allele_type", "biallelic_rank", "biallelic_singleton_rank", "has_star", "info_DP", "mills", "n_alt_alleles", "n_nonref", "omni", "popmax", "qd", "rank", "score", "singleton_rank", "singleton", "transmitted_singleton", "variant_type", "was_mixed", "was_split", ) # These two fields appear only in the genomes table if "_score" in ds.row_value.dtype.fields: ds = ds.drop("_score", "_singleton") ds = ds.select(**{exome_or_genome: ds.row_value}) return ds
@pytest.mark.parametrize( "input_regions,expected_output_regions", [ ( hl.literal([ hl.utils.Struct(start=5, stop=10), hl.utils.Struct(start=7, stop=12), hl.utils.Struct(start=10, stop=11), ]), [hl.utils.Struct(start=5, stop=12)], ), ( hl.literal([ hl.utils.Struct(start=5, stop=10), hl.utils.Struct(start=11, stop=14), hl.utils.Struct(start=17, stop=22), hl.utils.Struct(start=22, stop=24), ]), [ hl.utils.Struct(start=5, stop=14), hl.utils.Struct(start=17, stop=24), ], ), (hl.empty_array(hl.tstruct(start=hl.tint, stop=hl.tint)), []), ], ) def test_merge_overlapping_regions(input_regions, expected_output_regions): assert hl.eval( merge_overlapping_regions(input_regions)) == expected_output_regions
def get_reference_ht( ref: hl.ReferenceGenome, contigs: Optional[List[str]] = None, excluded_intervals: Optional[List[hl.Interval]] = None, add_all_substitutions: bool = False, filter_n: bool = True, ) -> hl.Table: """ Creates a reference Table with locus and alleles (containing only the reference allele by default) from the given reference genome. .. note:: If the `contigs` argument is not provided, all contigs (including obscure ones) will be added to the table. This can be slow as contigs are added one by one. :param ref: Input reference genome :param contigs: An optional list of contigs that the Table should include :param excluded_intervals: An optional list of intervals to exclude :param add_all_substitutions: If set, then all possible substitutions are added in the alleles array :param filter_n: If set, bases where the reference is unknown (n) are filtered. :return: """ if not ref.has_sequence(): add_reference_sequence(ref) if not contigs: contigs = ref.contigs if add_all_substitutions: SUBSTITUTIONS_TABLE = hl.literal( { "a": ["c", "g", "t"], "c": ["a", "g", "t"], "g": ["a", "c", "t"], "t": ["a", "c", "g"], } ) context = [] for contig in contigs: n_partitions = max(1, int(ref.contig_length(contig) / 5000000)) logger.info( f"Creating reference contig {contig} with {n_partitions} partitions." ) _context = hl.utils.range_table( ref.contig_length(contig), n_partitions=n_partitions ) locus_expr = hl.locus(contig=contig, pos=_context.idx + 1, reference_genome=ref) ref_allele_expr = locus_expr.sequence_context().lower() if add_all_substitutions: alleles_expr = hl.array([ref_allele_expr]).extend( SUBSTITUTIONS_TABLE.get(ref_allele_expr, hl.empty_array(hl.tstr)) ) else: alleles_expr = [ref_allele_expr] _context = ( _context.select(locus=locus_expr, alleles=alleles_expr) .key_by("locus", "alleles") .drop("idx") ) if excluded_intervals is not None: _context = hl.filter_intervals(_context, excluded_intervals, keep=False) if filter_n: _context = _context.filter(_context.alleles[0] == "n", keep=False) context.append(_context) return context.pop().union(*context)
def field_to_array(ds, field): return hl.cond(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))
def prepare_gnomad_v2_mnvs(mnvs_path, three_bp_mnvs_path): mnvs = import_mnv_file(mnvs_path, quote="'") mnvs_3bp = import_three_bp_mnv_file(three_bp_mnvs_path, quote="'") snp12_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv1, snv2: hl.delimit( [snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref, snv1.alt + snv2.alt,], "-", ), mnvs_3bp.constituent_snvs[0], mnvs_3bp.constituent_snvs[1], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[2].variant_id], consequences=mnvs_3bp.consequences, ), ) snp23_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv2, snv3: hl.delimit( [snv2.chrom, hl.str(snv2.pos), snv2.ref + snv3.ref, snv2.alt + snv3.alt,], "-", ), mnvs_3bp.constituent_snvs[1], mnvs_3bp.constituent_snvs[2], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[0].variant_id], consequences=mnvs_3bp.consequences, ), ) snp13_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv1, snv2, snv3: hl.delimit( [snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref + snv3.ref, snv1.alt + snv2.ref + snv3.alt,], "-", ), mnvs_3bp.constituent_snvs[0], mnvs_3bp.constituent_snvs[1], mnvs_3bp.constituent_snvs[2], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[1].variant_id], consequences=mnvs_3bp.consequences, ), ) component_2bp_mnvs = snp12_components.union(snp13_components).union(snp23_components) component_2bp_mnvs = component_2bp_mnvs.group_by(component_2bp_mnvs.component_mnv).aggregate( related_mnvs=hl.agg.collect(component_2bp_mnvs.related_mnv) ) mnvs = mnvs.annotate(related_mnvs=component_2bp_mnvs[mnvs.variant_id].related_mnvs) mnvs = mnvs.annotate( related_mnvs=hl.or_else(mnvs.related_mnvs, hl.empty_array(mnvs.related_mnvs.dtype.element_type)) ) mnvs = mnvs.annotate( related_mnvs=mnvs.related_mnvs.map( lambda related_mnv: related_mnv.select( "combined_variant_id", "n_individuals", "other_constituent_snvs", changes_amino_acids=hl.bind( lambda mnv_consequences, related_mnv_consequences: mnv_consequences.key_set() .union(related_mnv_consequences.key_set()) .any(lambda gene_id: mnv_consequences.get(gene_id) != related_mnv_consequences.get(gene_id)), hl.dict(mnvs.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))), hl.dict(related_mnv.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))), ), ) ) ) mnvs_3bp = mnvs_3bp.annotate(related_mnvs=hl.empty_array(mnvs.related_mnvs.dtype.element_type)) mnvs = mnvs.union(mnvs_3bp) return mnvs
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE-1 SAMPLE-2 SAMPLE-3 SAMPLE-4 SAMPLE-5', 'chr1 10000 DUP_chr1_1 N <DUP> 999 LOW_CALL_RATE END=17000;SVTYPE=DUP;CHR2=chr1;SVLEN=7000;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;NONCODING_SPAN=DNase;NONCODING_BREAKPOINT=DNase;AN=1428;AC=370;AF=0.259104;N_BI_GENOS=714;N_HOMREF=415;N_HET=228;N_HOMALT=71;FREQ_HOMREF=0.581232;FREQ_HET=0.319328;FREQ_HOMALT=0.0994398;MALE_AN=772;MALE_AC=214;MALE_AF=0.277202;MALE_N_BI_GENOS=386;MALE_N_HOMREF=216;MALE_N_HET=126;MALE_N_HOMALT=44;MALE_FREQ_HOMREF=0.559586;MALE_FREQ_HET=0.326425;MALE_FREQ_HOMALT=0.11399;FEMALE_AN=656;FEMALE_AC=156;FEMALE_AF=0.237805;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=199;FEMALE_N_HET=102;FEMALE_N_HOMALT=27;FEMALE_FREQ_HOMREF=0.606707;FEMALE_FREQ_HET=0.310976;FEMALE_FREQ_HOMALT=0.0823171 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/1:999:3:999:.:.:.:.:RD 0/1:52:3:52:.:.:.:.:RD 0/1:19:3:19:.:.:.:.:RD 0/0:1:2:1:.:.:.:.:RD 0/0:31:2:31:.:.:.:.:RD', 'chr1 10000 DUP_chr1_2 N <DUP> 999 LOW_CALL_RATE;UNRESOLVED END=53500;SVTYPE=DUP;CHR2=chr1;SVLEN=43500;ALGORITHMS=depth;EVIDENCE=BAF,RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;LINCRNA__COPY_GAIN=FAM138A,MIR1302-2HG;NONCODING_SPAN=DNase;NONCODING_BREAKPOINT=DNase;AN=1428;AC=70;AF=0.04902;N_BI_GENOS=714;N_HOMREF=649;N_HET=60;N_HOMALT=5;FREQ_HOMREF=0.908964;FREQ_HET=0.0840336;FREQ_HOMALT=0.0070028;MALE_AN=772;MALE_AC=46;MALE_AF=0.059585;MALE_N_BI_GENOS=386;MALE_N_HOMREF=344;MALE_N_HET=38;MALE_N_HOMALT=4;MALE_FREQ_HOMREF=0.891192;MALE_FREQ_HET=0.0984456;MALE_FREQ_HOMALT=0.0103627;FEMALE_AN=656;FEMALE_AC=24;FEMALE_AF=0.036585;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=305;FEMALE_N_HET=22;FEMALE_N_HOMALT=1;FEMALE_FREQ_HOMREF=0.929878;FEMALE_FREQ_HET=0.0670732;FEMALE_FREQ_HOMALT=0.00304878 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:1:2:1:.:.:.:.:RD 0/1:119:3:119:.:.:.:.:RD 0/1:119:3:119:.:.:.:.:RD 0/0:999:2:999:.:.:.:.:RD 0/0:133:2:133:.:.:.:.:RD', 'chr1 10602 BND_chr1_1 N <BND> 461 UNRESOLVED;UNSTABLE_AF_PCRMINUS END=10602;SVTYPE=BND;CHR2=chr12;STRANDS=+-;SVLEN=-1;ALGORITHMS=manta;EVIDENCE=SR;UNRESOLVED_TYPE=SINGLE_ENDER_+-;END2=10546;AN=1428;AC=88;AF=0.061625;N_BI_GENOS=714;N_HOMREF=626;N_HET=88;N_HOMALT=0;FREQ_HOMREF=0.876751;FREQ_HET=0.123249;FREQ_HOMALT=0;MALE_AN=772;MALE_AC=51;MALE_AF=0.066062;MALE_N_BI_GENOS=386;MALE_N_HOMREF=335;MALE_N_HET=51;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.867876;MALE_FREQ_HET=0.132124;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=37;FEMALE_AF=0.056402;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=291;FEMALE_N_HET=37;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.887195;FEMALE_FREQ_HET=0.112805;FEMALE_FREQ_HOMALT=0;gnomAD_V2_SVID=gnomAD-SV_v2.1_BND_1_1;gnomAD_V2_AF=0.00678599998354912 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV .:999:.:.:0:23:0:999:PE,SR 0/0:999:.:.:0:23:0:999:PE,SR .:999:.:.:0:1:0:999:PE,SR 0/0:999:.:.:0:3:0:999:PE,SR .:999:.:.:0:23:0:999:PE,SR', 'chr1 41950 DUP_chr1_3 N <DUP> 999 LOW_CALL_RATE END=52000;SVTYPE=DUP;CHR2=chr1;SVLEN=10050;ALGORITHMS=depth;EVIDENCE=BAF,RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;AN=1428;AC=28;AF=0.019608;N_BI_GENOS=714;N_HOMREF=687;N_HET=26;N_HOMALT=1;FREQ_HOMREF=0.962185;FREQ_HET=0.0364146;FREQ_HOMALT=0.00140056;MALE_AN=772;MALE_AC=15;MALE_AF=0.01943;MALE_N_BI_GENOS=386;MALE_N_HOMREF=371;MALE_N_HET=15;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.96114;MALE_FREQ_HET=0.0388601;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=13;FEMALE_AF=0.019817;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=316;FEMALE_N_HET=11;FEMALE_N_HOMALT=1;FEMALE_FREQ_HOMREF=0.963415;FEMALE_FREQ_HET=0.0335366;FEMALE_FREQ_HOMALT=0.00304878;gnomAD_V2_SVID=gnomAD-SV_v2.1_DUP_1_1;gnomAD_V2_AF=0.068962998688221 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:31:2:31:.:.:.:.:RD 0/0:58:2:58:.:.:.:.:RD 0/0:1:2:1:.:.:.:.:RD 0/0:112:2:112:.:.:.:.:RD 0/0:999:2:999:.:.:.:.:RD', 'chr1 44000 DUP_chr1_4 N <DUP> 999 UNSTABLE_AF_PCRMINUS;LOW_CALL_RATE END=66000;SVTYPE=DUP;CHR2=chr1;SVLEN=22000;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__DUP_PARTIAL=OR4F5;NONCODING_SPAN=DNase;AN=1428;AC=96;AF=0.067227;N_BI_GENOS=714;N_HOMREF=641;N_HET=50;N_HOMALT=23;FREQ_HOMREF=0.897759;FREQ_HET=0.070028;FREQ_HOMALT=0.0322129;MALE_AN=772;MALE_AC=54;MALE_AF=0.069948;MALE_N_BI_GENOS=386;MALE_N_HOMREF=345;MALE_N_HET=28;MALE_N_HOMALT=13;MALE_FREQ_HOMREF=0.893782;MALE_FREQ_HET=0.0725389;MALE_FREQ_HOMALT=0.0336788;FEMALE_AN=656;FEMALE_AC=42;FEMALE_AF=0.064024;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=296;FEMALE_N_HET=22;FEMALE_N_HOMALT=10;FEMALE_FREQ_HOMREF=0.902439;FEMALE_FREQ_HET=0.0670732;FEMALE_FREQ_HOMALT=0.0304878 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:125:1:125:.:.:.:.:RD 0/0:72:2:72:.:.:.:.:RD 0/0:130:2:130:.:.:.:.:RD 0/0:1:2:1:.:.:.:.:RD 0/0:1:2:1:.:.:.:.:RD', 'chr1 44250 DUP_chr1_5 N <DUP> 999 LOW_CALL_RATE END=116000;SVTYPE=DUP;CHR2=chr1;SVLEN=71750;ALGORITHMS=depth;EVIDENCE=BAF,RD;PROTEIN_CODING__COPY_GAIN=OR4F5;LINCRNA__COPY_GAIN=AL627309.3;LINCRNA__DUP_PARTIAL=AL627309.1;NONCODING_SPAN=DNase;AN=1428;AC=82;AF=0.057423;N_BI_GENOS=714;N_HOMREF=646;N_HET=54;N_HOMALT=14;FREQ_HOMREF=0.904762;FREQ_HET=0.0756303;FREQ_HOMALT=0.0196078;MALE_AN=772;MALE_AC=43;MALE_AF=0.055699;MALE_N_BI_GENOS=386;MALE_N_HOMREF=351;MALE_N_HET=27;MALE_N_HOMALT=8;MALE_FREQ_HOMREF=0.909326;MALE_FREQ_HET=0.0699482;MALE_FREQ_HOMALT=0.0207254;FEMALE_AN=656;FEMALE_AC=39;FEMALE_AF=0.059451;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=295;FEMALE_N_HET=27;FEMALE_N_HOMALT=6;FEMALE_FREQ_HOMREF=0.89939;FEMALE_FREQ_HET=0.0823171;FEMALE_FREQ_HOMALT=0.0182927 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:1:1:1:.:.:.:.:RD 0/0:36:2:36:.:.:.:.:RD 0/0:94:2:94:.:.:.:.:RD 0/0:130:1:130:.:.:.:.:RD 0/0:999:1:999:.:.:.:.:RD', 'chr1 51400 DEL_chr1_1 N <DEL> 999 UNSTABLE_AF_PCRMINUS END=64000;SVTYPE=DEL;CHR2=chr1;SVLEN=12600;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;NONCODING_SPAN=DNase;AN=1428;AC=306;AF=0.214286;N_BI_GENOS=714;N_HOMREF=443;N_HET=236;N_HOMALT=35;FREQ_HOMREF=0.620448;FREQ_HET=0.330532;FREQ_HOMALT=0.0490196;MALE_AN=772;MALE_AC=156;MALE_AF=0.202073;MALE_N_BI_GENOS=386;MALE_N_HOMREF=246;MALE_N_HET=124;MALE_N_HOMALT=16;MALE_FREQ_HOMREF=0.637306;MALE_FREQ_HET=0.321244;MALE_FREQ_HOMALT=0.0414508;FEMALE_AN=656;FEMALE_AC=150;FEMALE_AF=0.228659;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=197;FEMALE_N_HET=112;FEMALE_N_HOMALT=19;FEMALE_FREQ_HOMREF=0.60061;FEMALE_FREQ_HET=0.341463;FEMALE_FREQ_HOMALT=0.0579268 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/1:125:1:125:.:.:.:.:RD 0/0:72:2:72:.:.:.:.:RD 0/0:112:2:112:.:.:.:.:RD 0/0:1:2:1:.:.:.:.:RD 0/0:8:2:8:.:.:.:.:RD', 'chr1 52600 CNV_chr1_1 N <CNV> 999 FAIL_minGQ END=58000;SVTYPE=CNV;CHR2=chr1;SVLEN=5400;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;NONCODING_SPAN=DNase;AN=0;AC=0;AF=0;MALE_AN=0;MALE_AC=0;MALE_AF=0;FEMALE_AN=0;FEMALE_AC=0;FEMALE_AF=0 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV:CN:CNQ .:.:1:125:.:.:.:.:RD:1:125 .:.:2:130:.:.:.:.:RD:2:130 .:.:2:23:.:.:.:.:RD:2:23 .:.:2:1:.:.:.:.:RD:2:1 .:.:2:1:.:.:.:.:RD:2:1', 'chr1 66234 BND_chr1_2 N <BND> 807 UNRESOLVED END=66234;SVTYPE=BND;CHR2=chr19;STRANDS=-+;SVLEN=-1;ALGORITHMS=manta;EVIDENCE=PE;UNRESOLVED_TYPE=SINGLE_ENDER_-+;END2=108051;AN=1428;AC=236;AF=0.165266;N_BI_GENOS=714;N_HOMREF=514;N_HET=164;N_HOMALT=36;FREQ_HOMREF=0.719888;FREQ_HET=0.229692;FREQ_HOMALT=0.0504202;MALE_AN=772;MALE_AC=131;MALE_AF=0.169689;MALE_N_BI_GENOS=386;MALE_N_HOMREF=275;MALE_N_HET=91;MALE_N_HOMALT=20;MALE_FREQ_HOMREF=0.712435;MALE_FREQ_HET=0.235751;MALE_FREQ_HOMALT=0.0518135;FEMALE_AN=656;FEMALE_AC=105;FEMALE_AF=0.160061;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=239;FEMALE_N_HET=73;FEMALE_N_HOMALT=16;FEMALE_FREQ_HOMREF=0.728659;FEMALE_FREQ_HET=0.222561;FEMALE_FREQ_HOMALT=0.0487805 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:999:.:.:0:23:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR', 'chr1 1495464 CPX_chr1_1 N <CPX> 999 PASS END=1495554;SVTYPE=CPX;CHR2=chr1;SVLEN=184;ALGORITHMS=manta;EVIDENCE=PE;CPX_TYPE=dDUP;SOURCE=DUP_chr1:1533874-1534058;CPX_INTERVALS=DUP_chr1:1533874-1534058;PROTEIN_CODING__DUP_PARTIAL=ATAD3A;PROTEIN_CODING__INTRONIC=ATAD3A;AN=1428;AC=7;AF=0.004902;N_BI_GENOS=714;N_HOMREF=707;N_HET=7;N_HOMALT=0;FREQ_HOMREF=0.990196;FREQ_HET=0.00980392;FREQ_HOMALT=0;MALE_AN=772;MALE_AC=4;MALE_AF=0.005181;MALE_N_BI_GENOS=386;MALE_N_HOMREF=382;MALE_N_HET=4;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.989637;MALE_FREQ_HET=0.0103627;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=3;FEMALE_AF=0.004573;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=325;FEMALE_N_HET=3;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.990854;FEMALE_FREQ_HET=0.00914634;FEMALE_FREQ_HOMALT=0 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/1:782:.:.:1:782:1:1:PE,SR', 'chr1 1643228 INS_chr1_10 N <INS:ME:SVA> 250 PASS END=1643309;SVTYPE=INS;CHR2=chr1;SVLEN=169;ALGORITHMS=melt;EVIDENCE=SR;PROTEIN_CODING__INTRONIC=CDK11B;AN=1428;AC=11;AF=0.007703;N_BI_GENOS=714;N_HOMREF=703;N_HET=11;N_HOMALT=0;FREQ_HOMREF=0.984594;FREQ_HET=0.0154062;FREQ_HOMALT=0;MALE_AN=772;MALE_AC=5;MALE_AF=0.006477;MALE_N_BI_GENOS=386;MALE_N_HOMREF=381;MALE_N_HET=5;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.987047;MALE_FREQ_HET=0.0129534;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=6;FEMALE_AF=0.009146;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=322;FEMALE_N_HET=6;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.981707;FEMALE_FREQ_HET=0.0182927;FEMALE_FREQ_HOMALT=0;gnomAD_V2_SVID=gnomAD-SV_v2.1_INS_1_47;gnomAD_V2_AF=0.00130899995565414 GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/0:999:.:.:0:999:0:24:PE,SR 0/0:999:.:.:0:999:0:999:PE,SR 0/1:1:.:.:0:999:1:1:SR', ] NULL_STR_ARRAY = hl.null(hl.dtype('array<str>')) EMPTY_STR_ARRAY = hl.empty_array(hl.dtype('str')) NULL_INTERVALS = hl.null( hl.dtype('array<struct{type: str, chrom: str, start: int32, end: int32}>')) VARIANT_CPX = hl.struct(variantId='CPX_chr1_1', contig='1', sc=7, sf=0.004902, sn=1428, start=1495464, end=1495554, sv_callset_Het=7, sv_callset_Hom=0, gnomad_svs_ID=hl.null('str'), gnomad_svs_AF=hl.null('float'), pos=1495464, filters=NULL_STR_ARRAY,
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--gencode", action="append", default=[], metavar=("version", "gtf_path", "canonical_transcripts_path"), nargs=3, required=True, ) parser.add_argument("--hgnc") parser.add_argument("--mane-select-transcripts") parser.add_argument("--min-partitions", type=int, default=32) parser.add_argument("--output", required=True) args = parser.parse_args() genes = None all_gencode_versions = [ gencode_version for gencode_version, _, _ in args.gencode ] for gencode_version, gtf_path, canonical_transcripts_path in args.gencode: gencode_genes = load_gencode_gene_models( gtf_path, min_partitions=args.min_partitions) # Canonical transcripts file is a TSV with two columns: gene ID and transcript ID and no header row canonical_transcripts = hl.import_table( canonical_transcripts_path, key="gene_id", min_partitions=args.min_partitions) gencode_genes = gencode_genes.annotate( canonical_transcript_id=canonical_transcripts[ gencode_genes.gene_id].transcript_id) gencode_genes = gencode_genes.select( **{f"v{gencode_version}": gencode_genes.row_value}) if not genes: genes = gencode_genes else: genes = genes.join(gencode_genes, "outer") genes = genes.select(gencode=genes.row_value) hgnc = hl.import_table(args.hgnc, missing="") hgnc = hgnc.select( hgnc_id=hgnc["HGNC ID"], symbol=hgnc["Approved symbol"], name=hgnc["Approved name"], previous_symbols=hgnc["Previous symbols"], alias_symbols=hgnc["Alias symbols"], omim_id=hgnc["OMIM ID(supplied by OMIM)"], gene_id=hl.or_else(hgnc["Ensembl gene ID"], hgnc["Ensembl ID(supplied by Ensembl)"]), ) hgnc = hgnc.filter(hl.is_defined(hgnc.gene_id)).key_by("gene_id") hgnc = hgnc.annotate( previous_symbols=hl.cond( hgnc.previous_symbols == "", hl.empty_array(hl.tstr), hgnc.previous_symbols.split(",").map(lambda s: s.strip()), ), alias_symbols=hl.cond( hgnc.alias_symbols == "", hl.empty_array(hl.tstr), hgnc.alias_symbols.split(",").map(lambda s: s.strip())), ) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr))) # If an HGNC gene symbol was not present, use the symbol from Gencode for gencode_version in all_gencode_versions: genes = genes.annotate( symbol=hl.or_else( genes.symbol, genes.gencode[f"v{gencode_version}"].gene_symbol), symbol_source=hl.cond( hl.is_missing(genes.symbol) & hl.is_defined( genes.gencode[f"v{gencode_version}"].gene_symbol), f"gencode (v{gencode_version})", genes.symbol_source, ), ) # Collect all fields that can be used to search by gene name genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend( genes.previous_symbols).extend(genes.alias_symbols), ) for gencode_version in all_gencode_versions: genes = genes.annotate(search_terms=hl.rbind( genes.gencode[f"v{gencode_version}"].gene_symbol, lambda symbol_in_gencode: hl.cond( hl.is_defined(symbol_in_gencode), genes.search_terms.append(symbol_in_gencode), genes. search_terms), )) genes = genes.annotate( search_terms=hl.set(genes.search_terms.map(lambda s: s.upper()))) if args.mane_select_transcripts: mane_select_transcripts = hl.import_table(args.mane_select_transcripts, force=True) mane_select_transcripts = mane_select_transcripts.select( gene_id=mane_select_transcripts.Ensembl_Gene.split("\\.")[0], matched_gene_version=mane_select_transcripts.Ensembl_Gene.split( "\\.")[1], ensembl_id=mane_select_transcripts.Ensembl_nuc.split("\\.")[0], ensembl_version=mane_select_transcripts.Ensembl_nuc.split("\\.") [1], refseq_id=mane_select_transcripts.RefSeq_nuc.split("\\.")[0], refseq_version=mane_select_transcripts.RefSeq_nuc.split("\\.")[1], ) mane_select_transcripts = mane_select_transcripts.key_by("gene_id") # For GRCh38 (Gencode >= 20) transcripts, use the MANE Select transcripts to annotate transcripts # with their matching RefSeq transcript. ensembl_to_refseq_map = {} for transcript in mane_select_transcripts.collect(): ensembl_to_refseq_map[transcript.ensembl_id] = { transcript.ensembl_version: hl.Struct(refseq_id=transcript.refseq_id, refseq_version=transcript.refseq_version) } ensembl_to_refseq_map = hl.literal(ensembl_to_refseq_map) for gencode_version in ["19", "29"]: if int(gencode_version) >= 20: transcript_annotation = lambda transcript: transcript.annotate( **ensembl_to_refseq_map.get( transcript.transcript_id, hl.empty_dict( hl.tstr, hl.tstruct(refseq_id=hl.tstr, refseq_version=hl.tstr)), ).get( transcript.transcript_version, hl.struct(refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr)), )) else: transcript_annotation = lambda transcript: transcript.annotate( refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr)) genes = genes.annotate(gencode=genes.gencode.annotate( **{ f"v{gencode_version}": genes.gencode[f"v{gencode_version}"].annotate( transcripts=genes.gencode[f"v{gencode_version}"]. transcripts.map(transcript_annotation)) })) # Annotate genes with their MANE Select transcript genes = genes.annotate( mane_select_transcript=mane_select_transcripts[genes.gene_id]) genes.describe() genes.write(args.output, overwrite=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--gencode", action="append", default=[], metavar=("version", "gtf_path", "canonical_transcripts_path"), nargs=3, required=True, ) parser.add_argument("--hgnc") parser.add_argument("--min-partitions", type=int, default=32) parser.add_argument("--output", required=True) args = parser.parse_args() genes = None all_gencode_versions = [gencode_version for gencode_version, _, _ in args.gencode] for gencode_version, gtf_path, canonical_transcripts_path in args.gencode: gencode_genes = load_gencode_gene_models(gtf_path, min_partitions=args.min_partitions) # Canonical transcripts file is a TSV with two columns: gene ID and transcript ID and no header row canonical_transcripts = hl.import_table( canonical_transcripts_path, key="gene_id", min_partitions=args.min_partitions ) gencode_genes = gencode_genes.annotate( canonical_transcript_id=canonical_transcripts[gencode_genes.gene_id].transcript_id ) gencode_genes = gencode_genes.select(**{f"v{gencode_version}": gencode_genes.row_value}) if not genes: genes = gencode_genes else: genes = genes.join(gencode_genes, "outer") genes = genes.select(gencode=genes.row_value) hgnc = hl.import_table(args.hgnc) # Fix for alternative HGNC column name try: hgnc = hgnc.rename({'Alias symbols': 'Synonyms'}) except: pass hgnc = hgnc.select( hgnc_id=hgnc["HGNC ID"], symbol=hgnc["Approved symbol"], name=hgnc["Approved name"], previous_symbols=hgnc["Previous symbols"], synonyms=hgnc["Synonyms"], omim_id=hgnc["OMIM ID(supplied by OMIM)"], gene_id=hgnc["Ensembl ID(supplied by Ensembl)"], ) hgnc = hgnc.key_by("gene_id") hgnc = hgnc.annotate( previous_symbols=hl.cond( hgnc.previous_symbols == "", hl.empty_array(hl.tstr), hgnc.previous_symbols.split(",").map(lambda s: s.strip()), ), synonyms=hl.cond( hgnc.synonyms == "", hl.empty_array(hl.tstr), hgnc.synonyms.split(",").map(lambda s: s.strip()) ), ) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr))) # If an HGNC gene symbol was not present, use the symbol from Gencode for gencode_version in all_gencode_versions: genes = genes.annotate( symbol=hl.or_else(genes.symbol, genes.gencode[f"v{gencode_version}"].gene_symbol), symbol_source=hl.cond( hl.is_missing(genes.symbol) & hl.is_defined(genes.gencode[f"v{gencode_version}"].gene_symbol), f"gencode (v{gencode_version})", genes.symbol_source, ), ) # Collect all fields that can be used to search by gene name genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend(genes.synonyms).extend(genes.previous_symbols), ) for gencode_version in all_gencode_versions: genes = genes.annotate( search_terms=hl.rbind( genes.gencode[f"v{gencode_version}"].gene_symbol, lambda symbol_in_gencode: hl.cond( hl.is_defined(symbol_in_gencode), genes.search_terms.append(symbol_in_gencode), genes.search_terms ), ) ) genes = genes.annotate(search_terms=hl.set(genes.search_terms.map(lambda s: s.upper()))) genes.describe() genes.write(args.output, overwrite=True)
''' # Merging the female PAR region with the male PAR region mt_x_par = femaleX_par.union_cols(maleX_par) # Annotating the matrix tables with variant QC data mt_x_list = [hl.variant_qc(mt, name='variant_qc') for mt in mt_x_list] intervals = [ hl.parse_locus_interval(x) for x in ['X:60001-2699520', 'X:154931044-155260560'] ] mt_x_list = [hl.filter_intervals(mt, intervals, keep=True) for mt in mt_x_list] # Creating lists for variant call rate, hwe pval and maf in joined mt mt_x_par = mt_x_par.annotate_rows(var_call_rate=hl.empty_array('float64')) mt_x_par = mt_x_par.annotate_rows(hwe_pval=hl.empty_array('float64')) mt_x_par = mt_x_par.annotate_rows(maf=hl.empty_array('float64')) # Annotating var_call_rate list with variant call rates for mt_next in mt_x_list: mt_x_par = mt_x_par.annotate_rows( var_call_rate=mt_x_par.var_call_rate.append( mt_next.index_rows(mt_x_par.row_key).variant_qc.call_rate)) # Annotating hwe_pval list with hwe pvals for mt_next in mt_x_list: mt_x_par = mt_x_par.annotate_rows(hwe_pval=mt_x_par.hwe_pval.append(( mt_next.index_rows(mt_x_par.row_key).variant_qc.p_value_hwe))) # Annotating maf list with mafs
mt = mt_list[i].union_cols(mt_list[i + 1]) else: mt0 = mt.union_cols(mt_list[i + 1]) mt = mt0 # In[94]: # Creating a list of site IDs to annotate globals - for the purpose of keeping track of the order of variant data in variant arrays siteIDs = hl.array(['ID', 'ID', 'ID']) mt = mt.annotate_globals(location=siteIDs) # In[95]: # Creating lists for variant call rate, hwe pval and maf in joined mt mt = mt.annotate_rows(var_call_rate=hl.empty_array('float64')) mt = mt.annotate_rows(hwe_pval=hl.empty_array('float64')) mt = mt.annotate_rows(maf=hl.empty_array('float64')) # In[96]: # Annotating var_call_rate list with variant call rates for each location for mt_next in mt_list: mt = mt.annotate_rows(var_call_rate=mt.var_call_rate.append( mt_next.index_rows(mt.row_key).variant_qc.call_rate)) # In[98]: # Annotating hwe_pval list with hwe pvals for each location for mt_next in mt_list: mt = mt.annotate_rows(hwe_pval=mt.hwe_pval.append((
def infer_families( kin_ht: hl.Table, # the kinship hail table sex: Dict[str, bool], # the dictionary of sexes i_col: str = 'i', # the rest of these are default that can be set to something else if needed j_col: str = 'j', pi_hat_col: str = 'pi_hat', ibd2_col: str = 'ibd2', ibd1_col: str = 'ibd1', ibd0_col: str = 'ibd0', first_degree_threshold: Tuple[float, float] = (0.4, 0.75), second_degree_threshold: Tuple[float, float] = (0.195, 0.3), ibd1_second_degree_threshold: float = 0.40, ibd2_parent_offspring_threshold: float = 0.30, ibd1_parent_offspring_threshold: float = 0.70, ibd0_parent_offspring_threshold: float = 0.15) -> hl.Pedigree: """ Infers familial relationships from the results of pc_relate and sex information. Note that both kinship and ibd2 are needed in the pc_relate output. This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID. Note that this function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents) :param Table kin_ht: pc_relate output table :param dict of str -> bool sex: A dict containing the sex for each sample. True = female, False = male, None = unknown :param str i_col: Column containing the 1st sample id in the ibd table :param str j_col: Column containing the 2nd sample id in the ibd table #:param str kin_col: Column containing the kinship in the ibd table :param str pi_hat_col: Column containing the pi_hat in the ibd table :param str ibd2_col: Column containing ibd2 in the pc_relate table :param (float, float) first_degree_threshold: Lower/upper bounds for kin for 1st degree relatives :param (float, float) second_degree_threshold: Lower/upper bounds for kin for 2nd degree relatives :param float ibd2_parent_offspring_threshold: Upper bound on ibd2 for a parent/offspring :return: Pedigree containing all trios in the data :rtype: Pedigree """ def get_fam_samples( sample: str, fam: Set[str], samples_rel: Dict[str, Set[str]], ) -> Set[str]: """ Given a sample, its known family and a dict that links samples with their relatives, outputs the set of samples that constitute this sample family. :param str sample: sample :param dict of str -> set of str samples_rel: dict( :param set of str fam: sample known family :return: Family including the sample :rtype: set of str """ fam.add( sample ) # usually this starts out as a blank set except for the case two lines below for s2 in samples_rel[ sample]: # iterate through the sample's relatives if s2 not in fam: fam = get_fam_samples( s2, fam, samples_rel ) # this part is to get who s2 is related to but that sample may not have been related to? return fam def get_indexed_ibd( pc_relate_rows: List[hl.Struct]) -> Dict[Tuple[str, str], float]: """ Given rows from a pc_relate table, creates dicts with: keys: Pairs of individuals, lexically ordered values: ibd2, ibd1, ibd0 :param list of hl.Struct pc_relate_rows: Rows from a pc_relate table :return: Dict of lexically ordered pairs of individuals -> kinship :rtype: dict of (str, str) -> float """ ibd2 = dict() ibd1 = dict() ibd0 = dict() for row in pc_relate_rows: ibd2[tuple(sorted((row[i_col], row[j_col])))] = row[ ibd2_col] # this is just getting the ibd2 value for every sample pair ibd1[tuple(sorted((row[i_col], row[j_col])))] = row[ ibd1_col] # this is just getting the ibd1 value for every sample pair ibd0[tuple(sorted((row[i_col], row[j_col])))] = row[ ibd0_col] # this is just getting the ibd0 value for every sample pair return ibd2, ibd1, ibd0 def get_parents(possible_parents: List[str], relative_pairs: List[Tuple[str, str]], sex: Dict[str, bool]) -> Union[Tuple[str, str], None]: """ Given a list of possible parents for a sample (first degree relatives with low ibd2), looks for a single pair of samples that are unrelated with different sexes. If a single pair is found, return the pair (father, mother) :param list of str possible_parents: Possible parents :param list of (str, str) relative_pairs: Pairs of relatives, used to check that parents aren't related with each other :param dict of str -> bool sex: Dict mapping samples to their sex (True = female, False = male, None or missing = unknown) :return: (father, mother) if found, `None` otherwise :rtype: (str, str) or None """ parents = [] logging.info(f"You have {len(possible_parents)} possible parent(s)") while len(possible_parents ) > 1: # go through the entire list of possible parents p1 = possible_parents.pop() # start with the first possible parent for p2 in possible_parents: logging.info(str(tuple(sorted((p1, p2)))) + '\n') if tuple( sorted((p1, p2)) ) not in relative_pairs: # to what degree is a "relative"? will this work for grandparent, mom, child? logging.info( "your potential parent's don't appear to be relatives\n" ) logging.info("SEX p1: " + str(sex.get(p1)) + '\n') logging.info("SEX p2: " + str(sex.get(p2)) + '\n') if sex.get(p1) is False and sex.get(p2): parents.append((p1, p2)) logging.info("found in order 1\n") elif sex.get(p1) and sex.get(p2) is False: parents.append((p2, p1)) logging.info("found in order 2\n") else: logging.info("Your Parents are Related!!!\n\n") if len(parents) == 1: logging.info("Found your parents!\n") return parents[0] return None # Duplicated samples to remove (If not provided, this function won't work as it assumes that each child has exactly two parents) duplicated_samples = set() try: dups = hl.literal(duplicated_samples) except: dups = hl.empty_array(hl.tstr) first_degree_pairs = kin_ht.filter( (kin_ht[pi_hat_col] >= first_degree_threshold[0]) & (kin_ht[pi_hat_col] <= first_degree_threshold[1]) & ~dups.contains(kin_ht[i_col]) & ~dups.contains(kin_ht[j_col]) # so not including any duplicate samples ).collect() first_degree_relatives = defaultdict(set) for row in first_degree_pairs: first_degree_relatives[row[i_col]].add( row[j_col] ) # so you're making a list for every sample that includes any other sample they are related to by first degree first_degree_relatives[row[j_col]].add(row[i_col]) # Add second degree relatives for those samples # This is needed to distinguish grandparent - child - parent from child - mother, father down the line first_degree_samples = hl.literal(set(first_degree_relatives.keys())) second_degree_samples = kin_ht.filter(( (kin_ht[pi_hat_col] >= first_degree_threshold[0]) & (kin_ht[pi_hat_col] <= first_degree_threshold[1])) | ( (kin_ht[pi_hat_col] >= second_degree_threshold[0]) & (kin_ht[ibd1_col] >= ibd1_second_degree_threshold) & (kin_ht[pi_hat_col] < second_degree_threshold[1]))).collect() ibd2, ibd1, ibd0 = get_indexed_ibd( second_degree_samples ) # this is just getting the ibd values for every sample pair fam_id = 1 trios = [] duos = [] decisions = {} while len(first_degree_relatives) > 0: s_fam = get_fam_samples( list(first_degree_relatives)[0], set(), first_degree_relatives ) # just feed in the entire dictionary because it gets keyed out to only that sample in the function anyway for s in s_fam: logging.info(f"Processing sample: {s}") s_rel = first_degree_relatives.pop( s ) # because your popping, the above index of [0] will appropriately be updated possible_parents = [] for rel in s_rel: # so s rel is a list of all the people s (which was popped off) was related to by first degree if (ibd2[tuple(sorted((s, rel)))] <= ibd2_parent_offspring_threshold) & \ (ibd1[tuple(sorted((s, rel)))] >= ibd1_parent_offspring_threshold) & \ (ibd0[tuple(sorted((s, rel)))] <= ibd0_parent_offspring_threshold): # if the ib2 value for that pair is below that parent threshold possible_parents.append(rel) #these will be the proband-offspring only pairs if len(possible_parents) == 1: duos.append(sorted((s, possible_parents[0]))) decisions[s] = possible_parents[0] else: parents = get_parents(possible_parents, list(ibd2.keys()), sex) decisions[s] = parents if parents is not None: # just formatting the trio output here trios.append( hl.Trio(s=s, fam_id=str(fam_id), pat_id=parents[0], mat_id=parents[1], is_female=sex.get(s))) fam_id += 1 return hl.Pedigree(trios), duos, decisions
def field_to_array(ds, field): return hl.if_else(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))
def samples_qc(mt, mt_to_annotate, args): """ Performs samples QC on a matrix table, removing samples on chimera and contamination %, as well as being +/- 4 standard deviations from mean on TiTv, het/homvar, insertion/deletion ratios and n_singletons for a specific batch or cohort :param mt: matrix table, low-pass failing variants and genotypes filtered out :param mt_to_annotate: matrix table to annotate with failing samples information after calculating on filtered mt :param args: :return: returns annotated, unfiltered matrix table """ datestr = time.strftime("%Y.%m.%d") # Run variant QC to get up to date variant QC metrics for samples QC mt = hl.sample_qc(mt) # Pull data to cols and checkpoint mt_cols = mt.cols() mt_cols = mt_cols.checkpoint("samples_qc_cols_tmp.ht", overwrite=True) # Instantiate empty array for failing samples QC tags mt_cols = mt_cols.annotate(failing_samples_qc=hl.empty_array(hl.tstr)) ############################################################ # Find samples failing on chimeras or contamination values # ############################################################ mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( (mt_cols[args.chimeras_col] > args.chimeras_max) & hl.is_defined(mt_cols[args.chimeras_col]), mt_cols.failing_samples_qc.append( "failing_chimeras"), mt_cols.failing_samples_qc)) mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( (mt_cols[args.contamination_col] > args.contamination_max) & hl.is_defined(mt_cols[args.contamination_col]), mt_cols.failing_samples_qc.append( "failing_contamination"), mt_cols.failing_samples_qc)) failing_chim = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains("failing_chimeras"))) miss_chim = mt_cols.aggregate( hl.agg.count_where(~(hl.is_defined(mt_cols[args.chimeras_col])))) failing_contam = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains("failing_contamination"))) miss_contam = mt_cols.aggregate( hl.agg.count_where(~(hl.is_defined(mt_cols[args.contamination_col])))) logging.info( f"Number of samples failing on chimeras % > {args.chimeras_max}: {failing_chim}" ) logging.info(f"Number of samples missing chimeras %: {miss_chim}") logging.info( f"Number of samples failing on contamination % > {args.contamination_max}: {failing_contam}" ) logging.info(f"Number of samples missing contamination %: {miss_contam}") chim_stats = mt_cols.aggregate(hl.agg.stats(mt_cols[args.chimeras_col])) cont_stats = mt_cols.aggregate( hl.agg.stats(mt_cols[args.contamination_col])) logging.info(f"Chimeras statistics: {chim_stats}") logging.info(f"Contamination statistics: {cont_stats}") ############################################### # Find samples failing on sex-aware call rate # ############################################### if args.sample_call_rate is not None: mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( (mt_cols.sexaware_sample_call_rate < args.sample_call_rate) & hl.is_defined(mt_cols.sexaware_sample_call_rate), mt_cols.failing_samples_qc.append( "failing_sexaware_sample_call_rate"), mt_cols.failing_samples_qc)) mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( ~(hl.is_defined(mt_cols.sexaware_sample_call_rate)), mt_cols.failing_samples_qc.append( "missing_sexaware_sample_call_rate"), mt_cols.failing_samples_qc)) failing_cr = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains( "failing_sexaware_sample_call_rate"))) missing_cr = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains( "missing_sexaware_sample_call_rate"))) logging.info( f"Number of samples failing on sex-aware call rate > {args.sample_call_rate}: {failing_cr}" ) logging.info( f"Number of samples missing sex-aware call rate : {missing_cr}") cr_stats = mt_cols.aggregate( hl.agg.stats(mt_cols.sexaware_sample_call_rate)) logging.info(f"Sex-aware call rate statistics: {cr_stats}") ###################################################################################### # Find samples failing per-cohort on titv, het_homvar ratio, indel, and # singletons # ###################################################################################### if args.batch_col_name is not None: batch_none = mt_cols.aggregate( hl.agg.count_where(~(hl.is_defined(mt_cols[args.batch_col_name])))) mt_cols = mt_cols.annotate( **{ args.batch_col_name: hl.or_else(mt_cols[args.batch_col_name], "no_batch_info") }) if batch_none > 0: logging.info( f"Warning- {batch_none} samples have batch undefined. These samples will be grouped in one" f"batch for sample QC (named no_batch_info).") mt_cols.filter_cols(mt_cols[args.batch_col_name] == "no_batch_info").s.show(batch_none + 1) batch_set = mt_cols.aggregate( hl.agg.collect_as_set(mt_cols[args.batch_col_name])) else: args.batch_col_name = "mock_batch_col" mt_cols = mt_cols.annotate(mock_batch_col="all") batch_set = ["all"] # Convert batch strings to numeric values, create label for plotting batch_set_numeric = list(range(len(batch_set))) batch_key = list(zip(batch_set, batch_set_numeric)) mt_cols = mt_cols.annotate(plot_batch=0) for batch in batch_key: mt_cols = mt_cols.annotate( plot_batch=hl.cond(mt_cols[args.batch_col_name] == batch[0], batch[1], mt_cols.plot_batch)) mt_cols = mt_cols.annotate(plot_batch_jitter=mt_cols.plot_batch + hl.rand_unif(-0.3, 0.3)) batch_thresholds = {} batch_statistics = {} for measure in [ 'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton' ]: logging.info(f"Performing sample QC for measure {measure}") # Instantiate/reset box plot label mt_cols = mt_cols.annotate(boxplot_label=mt_cols[args.batch_col_name]) batch_thresholds[measure] = {} batch_statistics[measure] = {} mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( ~(hl.is_defined(mt_cols.sample_qc[measure])), mt_cols.failing_samples_qc.append(f"missing_{measure}"), mt_cols.failing_samples_qc)) for batch in batch_set: # See if values exist at all for all values defined_values = mt_cols.aggregate( hl.agg.count_where(hl.is_defined(mt_cols.sample_qc[measure]))) if defined_values > 0: # Get mean and standard deviation for each measure, for each batch's samples stats = mt_cols.aggregate( hl.agg.filter(mt_cols[args.batch_col_name] == batch, hl.agg.stats(mt_cols.sample_qc[measure]))) # Get cutoffs for each measure cutoff_upper = stats.mean + (args.sampleqc_sd_threshold * stats.stdev) cutoff_lower = stats.mean - (args.sampleqc_sd_threshold * stats.stdev) if measure == "n_singleton": logging.info( f"Max number of singletons for batch {batch}: {stats.max}" ) mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( ((mt_cols.sample_qc[measure] > cutoff_upper) | (mt_cols.sample_qc[measure] < cutoff_lower)) & hl.is_defined(mt_cols.sample_qc[measure]) & (mt_cols[args.batch_col_name] == batch), mt_cols.failing_samples_qc.append( f"failing_{measure}"), mt_cols.failing_samples_qc)) mt_cols = mt_cols.annotate(boxplot_label=hl.cond( ((mt_cols.sample_qc[measure] > cutoff_upper) | (mt_cols.sample_qc[measure] < cutoff_lower)) & hl.is_defined(mt_cols.sample_qc[measure]) & (mt_cols[args.batch_col_name] == batch), "outlier", mt_cols.boxplot_label)) # Collect thresholds and statistics for each batch batch_thresholds[measure][batch] = { 'min_thresh': cutoff_lower, 'max_thresh': cutoff_upper } batch_statistics[measure][batch] = stats else: logging.error( f"Error- no defined values for measure {measure}. NAs can be introduced by division by " f"zero. Samples not filtered on {measure}!") # Create plot for measure for each batch output_file(f"{datestr}_samples_qc_plots_{measure}.html") p = hl.plot.scatter(mt_cols.plot_batch_jitter, mt_cols.sample_qc[measure], label=mt_cols.boxplot_label, title=f"{measure} values split by batch.") save(p) ########################## # Report failing samples # ########################## for measure in [ 'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton' ]: failing_count = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains(f"failing_{measure}"))) missing_count = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains(f"missing_{measure}"))) logging.info( f"Number of samples failing on {measure}: {failing_count}") logging.info(f"Number of samples missing {measure}: {missing_count}") failing_any = mt_cols.aggregate( hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0)) logging.info( f"Number of samples failing samples QC on any measure: {failing_any}") if args.pheno_col is not None: cases_failing = mt_cols.aggregate( hl.agg.filter( mt_cols[args.pheno_col] == True, hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0))) controls_failing = mt_cols.aggregate( hl.agg.filter( mt_cols[args.pheno_col] == False, hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0))) logging.info(f"Cases failing QC: {cases_failing}") logging.info(f"Controls failing QC: {controls_failing}") ####################################################################################################### # Annotate original (unfiltered) matrix table with failing samples QC information + sample QC measure # ####################################################################################################### mt_to_annotate = mt_to_annotate.annotate_cols( sample_qc=mt_cols[mt_to_annotate.s].sample_qc) mt_to_annotate = mt_to_annotate.annotate_cols( failing_samples_qc=mt_cols[mt_to_annotate.s].failing_samples_qc) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_stats_batches=batch_statistics) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_stats_chim_cont={ 'chimeras': chim_stats, 'contamination': cont_stats }) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_thresholds={ 'chimeras_max': str(args.chimeras_max), 'contamination_max': str(args.contamination_max), 'deviation_multiplier_threshold': str(args.sampleqc_sd_threshold), 'batches': str(batch_set), 'batch_cohort_name': str(args.batch_col_name) }) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_batch_thresholds=batch_thresholds) return mt_to_annotate
ds = ds.annotate(filters=ds.filters.difference(hl.set(["MULTIALLELIC"]))) # Group gene lists for all consequences in a struct ds = ds.annotate( consequences=hl.struct( **{ csq.lower(): ds.info[f"PROTEIN_CODING__{csq}"] for csq in protein_coding_consequences if csq != "INTERGENIC" and csq != "NEAREST_TSS" } ) ) ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC) # Collect set of all genes for which a variant has a consequence all_genes = hl.empty_array(hl.tstr) for csq in ds.consequences.dtype.fields: all_genes = all_genes.extend( hl.or_else(ds.consequences[csq.lower()], hl.empty_array(hl.tstr)) ) ds = ds.annotate(genes=hl.set(all_genes)) # Group per-population values in a struct for each field def expr_for_per_population_field(row, field): return hl.struct( **dict( ((pop.lower(), row.info[f"{pop}_{field}"]) for pop in populations), total=row.info[field], ) )
def import_structural_variants(vcf_path): ds = hl.import_vcf(vcf_path, force_bgz=True, min_partitions=32).rows() ds = ds.annotate( **{field.lower(): ds.info[field] for field in TOP_LEVEL_INFO_FIELDS}) ds = ds.annotate( variant_id=ds.rsid.replace("^gnomAD-SV_v2.1_", ""), reference_genome="GRCh37", # Start chrom=ds.locus.contig, pos=ds.locus.position, xpos=x_position(ds.locus.contig, ds.locus.position), # End end=ds.info.END, xend=x_position(ds.locus.contig, ds.info.END), # Start 2 chrom2=ds.info.CHR2, pos2=ds.info.POS2, xpos2=x_position(ds.info.CHR2, ds.info.POS2), # End 2 end2=ds.info.END2, xend2=x_position(ds.info.CHR2, ds.info.END2), # Other length=ds.info.SVLEN, type=ds.info.SVTYPE, alts=ds.alleles[1:], ) # MULTIALLELIC should not be used as a quality filter in the browser ds = ds.annotate(filters=ds.filters.difference(hl.set(["MULTIALLELIC"]))) # Group gene lists for all consequences in one field ds = ds.annotate(consequences=hl.array([ hl.struct( consequence=csq.lower(), genes=hl.or_else(ds.info[f"PROTEIN_CODING__{csq}"], hl.empty_array(hl.tstr)), ) for csq in RANKED_CONSEQUENCES if csq not in ("INTERGENIC", "NEAREST_TSS") ]).filter(lambda csq: hl.len(csq.genes) > 0)) ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC) ds = ds.annotate(major_consequence=hl.rbind( ds.consequences.find(lambda csq: hl.len(csq.genes) > 0), lambda csq: hl.or_else(csq.consequence, hl.or_missing(ds.intergenic, "intergenic")), )) # Collect set of all genes for which a variant has a consequence ds = ds.annotate(genes=hl.set(ds.consequences.flatmap(lambda c: c.genes))) # Group per-population frequency values ds = ds.annotate(freq=hl.struct( **{field.lower(): ds.info[field] for field in FREQ_FIELDS}, populations=[ hl.struct(id=pop, **{ field.lower(): ds.info[f"{pop}_{field}"] for field in FREQ_FIELDS }) for pop in DIVISIONS ], )) # For MCNVs, store per-copy number allele counts ds = ds.annotate(freq=ds.freq.annotate(copy_numbers=hl.or_missing( ds.type == "MCNV", hl.zip_with_index(ds.alts).map(lambda pair: hl.rbind( pair[0], pair[1], lambda index, alt: hl.struct( # Extract copy number. Example, get 2 from "CN=<2>" copy_number=hl.int(alt[4:-1]), ac=ds.freq.ac[index], ), )), ))) # For MCNVs, sum AC/AF for all alt alleles except CN=2 ds = ds.annotate(freq=ds.freq.annotate( ac=hl.if_else(ds.type == "MCNV", sum_mcnv_ac_or_af( ds.alts, ds.freq.ac), ds.freq.ac[0]), af=hl.if_else(ds.type == "MCNV", sum_mcnv_ac_or_af( ds.alts, ds.freq.af), ds.freq.af[0]), populations=hl.if_else( ds.type == "MCNV", ds.freq.populations.map(lambda pop: pop.annotate( ac=sum_mcnv_ac_or_af(ds.alts, pop.ac), af=sum_mcnv_ac_or_af(ds.alts, pop.af), )), ds.freq.populations.map( lambda pop: pop.annotate(ac=pop.ac[0], af=pop.af[0])), ), )) # Add hemizygous frequencies ds = ds.annotate(hemizygote_count=hl.dict( [( pop_id, hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info[f"{pop_id}_MALE_N_HEMIALT"], 0), ) for pop_id in POPULATIONS] + [(f"{pop_id}_FEMALE", 0) for pop_id in POPULATIONS] + [( f"{pop_id}_MALE", hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info[f"{pop_id}_MALE_N_HEMIALT"], 0), ) for pop_id in POPULATIONS] + [("FEMALE", 0)] + [("MALE", hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info.MALE_N_HEMIALT, 0))])) ds = ds.annotate(freq=ds.freq.annotate( hemizygote_count=hl.or_missing( ds.type != "MCNV", hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info.MALE_N_HEMIALT, 0), ), populations=hl.if_else( ds.type != "MCNV", ds.freq.populations.map(lambda pop: pop.annotate( hemizygote_count=ds.hemizygote_count[pop.id])), ds.freq.populations.map( lambda pop: pop.annotate(hemizygote_count=hl.null(hl.tint))), ), )) ds = ds.drop("hemizygote_count") # Rename n_homalt ds = ds.annotate(freq=ds.freq.annotate( homozygote_count=ds.freq.n_homalt, populations=ds.freq.populations.map(lambda pop: pop.annotate( homozygote_count=pop.n_homalt).drop("n_homalt")), ).drop("n_homalt")) # Re-key ds = ds.key_by("variant_id") ds = ds.drop("locus", "alleles", "info", "rsid") return ds
def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path): all_flags = set() with hl.hadoop_open("/tmp/import_temp.tsv", "w") as temp_output_file: writer = csv.writer(temp_output_file, delimiter="\t", quotechar='"') writer.writerow(["chrom", "position", "ref", "alt", "genes", "verdict", "flags", "project_index"]) for project_index, path in enumerate(curation_result_paths): with hl.hadoop_open(path, "r") as input_file: reader = csv.DictReader(input_file) raw_dataset_flags = [f.lstrip("Flag ") for f in reader.fieldnames if f.startswith("Flag ")] dataset_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags] all_flags = all_flags.union(set(dataset_flags)) for row in reader: [chrom, pos, ref, alt] = row["Variant ID"].split("-") variant_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags if row[f"Flag {f}"] == "TRUE"] genes = [gene_id for (gene_id, gene_symbol) in (gene.split(":") for gene in row["Gene"].split(";"))] verdict = row["Verdict"] if verdict == "inufficient_evidence": verdict = "insufficient_evidence" verdict = VERDICT_MAPPING[verdict] output_row = [ chrom, pos, ref, alt, ",".join(genes), verdict, ",".join(variant_flags), project_index, ] writer.writerow(output_row) ds = hl.import_table("/tmp/import_temp.tsv") ds = ds.transmute(locus=hl.locus(ds.chrom, hl.int(ds.position)), alleles=[ds.ref, ds.alt],) ds = ds.annotate( genes=ds.genes.split(","), flags=hl.set(hl.if_else(ds.flags == "", hl.empty_array(hl.tstr), ds.flags.split(","))), ) ds = ds.explode(ds.genes, name="gene_id") genes = hl.read_table(genes_path) ds = ds.annotate(gene_symbol=genes[ds.gene_id].symbol, gene_version=genes[ds.gene_id].gene_version) ds = ds.group_by(ds.locus, ds.alleles, ds.gene_id).aggregate( result=hl.agg.take(ds.row.drop("locus", "alleles", "gene_id"), 1, ds.project_index) ) ds = ds.annotate(**ds.result[0]).drop("result", "project_index") ds = ds.group_by("locus", "alleles").aggregate(lof_curations=hl.agg.collect(ds.row.drop("locus", "alleles"))) ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles)) for flag in sorted(list(all_flags)): print(flag) return ds
omim = omim.select( gene_id=omim["Ensembl Gene ID"], omim_accession=omim["MIM Gene Accession"], omim_description=omim["MIM Gene Description"], ) omim = omim.key_by("gene_id") genes = genes.annotate(**omim[genes.gene_id]) # Full names dbnsfp = hl.import_table(args.dbnsfp_file, missing=".") dbnsfp = dbnsfp.select( gene_id=dbnsfp["Ensembl_gene"], full_gene_name=dbnsfp["Gene_full_name"], other_names=hl.or_else( dbnsfp["Gene_old_names"].upper().split(";"), hl.empty_array(hl.tstr)).extend( hl.or_else(dbnsfp["Gene_other_names"].upper().split(";"), hl.empty_array(hl.tstr))), ) dbnsfp = dbnsfp.key_by("gene_id") genes = genes.annotate(**dbnsfp[genes.gene_id]) genes.key_by().write(os.path.join(args.output_directory, "genes.ht")) ############################################### # Transcripts # ############################################### transcripts = gencode.filter(gencode.feature == "transcript") transcripts = transcripts.select( transcript_id=transcripts.transcript_id.split("\\.")[0],
def vep_struct_to_csq( vep_expr: hl.expr.StructExpression, csq_fields: str = VEP_CSQ_FIELDS) -> hl.expr.ArrayExpression: """ Given a VEP Struct, returns and array of VEP VCF CSQ strings (one per consequence in the struct). The fields and their order will correspond to those passed in `csq_fields`, which corresponds to the VCF header that is required to interpret the VCF CSQ INFO field. Note that the order is flexible and that all fields that are in the default value are supported. These fields will be formatted in the same way that their VEP CSQ counterparts are. While other fields can be added if their name are the same as those in the struct. Their value will be the result of calling hl.str(), so it may differ from their usual VEP CSQ representation. :param vep_expr: The input VEP Struct :param csq_fields: The | delimited list of fields to include in the CSQ (in that order) :return: The corresponding CSQ strings """ _csq_fields = [f.lower() for f in csq_fields.split("|")] def get_csq_from_struct(element: hl.expr.StructExpression, feature_type: str) -> hl.expr.StringExpression: # Most fields are 1-1, just lowercase fields = dict(element) # Add general exceptions fields.update({ "allele": element.variant_allele, "consequence": hl.delimit(element.consequence_terms, delimiter="&"), "feature_type": feature_type, "feature": (element.transcript_id if "transcript_id" in element else element.regulatory_feature_id if "regulatory_feature_id" in element else element.motif_feature_id if "motif_feature_id" in element else ""), "variant_class": vep_expr.variant_class, }) # Add exception for transcripts if feature_type == "Transcript": fields.update({ "canonical": hl.cond(element.canonical == 1, "YES", ""), "ensp": element.protein_id, "gene": element.gene_id, "symbol": element.gene_symbol, "symbol_source": element.gene_symbol_source, "cdna_position": hl.str(element.cdna_start) + hl.cond( element.cdna_start == element.cdna_end, "", "-" + hl.str(element.cdna_end), ), "cds_position": hl.str(element.cds_start) + hl.cond( element.cds_start == element.cds_end, "", "-" + hl.str(element.cds_end), ), "protein_position": hl.str(element.protein_start) + hl.cond( element.protein_start == element.protein_end, "", "-" + hl.str(element.protein_end), ), "sift": element.sift_prediction + "(" + hl.format("%.3f", element.sift_score) + ")", "polyphen": element.polyphen_prediction + "(" + hl.format("%.3f", element.polyphen_score) + ")", "domains": hl.delimit(element.domains.map(lambda d: d.db + ":" + d.name), "&"), }) elif feature_type == "MotifFeature": fields["motif_score_change"] = hl.format( "%.3f", element.motif_score_change) return hl.delimit( [hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields], "|") csq = hl.empty_array(hl.tstr) for feature_field, feature_type in [ ("transcript_consequences", "Transcript"), ("regulatory_feature_consequences", "RegulatoryFeature"), ("motif_feature_consequences", "MotifFeature"), ("intergenic_consequences", "Intergenic"), ]: csq = csq.extend( hl.or_else( vep_expr[feature_field].map(lambda x: get_csq_from_struct( x, feature_type=feature_type)), hl.empty_array(hl.tstr), )) return hl.or_missing(hl.len(csq) > 0, csq)
def prepare_gnomad_v3_variants(path): ds = hl.read_table(path) g = hl.eval(ds.globals) subsets = set(m.get("subset", None) for m in g.freq_meta) def freq(ds, *args, **kwargs): return ds.freq[g.freq_index_dict[freq_index_key(*args, **kwargs)]] ############################ # Derived top level fields # ############################ ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles)) ds = ds.rename({"rsid": "rsids"}) ###################### # Colocated variants # ###################### variants_by_locus = ds.select( ds.variant_id, ac_raw=hl.struct( **{ subset or "all": freq(ds, subset=subset, raw=True).AC for subset in subsets }), ) variants_by_locus = variants_by_locus.group_by("locus").aggregate( variants=hl.agg.collect(variants_by_locus.row_value)) def subset_filter(subset): return lambda variant: variant.ac_raw[subset] > 0 variants_by_locus = variants_by_locus.annotate(variant_ids=hl.struct( **{ subset or "all": variants_by_locus.variants.filter( subset_filter(subset or "all")).map( lambda variant: variant.variant_id) for subset in subsets })) ds = ds.annotate( colocated_variants=variants_by_locus[ds.locus].variant_ids) ds = ds.annotate(colocated_variants=hl.struct( **{ subset: ds.colocated_variants[subset].filter( lambda variant_id: variant_id != ds.variant_id) for subset in ds.colocated_variants._fields })) ############### # Frequencies # ############### subset_populations = {} for subset in subsets: subset_populations[subset] = set( m.get("pop", None) for m in g.freq_meta if m.get("subset", None) == subset) subset_populations[subset].discard(None) # "global" population is used for downsamplings subset_populations[subset].discard("global") ds = ds.annotate(in_autosome_or_par=ds.locus.in_autosome_or_par()) ds = ds.annotate(genome=hl.struct(freq=hl.struct( **{ subset or "all": hl.struct( ac=freq(ds, subset=subset).AC, ac_raw=freq(ds, subset=subset, raw=True).AC, an=freq(ds, subset=subset).AN, hemizygote_count=hl.if_else( ds.in_autosome_or_par, 0, hl.or_else(freq(ds, subset=subset, sex="XY").AC, 0)), homozygote_count=freq(ds, subset=subset).homozygote_count, populations=[ hl.struct( id="_".join(filter(bool, [pop, sex])), ac=hl.or_else( freq(ds, subset=subset, pop=pop, sex=sex).AC, 0), an=hl.or_else( freq(ds, subset=subset, pop=pop, sex=sex).AN, 0), hemizygote_count=0 if sex == "XX" else hl.if_else( ds.in_autosome_or_par, 0, hl.or_else( freq(ds, subset=subset, pop=pop, sex="XY"). AC, 0), ), homozygote_count=hl.or_else( freq(ds, subset=subset, pop=pop, sex=sex).homozygote_count, 0), ) for pop, sex in list( itertools.product(subset_populations[subset], [None, "XX", "XY"])) + [(None, "XX"), (None, "XY")] ], ) for subset in subsets }))) # If a variant is not present in a subset, do not store population frequencies for that subset ds = ds.annotate(genome=ds.genome.annotate(freq=ds.genome.freq.annotate( **{ subset or "all": ds.genome.freq[subset or "all"].annotate( populations=hl.if_else( ds.genome.freq[subset or "all"].ac_raw == 0, hl.empty_array(ds.genome.freq[ subset or "all"].populations.dtype.element_type), ds.genome.freq[subset or "all"].populations, )) for subset in subsets }))) ds = ds.drop("freq", "in_autosome_or_par") ########################################### # Subsets in which the variant is present # ########################################### ds = ds.annotate(subsets=hl.set( hl.array([(subset, ds.genome.freq[subset].ac_raw > 0) for subset in subsets if subset is not None]).filter( lambda t: t[1]).map(lambda t: t[0]))) ############################## # Filtering allele frequency # ############################## faf_populations = [ pop for pop in subset_populations[None] if f"{pop}-adj" in g.faf_index_dict ] # Get popmax FAFs ds = ds.annotate(genome=ds.genome.annotate( faf95=hl.rbind( hl.sorted( hl.array([ hl.struct(faf=ds.faf[g.faf_index_dict[f"{pop}-adj"]].faf95, population=pop) for pop in faf_populations ]), key=lambda f: (-f.faf, f.population), ), lambda fafs: hl.if_else( hl.len(fafs) > 0, hl.struct(popmax=fafs[0].faf, popmax_population=fafs[0].population), hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr)), ), ), faf99=hl.rbind( hl.sorted( hl.array([ hl.struct(faf=ds.faf[g.faf_index_dict[f"{pop}-adj"]].faf99, population=pop) for pop in faf_populations ]), key=lambda f: (-f.faf, f.population), ), lambda fafs: hl.if_else( hl.len(fafs) > 0, hl.struct(popmax=fafs[0].faf, popmax_population=fafs[0].population), hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr)), ), ), )) ds = ds.drop("faf") #################### # Age distribution # #################### ds = ds.annotate(genome=ds.genome.annotate( age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom))) ds = ds.drop("age_hist_het", "age_hist_hom") ################### # Quality metrics # ################### ds = ds.annotate(genome=ds.genome.annotate( filters=ds.filters, quality_metrics=hl.struct( allele_balance=hl.struct( alt_adj=ds.qual_hists.ab_hist_alt.annotate( bin_edges=ds.qual_hists.ab_hist_alt.bin_edges.map( lambda n: hl.float(hl.format("%.3f", n)))), alt_raw=ds.raw_qual_hists.ab_hist_alt.annotate( bin_edges=ds.raw_qual_hists.ab_hist_alt.bin_edges.map( lambda n: hl.float(hl.format("%.3f", n)))), ), genotype_depth=hl.struct( all_adj=ds.qual_hists.dp_hist_all, all_raw=ds.raw_qual_hists.dp_hist_all, alt_adj=ds.qual_hists.dp_hist_alt, alt_raw=ds.raw_qual_hists.dp_hist_alt, ), genotype_quality=hl.struct( all_adj=ds.qual_hists.gq_hist_all, all_raw=ds.raw_qual_hists.gq_hist_all, alt_adj=ds.qual_hists.gq_hist_alt, alt_raw=ds.raw_qual_hists.gq_hist_alt, ), site_quality_metrics=[ hl.struct(metric="SiteQuality", value=hl.float(nullify_nan(ds.info.QUALapprox))) ] + [ hl.struct(metric=metric, value=hl.float(nullify_nan(ds.info[metric]))) for metric in [ "InbreedingCoeff", "AS_FS", "AS_MQ", "AS_MQRankSum", "AS_pab_max", "AS_QUALapprox", "AS_QD", "AS_ReadPosRankSum", "AS_SOR", "AS_VarDP", "AS_VQSLOD", ] ], ), )) ds = ds.drop("filters", "qual_hists", "raw_qual_hists", "vqsr") ######### # Flags # ######### ds = ds.annotate(flags=hl.set([ hl.or_missing(ds.region_flag.lcr, "lcr"), hl.or_missing(ds.region_flag.segdup, "segdup"), hl.or_missing( ((ds.locus.contig == "chrX") & ds.locus.in_x_par()) | ((ds.locus.contig == "chrY") & ds.locus.in_y_par()), "par", ), hl.or_missing(ds.info.monoallelic, "monoallelic"), ]).filter(hl.is_defined)) ds = ds.drop("region_flag") ######################## # In silico predictors # ######################## ds = ds.transmute(in_silico_predictors=hl.struct(cadd=ds.cadd, primate_ai=ds.primate_ai, revel=ds.revel, splice_ai=ds.splice_ai)) ################ # Other fields # ################ # Drop unused fields ds = ds.drop("allele_info", "a_index", "info", "popmax", "was_split") return ds