def get_expr_for_vep_transcript_id_to_consequence_map( vep_transcript_consequences_root): # Manually build string because hl.json encodes a dictionary as [{ key: ..., value: ... }, ...] return ("{" + hl.delimit( vep_transcript_consequences_root.map( lambda c: '"' + c.transcript_id + '": "' + c.major_consequence + '"')) + "}")
def get_sample_data(mt: hl.MatrixTable, fields: List[hl.expr.StringExpression], sep: str = '\t', delim: str = '|'): """ Hail devs hate this one simple py4j trick to speed up sample queries :param MatrixTable or Table mt: MT :param list of StringExpression fields: fields :param sep: Separator to use (tab usually fine) :param delim: Delimiter to use (pipe usually fine) :return: Sample data :rtype: list of list of str """ field_expr = fields[0] for field in fields[1:]: field_expr = field_expr + '|' + field if isinstance(mt, hl.MatrixTable): mt_agg = mt.aggregate_cols else: mt_agg = mt.aggregate return [ x.split(delim) for x in mt_agg(hl.delimit(hl.agg.collect(field_expr), sep)).split(sep) if x != 'null' ]
def annotate_variant_id( t: Union[hl.Table, hl.MatrixTable], field_name: str = 'vid') -> Union[hl.Table, hl.MatrixTable]: """ Expected input dataset with bi-allelic variant, and fields `locus` and `alleles`. Annotate variant ids as follow 'chr:position:ref:alt'. :param field_name: variant id field name :param t: dataset :return: HailTable or MatrixTable """ variant_id_ann_exp = { field_name: hl.delimit([ hl.str(t.locus.contig), hl.str(t.locus.position), hl.str(t.alleles[0]), hl.str(t.alleles[1]) ], delimiter=":") } if isinstance(t, hl.Table): return t.annotate(**variant_id_ann_exp) else: return t.annotate_rows(**variant_id_ann_exp)
def make_pheno_manifest(export=True): mt0 = load_final_sumstats_mt(filter_sumstats=False, filter_variants=False, separate_columns_by_pop=False, annotate_with_nearest_gene=False) ht = mt0.cols() annotate_dict = {} annotate_dict.update({ 'pops': hl.delimit(ht.pheno_data.pop), 'num_pops': hl.len(ht.pheno_data.pop) }) for field in ['n_cases', 'n_controls', 'heritability', 'lambda_gc']: for pop in ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']: new_field = field if field != 'heritability' else 'saige_heritability' # new field name (only applicable to saige heritability) idx = ht.pheno_data.pop.index(pop) field_expr = ht.pheno_data[field] annotate_dict.update({ f'{new_field}_{pop}': hl.if_else(hl.is_nan(idx), hl.null(field_expr[0].dtype), field_expr[idx]) }) annotate_dict.update({'filename': get_pheno_id(tb=ht) + '.tsv.bgz'}) ht = ht.annotate(**annotate_dict) dropbox_manifest = hl.import_table( f'{ldprune_dir}/UKBB_Pan_Populations-Manifest_20200615-manifest_info.tsv', impute=True, key='File') dropbox_manifest = dropbox_manifest.filter( dropbox_manifest['is_old_file'] != '1') bgz = dropbox_manifest.filter(~dropbox_manifest.File.contains('.tbi')) bgz = bgz.rename({'File': 'filename'}) tbi = dropbox_manifest.filter(dropbox_manifest.File.contains('.tbi')) tbi = tbi.annotate( filename=tbi.File.replace('.tbi', '')).key_by('filename') dropbox_annotate_dict = {} rename_dict = { 'dbox link': 'dropbox_link', 'size (bytes)': 'size_in_bytes' } dropbox_annotate_dict.update({'filename_tabix': tbi[ht.filename].File}) for field in ['dbox link', 'wget', 'size (bytes)', 'md5 hex']: for tb, suffix in [(bgz, ''), (tbi, '_tabix')]: dropbox_annotate_dict.update({ (rename_dict[field] if field in rename_dict else field.replace( ' ', '_')) + suffix: tb[ht.filename][field] }) ht = ht.annotate(**dropbox_annotate_dict) ht = ht.drop('pheno_data') ht.describe() ht.show()
def get_expr_for_vep_gene_id_to_consequence_map( vep_sorted_transcript_consequences_root, gene_ids): # Manually build string because hl.json encodes a dictionary as [{ key: ..., value: ... }, ...] return ("{" + hl.delimit( gene_ids.map(lambda gene_id: hl.bind( lambda worst_consequence_in_gene: '"' + gene_id + '":"' + worst_consequence_in_gene.major_consequence + '"', vep_sorted_transcript_consequences_root.find(lambda c: c.gene_id == gene_id)))) + "}")
def get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root, include_coding_annotations=True): """Retrieves the top-ranked transcript annotation based on the ranking computed by get_expr_for_vep_sorted_transcript_consequences_array(..) Args: vep_sorted_transcript_consequences_root (ArrayExpression): include_coding_annotations (bool): """ transcript_consequences = { "biotype": hl.tstr, "canonical": hl.tint, "category": hl.tstr, "cdna_start": hl.tint, "cdna_end": hl.tint, "codons": hl.tstr, "gene_id": hl.tstr, "gene_symbol": hl.tstr, "hgvs": hl.tstr, "hgvsc": hl.tstr, "major_consequence": hl.tstr, "major_consequence_rank": hl.tint, "transcript_id": hl.tstr, } if include_coding_annotations: transcript_consequences.update({ "amino_acids": hl.tstr, "domains": hl.tstr, "hgvsp": hl.tstr, "lof": hl.tstr, "lof_flags": hl.tstr, "lof_filter": hl.tstr, "lof_info": hl.tstr, "polyphen_prediction": hl.tstr, "protein_id": hl.tstr, "sift_prediction": hl.tstr, }) return hl.cond( vep_sorted_transcript_consequences_root.size() == 0, hl.struct( **{ field: hl.null(field_type) for field, field_type in transcript_consequences.items() }), hl.bind( lambda worst_transcript_consequence: (worst_transcript_consequence.annotate(domains=hl.delimit( hl.set(worst_transcript_consequence.domains))).select( *transcript_consequences.keys())), vep_sorted_transcript_consequences_root[0], ), )
def clinvar(self): return hl.struct( **{ 'allele_id': self._clinvar_data[self.mt.row_key].info.ALLELEID, 'clinical_significance': hl.delimit(self._clinvar_data[self.mt.row_key].info.CLNSIG), 'gold_stars': self._clinvar_data[self.mt.row_key].gold_stars })
def make_pheno_manifest(): mt0 = load_final_sumstats_mt(filter_sumstats=False, filter_variants=False, separate_columns_by_pop=False, annotate_with_nearest_gene=False) ht = mt0.cols() annotate_dict = {} annotate_dict.update({ 'pops': hl.delimit(ht.pheno_data.pop), 'num_pops': hl.len(ht.pheno_data.pop) }) for field in ['n_cases', 'n_controls', 'heritability', 'lambda_gc']: for pop in ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']: new_field = field if field != 'heritability' else 'saige_heritability' # new field name (only applicable to saige heritability) idx = ht.pheno_data.pop.index(pop) field_expr = ht.pheno_data[field] annotate_dict.update({ f'{new_field}_{pop}': hl.if_else(hl.is_nan(idx), hl.null(field_expr[0].dtype), field_expr[idx]) }) annotate_dict.update({ 'filename': (ht.trait_type + '-' + ht.phenocode + '-' + ht.pheno_sex + hl.if_else(hl.len(ht.coding) > 0, '-' + ht.coding, '') + hl.if_else(hl.len(ht.modifier) > 0, '-' + ht.modifier, '')).replace( ' ', '_').replace('/', '_') + '.tsv.bgz' }) ht = ht.annotate(**annotate_dict) aws_bucket = 'https://pan-ukb-us-east-1.s3.amazonaws.com/sumstats_release' ht = ht.annotate(aws_link=aws_bucket + '/' + ht.filename, aws_link_tabix=aws_bucket + '_tabix/' + ht.filename + '.tbi') other_fields_ht = hl.import_table( f'{ldprune_dir}/release/md5_hex_and_file_size.tsv.bgz', force_bgz=True, key=PHENO_KEY_FIELDS) other_fields = [ 'size_in_bytes', 'size_in_bytes_tabix', 'md5_hex', 'md5_hex_tabix' ] ht = ht.annotate(wget='wget ' + ht.aws_link, wget_tabix='wget ' + ht.aws_link_tabix, **{f: other_fields_ht[ht.key][f] for f in other_fields}) ht = ht.drop('pheno_data', 'pheno_indices') ht.export(f'{bucket}/combined_results/phenotype_manifest.tsv.bgz')
def annotate_nearest_gene(t, add_contig: bool = False, add_only_gene_symbols_as_str: bool = False, loc: str = 'nearest_genes'): intervals_ht = hl.read_table(get_gene_intervals_path()) if add_contig: intervals_ht = intervals_ht.annotate( contig=intervals_ht.interval.start.contig) annotation = intervals_ht.index(t.locus, all_matches=True) if add_only_gene_symbols_as_str: annotation = hl.delimit(annotation.gene_name) if loc: annotation = {loc: annotation} return t.annotate_rows(**annotation) if isinstance( t, hl.MatrixTable) else t.annotate(**annotation)
def hgvsp_from_consequence_amino_acids(csq): return hl.if_else( csq.hgvsp.contains("=") | csq.hgvsp.contains("%3D"), hl.bind( lambda protein_letters: "p." + protein_letters + hl.str( csq.protein_start) + protein_letters, hl.delimit( csq.amino_acids.split("").filter(lambda l: l != "").map( lambda l: PROTEIN_LETTERS_1TO3.get(l)), # pylint: disable=unnecessary-lambda "", ), ), csq.hgvsp.split(":")[-1], )
def test_export_import_plink_same(self): mt = get_dataset() mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'), cm_position=15.0) mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr), is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool)) mt = mt.select_entries('GT') bfile = '/tmp/test_import_export_plink' hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position) mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, reference_genome='GRCh37') self.assertTrue(mt._same(mt_imported)) self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
def annotate_variant_key(ds: Union[hl.MatrixTable, hl.Table] ) -> Union[hl.MatrixTable, hl.Table]: # define key variant expression key_expr = hl.delimit([ds.locus.contig, hl.str(ds.locus.position), ds.alleles[0], ds.alleles[1]], ':') if isinstance(ds, hl.MatrixTable): ds = ds.annotate_rows(variant_key=key_expr) if isinstance(ds, hl.Table): ds = ds.annotate(variant_key=key_expr) return ds
def get_expr_for_formatted_hgvs(csq): return hl.cond( hl.is_missing(csq.hgvsp) | HGVSC_CONSEQUENCES.contains(csq.major_consequence), csq.hgvsc.split(":")[-1], hl.cond( csq.hgvsp.contains("=") | csq.hgvsp.contains("%3D"), hl.bind( lambda protein_letters: "p." + protein_letters + hl.str( csq.protein_start) + protein_letters, hl.delimit( csq.amino_acids.split("").map( lambda l: PROTEIN_LETTERS_1TO3.get(l)), ""), ), csq.hgvsp.split(":")[-1], ), )
def _encode_allele(allele: hl.expr.StringExpression) -> hl.expr.StringExpression: return hl.delimit( _grouped( # Convert string to array allele.split("")[:-1] # Convert letters to numbers .map(lambda letter: hl.switch(letter).when("A", 0).when("C", 1).when("G", 2).when("T", 3).or_missing()), 3, # Group into sets of 3 ) # Ensure each group has 3 elements .map(lambda g: g.extend(hl.range(3 - hl.len(g)).map(lambda _: 0))) # Bit shift and add group elements .map(lambda g: g[0] * 16 + g[1] * 4 + g[2]) # Convert to letters .map(lambda n: _ENCODED_ALLELE_CHARACTERS[n]), "", )
def prepare_variant_results(): results_path = pipeline_config.get("SCHEMA", "variant_results_path") annotations_path = pipeline_config.get("SCHEMA", "variant_annotations_path") results = hl.read_table(results_path) results = results.drop("v", "af_case", "af_ctrl") # Add n_denovos to AC_case results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0)) results = results.annotate( source=hl.delimit(hl.sorted(hl.array(results.source)), ", ")) results = results.group_by( "locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate(group_results=hl.dict( results.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variants = hl.read_table(annotations_path) variants = variants.select( gene_id=variants.gene_id, consequence=hl.case().when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 3), "missense_variant_mpc_>=3").when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 2), "missense_variant_mpc_2-3").when( variants.canonical_term == "missense_variant", "missense_variant_mpc_<2").default( variants.canonical_term), hgvsc=variants.hgvsc_canonical.split(":")[-1], hgvsp=variants.hgvsp_canonical.split(":")[-1], info=hl.struct(cadd=variants.cadd, mpc=variants.mpc, polyphen=variants.polyphen), ) variants = variants.annotate(**results[variants.key]) variants = variants.filter(hl.is_defined(variants.group_results)) return variants
def prepare_clinvar_variants(clinvar_path, reference_genome): ds = hl.read_table(clinvar_path) ds = ds.filter(hl.is_defined(ds[f"locus_{reference_genome}"]) & hl.is_defined(ds[f"alleles_{reference_genome}"])) ds = ds.select(locus=ds[f"locus_{reference_genome}"], alleles=ds[f"alleles_{reference_genome}"], **ds.variant) # Remove any variants with alleles other than ACGT ds = ds.filter( hl.len(hl.set(hl.delimit(ds.alleles, "").split("")).difference(hl.set(["A", "C", "G", "T", ""]))) == 0 ) ds = ds.annotate( variant_id=variant_id(ds.locus, ds.alleles), chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, ref=ds.alleles[0], alt=ds.alleles[1], ) ds = ds.key_by("locus", "alleles") return ds
def test_export_import_plink_same(self): mt = get_dataset() mt = mt.select_rows(rsid=hl.delimit([ mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1] ], ':'), cm_position=15.0) mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr), is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool)) mt = mt.select_entries('GT') bfile = '/tmp/test_import_export_plink' hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position) mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, reference_genome='GRCh37') self.assertTrue(mt._same(mt_imported)) self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
def ht_to_vcf_mt( info_ht: hl.Table, pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS, ) -> hl.MatrixTable: """ Creates a MT ready for vcf export from a HT. In particular, the following conversions are done: - All int64 are coerced to int32 - Fields specified by `pipe_delimited_annotations` will be converted from arrays to pipe-delimited strings .. note:: The MT returned has no cols. :param info_ht: Input HT :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct) :return: MatrixTable ready for VCF export """ def get_pipe_expr( array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|") # Make sure the HT is keyed by locus, alleles info_ht = info_ht.key_by("locus", "alleles") # Convert int64 fields to int32 (int64 isn't supported by VCF) for f, ft in info_ht.info.dtype.items(): if ft == hl.dtype("int64"): logger.warning( f"Coercing field info.{f} from int64 to int32 for VCF output. Value will be capped at int32 max value." ) info_ht = info_ht.annotate(info=info_ht.info.annotate( **{f: hl.int32(hl.min(2**31 - 1, info_ht.info[f]))})) elif ft == hl.dtype("array<int64>"): logger.warning( f"Coercing field info.{f} from array<int64> to array<int32> for VCF output. Array values will be capped at int32 max value." ) info_ht = info_ht.annotate(info=info_ht.info.annotate( **{ f: info_ht.info[f].map( lambda x: hl.int32(hl.min(2**31 - 1, x))) })) info_expr = {} # Make sure to pipe-delimit fields that need to. # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty) # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, but it is overwritten below for f in pipe_delimited_annotations: if f in info_ht.info: info_expr[f] = "|" + get_pipe_expr(info_ht.info[f]) # Flatten SB if it is an array of arrays if "SB" in info_ht.info and not isinstance(info_ht.info.SB, hl.expr.ArrayNumericExpression): info_expr["SB"] = info_ht.info.SB[0].extend(info_ht.info.SB[1]) if "AS_SB_TABLE" in info_ht.info: info_expr["AS_SB_TABLE"] = get_pipe_expr( info_ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ","))) # Annotate with new expression and add 's' empty string field required to cast HT to MT info_ht = info_ht.annotate(info=info_ht.info.annotate(**info_expr), s=hl.null(hl.tstr)) # Create an MT with no cols so that we acn export to VCF info_mt = info_ht.to_matrix_table_row_major(columns=["s"], entry_field_name="s") return info_mt.filter_cols(False)
mnvs = import_mnv_file(replace_quote_char(args.mnv_url), quote="'") if args.three_bp_mnv_url: mnvs_3bp = import_three_bp_mnv_file(replace_quote_char( args.three_bp_mnv_url), quote="'") snp12_components = mnvs_3bp.select( component_mnv=hl.bind( lambda snv1, snv2: hl.delimit( [ snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref, snv1.alt + snv2.alt, ], "-", ), mnvs_3bp.constituent_snvs[0], mnvs_3bp.constituent_snvs[1], ), related_mnv=hl.struct( combined_variant_id=mnvs_3bp.variant_id, n_individuals=mnvs_3bp.n_individuals, other_constituent_snvs=[mnvs_3bp.constituent_snvs[2].variant_id], consequences=mnvs_3bp.consequences, ), ) snp23_components = mnvs_3bp.select(
def main(): parser = argparse.ArgumentParser() parser.add_argument("--results", required=True) parser.add_argument("--annotations", required=True) parser.add_argument("--output", required=True) args = parser.parse_args() hl.init(log="/tmp/hail.log") variants = hl.read_table(args.annotations) variants = variants.annotate( variant_id=variant_id(variants.locus, variants.alleles), chrom=variants.locus.contig, pos=variants.locus.position, xpos=x_position(variants.locus), alt=variants.alleles[1], ref=variants.alleles[0], ) variants = variants.transmute( transcript_id=hl.delimit(variants.transcript_id, ","), hgvsc=hl.delimit( variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]), ","), hgvsp=hl.delimit( variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]), ","), ) variants = variants.annotate( csq_canonical=hl.case().when((variants.csq_canonical == "mis") & (variants.mpc >= 3), "mis3"). when((variants.csq_canonical == "mis") & (variants.mpc >= 2), "mis2").default(variants.csq_canonical)) variants = variants.annotate(flags="PASS") variants = variants.drop("v") results = hl.read_table(args.results) results = results.annotate( analysis_group=results.analysis_group.lower().replace( "[^a-z0-9]+", "_").replace("_+$", "")) results = results.drop("v") # Add n_denovos to AC_case results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0)) results = results.annotate( af_case=hl.cond(results.an_case == 0, 0, results.ac_case / results.an_case)) variants = variants.filter(hl.is_defined(results[variants.key])) analysis_groups = results.aggregate( hl.agg.collect_as_set(results.analysis_group)) variants = variants.annotate(groups=hl.struct()) for group in analysis_groups: group_results = results.filter( results.analysis_group == group).drop("analysis_group") variants = variants.annotate(groups=variants.groups.annotate( **{group: group_results[variants.key]})) # The latest (2019/04/15) SCHEMA dataset moved the source and in_analysis field from variant level to group level # in_analysis is the same for all groups within a variant, but source is not variants = variants.annotate(in_analysis=variants.groups.meta.in_analysis, source=variants.groups.meta.source) variants.write(args.output)
def infer_families( relationship_ht: hl.Table, sex: Union[hl.Table, Dict[str, bool]], duplicate_samples_ht: hl.Table, i_col: str = "i", j_col: str = "j", relationship_col: str = "relationship", ) -> hl.Pedigree: """ This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too). The `relationship_col` should be a column specifying the relationship between each two samples as defined in this module's constants. This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID. .. note:: This function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents). :param relationship_ht: Input relationship table :param sex: A Table or dict giving the sex for each sample (`TRUE`=female, `FALSE`=male). If a Table is given, it should have a field `is_female`. :param duplicated_samples: All duplicated samples TO REMOVE (If not provided, this function won't work as it assumes that each child has exactly two parents) :param i_col: Column containing the 1st sample of the pair in the relationship table :param j_col: Column containing the 2nd sample of the pair in the relationship table :param relationship_col: Column contatining the relationship for the sample pair as defined in this module constants. :return: Pedigree of complete trios """ def group_parent_child_pairs_by_fam( parent_child_pairs: Iterable[Tuple[str, str]] ) -> List[List[Tuple[str, str]]]: """ Takes all parent-children pairs and groups them by family. A family here is defined as a list of sample-pairs which all share at least one sample with at least one other sample-pair in the list. :param parent_child_pairs: All the parent-children pairs :return: A list of families, where each element of the list is a list of the parent-children pairs """ fam_id = 1 # stores the current family id s_fam = dict() # stores the family id for each sample fams = defaultdict(list) # stores fam_id -> sample-pairs for pair in parent_child_pairs: if pair[0] in s_fam: if pair[1] in s_fam: if ( s_fam[pair[0]] != s_fam[pair[1]] ): # If both samples are in different families, merge the families new_fam_id = s_fam[pair[0]] fam_id_to_merge = s_fam[pair[1]] for s in s_fam: if s_fam[s] == fam_id_to_merge: s_fam[s] = new_fam_id fams[new_fam_id].extend(fams.pop(fam_id_to_merge)) else: # If only the 1st sample in the pair is already in a family, assign the 2nd sample in the pair to the same family s_fam[pair[1]] = s_fam[pair[0]] fams[s_fam[pair[0]]].append(pair) elif ( pair[1] in s_fam ): # If only the 2nd sample in the pair is already in a family, assign the 1st sample in the pair to the same family s_fam[pair[0]] = s_fam[pair[1]] fams[s_fam[pair[1]]].append(pair) else: # If none of the samples in the pair is already in a family, create a new family s_fam[pair[0]] = fam_id s_fam[pair[1]] = fam_id fams[fam_id].append(pair) fam_id += 1 return list(fams.values()) def get_trios( fam_id: str, parent_child_pairs: List[Tuple[str, str]], related_pairs: Dict[Tuple[str, str], str], ) -> List[hl.Trio]: """ Generates trios based from the list of parent-child pairs in the family and all related pairs in the data. Only complete parent/offspring trios are included in the results. The trios are assembled as follows: 1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs 2. For each possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent) 3. If there are multiple children for a given parent pair, all children should be siblings with each other 4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded. :param fam_id: The family ID :param parent_child_pairs: The parent-child pairs for this family :param related_pairs: All related sample pairs in the data :return: List of trios in the family """ def get_possible_parents(samples: List[str]) -> List[Tuple[str, str]]: """ 1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs :param samples: All samples in the family :return: Possible parent pairs """ possible_parents = [] for i in range(len(samples)): for j in range(i + 1, len(samples)): if (related_pairs.get( tuple(sorted([samples[i], samples[j]]))) is None): if sex.get(samples[i]) is False and sex.get( samples[j]) is True: possible_parents.append((samples[i], samples[j])) elif (sex.get(samples[i]) is True and sex.get(samples[j]) is False): possible_parents.append((samples[j], samples[i])) return possible_parents def get_children(possible_parents: Tuple[str, str]) -> List[str]: """ 2. For a given possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent) :param possible_parents: A pair of possible parents :return: The list of all children (if any) corresponding to the possible parents """ possible_offsprings = defaultdict( set ) # stores sample -> set of parents in the possible_parents where (sample, parent) is found in possible_child_pairs for pair in parent_child_pairs: if possible_parents[0] == pair[0]: possible_offsprings[pair[1]].add(possible_parents[0]) elif possible_parents[0] == pair[1]: possible_offsprings[pair[0]].add(possible_parents[0]) elif possible_parents[1] == pair[0]: possible_offsprings[pair[1]].add(possible_parents[1]) elif possible_parents[1] == pair[1]: possible_offsprings[pair[0]].add(possible_parents[1]) return [ s for s, parents in possible_offsprings.items() if len(parents) == 2 ] def check_sibs(children: List[str]) -> bool: """ 3. If there are multiple children for a given parent pair, all children should be siblings with each other :param children: List of all children for a given parent pair :return: Whether all children in the list are siblings """ for i in range(len(children)): for j in range(i + 1, len(children)): if (related_pairs[tuple(sorted([children[i], children[j] ]))] != SIBLINGS): return False return True def discard_multi_parents_children(trios: List[hl.Trio]): """ 4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded. :param trios: All trios formed for this family :return: The list of trios for which each child has a single parents pair. """ children_trios = defaultdict(list) for trio in trios: children_trios[trio.s].append(trio) for s, s_trios in children_trios.items(): if len(s_trios) > 1: logger.warning( "Discarded duplicated child {0} found multiple in trios: {1}" .format(s, ", ".join([str(trio) for trio in s_trios]))) return [ trios[0] for trios in children_trios.values() if len(trios) == 1 ] # Get all possible pairs of parents in (father, mother) order all_possible_parents = get_possible_parents( list({s for pair in parent_child_pairs for s in pair})) trios = [] for possible_parents in all_possible_parents: children = get_children(possible_parents) if check_sibs(children): trios.extend([ hl.Trio( s=s, fam_id=fam_id, pat_id=possible_parents[0], mat_id=possible_parents[1], is_female=sex.get(s), ) for s in children ]) else: logger.warning( "Discarded family with same parents, and multiple offspring that weren't siblings:" "\nMother: {}\nFather:{}\nChildren:{}".format( possible_parents[0], possible_parents[1], ", ".join(children))) return discard_multi_parents_children(trios) # Get all the relations we care about: # => Remove unrelateds and duplicates dups = duplicate_samples_ht.aggregate( hl.agg.explode(lambda dup: hl.agg.collect_as_set(dup), duplicate_samples_ht.filtered), _localize=False, ) relationship_ht = relationship_ht.filter( ~dups.contains(relationship_ht[i_col]) & ~dups.contains(relationship_ht[j_col]) & (relationship_ht[relationship_col] != UNRELATED)) # Check relatedness table format if not relationship_ht[i_col].dtype == relationship_ht[j_col].dtype: logger.error( "i_col and j_col of the relatedness table need to be of the same type." ) # If i_col and j_col aren't str, then convert them if not isinstance(relationship_ht[i_col], hl.expr.StringExpression): logger.warning( f"Pedigrees can only be constructed from string IDs, but your relatedness_ht ID column is of type: {relationship_ht[i_col].dtype}. Expression will be converted to string in Pedigrees." ) if isinstance(relationship_ht[i_col], hl.expr.StructExpression): logger.warning( f"Struct fields {list(relationship_ht[i_col])} will be joined by underscores to use as sample names in Pedigree." ) relationship_ht = relationship_ht.key_by( **{ i_col: hl.delimit( hl.array([ hl.str(relationship_ht[i_col][x]) for x in relationship_ht[i_col] ]), "_", ), j_col: hl.delimit( hl.array([ hl.str(relationship_ht[j_col][x]) for x in relationship_ht[j_col] ]), "_", ), }) else: raise NotImplementedError( "The `i_col` and `j_col` columns of the `relationship_ht` argument passed to infer_families are not of type StringExpression or Struct." ) # If sex is a Table, extract sex information as a Dict if isinstance(sex, hl.Table): sex = dict(hl.tuple([sex.s, sex.is_female]).collect()) # Collect all related sample pairs and # create a dictionnary with pairs as keys and relationships as values # Sample-pairs are tuples ordered by sample name related_pairs = { tuple(sorted([i, j])): rel for i, j, rel in hl.tuple([ relationship_ht.i, relationship_ht.j, relationship_ht.relationship ]).collect() } parent_child_pairs_by_fam = group_parent_child_pairs_by_fam( [pair for pair, rel in related_pairs.items() if rel == PARENT_CHILD]) return hl.Pedigree([ trio for fam_index, parent_child_pairs in enumerate( parent_child_pairs_by_fam) for trio in get_trios( str(fam_index), parent_child_pairs, related_pairs) ])
# initialize hail logging.info('Initialize hail') hl.init(log = args.hail_log) # read hail Tables logging.info('Read GWAS results saved in hail Table') gwas_out = hl.read_table(args.gwas_ht) # add variant column logging.info('Adding `variant` column: chr:pos:ref:alt') gwas_out = gwas_out.annotate( variant = hl.delimit( hl.array([ gwas_out['locus'].contig, hl.str(gwas_out['locus'].position), gwas_out['alleles'][0], gwas_out['alleles'][1] ]), delimiter = ':') ) # change the key of Table to variant logging.info('Changing the key of Table to `variant` column') gwas_out = gwas_out.key_by('variant') gwas_out = gwas_out.repartition(40) gwas_out = gwas_out.cache() # exporting TSV logging.info('Looping over list of trait lists and output TSVs') phenotypes = gwas_out['phenotypes'].collect()[0] # note that this annotation `phenotypes` was added by gwas_on_subset_ht.py! for i, subset in enumerate(phenotypes):
def adjust_vcf_incompatible_types( ht: hl.Table, pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS, ) -> hl.Table: """ Create a Table ready for vcf export. In particular, the following conversions are done: - All int64 are coerced to int32 - Fields specified by `pipe_delimited_annotations` are converted from arrays to pipe-delimited strings :param ht: Input Table. :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct). :return: Table ready for VCF export. """ def get_pipe_expr( array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|") # Make sure the HT is keyed by locus, alleles ht = ht.key_by("locus", "alleles") info_type_convert_expr = {} # Convert int64 fields to int32 (int64 isn't supported by VCF) for f, ft in ht.info.dtype.items(): if ft == hl.dtype("int64"): logger.warning( "Coercing field info.%s from int64 to int32 for VCF output. Value will be capped at int32 max value.", f, ) info_type_convert_expr.update( {f: hl.int32(hl.min(2**31 - 1, ht.info[f]))}) elif ft == hl.dtype("array<int64>"): logger.warning( "Coercing field info.%s from array<int64> to array<int32> for VCF output. Array values will be capped " "at int32 max value.", f, ) info_type_convert_expr.update( {f: ht.info[f].map(lambda x: hl.int32(hl.min(2**31 - 1, x)))}) ht = ht.annotate(info=ht.info.annotate(**info_type_convert_expr)) info_expr = {} # Make sure to pipe-delimit fields that need to. # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty) # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, it is handled below for f in pipe_delimited_annotations: if f in ht.info and f != "AS_SB_TABLE": info_expr[f] = "|" + get_pipe_expr(ht.info[f]) # Flatten SB if it is an array of arrays if "SB" in ht.info and not isinstance(ht.info.SB, hl.expr.ArrayNumericExpression): info_expr["SB"] = ht.info.SB[0].extend(ht.info.SB[1]) if "AS_SB_TABLE" in ht.info: info_expr["AS_SB_TABLE"] = get_pipe_expr( ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ","))) # Annotate with new expression ht = ht.annotate(info=ht.info.annotate(**info_expr)) return ht
def get_gold_stars(review_status): review_status_str = hl.delimit(hl.sorted(review_status, key=lambda s: s.replace("^_", "z"))) return CLINVAR_GOLD_STARS[review_status_str]
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'), default_reference=args.reference) sys.path.append('/') add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') load_module = importlib.import_module(args.load_module) mt = getattr(load_module, args.load_mt_function)(*add_args) if args.gene_map_ht_path is None: interval = [hl.parse_locus_interval(args.interval)] else: gene_ht = hl.read_table(args.gene_map_ht_path) if args.gene is not None: gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene) interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False) else: interval = [hl.parse_locus_interval(args.gene_ht_interval)] gene_ht = hl.filter_intervals(gene_ht, interval) gene_ht = gene_ht.filter( hl.set(args.groups.split(',')).contains(gene_ht.annotation)) gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t')).key_by().drop('start').export( args.group_output_file, header=False) # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants` if not args.no_adj: mt = mt.filter_entries(mt.adj) mt = hl.filter_intervals(mt, interval) if not args.input_bgen: mt = mt.select_entries('GT') mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0) mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1]) if args.callrate_filter: mt = mt.filter_rows( hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter) if args.export_bgen: if not args.input_bgen: mt = mt.annotate_entries(GT=hl.if_else( mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT)) mt = gt_to_gp(mt) mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing) hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid) else: mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0))) # Note: no mean-imputation for VCF hl.export_vcf(mt, args.output_file)
def get_csq_from_struct(element: hl.expr.StructExpression, feature_type: str) -> hl.expr.StringExpression: # Most fields are 1-1, just lowercase fields = dict(element) # Add general exceptions fields.update({ "allele": element.variant_allele, "consequence": hl.delimit(element.consequence_terms, delimiter="&"), "feature_type": feature_type, "feature": (element.transcript_id if "transcript_id" in element else element.regulatory_feature_id if "regulatory_feature_id" in element else element.motif_feature_id if "motif_feature_id" in element else ""), "variant_class": vep_expr.variant_class, }) # Add exception for transcripts if feature_type == "Transcript": fields.update({ "canonical": hl.cond(element.canonical == 1, "YES", ""), "ensp": element.protein_id, "gene": element.gene_id, "symbol": element.gene_symbol, "symbol_source": element.gene_symbol_source, "cdna_position": hl.str(element.cdna_start) + hl.cond( element.cdna_start == element.cdna_end, "", "-" + hl.str(element.cdna_end), ), "cds_position": hl.str(element.cds_start) + hl.cond( element.cds_start == element.cds_end, "", "-" + hl.str(element.cds_end), ), "protein_position": hl.str(element.protein_start) + hl.cond( element.protein_start == element.protein_end, "", "-" + hl.str(element.protein_end), ), "sift": element.sift_prediction + "(" + hl.format("%.3f", element.sift_score) + ")", "polyphen": element.polyphen_prediction + "(" + hl.format("%.3f", element.polyphen_score) + ")", "domains": hl.delimit(element.domains.map(lambda d: d.db + ":" + d.name), "&"), }) elif feature_type == "MotifFeature": fields["motif_score_change"] = hl.format( "%.3f", element.motif_score_change) return hl.delimit( [hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields], "|")
def get_pipe_expr( array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|")
def export(self, path, delimiter='\t', missing='NA', header=True): """Export a field to a text file. Examples -------- >>> small_mt.GT.export('output/gt.tsv') >>> with open('output/gt.tsv', 'r') as f: ... for line in f: ... print(line, end='') locus alleles 0 1 2 3 1:1 ["A","C"] 0/1 0/1 0/0 0/0 1:2 ["A","C"] 1/1 0/1 1/1 1/1 1:3 ["A","C"] 1/1 0/1 0/1 0/0 1:4 ["A","C"] 1/1 0/1 1/1 1/1 <BLANKLINE> >>> small_mt.GT.export('output/gt-no-header.tsv', header=False) >>> with open('output/gt-no-header.tsv', 'r') as f: ... for line in f: ... print(line, end='') 1:1 ["A","C"] 0/1 0/1 0/0 0/0 1:2 ["A","C"] 1/1 0/1 1/1 1/1 1:3 ["A","C"] 1/1 0/1 0/1 0/0 1:4 ["A","C"] 1/1 0/1 1/1 1/1 <BLANKLINE> >>> small_mt.pop.export('output/pops.tsv') >>> with open('output/pops.tsv', 'r') as f: ... for line in f: ... print(line, end='') sample_idx pop 0 2 1 2 2 0 3 2 <BLANKLINE> >>> small_mt.ancestral_af.export('output/ancestral_af.tsv') >>> with open('output/ancestral_af.tsv', 'r') as f: ... for line in f: ... print(line, end='') locus alleles ancestral_af 1:1 ["A","C"] 5.3905e-01 1:2 ["A","C"] 8.6768e-01 1:3 ["A","C"] 4.3765e-01 1:4 ["A","C"] 7.6300e-01 <BLANKLINE> >>> mt = small_mt >>> small_mt.bn.export('output/bn.tsv') >>> with open('output/bn.tsv', 'r') as f: ... for line in f: ... print(line, end='') bn {"n_populations":3,"n_samples":4,"n_variants":4,"n_partitions":8,"pop_dist":[1,1,1],"fst":[0.1,0.1,0.1],"mixture":false} <BLANKLINE> Notes ----- For entry-indexed expressions, if there is one column key field, the result of calling :func:`~hail.expr.functions.str` on that field is used as the column header. Otherwise, each compound column key is converted to JSON and used as a column header. For example: >>> small_mt = small_mt.key_cols_by(s=small_mt.sample_idx, family='fam1') >>> small_mt.GT.export('output/gt-no-header.tsv') >>> with open('output/gt-no-header.tsv', 'r') as f: ... for line in f: ... print(line, end='') locus alleles {"s":0,"family":"fam1"} {"s":1,"family":"fam1"} {"s":2,"family":"fam1"} {"s":3,"family":"fam1"} 1:1 ["A","C"] 0/1 0/1 0/0 0/0 1:2 ["A","C"] 1/1 0/1 1/1 1/1 1:3 ["A","C"] 1/1 0/1 0/1 0/0 1:4 ["A","C"] 1/1 0/1 1/1 1/1 <BLANKLINE> Parameters ---------- path : :class:`str` The path to which to export. delimiter : :class:`str` The string for delimiting columns. missing : :class:`str` The string to output for missing values. header : :obj:`bool` When ``True`` include a header line. """ uid = Env.get_uid() self_name, ds = self._to_relational_preserving_rows_and_cols(uid) if isinstance(ds, hl.Table): ds.export(output=path, delimiter=delimiter, header=header) else: assert len(self._indices.axes) == 2 entries, cols = Env.get_uid(), Env.get_uid() t = ds.select_cols().localize_entries(entries, cols) t = t.order_by(*t.key) output_col_name = Env.get_uid() entry_array = t[entries] if self_name: entry_array = hl.map(lambda x: x[self_name], entry_array) entry_array = hl.map( lambda x: hl.if_else(hl.is_missing(x), missing, hl.str(x)), entry_array) file_contents = t.select( **{k: hl.str(t[k]) for k in ds.row_key}, **{output_col_name: hl.delimit(entry_array, delimiter)}) if header: col_key = t[cols] if len(ds.col_key) == 1: col_key = hl.map(lambda x: x[0], col_key) column_names = hl.map(hl.str, col_key).collect(_localize=False)[0] header_table = hl.utils.range_table(1).key_by().select( **{k: k for k in ds.row_key}, **{output_col_name: hl.delimit(column_names, delimiter)}) file_contents = header_table.union(file_contents) file_contents.export(path, delimiter=delimiter, header=False)
def main(args): ht_snp = hl.import_table(args.snp, impute=True) ht_snp = ht_snp.annotate(variant=hl.delimit([ ht_snp.chromosome, hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2 ], delimiter=':')) ht_snp = ht_snp.annotate( **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38')) ht_snp = ht_snp.key_by('locus', 'alleles') ht_snp = ht_snp.add_index('idx_snp') ht_snp = ht_snp.checkpoint(new_temp_file()) # annotate vep gnomad = hl.read_table( 'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) ht_snp = ht_snp.join(gnomad.select('vep'), how='left') ht_snp = process_consequences(ht_snp) # extract most severe ht_snp = ht_snp.annotate(vep=(hl.case().when( hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical), ht_snp.vep.worst_csq_for_variant_canonical).when( hl.is_defined(ht_snp.vep.worst_csq_for_variant), ht_snp.vep.worst_csq_for_variant).or_missing()), is_canonical_vep=hl.is_defined( ht_snp.vep.worst_csq_for_variant_canonical)) ht_snp = ht_snp.annotate(most_severe=hl.if_else( hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence, 'intergenic_variant'), gene_most_severe=ht_snp.vep.gene_symbol) ht_snp = ht_snp.select_globals() ht_snp = ht_snp.drop('vep') ht_snp = ht_snp.annotate( **annotate_consequence_category(ht_snp.most_severe)) ht_snp = ht_snp.checkpoint(new_temp_file()) df = ht_snp.key_by().drop('locus', 'alleles', 'variant', 'idx_snp').to_pandas() # annotate LD for pop in POPS: ht = hl.read_table( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht' ) ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.locus_hg38)) ht = ht.key_by('locus_hg38', 'alleles').drop('locus') ht = ht_snp.join(ht, 'inner') ht = ht.checkpoint(new_temp_file()) lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect() idx = ht.idx.collect() bm = BlockMatrix.read( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm' ) bm = bm.filter(idx, idx) # re-densify triangluar matrix bm = bm + bm.T - get_diag_mat(bm.diagonal()) bm = bm.filter_rows( np.where(np.array(idx) == lead_idx[0])[0].tolist())**2 idx_snp = ht.idx_snp.collect() r2 = bm.to_numpy()[0] df[f'gnomad_lead_r2_{pop}'] = np.nan df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2 if args.out.startswith('gs://'): fopen = hl.hadoop_open else: fopen = open with fopen(args.out, 'w') as f: df.to_csv(f, sep='\t', na_rep='NA', index=False)
def format_variants_table(ds): ############################ # Derived top level fields # ############################ ds = ds.annotate( variant_id=variant_id(ds.locus, ds.alleles), chrom=normalized_contig(ds.locus), pos=ds.locus.position, xpos=x_position(ds.locus), ref=ds.alleles[0], alt=ds.alleles[1], ) ############### # Frequencies # ############### g = hl.eval(ds.globals) freq_index_tree = get_freq_index_tree(g.freq_index_dict) subsets = list(freq_index_tree.keys()) ds = ds.annotate( **{ subset: hl.struct( # Adjusted frequencies AC_adj=freq_expression(ds, "AC", freq_index_tree[subset]), AN_adj=freq_expression(ds, "AN", freq_index_tree[subset]), AF_adj=freq_expression(ds, "AF", freq_index_tree[subset]), nhomalt_adj=freq_expression(ds, "homozygote_count", freq_index_tree[subset]), # Raw frequencies AC_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC, AN_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AN, AF_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AF, nhomalt_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].homozygote_count, # Popmax popmax=ds.popmax[g.popmax_index_dict[subset]].pop, AC_popmax=ds.popmax[g.popmax_index_dict[subset]].AC, AN_popmax=ds.popmax[g.popmax_index_dict[subset]].AN, AF_popmax=ds.popmax[g.popmax_index_dict[subset]].AF, nhomalt_popmax=ds.popmax[g.popmax_index_dict[subset]].homozygote_count, ) for subset in subsets } ) ############################## # Filtering allele frequency # ############################## faf_index_tree = collections.defaultdict(dict) for labels_combo, index in g.faf_index_dict.items(): labels = labels_combo.split("_") # Subset labels contain an _, so rebuild those after splitting them if labels[0] == "non": labels = ["_".join(labels[0:2])] + labels[2:] if len(labels) == 2: [subset, pop] = labels faf_index_tree[subset][pop] = index else: assert len(labels) == 1 subset = labels[0] faf_index_tree[subset]["total"] = index ds = ds.annotate( **{ subset: ds[subset].annotate( faf95_adj=hl.struct(**{pop: ds.faf[index].faf95 for pop, index in faf_index_tree[subset].items()}), faf99_adj=hl.struct(**{pop: ds.faf[index].faf99 for pop, index in faf_index_tree[subset].items()}), ) for subset in subsets } ) ds = ds.drop("freq", "popmax", "faf") ############## # Histograms # ############## # Extract overall age distribution ds = ds.transmute( gnomad_age_hist_het=ds.age_hist_het[g.age_index_dict["gnomad"]], gnomad_age_hist_hom=ds.age_hist_hom[g.age_index_dict["gnomad"]], ) # Convert lists of numbers in histograms into pipe delimited strings ds = ds.annotate( **{ field: ds[field].annotate( bin_freq=hl.delimit(ds[field].bin_freq, "|"), bin_edges=hl.delimit(ds[field].bin_edges, "|") ) for field in [ "ab_hist_alt", "dp_hist_all", "dp_hist_alt", "gq_hist_all", "gq_hist_alt", "gnomad_age_hist_het", "gnomad_age_hist_hom", ] } ) ########################### # Quality metrics / flags # ########################### # Use the same fields as the VCFs # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159 ds = ds.transmute( BaseQRankSum=ds.allele_info.BaseQRankSum, ClippingRankSum=ds.allele_info.ClippingRankSum, DP=ds.allele_info.DP, FS=ds.info_FS, InbreedingCoeff=ds.info_InbreedingCoeff, MQ=ds.info_MQ, MQRankSum=ds.info_MQRankSum, QD=ds.info_QD, ReadPosRankSum=ds.info_ReadPosRankSum, rf_negative_label=ds.fail_hard_filters, rf_positive_label=ds.tp, rf_tp_probability=ds.rf_probability, SOR=ds.info_SOR, VQSLOD=ds.allele_info.VQSLOD, VQSR_culprit=ds.allele_info.culprit, VQSR_NEGATIVE_TRAIN_SITE=ds.info_NEGATIVE_TRAIN_SITE, VQSR_POSITIVE_TRAIN_SITE=ds.info_POSITIVE_TRAIN_SITE, ) # These fields are left unaltered at the top level # # allele_type # decoy # has_star # lcr # n_alt_alleles # nonpar # pab_max # rf_label # rf_train # segdup # transmitted_singleton # variant_type # was_mixed # TODO: Remove this, leave these at top level ds = ds.transmute( allele_info=hl.struct( BaseQRankSum=ds.BaseQRankSum, ClippingRankSum=ds.ClippingRankSum, DP=ds.DP, FS=ds.FS, InbreedingCoeff=ds.InbreedingCoeff, MQ=ds.MQ, MQRankSum=ds.MQRankSum, QD=ds.QD, ReadPosRankSum=ds.ReadPosRankSum, SOR=ds.SOR, VQSLOD=ds.VQSLOD, VQSR_culprit=ds.VQSR_culprit, VQSR_NEGATIVE_TRAIN_SITE=ds.VQSR_NEGATIVE_TRAIN_SITE, VQSR_POSITIVE_TRAIN_SITE=ds.VQSR_POSITIVE_TRAIN_SITE, ) ) ################### # VEP annotations # ################### ds = ds.annotate(sortedTranscriptConsequences=sorted_transcript_consequences_v2(ds.vep)) ds = ds.drop("vep") ######### # Flags # ######### # TODO: Leave these at the top level ds = ds.transmute(flags=hl.struct(lcr=ds.lcr, segdup=ds.segdup)) # TODO: Remove this, these flags are calculated on the fly ds = ds.annotate( flags=ds.flags.annotate( lc_lof=get_expr_for_variant_lc_lof_flag(ds.sortedTranscriptConsequences), lof_flag=get_expr_for_variant_loftee_flag_flag(ds.sortedTranscriptConsequences), ), sortedTranscriptConsequences=hl.bind( lambda genes_with_lc_lof_flag, genes_with_loftee_flag_flag: ds.sortedTranscriptConsequences.map( lambda csq: csq.annotate( flags=hl.struct( lc_lof=get_expr_for_consequence_lc_lof_flag(csq), lc_lof_in_gene=genes_with_lc_lof_flag.contains(csq.gene_id), lof_flag=get_expr_for_consequence_loftee_flag_flag(csq), lof_flag_in_gene=genes_with_loftee_flag_flag.contains(csq.gene_id), nc_transcript=(csq.category == "lof") & (csq.lof == ""), ) ) ), get_expr_for_genes_with_lc_lof_flag(ds.sortedTranscriptConsequences), get_expr_for_genes_with_loftee_flag_flag(ds.sortedTranscriptConsequences), ), ) ################# # Unused fields # ################# # These fields were not in the 2.1.1 browser Hail table ds = ds.drop( "adj_biallelic_rank", "adj_biallelic_singleton_rank", "adj_rank", "adj_singleton_rank", "biallelic_rank", "biallelic_singleton_rank", "info_DP", "mills", "n_nonref", "omni", "qd", "rank", "score", "singleton_rank", "singleton", "was_split", ) # These two fields appear only in the genomes table if "_score" in ds.row_value.dtype.fields: ds = ds.drop("_score", "_singleton") ######## # Keys # ######## # Drop key fields ds = ds.key_by().drop("locus", "alleles") return ds
print("\n=== Processing ===") mt = mt.annotate_rows(sortedTranscriptConsequences= get_expr_for_vep_sorted_transcript_consequences_array( vep_root=mt.vep)) mt = mt.annotate_rows( main_transcript= get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences )) mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), ) review_status_str = hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)), key=lambda s: s.replace("^_", "z"))) mt = mt.select_rows( allele_id=mt.info.ALLELEID, alt=get_expr_for_alt_allele(mt), chrom=get_expr_for_contig(mt.locus), clinical_significance=hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNSIG)), key=lambda s: s.replace("^_", "z"))), domains=get_expr_for_vep_protein_domains_set( vep_transcript_consequences_root=mt.vep.transcript_consequences), gene_ids=mt.gene_ids, gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map( vep_sorted_transcript_consequences_root=mt. sortedTranscriptConsequences,