def test_explode_on_set(self): t = hl.utils.range_table(1) t = t.annotate(a=hl.set(['a', 'b', 'c'])) t = t.explode('a') self.assertEqual(set(t.collect()), hl.eval(hl.set([hl.struct(idx=0, a='a'), hl.struct(idx=0, a='b'), hl.struct(idx=0, a='c')])))
def test_group_by_field_lifetimes(self): mt = hl.utils.range_matrix_table(3, 3) mt2 = (mt.group_rows_by(row_idx='100').aggregate( x=hl.agg.collect_as_set(mt.row_idx + 5))) assert mt2.aggregate_entries(hl.agg.all(mt2.x == hl.set({5, 6, 7}))) mt3 = (mt.group_cols_by(col_idx='100').aggregate( x=hl.agg.collect_as_set(mt.col_idx + 5))) assert mt3.aggregate_entries(hl.agg.all(mt3.x == hl.set({5, 6, 7})))
def test_group_by_field_lifetimes(self): mt = hl.utils.range_matrix_table(3, 3) mt2 = (mt.group_rows_by(row_idx='100') .aggregate(x=hl.agg.collect_as_set(mt.row_idx + 5))) assert mt2.aggregate_entries(hl.agg.all(mt2.x == hl.set({5, 6, 7}))) mt3 = (mt.group_cols_by(col_idx='100') .aggregate(x=hl.agg.collect_as_set(mt.col_idx + 5))) assert mt3.aggregate_entries(hl.agg.all(mt3.x == hl.set({5, 6, 7})))
def test_multiple_files_variant_filtering(self): bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def pop_max_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, pops_to_exclude: Optional[Set[str]] = None, ) -> hl.expr.StructExpression: """ Creates an expression containing popmax: the frequency information about the population that has the highest AF from the populations provided in `freq_meta`, excluding those specified in `pops_to_exclude`. Only frequencies from adj populations are considered. This resulting struct contains the following fields: - AC: int32 - AF: float64 - AN: int32 - homozygote_count: int32 - pop: str :param freq: ArrayExpression of Structs with fields ['AC', 'AF', 'AN', 'homozygote_count'] :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq) :param pops_to_exclude: Set of populations to skip for popmax calcluation :return: Popmax struct """ _pops_to_exclude = hl.literal(pops_to_exclude) popmax_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (hl.set(freq_meta[i].keys()) == {"group", "pop"}) & (freq_meta[i]["group"] == "adj") & (~_pops_to_exclude.contains(freq_meta[i]["pop"]))) freq_filtered = popmax_freq_indices.map(lambda i: freq[i].annotate( pop=freq_meta[i]["pop"])).filter(lambda f: f.AC > 0) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing(hl.len(sorted_freqs) > 0, sorted_freqs[0])
def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal)))
def filter_to_clinvar_pathogenic( t: Union[hl.MatrixTable, hl.Table], clnrevstat_field: str = "CLNREVSTAT", clnsig_field: str = "CLNSIG", clnsigconf_field: str = "CLNSIGCONF", remove_no_assertion: bool = True, remove_conflicting: bool = True, ) -> Union[hl.MatrixTable, hl.Table]: """ Return a MatrixTable or Table that filters the clinvar data to pathogenic and likely pathogenic variants. Example use: .. code-block:: python from gnomad.resources.grch38.reference_data import clinvar clinvar_ht = clinvar.ht() clinvar_ht = filter_to_clinvar_pathogenic(clinvar_ht) :param: t: Input dataset that contains clinvar data, could either be a MatrixTable or Table. :param clnrevstat_field: The field string for the expression that contains the review status of the clinical significance of clinvar variants. :param clnsig_field: The field string for the expression that contains the clinical signifcance of the clinvar variant. :param clnsigconf_field: The field string for the expression that contains the conflicting clinical significance values for the variant. For variants with no conflicting significance, this field should be undefined. :param remove_no_assertion: Flag for removing entries in which the clnrevstat (clinical significance) has no assertions (zero stars). :param remove_conflicting: Flag for removing entries with conflicting clinical interpretations. :return: Filtered MatrixTable or Table """ logger.info( "Found %d variants before filtering", t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(), ) path_expr = (t.info[clnsig_field].map(lambda x: x.lower()).map( lambda x: x.contains("pathogenic")).any(lambda x: x)) if remove_no_assertion: logger.info("Variants without assertions will be removed.") no_star_assertions = hl.literal({ "no_assertion_provided", "no_assertion_criteria_provided", "no_interpretation_for_the_individual_variant", }) path_expr = path_expr & (hl.set(t.info[clnrevstat_field]).intersection( no_star_assertions).length() == 0) if remove_conflicting: logger.info( "Variants with conflicting clinical interpretations will be removed." ) path_expr = path_expr & hl.is_missing(t.info[clnsigconf_field]) if isinstance(t, hl.MatrixTable): t = t.filter_rows(path_expr) else: t = t.filter(path_expr) logger.info( "Found %d variants after filtering to clinvar pathogenic variants.", t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(), ) return t
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--base-level-pext", help="Path to Hail table with base-level data", default= "gs://gnomad-public/papers/2019-tx-annotation/gnomad_browser/all.baselevel.021620.ht", ) parser.add_argument( "--low-max-pext-genes", help="Path to table containing list of genes with low max pext", default= "gs://gnomad-public/papers/2019-tx-annotation/data/GRCH37_hg19/max_pext_low_genes.021520.tsv", ) parser.add_argument( "output_path", help="Path to output Hail table with region-level data") args = parser.parse_args() ds = prepare_pext_data(args.base_level_pext) low_max_pext_genes = hl.import_table(args.low_max_pext_genes) low_max_pext_genes = low_max_pext_genes.aggregate( hl.agg.collect_as_set(low_max_pext_genes.ensg)) ds = ds.annotate(flags=hl.cond( hl.set(low_max_pext_genes).contains(ds.gene_id), hl.literal(["low_max_pext"]), hl.empty_array(hl.tstr), )) ds.write(args.output_path)
def get_mt_filtered_by_pops(pops: list, chrom: str = 'all', imputed: bool = True, min_mac: int = 20, entry_fields=('GP', ), filter_mac_instead_of_ac: bool = False): r''' Wraps `get_filtered_mt()` from ukbb_pan_ancestry.resources.genotypes This filters to samples from populations listed in `pops`. NOTE: If chrom='all', this loads all autosomes AND chrX. ''' assert len(pops) > 0 and set(pops).issubset(POPS) kwargs = { 'pop': 'all' if len(pops) > 1 else pops[0], 'imputed': imputed, 'min_mac': min_mac, 'entry_fields': entry_fields, 'filter_mac_instead_of_ac': filter_mac_instead_of_ac } mt = get_filtered_mt(chrom=chrom, **kwargs) # in this case chrom='all' gets autosomes if chrom == 'all': mt_x = get_filtered_mt(chrom='X', **kwargs) mt = mt.union_rows(mt_x) if len(pops) > 1: mt = mt.filter_cols(hl.set(pops).contains(mt.pop)) return mt
def test_multiple_files_variant_filtering(self): bgen_file = [ resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen') ] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def collect_gene_exons(gene_exons): # There are 3 feature types in the exons collection: "CDS", "UTR", and "exon". # There are "exon" regions that cover the "CDS" and "UTR" regions and also # some (non-coding) transcripts that contain only "exon" regions. # This filters the "exon" regions to only those that are in non-coding transcripts. # # This makes the UI for selecting visible regions easier, since it can filter # on "CDS" or "UTR" feature type without having to also filter out the "exon" regions # that duplicate the "CDS" and "UTR" regions. non_coding_transcript_exons = hl.bind( lambda coding_transcripts: gene_exons.filter( lambda exon: ~coding_transcripts.contains(exon.transcript_id)), hl.set( gene_exons.filter(lambda exon: (exon.feature_type == "CDS") | (exon.feature_type == "UTR")).map( lambda exon: exon.transcript_id)), ) exons = (merge_overlapping_regions( gene_exons.filter(lambda exon: exon.feature_type == "CDS")).extend( merge_overlapping_regions( gene_exons.filter(lambda exon: exon.feature_type == "UTR"))). extend(merge_overlapping_regions(non_coding_transcript_exons))) exons = exons.map( lambda exon: exon.select("feature_type", "start", "stop", xstart=xpos(exon.chrom, exon.start), xstop=xpos(exon.chrom, exon.stop))) return exons
def make_pop_filters_expr(ht: hl.Table, qc_metrics: List[str]) -> hl.expr.SetExpression: return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(ht[f'fail_{metric}'], metric) for metric in qc_metrics ]))
def make_hard_filters_expr(ht: hl.Table, data_type: str) -> hl.expr.SetExpression: """ NOTE: additional metadata in Kristen's import file is hard-coded :param: Table ht: input MT :param: str data_type: 'exomes' or 'genomes' :return: output MT :rtype: SetExpression """ hard_filters = { 'contamination': ht.freemix > 0.05, 'callrate': ht.callrate < 0.85, 'chimera': ht.pct_chimeras > 0.05, 'ambiguous_sex': ht.ambiguous_sex } if data_type == 'exomes': hard_filters.update({ 'coverage': ht.mean_chr20_coverage == 0, 'sex_aneuploidy': ht.sex_aneuploidy }) else: hard_filters.update({ 'coverage': ht.mean_dp < 15, 'insert_size': ht.median_insert_size < 250 }) return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(filter_expr, name) for name, filter_expr in hard_filters.items() ]))
def apply_mito_artifact_filter( mt: hl.MatrixTable, artifact_prone_sites_path: str, ) -> hl.MatrixTable: """Add back in artifact_prone_site filter :param hl.MatrixTable mt: MatrixTable to use an input :param str artifact_prone_sites_path: path to BED file of artifact_prone_sites to flag in the filters column :return: MatrixTable with artifact_prone_sites filter :rtype: hl.MatrixTable """ # apply "artifact_prone_site" filter to any SNP or deletion that spans a known problematic site mt = mt.annotate_rows( position_range=hl.range(mt.locus.position, mt.locus.position + hl.len(mt.alleles[0]))) artifact_sites = [] with hl.hadoop_open(artifact_prone_sites_path) as f: for line in f: pos = line.split()[2] artifact_sites.append(int(pos)) sites = hl.literal(set(artifact_sites)) mt = mt.annotate_rows(filters=hl.if_else( hl.len(hl.set(mt.position_range).intersection(sites)) > 0, {"artifact_prone_site"}, {"PASS"}, )) mt = mt.drop("position_range") return mt
def remove_FT_values( mt: hl.MatrixTable, filters_to_remove: list = [ 'possible_numt', 'mt_many_low_hets', 'FAIL', 'blacklisted_site' ] ) -> hl.MatrixTable: """Removes the FT filters specified in filters_to_remove By default, this function removes the 'possible_numt', 'mt_many_low_hets', and 'FAIL' filters (because these filters were found to have low performance), and the 'blacklisted_site' filter because this filter did not always behave as expected in early GATK versions (can be replaced with apply_mito_artifact_filter function) :param hl.MatrixTable mt: MatrixTable :param list filters_to_remove: list of FT filters that should be removed from the entries :return: MatrixTable with certain FT filters removed :rtype: MatrixTable """ filters_to_remove = hl.set(filters_to_remove) mt = mt.annotate_entries( FT=hl.array((mt.FT).difference(filters_to_remove))) # if no filters exists after removing those specified above, set the FT field to PASS mt = mt.annotate_entries( FT=hl.if_else(hl.len(mt.FT) == 0, ["PASS"], mt.FT)) return (mt)
def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))), info=hl.struct( DP=hl.sum(ts.data.map(lambda d: d.info.DP)), MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB[0])), hl.sum(ts.data.map(lambda d: d.info.SB[1])), hl.sum(ts.data.map(lambda d: d.info.SB[2])), hl.sum(ts.data.map(lambda d: d.info.SB[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]])))), hl.dict(hl.range(0, hl.len(tmp.alleles)).map( lambda j: hl.tuple([tmp.alleles[j], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def add_filters_expr( filters: Dict[str, hl.expr.BooleanExpression], current_filters: hl.expr.SetExpression = None, ) -> hl.expr.SetExpression: """ Create an expression to create or add filters. For each entry in the `filters` dictionary, if the value evaluates to `True`, then the key is added as a filter name. Current filters are kept if provided using `current_filters` :param filters: The filters and their expressions :param current_filters: The set of current filters :return: An expression that can be used to annotate the filters """ if current_filters is None: current_filters = hl.empty_set(hl.tstr) return hl.fold( lambda x, y: x.union(y), current_filters, [ hl.cond(filter_condition, hl.set([filter_name]), hl.empty_set(hl.tstr)) for filter_name, filter_condition in filters.items() ], )
def make_perm_filters_expr(ht: hl.Table, data_type: str) -> hl.expr.SetExpression: """ NOTE: syndip will remain dropped wrt to permissions, but all possible QC measures will still be calculated :param Table ht: input MT :param str data_type: 'exomes' or 'genomes' :return: output MT :rtype: SetExpression """ if data_type == 'genomes': perm_filters = {'not_releasable': ~ht.releasable_2_1} else: perm_filters = { 'tcga_tumor': ht.tcga_tumor, 'tcga_barcode': ht.tcga_weird_barcode, 'tcga_below_30': ht.tcga_below_30, 'specific_exclusion': ht.specific_exclusion, 'esp': ht.esp, 'not_releasable': ht.non_releasable, 'syndip': ht.syndip } return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(filter_expr, name) for name, filter_expr in perm_filters.items() ]))
def test_import_bgen_variant_filtering(self): desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198] actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, n_partitions=10, _row_fields=['file_row_idx'], _variants_per_file={ resource('example.8bits.bgen'): desired_variant_indexes }) # doing the expected import_bgen second catches the case where the # hadoop configuraiton is polluted with old data from the # _variants_per_file everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, _row_fields=['file_row_idx']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variant_indexes).contains( hl.int32(everything.file_row_idx))) self.assertTrue(expected._same(actual)) self.assertEqual( (hl.str(actual.locus.contig) + ":" + hl.str(actual.locus.position)).collect(), [ '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000', '1:13000', '1:15000', '1:19000', '1:100001' ])
def vep_genes_expr(vep_expr: hl.expr.StructExpression, least_consequence: str) -> hl.expr.SetExpression: vep_consequences = hl.literal( set(CSQ_ORDER[0:CSQ_ORDER.index(least_consequence) + 1])) return (hl.set( vep_expr.transcript_consequences.filter( lambda tc: (tc.biotype == 'protein_coding') & (tc.consequence_terms.any(lambda c: vep_consequences.contains(c)) )).map(lambda x: x.gene_id)))
def test_complex_round_trips(): assert_round_trip(hl.struct()) assert_round_trip(hl.empty_array(hl.tint32)) assert_round_trip(hl.empty_set(hl.tint32)) assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32)) assert_round_trip(hl.locus('1', 100)) assert_round_trip(hl.struct(x=3)) assert_round_trip(hl.set([3, 4, 5, 3])) assert_round_trip(hl.array([3, 4, 5])) assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'})) assert_round_trip( hl.struct(x=hl.dict({ 3: 'a', 4: 'b', 5: 'c' }), y=hl.array([3, 4, 5]), z=hl.set([3, 4, 5, 3])))
def get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root, include_coding_annotations=True): """Retrieves the top-ranked transcript annotation based on the ranking computed by get_expr_for_vep_sorted_transcript_consequences_array(..) Args: vep_sorted_transcript_consequences_root (ArrayExpression): include_coding_annotations (bool): """ transcript_consequences = { "biotype": hl.tstr, "canonical": hl.tint, "category": hl.tstr, "cdna_start": hl.tint, "cdna_end": hl.tint, "codons": hl.tstr, "gene_id": hl.tstr, "gene_symbol": hl.tstr, "hgvs": hl.tstr, "hgvsc": hl.tstr, "major_consequence": hl.tstr, "major_consequence_rank": hl.tint, "transcript_id": hl.tstr, } if include_coding_annotations: transcript_consequences.update({ "amino_acids": hl.tstr, "domains": hl.tstr, "hgvsp": hl.tstr, "lof": hl.tstr, "lof_flags": hl.tstr, "lof_filter": hl.tstr, "lof_info": hl.tstr, "polyphen_prediction": hl.tstr, "protein_id": hl.tstr, "sift_prediction": hl.tstr, }) return hl.cond( vep_sorted_transcript_consequences_root.size() == 0, hl.struct( **{ field: hl.null(field_type) for field, field_type in transcript_consequences.items() }), hl.bind( lambda worst_transcript_consequence: (worst_transcript_consequence.annotate(domains=hl.delimit( hl.set(worst_transcript_consequence.domains))).select( *transcript_consequences.keys())), vep_sorted_transcript_consequences_root[0], ), )
def make_filters_expr(ht: hl.Table, qc_metrics: Iterable[str]) -> hl.expr.SetExpression: return hl.set( hl.filter( lambda x: hl.is_defined(x), [ hl.or_missing(ht[f"fail_{metric}"], metric) for metric in qc_metrics ], ))
def vep_protein_domain_filter_expr( d: hl.expr.DictExpression) -> hl.expr.BooleanExpression: """ Return True of False if any protein domain source(s) are contained within pre-defined protein domain sources. Expected as input dict<k,v> where keys (k) represent source/database and values (v) the annotated domain_id. :param d: hl.DictExpression :return: hl.BoolExpression """ domain_dbs = hl.set(PROTEIN_DOMAIN_DB) return (d.key_set().intersection(domain_dbs).length() >= 1)
def finalize_annotated_table_for_seqr_variants( mt: hl.MatrixTable) -> hl.MatrixTable: """Given a messily-but-completely annotated Hail MatrixTable of variants, return a new MatrixTable with appropriate formatting to export to Elasticsearch and consume by Seqr. TO-EXTREMELY-DO: Create a app/common Python 3 module with code for SeqrAnnotatedVariant, with methods to im/export to/from Hail/Elasticsearch. :param vep_mt: A VCF loaded into hail 0.2, VEP has been run, and reference/computed fields have been added. :type vep_mt: hl.MatrixTable :return: A hail matrix table of variants and VEP annotations with proper formatting to be consumed by Seqr. :rtype: hl.MatrixTable """ mt = mt.annotate_rows( sortedTranscriptConsequences= get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep)) mt = mt.annotate_rows( mainTranscript=hl.cond( hl.len(mt.sortedTranscriptConsequences) > 0, mt.sortedTranscriptConsequences[0], hl.null( "struct {biotype: str,canonical: int32,cdna_start: int32,cdna_end: int32,codons: str,gene_id: str,gene_symbol: str,hgvsc: str,hgvsp: str,transcript_id: str,amino_acids: str,lof: str,lof_filter: str,lof_flags: str,lof_info: str,polyphen_prediction: str,protein_id: str,protein_start: int32,sift_prediction: str,consequence_terms: array<str>,domains: array<str>,major_consequence: str,category: str,hgvs: str,major_consequence_rank: int32,transcript_rank: int32}" )), #allele_id=clinvar_mt.index_rows(mt.row_key).vep.id, alt=get_expr_for_alt_allele(mt), chrom=get_expr_for_contig(mt.locus), #clinvar_clinical_significance=clinvar_mt.index_rows(mt.row_key).clinical_significance, domains=get_expr_for_vep_protein_domains_set( vep_transcript_consequences_root=mt.vep.transcript_consequences), geneIds=hl.set( mt.vep.transcript_consequences.map(lambda c: c.gene_id)), # gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map( # vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences, # gene_ids=clinvar_mt.gene_ids # ), #gold_stars= clinvar_mt.index_entries(mt.row_key,mt.col_key).gold_stars, pos=get_expr_for_start_pos(mt), ref=get_expr_for_ref_allele(mt), #review_status=clinvar_mt.index_rows(mt.locus,mt.alleles).review_status, transcript_consequence_terms=get_expr_for_vep_consequence_terms_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_ids=get_expr_for_vep_transcript_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_id_to_consequence_json= get_expr_for_vep_transcript_id_to_consequence_map( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), variant_id=get_expr_for_variant_id(mt), xpos=get_expr_for_xpos(mt.locus)) return mt
def get_exons(gencode): exons = gencode.filter( hl.set(["exon", "CDS", "UTR"]).contains(gencode.feature)) exons = exons.select( feature_type=exons.feature, transcript_id=exons.transcript_id.split("\\.")[0], gene_id=exons.gene_id.split("\\.")[0], chrom=exons.interval.start.seqname[3:], strand=exons.strand, start=exons.interval.start.position, stop=exons.interval.end.position, ) return exons
def prepare_pext_data(base_level_pext_path, low_max_pext_genes_path): ds = prepare_base_level_pext(base_level_pext_path) low_max_pext_genes = hl.import_table(low_max_pext_genes_path) low_max_pext_genes = low_max_pext_genes.aggregate( hl.agg.collect_as_set(low_max_pext_genes.ensg)) ds = ds.annotate(flags=hl.if_else( hl.set(low_max_pext_genes).contains(ds.gene_id), hl.literal(["low_max_pext"]), hl.empty_array(hl.tstr), )) return ds
def main(args): variants = hl.import_table(f'{ldprune_dir}/variants.txt', delimiter=' ') variants = variants.annotate(alleles = hl.set([variants.ref, variants.alt])) variants = variants.key_by('chrom','pos','alleles') all_pops = [ 'AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID' ] pops = [args.pop.upper()] if args.pop!=None else all_pops for pop in pops: bim0 = hl.import_table(f'{ldprune_dir}/subsets_5k/not_{pop}/not_{pop}.bim', delimiter=' ', no_header=True) bim0 = bim0.rename({'f0':'chrom', 'f3':'pos', 'f4':'A1_bim', 'f5':'A2_bim'}) bim0 = bim0.add_index() bim = bim0.annotate(alleles = hl.set([bim0.A1_bim, bim0.A2_bim])) bim = bim.key_by('chrom','pos','alleles') bim = bim.annotate(ref = variants[bim.key].ref, alt = variants[bim.key].alt) bim = bim.annotate(SNP = bim.chrom+':'+bim.pos+':'+bim.ref+':'+bim.alt) bim = bim.key_by() bim = bim.order_by(bim.idx) bim = bim.select('chrom','SNP','f2','pos','A1_bim','A2_bim') bim.export(f'{ldprune_dir}/subsets_5k/not_{pop}/not_{pop}.bim_new', header=False, delimiter=' ')
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'), default_reference=args.reference) sys.path.append('/') add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') load_module = importlib.import_module(args.load_module) mt = getattr(load_module, args.load_mt_function)(*add_args) if args.gene_map_ht_path is None: interval = [hl.parse_locus_interval(args.interval)] else: gene_ht = hl.read_table(args.gene_map_ht_path) if args.gene is not None: gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene) interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False) else: interval = [hl.parse_locus_interval(args.interval)] gene_ht = hl.filter_intervals(gene_ht, interval) gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation)) gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t') ).key_by().drop('start').export(args.group_output_file, header=False) # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants` if not args.no_adj: mt = mt.filter_entries(mt.adj) mt = hl.filter_intervals(mt, interval) if not args.input_bgen: mt = mt.select_entries('GT') mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0) mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1]) if args.callrate_filter: mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter) if args.export_bgen: if not args.input_bgen: mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT)) mt = gt_to_gp(mt) mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing) hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid) else: mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0))) # Note: no mean-imputation for VCF hl.export_vcf(mt, args.output_file)
def main(): parser = argparse.ArgumentParser() parser.add_argument("gencode") parser.add_argument("canonical_transcripts") parser.add_argument("hgnc") parser.add_argument("--min-partitions", type=int, default=8) parser.add_argument("--output", required=True) args = parser.parse_args() # Load genes from GTF file genes = load_gencode_gene_models(args.gencode, min_partitions=args.min_partitions) genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol) # Annotate genes with canonical transcript canonical_transcripts = load_canonical_transcripts(args.canonical_transcripts, min_partitions=args.min_partitions) genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id) # Drop transcripts except for canonical genes = genes.annotate( canonical_transcript=genes.transcripts.filter( lambda transcript: transcript.transcript_id == genes.canonical_transcript_id ).head() ) genes = genes.drop("transcripts") # Annotate genes with information from HGNC hgnc = load_hgnc(args.hgnc) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr))) genes = genes.annotate( symbol=hl.or_else(genes.symbol, genes.gencode_gene_symbol), symbol_source=hl.or_else(genes.symbol_source, "gencode"), ) # Collect all fields that can be used to search by gene symbol genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.set( hl.empty_array(hl.tstr) .append(genes.symbol) .extend(genes.previous_symbols) .extend(genes.alias_symbols) .append(genes.gencode_gene_symbol) .map(lambda s: s.upper()) ), ) genes.describe() genes.write(args.output, overwrite=True)
def prepare_clinvar_variants(clinvar_path, reference_genome): ds = hl.read_table(clinvar_path) ds = ds.filter(hl.is_defined(ds[f"locus_{reference_genome}"]) & hl.is_defined(ds[f"alleles_{reference_genome}"])) ds = ds.select(locus=ds[f"locus_{reference_genome}"], alleles=ds[f"alleles_{reference_genome}"], **ds.variant) # Remove any variants with alleles other than ACGT ds = ds.filter( hl.len(hl.set(hl.delimit(ds.alleles, "").split("")).difference(hl.set(["A", "C", "G", "T", ""]))) == 0 ) ds = ds.annotate( variant_id=variant_id(ds.locus, ds.alleles), chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, ref=ds.alleles[0], alt=ds.alleles[1], ) ds = ds.key_by("locus", "alleles") return ds
def import_hgnc(path): ds = hl.import_table(path, missing="") ds = ds.select( hgnc_id=ds["HGNC ID"], symbol=ds["Approved symbol"], name=ds["Approved name"], previous_symbols=ds["Previous symbols"], alias_symbols=ds["Alias symbols"], omim_id=ds["OMIM ID(supplied by OMIM)"], gene_id=hl.or_else(ds["Ensembl gene ID"], ds["Ensembl ID(supplied by Ensembl)"]), ) ds = ds.filter(hl.is_defined(ds.gene_id)).key_by("gene_id") ds = ds.annotate( previous_symbols=hl.set( ds.previous_symbols.split(",").map(lambda s: s.strip())), alias_symbols=hl.set( ds.alias_symbols.split(",").map(lambda s: s.strip())), ) return ds
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen( bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen( bgen_file, ['GT'], n_partitions= 199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen(bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by('locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen(bgen_file, ['GT'], n_partitions=199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by('locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def test_mendel_errors(self): mt = hl.import_vcf(resource('mendel.vcf')) ped = hl.Pedigree.read(resource('mendel.fam')) men, fam, ind, var = hl.mendel_errors(mt['GT'], ped) self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr)) self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr, fam_id=hl.tstr, mendel_code=hl.tint)) self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr)) self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr, fam_id=hl.tstr, children=hl.tint, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr)) self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr, fam_id=hl.tstr, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr))) self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), errors=hl.tint64)) self.assertEqual(men.count(), 41) self.assertEqual(fam.count(), 2) self.assertEqual(ind.count(), 7) self.assertEqual(var.count(), mt.count_rows()) self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()), { hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2, errors=41, snp_errors=39), hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1, errors=0, snp_errors=0) }) self.assertEqual(set(ind.select('errors', 'snp_errors').collect()), { hl.utils.Struct(s='Son1', errors=23, snp_errors=22), hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17), hl.utils.Struct(s='Dad1', errors=19, snp_errors=18), hl.utils.Struct(s='Mom1', errors=22, snp_errors=21), hl.utils.Struct(s='Dad2', errors=0, snp_errors=0), hl.utils.Struct(s='Mom2', errors=0, snp_errors=0), hl.utils.Struct(s='Son2', errors=0, snp_errors=0) }) to_keep = hl.set([ (hl.Locus("1", 1), ['C', 'CT']), (hl.Locus("1", 2), ['C', 'T']), (hl.Locus("X", 1), ['C', 'T']), (hl.Locus("X", 3), ['C', 'T']), (hl.Locus("Y", 1), ['C', 'T']), (hl.Locus("Y", 3), ['C', 'T']) ]) self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles))) .order_by('locus') .select('locus', 'alleles', 'errors').collect(), [ hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2), hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2), hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1), ]) ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam')) men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2) self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
def merge_alleles(alleles) -> ArrayExpression: return hl.array(hl.set(hl.flatten(alleles)))