def reference_genome(self): """Reference genome. Returns ------- :class:`.ReferenceGenome` Reference genome. """ if self._rg is None: self._rg = hl.default_reference() return self._rg
def values(self): values = [ (hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), {"a": 0, "b": 1, "c": 4}), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1])) ] return values
def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config=None): """ VEP a table, or lookup variants in a reference database :param ht: Input Table :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`) :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference) :param vep_config: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen) :return: VEPped Table """ if reference is None: reference = hl.default_reference().name if reference_vep_ht is None: possible_refs = ("GRCh37", "GRCh38") if reference not in possible_refs: raise ValueError( f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(possible_refs)}' ) reference_vep_ht = hl.read_table(vep_context_ht_path(reference)) ht = ht.annotate(vep=reference_vep_ht[ht.key].vep) vep_ht = ht.filter(hl.is_defined(ht.vep)) revep_ht = ht.filter(hl.is_missing(ht.vep)) if vep_config is None: vep_config = vep_config_path(reference) revep_ht = hl.vep(revep_ht, vep_config) return vep_ht.union(revep_ht)
def get_r_within_gene(bm: BlockMatrix, ld_index: hl.Table, gene: str, vep_ht: hl.Table = None, reference_genome: str = None): """ Gets LD information (`r`) for all pairs of variants within `gene`. Warning: this returns a table quadratic in number of variants. Exercise caution with large genes. :param bm: Input Block Matrix :param ld_index: Corresponding index table :param gene: Gene symbol as string :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data()) :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene :return: Table with pairs of variants """ if vep_ht is None: vep_ht = public_release('exomes').ht() if reference_genome is None: reference_genome = hl.default_reference().name intervals = hl.experimental.get_gene_intervals( gene_symbols=[gene], reference_genome=reference_genome) ld_index = hl.filter_intervals(ld_index, intervals) ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep) ld_index = ld_index.filter( hl.any(lambda tc: tc.gene_symbol == gene, ld_index.vep.transcript_consequences)) indices_to_keep = ld_index.idx.collect() filt_bm = bm.filter(indices_to_keep, indices_to_keep) ht = filt_bm.entries() ld_index = ld_index.add_index('new_idx').key_by('new_idx') return ht.transmute(r=ht.entry, i_variant=ld_index[ht.i], j_variant=ld_index[ht.j])
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable. On Google Cloud platform: Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz Example ------- >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37')) # doctest: +SKIP Parameters ---------- gene_symbols : :obj:`list` of :obj:`str`, optional Gene symbols (e.g. PCSK9). gene_ids : :obj:`list` of :obj:`str`, optional Gene IDs (e.g. ENSG00000223972). transcript_ids : :obj:`list` of :obj:`str`, optional Transcript IDs (e.g. ENSG00000223972). verbose : :obj:`bool` If ``True``, print which genes and transcripts were matched in the GTF file. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use (passed along to import_gtf). gtf_file : :obj:`str` GTF file to load. If none is provided, but `reference_genome` is one of `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform). Returns ------- :obj:`list` of :class:`.Interval` """ GTFS = { 'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz', 'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz', } if reference_genome is None: reference_genome = hl.default_reference().name else: reference_genome = reference_genome.name if gtf_file is None: gtf_file = GTFS.get(reference_genome) if gtf_file is None: raise ValueError( 'get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)' ) if gene_symbols is None and gene_ids is None and transcript_ids is None: raise ValueError( 'get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids' ) ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome, skip_invalid_contigs=True, min_partitions=12) ht = ht.annotate(gene_id=ht.gene_id.split(f'\\.')[0], transcript_id=ht.transcript_id.split('\\.')[0]) criteria = [] if gene_symbols: criteria.append( hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols)) if gene_ids: criteria.append( hl.any( lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids)) if transcript_ids: criteria.append( hl.any( lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids)) ht = ht.filter(functools.reduce(operator.ior, criteria)) gene_info = ht.aggregate( hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval))) if verbose: info(f'get_gene_intervals found {len(gene_info)} entries:\n' + "\n".join( map( lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info))) intervals = list(map(lambda x: x[-1], gene_info)) return intervals
def test_constructor(self): l = Locus.parse('1:100') self.assertEqual(l, Locus('1', 100)) self.assertEqual(l, Locus(1, 100)) self.assertEqual(l.reference_genome, hl.default_reference())
def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config_path=None, vep_version=None): """ VEP a table, or lookup variants in a reference database .. warning:: If `reference_vep_ht` is supplied, no check is performed to confirm `reference_vep_ht` was generated with the same version of VEP / VEP configuration as the VEP referenced in `vep_config_path`. :param ht: Input Table :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`) :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference) :param vep_config_path: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen) :param vep_version: Version of VEPed context Table to use (if None, the default `vep_context` resource will be used) :return: VEPed Table """ if reference is None: reference = hl.default_reference().name if vep_config_path is None: vep_config_path = VEP_CONFIG_PATH vep_help = get_vep_help(vep_config_path) with hl.hadoop_open(vep_config_path) as vep_config_file: vep_config = vep_config_file.read() if reference_vep_ht is None: if reference not in POSSIBLE_REFS: raise ValueError( f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(POSSIBLE_REFS)}' ) vep_context = get_vep_context(reference) if vep_version is None: vep_version = vep_context.default_version if vep_version not in vep_context.versions: logger.warning( f"No VEPed context Table available for genome build {reference} and VEP version {vep_version}, " f"all variants will be VEPed using the following VEP:\n{vep_help}" ) return hl.vep(ht, vep_config_path) logger.info( f"Using VEPed context Table from genome build {reference} and VEP version {vep_version}" ) reference_vep_ht = vep_context.versions[vep_version].ht() vep_context_help = hl.eval(reference_vep_ht.vep_help) vep_context_config = hl.eval(reference_vep_ht.vep_config) assert vep_help == vep_context_help, ( f"The VEP context HT version does not match the version referenced in the VEP config file." f"\nVEP context:\n{vep_context_help}\n\n VEP config:\n{vep_help}") assert vep_config == vep_context_config, ( f"The VEP context HT configuration does not match the configuration in {vep_config_path}." f"\nVEP context:\n{vep_context_config}\n\n Current config:\n{vep_config}" ) ht = ht.annotate(vep=reference_vep_ht[ht.key].vep) vep_ht = ht.filter(hl.is_defined(ht.vep)) revep_ht = ht.filter(hl.is_missing(ht.vep)) revep_ht = hl.vep(revep_ht, vep_config_path) return vep_ht.union(revep_ht)
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable. On Google Cloud platform: Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz Example ------- >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37')) # doctest: +SKIP Parameters ---------- gene_symbols : :obj:`list` of :obj:`str`, optional Gene symbols (e.g. PCSK9). gene_ids : :obj:`list` of :obj:`str`, optional Gene IDs (e.g. ENSG00000223972). transcript_ids : :obj:`list` of :obj:`str`, optional Transcript IDs (e.g. ENSG00000223972). verbose : :obj:`bool` If ``True``, print which genes and transcripts were matched in the GTF file. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use (passed along to import_gtf). gtf_file : :obj:`str` GTF file to load. If none is provided, but `reference_genome` is one of `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform). Returns ------- :obj:`list` of :class:`.Interval` """ GTFS = { 'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz', 'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz', } if reference_genome is None: reference_genome = hl.default_reference().name if gtf_file is None: gtf_file = GTFS.get(reference_genome) if gtf_file is None: raise ValueError('get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)') if gene_symbols is None and gene_ids is None and transcript_ids is None: raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids') ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome, skip_invalid_contigs=True, min_partitions=12) ht = ht.annotate(gene_id=ht.gene_id.split(f'\\.')[0], transcript_id=ht.transcript_id.split('\\.')[0]) criteria = [] if gene_symbols: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols)) if gene_ids: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids)) if transcript_ids: criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids)) ht = ht.filter(functools.reduce(operator.ior, criteria)) gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval))) if verbose: info(f'get_gene_intervals found {len(gene_info)} entries:\n' + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info))) intervals = list(map(lambda x: x[-1], gene_info)) return intervals
def test_classes(self): l = Locus.parse('1:100') self.assertEqual(l, Locus('1', 100)) self.assertEqual(l, Locus(1, 100)) self.assertEqual(l.reference_genome, hl.default_reference()) c_hom_ref = Call([0, 0]) self.assertEqual(c_hom_ref.alleles, [0, 0]) self.assertEqual(c_hom_ref.ploidy, 2) self.assertFalse(c_hom_ref.phased) self.assertFalse(c_hom_ref.is_haploid()) self.assertTrue(c_hom_ref.is_diploid()) self.assertEqual(c_hom_ref.n_alt_alleles(), 0) self.assertTrue(c_hom_ref.one_hot_alleles(2) == [2, 0]) self.assertTrue(c_hom_ref.is_hom_ref()) self.assertFalse(c_hom_ref.is_het()) self.assertFalse(c_hom_ref.is_hom_var()) self.assertFalse(c_hom_ref.is_non_ref()) self.assertFalse(c_hom_ref.is_het_non_ref()) self.assertFalse(c_hom_ref.is_het_ref()) self.assertTrue(c_hom_ref.unphased_diploid_gt_index() == 0) c_het_phased = Call([1, 0], phased=True) self.assertEqual(c_het_phased.alleles, [1, 0]) self.assertEqual(c_het_phased.ploidy, 2) self.assertTrue(c_het_phased.phased) self.assertFalse(c_het_phased.is_haploid()) self.assertTrue(c_het_phased.is_diploid()) self.assertEqual(c_het_phased.n_alt_alleles(), 1) self.assertTrue(c_het_phased.one_hot_alleles(2) == [1, 1]) self.assertFalse(c_het_phased.is_hom_ref()) self.assertTrue(c_het_phased.is_het()) self.assertFalse(c_het_phased.is_hom_var()) self.assertTrue(c_het_phased.is_non_ref()) self.assertFalse(c_het_phased.is_het_non_ref()) self.assertTrue(c_het_phased.is_het_ref()) c_hom_var = Call([1, 1]) self.assertEqual(c_hom_var.alleles, [1, 1]) self.assertEqual(c_hom_var.ploidy, 2) self.assertFalse(c_hom_var.phased) self.assertFalse(c_hom_var.is_haploid()) self.assertTrue(c_hom_var.is_diploid()) self.assertEqual(c_hom_var.n_alt_alleles(), 2) self.assertTrue(c_hom_var.one_hot_alleles(2) == [0, 2]) self.assertFalse(c_hom_var.is_hom_ref()) self.assertFalse(c_hom_var.is_het()) self.assertTrue(c_hom_var.is_hom_var()) self.assertTrue(c_hom_var.is_non_ref()) self.assertFalse(c_hom_var.is_het_non_ref()) self.assertFalse(c_hom_var.is_het_ref()) self.assertTrue(c_hom_var.unphased_diploid_gt_index() == 2) c_haploid = Call([2], phased=True) self.assertEqual(c_haploid.alleles, [2]) self.assertEqual(c_haploid.ploidy, 1) self.assertTrue(c_haploid.phased) self.assertTrue(c_haploid.is_haploid()) self.assertFalse(c_haploid.is_diploid()) self.assertEqual(c_haploid.n_alt_alleles(), 1) self.assertTrue(c_haploid.one_hot_alleles(3) == [0, 0, 1]) self.assertFalse(c_haploid.is_hom_ref()) self.assertFalse(c_haploid.is_het()) self.assertTrue(c_haploid.is_hom_var()) self.assertTrue(c_haploid.is_non_ref()) self.assertFalse(c_haploid.is_het_non_ref()) self.assertFalse(c_haploid.is_het_ref()) c_zeroploid = Call([]) self.assertEqual(c_zeroploid.alleles, []) self.assertEqual(c_zeroploid.ploidy, 0) self.assertFalse(c_zeroploid.phased) self.assertFalse(c_zeroploid.is_haploid()) self.assertFalse(c_zeroploid.is_diploid()) self.assertEqual(c_zeroploid.n_alt_alleles(), 0) self.assertTrue(c_zeroploid.one_hot_alleles(3) == [0, 0, 0]) self.assertFalse(c_zeroploid.is_hom_ref()) self.assertFalse(c_zeroploid.is_het()) self.assertFalse(c_zeroploid.is_hom_var()) self.assertFalse(c_zeroploid.is_non_ref()) self.assertFalse(c_zeroploid.is_het_non_ref()) self.assertFalse(c_zeroploid.is_het_ref()) self.assertRaisesRegex( NotImplementedError, "Calls with greater than 2 alleles are not supported.", Call, [1, 1, 1, 1])
def test_classes(self): l = Locus.parse('1:100') self.assertEqual(l, Locus('1', 100)) self.assertEqual(l, Locus(1, 100)) self.assertEqual(l.reference_genome, hl.default_reference()) c_hom_ref = Call([0, 0]) self.assertEqual(c_hom_ref.alleles, [0, 0]) self.assertEqual(c_hom_ref.ploidy, 2) self.assertFalse(c_hom_ref.phased) self.assertFalse(c_hom_ref.is_haploid()) self.assertTrue(c_hom_ref.is_diploid()) self.assertEqual(c_hom_ref.n_alt_alleles(), 0) self.assertTrue(c_hom_ref.one_hot_alleles(2) == [2, 0]) self.assertTrue(c_hom_ref.is_hom_ref()) self.assertFalse(c_hom_ref.is_het()) self.assertFalse(c_hom_ref.is_hom_var()) self.assertFalse(c_hom_ref.is_non_ref()) self.assertFalse(c_hom_ref.is_het_non_ref()) self.assertFalse(c_hom_ref.is_het_ref()) self.assertTrue(c_hom_ref.unphased_diploid_gt_index() == 0) c_het_phased = Call([1, 0], phased=True) self.assertEqual(c_het_phased.alleles, [1, 0]) self.assertEqual(c_het_phased.ploidy, 2) self.assertTrue(c_het_phased.phased) self.assertFalse(c_het_phased.is_haploid()) self.assertTrue(c_het_phased.is_diploid()) self.assertEqual(c_het_phased.n_alt_alleles(), 1) self.assertTrue(c_het_phased.one_hot_alleles(2) == [1, 1]) self.assertFalse(c_het_phased.is_hom_ref()) self.assertTrue(c_het_phased.is_het()) self.assertFalse(c_het_phased.is_hom_var()) self.assertTrue(c_het_phased.is_non_ref()) self.assertFalse(c_het_phased.is_het_non_ref()) self.assertTrue(c_het_phased.is_het_ref()) c_hom_var = Call([1, 1]) self.assertEqual(c_hom_var.alleles, [1, 1]) self.assertEqual(c_hom_var.ploidy, 2) self.assertFalse(c_hom_var.phased) self.assertFalse(c_hom_var.is_haploid()) self.assertTrue(c_hom_var.is_diploid()) self.assertEqual(c_hom_var.n_alt_alleles(), 2) self.assertTrue(c_hom_var.one_hot_alleles(2) == [0, 2]) self.assertFalse(c_hom_var.is_hom_ref()) self.assertFalse(c_hom_var.is_het()) self.assertTrue(c_hom_var.is_hom_var()) self.assertTrue(c_hom_var.is_non_ref()) self.assertFalse(c_hom_var.is_het_non_ref()) self.assertFalse(c_hom_var.is_het_ref()) self.assertTrue(c_hom_var.unphased_diploid_gt_index() == 2) c_haploid = Call([2], phased=True) self.assertEqual(c_haploid.alleles, [2]) self.assertEqual(c_haploid.ploidy, 1) self.assertTrue(c_haploid.phased) self.assertTrue(c_haploid.is_haploid()) self.assertFalse(c_haploid.is_diploid()) self.assertEqual(c_haploid.n_alt_alleles(), 1) self.assertTrue(c_haploid.one_hot_alleles(3) == [0, 0, 1]) self.assertFalse(c_haploid.is_hom_ref()) self.assertFalse(c_haploid.is_het()) self.assertTrue(c_haploid.is_hom_var()) self.assertTrue(c_haploid.is_non_ref()) self.assertFalse(c_haploid.is_het_non_ref()) self.assertFalse(c_haploid.is_het_ref()) c_zeroploid = Call([]) self.assertEqual(c_zeroploid.alleles, []) self.assertEqual(c_zeroploid.ploidy, 0) self.assertFalse(c_zeroploid.phased) self.assertFalse(c_zeroploid.is_haploid()) self.assertFalse(c_zeroploid.is_diploid()) self.assertEqual(c_zeroploid.n_alt_alleles(), 0) self.assertTrue(c_zeroploid.one_hot_alleles(3) == [0, 0, 0]) self.assertFalse(c_zeroploid.is_hom_ref()) self.assertFalse(c_zeroploid.is_het()) self.assertFalse(c_zeroploid.is_hom_var()) self.assertFalse(c_zeroploid.is_non_ref()) self.assertFalse(c_zeroploid.is_het_non_ref()) self.assertFalse(c_zeroploid.is_het_ref()) self.assertRaisesRegex( NotImplementedError, "Calls with greater than 2 alleles are not supported.", Call, [1, 1, 1, 1]) rg = hl.get_reference('GRCh37') self.assertEqual(rg.name, "GRCh37") self.assertEqual(rg.contigs[0], "1") self.assertListEqual(rg.x_contigs, ["X"]) self.assertListEqual(rg.y_contigs, ["Y"]) self.assertListEqual(rg.mt_contigs, ["MT"]) self.assertEqual(rg.par[0], hl.parse_locus_interval("X:60001-2699521").value) self.assertEqual(rg.contig_length("1"), 249250621) name = "test" contigs = ["1", "X", "Y", "MT"] lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000} x_contigs = ["X"] y_contigs = ["Y"] mt_contigs = ["MT"] par = [("X", 5, 1000)] gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par) self.assertEqual(gr2.name, name) self.assertListEqual(gr2.contigs, contigs) self.assertListEqual(gr2.x_contigs, x_contigs) self.assertListEqual(gr2.y_contigs, y_contigs) self.assertListEqual(gr2.mt_contigs, mt_contigs) self.assertEqual(gr2.par, [hl.parse_locus_interval("X:5-1000", gr2).value]) self.assertEqual(gr2.contig_length("1"), 10000) self.assertDictEqual(gr2.lengths, lengths) gr2.write("/tmp/my_gr.json") gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file( "test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue(t.all(hl.get_sequence(gr4, t.contig, t.pos) == t.base)) l = hl.locus("a", 7, gr4) self.assertTrue( l.sequence_context(before=3, after=3).value == "TTTCGAA")