def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual( hl.eval( hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval( hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual( hl.eval( hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval( hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) with self.assertRaises(FatalError): hl.eval( hl.liftover( hl.parse_locus_interval('1:10000-10000', reference_genome='GRCh37'), 'GRCh38')) grch37.remove_liftover("GRCh38")
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') assert grch37.has_liftover('GRCh38') assert grch38.has_liftover('GRCh37') ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() assert t.all(t.locus == t.liftover) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) grch37.remove_liftover("GRCh38")
def densify_sites( mt: hl.MatrixTable, sites_ht: hl.Table, last_END_positions_ht: hl.Table, semi_join_rows: bool = True, ) -> hl.MatrixTable: """ Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required. Note that only rows that appear both in `mt` and `sites_ht` are returned. :param mt: Input sparse MT :param sites_ht: Desired sites to densify :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag) :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites) :return: Dense MT filtered to the sites in `sites_ht` """ logger.info("Computing intervals to densify from sites Table.") sites_ht = sites_ht.key_by("locus") sites_ht = sites_ht.annotate( interval=hl.locus_interval( sites_ht.locus.contig, last_END_positions_ht[sites_ht.key].last_END_position, end=sites_ht.locus.position, includes_end=True, reference_genome=sites_ht.locus.dtype.reference_genome, ) ) sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval)) if semi_join_rows: mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus])) else: logger.info("Collecting intervals to densify.") intervals = sites_ht.interval.collect() print( "Found {0} intervals, totalling {1} bp in the dense Matrix.".format( len(intervals), sum( [ interval_length(interval) for interval in union_intervals(intervals) ] ), ) ) mt = hl.filter_intervals(mt, intervals) mt = hl.experimental.densify(mt) return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t.key_by( interval=hl.locus_interval(start.contig, start.position, end. position, True, True)).select().export( tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t .key_by(interval=hl.locus_interval(start.contig, start.position, end.position, True, True)) .select() .export(tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def test_segment_intervals(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_chr22_5_samples.vds')) contig_len = vds.reference_data.locus.dtype.reference_genome.lengths[ 'chr22'] breakpoints = hl.literal([*range(1, contig_len, 5_000_000), contig_len]) intervals = hl.range(hl.len(breakpoints) - 1) \ .map(lambda i: hl.struct( interval=hl.locus_interval('chr22', breakpoints[i], breakpoints[i + 1], reference_genome='GRCh38'))) intervals_ht = hl.Table.parallelize(intervals, key='interval') path = new_temp_file() r = hl.vds.segment_reference_blocks(vds.reference_data, intervals_ht) r.write(path) after = hl.read_matrix_table(path) es = after.entries() es = es.filter((es.END < es.locus.position) | (es.END >= es.interval.end.position)) if es.count() > 0: es.show(width=1000) assert False, "found entries with END < position or END >= interval end" before = vds.reference_data sum_per_sample_before = before.select_cols( ref_block_bases=hl.agg.sum(before.END + 1 - before.locus.position)).cols() sum_per_sample_after = after.select_cols( ref_block_bases=hl.agg.sum(after.END + 1 - after.locus.position)).cols() before_coverage = sum_per_sample_before.collect() after_coverage = sum_per_sample_after.collect() assert before_coverage == after_coverage
def liftover_intervals(t: hl.Table, keep_missing_interval: bool = False) -> hl.Table: """ Liftover locus in intervals from one coordinate system (hg37) to another (hg38) # Example input table description # # ---------------------------------------- # Global fields: # None # ---------------------------------------- # Row fields: # 'interval': interval<locus<GRCh37>> # ---------------------------------------- # Key: ['interval'] # ---------------------------------------- :param t: Table of intervals on GRCh37 :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table. :return: Table with intervals lifted over GRCh38 added. """ rg37 = hl.get_reference("GRCh37") rg38 = hl.get_reference("GRCh38") if not rg37.has_liftover("GRCh38"): rg37.add_liftover( f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz', rg38) t = t.annotate( start=hl.liftover(t.interval.start, "GRCh38"), end=hl.liftover(t.interval.end, "GRCh38"), ) t = t.filter((t.start.contig == "chr" + t.interval.start.contig) & (t.end.contig == "chr" + t.interval.end.contig)) t = t.key_by() t = (t.select(interval=hl.locus_interval(t.start.contig, t.start.position, t.end.position, reference_genome=rg38, invalid_missing=True), interval_hg37=t.interval)) # bad intervals missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval))) logger.info( f"Number of missing intervals: {missing[True]} out of {t.count()}...") # update globals annotations global_ann_expr = { 'date': current_date(), 'reference_genome': 'GRCh38', 'was_lifted': True } t = t.annotate_globals(**global_ann_expr) if not keep_missing_interval: logger.info(f"Filtering out {missing[True]} missing intervals...") t = t.filter(hl.is_defined(t.interval), keep=True) return t.key_by("interval")
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t') ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) ht = ht.annotate(attribute=hl.dict( hl.map( lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', ''). replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal( set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None, force_bgz=False, force=False) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :obj:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :class:`str` File to import. reference_genome : :class:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). force_bgz : :obj:`bool` If ``True``, load files as blocked gzip files, assuming that they were actually compressed using the BGZ codec. This option is useful when the file extension is not ``'.bgz'``, but the file is blocked gzip, so that the file can be read in parallel and not on a single node. force : :obj:`bool` If ``True``, load gzipped files serially on one core. This should be used only when absolutely necessary, as processing time will be increased due to lack of parallelism. Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t', force_bgz=force_bgz, force=force) ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) def parse_attributes(unparsed_attributes): def parse_attribute(attribute): key_and_value = attribute.split(' ') key = key_and_value[0] value = key_and_value[1] return (key, value.replace('"|;\\$', '')) return hl.dict(unparsed_attributes.split('; ').map(parse_attribute)) ht = ht.annotate(attribute=parse_attributes(ht['attribute'])) ht = ht.checkpoint(new_temp_file()) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome.name == 'GRCh37': ht = ht.annotate( seqname=hl.case().when((ht['seqname'] == 'M') | (ht['seqname'] == 'chrM'), 'MT'). when(ht['seqname'].startswith('chr'), ht['seqname'].replace( '^chr', '')).default(ht['seqname'])) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(reference_genome.contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
raw_data_root = 'gs://hail-datasets-raw-data/Ensembl' hail_data_root = 'gs://hail-datasets-hail-data' parser = argparse.ArgumentParser() parser.add_argument('-v', required=True, help='Dataset version.') parser.add_argument('-b', required=True, choices=['GRCh37', 'GRCh38'], help='Ensembl reference genome build.') args = parser.parse_args() name = 'Ensembl_homo_sapiens_low_complexity_regions' version = args.v build = args.b ht = hl.import_table(f'{raw_data_root}/Ensembl_homo_sapiens_low_complexity_regions_release{version}_{build}.tsv.bgz') if build == 'GRCh37': ht = ht.annotate(interval=hl.locus_interval(ht['chromosome'], hl.int(ht['start']), hl.int(ht['end']), reference_genome='GRCh37')) else: ht = ht.annotate(interval=hl.locus_interval('chr' + ht['chromosome'].replace('MT', 'M'), hl.int(ht['start']), hl.int(ht['end']), reference_genome='GRCh38')) ht = ht.key_by('interval') ht = ht.select() n_rows = ht.count() n_partitions = ht.n_partitions() ht = ht.annotate_globals(metadata=hl.struct(name=name, version=f'release_{version}', reference_genome=build, n_rows=n_rows, n_partitions=n_partitions))
ht_genes = ht_genes.rename({'interval': 'gene_interval'}) ht_genes = ht_genes.distinct() mt = hl.import_matrix_table( EXTRACT_BUCKET + 'GTEx/v7/GTEx_junction_read_counts.v7.GRCh37.tsv.bgz', row_fields={ 'junction_id': hl.tstr, 'Description': hl.tstr }, missing=' ', entry_type=hl.tfloat) mt = mt.transmute_rows(chr_start_end=mt['junction_id'].split('_')) mt = mt.transmute_rows( junction_interval=hl.locus_interval(mt['chr_start_end'][0], hl.int(mt['chr_start_end'][1]), hl.int(mt['chr_start_end'][2]), includes_start=True, includes_end=True, reference_genome='GRCh37')) mt = mt.key_rows_by(mt['junction_interval']) mt = mt.transmute_entries(read_count=hl.int(mt['x'])) mt = mt.rename({'Description': 'gene_id', 'col_id': 'sample_id'}) mt = mt.annotate_cols(**ht_sample_attributes[mt.sample_id]) if reference_genome == 'GRCh38': b37 = hl.get_reference('GRCh37') b37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') mt = mt.key_rows_by() mt = mt.annotate_rows( junction_interval=hl.liftover(mt['junction_interval'], 'GRCh38'))
def main(args): if args.create_gene_sample_mt: mt = hl.read_matrix_table( 'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019.mt' ) meta = hl.read_table( 'gs://gnomad/projects/compound_hets/myoseq/sample_qc/MacArthur_LGMD_Callset_Jan2019.full_meta.ht' ) pop_distance = hl.read_table( 'gs://gnomad-lfran/compound_hets/myoseq/sample_qc/myoseq_pop_distance_to_max_kde.ht' ) variant_annotations_ht = hl.read_table( 'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019.annotations.ht' ) variant_annotations_ht.drop('was_split', 'a_index') mt = mt.annotate_cols( **meta[mt.col_key], **pop_distance[mt.col_key], ) mt = mt.annotate_rows(**variant_annotations_ht[mt.row_key]) # Filter samples failing QC mt = mt.filter_cols((hl.len(mt.sample_filters) == 0) & ( mt.distance < args.pop_distance ) # NFE pop-distance away from densest point in KDE in pc-space (selects only NFEs) ) counts = mt.aggregate_cols(hl.agg.counter(mt.is_case)) print( f'Found {counts[True]} cases and {counts[False]} controls for gene aggregation.' ) # Filter sites failing QC, without any tx_annotation (i.e. without a protein-coding variant) or too common mt = mt.filter_rows( (hl.len(mt.filters) == 0) & hl.is_defined(mt.tx_annotation) & (hl.or_else(mt.gnomad_exomes_popmax.AF, hl.or_else(mt.gnomad_genomes_popmax.AF, 0.0)) < args.max_gnomad_af)) # Keep non-ref entries only entries_filter_expr = mt.GT.is_non_ref() if not args.raw: entries_filter_expr = mt.GT.is_non_ref() & get_adj_expr( mt.GT, mt.GQ, mt.DP, mt.AD, haploid_adj_dp=5) mt = mt.filter_entries(entries_filter_expr) # Annotate genes and mt = mt.annotate_rows(gene=hl.set( mt.tx_annotation.map( lambda x: hl.struct(gene_symbol=x.symbol, gene_id=x.ensg)))) # Aggregate by gene mt = mt.explode_rows(mt.gene) mt = mt.annotate_rows(tx_annotation=mt.tx_annotation.filter(lambda x: ( x.symbol == mt.gene.gene_symbol) & (x.ensg == mt.gene.gene_id))) # mt.write('gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_filtered_gene_exploded.mt', overwrite=True) # TODO: Add pext to missense counts # mt = hl.read_matrix_table('gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_filtered_gene_exploded.mt') mt = mt.group_rows_by(**mt.gene).aggregate( locus_interval=hl.locus_interval(hl.agg.take(mt.locus, 1)[0].contig, hl.agg.min(mt.locus.position), hl.agg.max(mt.locus.position), includes_end=True), n_het_lof=hl.agg.count_where( mt.GT.is_het() & mt.tx_annotation.any(lambda x: x.lof == 'HC')), n_hom_lof=hl.agg.count_where( mt.GT.is_hom_var() & mt.tx_annotation.any(lambda x: x.lof == 'HC')), n_het_lof_pext=hl.agg.count_where(mt.GT.is_het( ) & mt.tx_annotation.any(lambda x: (x.lof == 'HC') & (x.Muscle_Skeletal >= args.pext_cutoff))), n_hom_lof_pext=hl.agg.count_where(mt.GT.is_hom_var( ) & mt.tx_annotation.any(lambda x: (x.lof == 'HC') & (x.Muscle_Skeletal >= args.pext_cutoff))), n_het_missense=hl.agg.count_where( mt.GT.is_het() & mt.tx_annotation.any(lambda x: x.csq == 'missense_variant')), n_hom_missense=hl.agg.count_where( mt.GT.is_hom_var() & mt.tx_annotation.any(lambda x: x.csq == 'missense_variant')), n_het_damaging_missense=hl.agg.count_where( mt.GT.is_het() & mt.tx_annotation.any( lambda x: (x.polyphen_prediction == 'probably damaging') | (x.sift_prediction == 'deleterious'))), n_hom_damaging_missense=hl.agg.count_where( mt.GT.is_hom_var() & mt.tx_annotation.any( lambda x: (x.polyphen_prediction == 'probably damaging') | (x.sift_prediction == 'deleterious'))), n_het_synonymous=hl.agg.count_where(mt.GT.is_het( ) & mt.tx_annotation.any(lambda x: x.csq == 'synonymous_variant')), n_hom_synonymous=hl.agg.count_where(mt.GT.is_hom_var( ) & mt.tx_annotation.any(lambda x: x.csq == 'synonymous_variant')) ).write( 'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_gene_burden.mt', overwrite=args.overwrite) if args.run_burden_tests: mt = hl.read_matrix_table( 'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_gene_burden.mt' ) def fet_expr(het_count_exp: hl.expr.Int64Expression, hom_count_expr: hl.expr.Int64Expression): return hl.bind( lambda x: hl.struct( counts=x, dominant=hl.fisher_exact_test(x[0][0], x[0][1] + x[0][2], x[1][0], x[1][1] + x[1][2]), recessive=hl.fisher_exact_test(x[0][0] + x[0][1], x[0][ 2], x[1][0] + x[1][1], x[1][2])), hl.bind( lambda x: [ [ hl.int32( hl.cond(x.contains(False), x[False].get(0, 0), 0)), hl.int32( hl.cond(x.contains(False), x[False].get(1, 0), 0)), hl.int32( hl.cond(x.contains(False), x[False].get(2, 0), 0)) ], [ hl.int32( hl.cond(x.contains(True), x[True].get(0, 0), 0) ), hl.int32( hl.cond(x.contains(True), x[True].get(1, 0), 0) ), hl.int32( hl.cond(x.contains(True), x[True].get(2, 0), 0) ) ], ], hl.agg.group_by( mt.is_case, hl.agg.counter( hl.min(2, het_count_exp + 2 * hom_count_expr))))) mt = mt.annotate_rows( **{ 'lof': fet_expr(mt.n_het_lof, mt.n_hom_lof), 'lof_pext': fet_expr(mt.n_het_lof_pext, mt.n_hom_lof_pext), 'lof_missense': fet_expr(mt.n_het_lof + mt.n_het_missense, mt.n_het_lof + mt.n_hom_missense), 'lof_damaging_missense': fet_expr(mt.n_het_lof + mt.n_het_damaging_missense, mt.n_het_lof + mt.n_hom_damaging_missense), 'synonymous': fet_expr(mt.n_het_synonymous, mt.n_hom_synonymous) }) mt.write( 'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_gene_burden_tests.mt', overwrite=args.overwrite)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
';$', '')), ht_genes['attribute'].split('; ')))) attributes = ht_genes.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht_genes['attribute'].keys())) ht_genes = ht_genes.transmute( **{ x: hl.or_missing(ht_genes['attribute'].contains(x), ht_genes['attribute'][x]) for x in attributes if x }) ht_genes = ht_genes.annotate( gene_interval=hl.locus_interval(ht_genes['seqname'], ht_genes['start'], ht_genes['end'] + 1, reference_genome='GRCh37')) ht_genes = ht_genes.filter(ht_genes['feature'] == 'gene') ht_genes = ht_genes.key_by('gene_id') ht_genes = ht_genes.select('gene_interval', 'source', 'gene_name', 'havana_gene', 'gene_type', 'gene_status', 'level', 'score', 'strand', 'frame', 'tag') ht_genes = ht_genes.rename({ 'gene_name': 'gene_symbol', 'havana_gene': 'havana_gene_id' }) ht_genes.write('hdfs:///tmp/genes.ht', overwrite=True) ht_genes = hl.read_table('hdfs:///tmp/genes.ht') # gene read counts name = 'GTEx_RNA_seq_gene_read_counts'