def test_filter_alleles_hts(self): # 1 variant: A:T,G ds = hl.import_vcf(resource('filter_alleles/input.vcf')) self.assertTrue( hl.filter_alleles_hts(ds, lambda a, i: a == 'T', subset=True) .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new') ._same(hl.import_vcf(resource('filter_alleles/keep_allele1_subset.vcf')))) self.assertTrue( hl.filter_alleles_hts(ds, lambda a, i: a == 'G', subset=True) .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new') ._same(hl.import_vcf(resource('filter_alleles/keep_allele2_subset.vcf'))) ) self.assertTrue( hl.filter_alleles_hts(ds, lambda a, i: a != 'G', subset=False) .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new') ._same(hl.import_vcf(resource('filter_alleles/keep_allele1_downcode.vcf'))) ) (hl.filter_alleles_hts(ds, lambda a, i: a == 'G', subset=False)).old_to_new.show() self.assertTrue( hl.filter_alleles_hts(ds, lambda a, i: a == 'G', subset=False) .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new') ._same(hl.import_vcf(resource('filter_alleles/keep_allele2_downcode.vcf'))) )
def test_import_vcf_skip_invalid_loci(self): mt = hl.import_vcf(resource('skip_invalid_loci.vcf'), reference_genome='GRCh37', skip_invalid_loci=True) self.assertTrue(mt._force_count_rows() == 3) with self.assertRaisesRegex(FatalError, 'Invalid locus'): hl.import_vcf(resource('skip_invalid_loci.vcf')).count()
def test_union_cols_example(self): joined = hl.import_vcf(resource('joined.vcf')) left = hl.import_vcf(resource('joinleft.vcf')) right = hl.import_vcf(resource('joinright.vcf')) self.assertTrue(left.union_cols(right)._same(joined))
def test_not_identical_headers(self): t = new_temp_file('vcf') mt = hl.import_vcf(resource('sample.vcf')) hl.export_vcf(mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t) with self.assertRaisesRegex(FatalError, 'invalid sample IDs'): (hl.import_vcf([resource('sample.vcf'), t]) ._force_count_rows())
def test_export_vcf(self): dataset = hl.import_vcf(resource('sample.vcf.bgz')) vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz')) hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata) dataset_imported = hl.import_vcf('/tmp/sample.vcf') self.assertTrue(dataset._same(dataset_imported)) no_sample_dataset = dataset.filter_cols(False).select_entries() hl.export_vcf(no_sample_dataset, '/tmp/no_sample.vcf', metadata=vcf_metadata) no_sample_dataset_imported = hl.import_vcf('/tmp/no_sample.vcf') self.assertTrue(no_sample_dataset._same(no_sample_dataset_imported)) metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf') self.assertDictEqual(vcf_metadata, metadata_imported)
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad.describe() bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def test_import_vcf_flags_are_defined(self): # issue 3277 t = hl.import_vcf(resource('sample.vcf')).rows() self.assertTrue(t.all(hl.is_defined(t.info.NEGATIVE_TRAIN_SITE) & hl.is_defined(t.info.POSITIVE_TRAIN_SITE) & hl.is_defined(t.info.DB) & hl.is_defined(t.info.DS)))
def test_filter_intervals_compound_key(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles)) .key_rows_by('locus', 'alleles')) intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']), hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
def test_fix3307_read_mt_wrong(self): mt = hl.import_vcf(resource('sample2.vcf')) mt = hl.split_multi_hts(mt) mt.write('/tmp/foo.mt', overwrite=True) mt2 = hl.read_matrix_table('/tmp/foo.mt') t = hl.read_table('/tmp/foo.mt/rows') self.assertTrue(mt.rows()._same(t)) self.assertTrue(mt2.rows()._same(t)) self.assertTrue(mt._same(mt2))
def test_import_vcf(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) vcf_table = vcf.rows() self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22")) self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))
def get_1kg(output_dir, overwrite: bool = False): """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__ dataset and sample annotations. Notes ----- The download is about 15M. Parameters ---------- output_dir Directory in which to write data. overwrite If ``True``, overwrite any existing files/directories at `output_dir`. """ jhc = Env.hc()._jhc _mkdir(jhc, output_dir) matrix_table_path = os.path.join(output_dir, '1kg.mt') annotations_path = os.path.join(output_dir, '1kg_annotations.txt') if (overwrite or not Env.jutils().dirExists(jhc, matrix_table_path) or not Env.jutils().fileExists(jhc, annotations_path)): init_temp_dir() tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz') source = resources['1kg_matrix_table'] info(f'downloading 1KG VCF ...\n' f' Source: {source}') urlretrieve(resources['1kg_matrix_table'], tmp_vcf) cluster_readable_vcf = Env.jutils().copyToTmp(jhc, local_path_uri(tmp_vcf), 'vcf') info('importing VCF and writing to matrix table...') hl.import_vcf(cluster_readable_vcf, min_partitions=16).write(matrix_table_path, overwrite=True) tmp_annot = os.path.join(tmp_dir, '1kg_annotations.txt') source = resources['1kg_annotations'] info(f'downloading 1KG annotations ...\n' f' Source: {source}') urlretrieve(source, tmp_annot) hl.hadoop_copy(local_path_uri(tmp_annot), annotations_path) info('Done!') else: info('1KG files found')
def test_filter_alleles(self): # poor man's Gen paths = [resource('sample.vcf'), resource('multipleChromosomes.vcf'), resource('sample2.vcf')] for path in paths: ds = hl.import_vcf(path) self.assertEqual( hl.filter_alleles(ds, lambda a, i: False).count_rows(), 0) self.assertEqual(hl.filter_alleles(ds, lambda a, i: True).count_rows(), ds.count_rows())
def test_undeclared_info(self): mt = hl.import_vcf(resource('undeclaredinfo.vcf')) rows = mt.rows() self.assertTrue(rows.all(hl.is_defined(rows.info))) info_type = mt.row.dtype['info'] self.assertTrue('InbreedingCoeff' in info_type) self.assertFalse('undeclared' in info_type) self.assertFalse('undeclaredFlag' in info_type)
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_hw_func_and_agg_agree(self): mt = hl.import_vcf(resource('sample.vcf')) mt = mt.annotate_rows( stats=hl.agg.call_stats(mt.GT, mt.alleles), hw=hl.agg.hardy_weinberg_test(mt.GT)) mt = mt.annotate_rows( hw2=hl.hardy_weinberg_test(mt.stats.homozygote_count[0], mt.stats.AC[1] - 2 * mt.stats.homozygote_count[1], mt.stats.homozygote_count[1])) rt = mt.rows() self.assertTrue(rt.all(rt.hw == rt.hw2))
def test_import_vcf_missing_info_field_elements(self): mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR) expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)), key=['locus', 'alleles']) self.assertTrue(mt.rows()._same(expected))
def download_data(): global _initialized, _data_dir, _mt _data_dir = os.environ.get('HAIL_BENCHMARK_DIR', '/tmp/hail_benchmark_data') print(f'using benchmark data directory {_data_dir}') os.makedirs(_data_dir, exist_ok=True) files = map(lambda f: os.path.join(_data_dir, f), ['profile.vcf.bgz', 'profile.mt']) if not all(os.path.exists(file) for file in files): vcf = os.path.join(_data_dir, 'profile.vcf.bgz') print('files not found - downloading...', end='',flush=True) urlretrieve('https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz', os.path.join(_data_dir, vcf)) print('done', flush=True) print('importing...', end='', flush=True) hl.import_vcf(vcf).write(os.path.join(_data_dir, 'profile.mt')) print('done', flush=True) else: print('all files found.', flush=True) _initialized = True _mt = hl.read_matrix_table(resource('profile.mt'))
def test_trio_matrix_null_keys(self): ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s].fam_id) # Make keys all null mt = mt.key_cols_by(s=hl.null(hl.tstr)) tt = hl.trio_matrix(mt, ped, complete_trios=True) self.assertEqual(tt.count_cols(), 0)
def test_joins(self): kt = hl.utils.range_table(1).key_by().drop('idx') kt = kt.annotate(a='foo') kt1 = hl.utils.range_table(1).key_by().drop('idx') kt1 = kt1.annotate(a='foo', b='bar').key_by('a') kt2 = hl.utils.range_table(1).key_by().drop('idx') kt2 = kt2.annotate(b='bar', c='baz').key_by('b') kt3 = hl.utils.range_table(1).key_by().drop('idx') kt3 = kt3.annotate(c='baz', d='qux').key_by('c') kt4 = hl.utils.range_table(1).key_by().drop('idx') kt4 = kt4.annotate(d='qux', e='quam').key_by('d') ktr = kt.annotate(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e) self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam']) ktr = kt.select(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e) self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam']) self.assertEqual(kt.filter(kt4[kt3[kt2[kt1[kt.a].b].c].d].e == 'quam').count(), 1) m = hl.import_vcf(resource('sample.vcf')) vkt = m.rows() vkt = vkt.select(vkt.qual) vkt = vkt.annotate(qual2=m.index_rows(vkt.key).qual) self.assertTrue(vkt.filter(vkt.qual != vkt.qual2).count() == 0) m2 = m.annotate_rows(qual2=vkt.index(m.row_key).qual) self.assertTrue(m2.filter_rows(m2.qual != m2.qual2).count_rows() == 0) m3 = m.annotate_rows(qual2=m.index_rows(m.row_key).qual) self.assertTrue(m3.filter_rows(m3.qual != m3.qual2).count_rows() == 0) kt5 = hl.utils.range_table(1).annotate(key='C1589').key_by('key') m4 = m.annotate_cols(foo=m.s[:5]) m4 = m4.annotate_cols(idx=kt5[m4.foo].idx) n_C1589 = m.filter_cols(m.s[:5] == 'C1589').count_cols() self.assertTrue(n_C1589 > 1) self.assertEqual(m4.filter_cols(hl.is_defined(m4.idx)).count_cols(), n_C1589) kt = hl.utils.range_table(1) kt = kt.annotate_globals(foo=5) self.assertEqual(hl.eval(kt.foo), 5) kt2 = hl.utils.range_table(1) kt2 = kt2.annotate_globals(kt_foo=kt.index_globals().foo) self.assertEqual(hl.eval(kt2.globals.kt_foo), 5)
def test_call_fields(self): expected = hl.Table.parallelize( [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024", GT = hl.call(0, 0), GTA = hl.null(hl.tcall), GTZ = hl.call(0, 1)), hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025", GT = hl.call(1), GTA = hl.null(hl.tcall), GTZ = hl.call(0)), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024", GT = hl.call(2, 2), GTA = hl.call(2, 1), GTZ = hl.call(1, 1)), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025", GT = hl.call(2), GTA = hl.null(hl.tcall), GTZ = hl.call(1))], key=['locus', 's']) mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ']) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'GTA', 'GTZ') self.assertTrue(entries._same(expected))
def test_import_plink_contig_recoding_w_reference(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) hl.export_plink(vcf, '/tmp/sample_plink') bfile = '/tmp/sample_plink' plink = hl.import_plink( bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, contig_recoding={'chr22': '22'}, reference_genome='GRCh37').rows() self.assertTrue(plink.all(plink.locus.contig == "22")) self.assertEqual(vcf.count_rows(), plink.count()) self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
def test_import_vcf_missing_format_field_elements(self): mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows().select_entries('AD', 'PL') expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02024', 'AD': [None, None], 'PL': [0, None, 180]}, {'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02025', 'AD': [None, 6], 'PL': [70, None]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02024', 'AD': [0, 0, None], 'PL': [396, None, None, 33, None, 0]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02025', 'AD': [0, 0, 9], 'PL': [None, None, None]}], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), s=hl.tstr, AD=hl.tarray(hl.tint), PL=hl.tarray(hl.tint)), key=['locus', 'alleles', 's']) self.assertTrue(mt.entries()._same(expected))
def test_unions(self): dataset = hl.import_vcf(resource('sample2.vcf')) # test union_rows ds1 = dataset.filter_rows(dataset.locus.position % 2 == 1) ds2 = dataset.filter_rows(dataset.locus.position % 2 == 0) datasets = [ds1, ds2] r1 = ds1.union_rows(ds2) r2 = hl.MatrixTable.union_rows(*datasets) self.assertTrue(r1._same(r2)) # test union_cols ds = dataset.union_cols(dataset).union_cols(dataset) for s, count in ds.aggregate_cols(agg.counter(ds.s)).items(): self.assertEqual(count, 3)
def test_haploid(self): expected = hl.Table.parallelize( [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024", GT = hl.call(0, 0), AD = [10, 0], GQ = 44), hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025", GT = hl.call(1), AD = [0, 6], GQ = 70), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024", GT = hl.call(2, 2), AD = [0, 0, 11], GQ = 33), hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025", GT = hl.call(2), AD = [0, 0, 9], GQ = 24)], key=['locus', 's']) mt = hl.import_vcf(resource('haploid.vcf')) entries = mt.entries() entries = entries.key_by('locus', 's') entries = entries.select('GT', 'AD', 'GQ') self.assertTrue(entries._same(expected))
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) # permute columns so not in alphabetical order! import random indices = list(range(mt.count_cols())) random.shuffle(indices) mt = mt.choose_cols(indices) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command(["plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order"]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command(["plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def test_matrix_filter_intervals(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) self.assertEqual( hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3) intervals = [hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')), hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
def test_hardy_weinberg_test(self): mt = hl.import_vcf(resource('HWE_test.vcf')) mt = mt.select_rows(**hl.agg.hardy_weinberg_test(mt.GT)) rt = mt.rows() expected = hl.Table.parallelize([ hl.struct( locus=hl.locus('20', pos), alleles=alleles, het_freq_hwe=r, p_value=p) for (pos, alleles, r, p) in [ (1, ['A', 'G'], 0.0, 0.5), (2, ['A', 'G'], 0.25, 0.5), (3, ['T', 'C'], 0.5357142857142857, 0.21428571428571427), (4, ['T', 'A'], 0.5714285714285714, 0.6571428571428573), (5, ['G', 'A'], 0.3333333333333333, 0.5)]], key=['locus', 'alleles']) self.assertTrue(rt.filter(rt.locus.position != 6)._same(expected)) rt6 = rt.filter(rt.locus.position == 6).collect()[0] self.assertEqual(rt6['p_value'], 0.5) self.assertTrue(math.isnan(rt6['het_freq_hwe']))
def test_trio_matrix_incomplete_trios(self): ped = hl.Pedigree.read(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) hl.trio_matrix(mt, ped, complete_trios=False)
"--input-url", help="URL of ExAC sites VCF", default="gs://exac/170122_exacv1_bundle/ExAC.r1.sites.vep.vcf.gz") p.add_argument("--output-url", help="URL to write Hail table to", required=True) p.add_argument("--subset", help="Filter variants to this chrom:start-end range") args = p.parse_args() hl.init(log="/tmp/hail.log") print("\n=== Importing VCF ===") mt = hl.import_vcf(args.input_url, force_bgz=True, min_partitions=2000, skip_invalid_loci=True) # Drop entry values mt = mt.drop("AD", "DP", "GQ", "GT", "MIN_DP", "PL", "SB") if args.subset: print(f"\n=== Filtering to interval {args.subset} ===") subset_interval = hl.parse_locus_interval(args.subset) mt = mt.filter_rows(subset_interval.contains(mt.locus)) print("\n=== Splitting multiallelic variants ===") mt = hl.split_multi(mt) # For multiallelic variants, these fields contain a value for each alt allele
def get_1kg(output_dir, overwrite: bool = False): """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__ dataset and sample annotations. Notes ----- The download is about 15M. Parameters ---------- output_dir Directory in which to write data. overwrite If ``True``, overwrite any existing files/directories at `output_dir`. """ fs = Env.fs() if not _dir_exists(fs, output_dir): fs.mkdir(output_dir) matrix_table_path = os.path.join(output_dir, '1kg.mt') vcf_path = os.path.join(output_dir, '1kg.vcf.bgz') sample_annotations_path = os.path.join(output_dir, '1kg_annotations.txt') gene_annotations_path = os.path.join(output_dir, 'ensembl_gene_annotations.txt') if (overwrite or not _dir_exists(fs, matrix_table_path) or not _file_exists(fs, sample_annotations_path) or not _file_exists(fs, vcf_path) or not _file_exists(fs, gene_annotations_path)): init_temp_dir() tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz') source = resources['1kg_matrix_table'] info(f'downloading 1KG VCF ...\n' f' Source: {source}') sync_retry_transient_errors(urlretrieve, resources['1kg_matrix_table'], tmp_vcf) cluster_readable_vcf = _copy_to_tmp(fs, local_path_uri(tmp_vcf), extension='vcf.bgz') info('importing VCF and writing to matrix table...') hl.import_vcf(cluster_readable_vcf, min_partitions=16).write(matrix_table_path, overwrite=True) tmp_sample_annot = os.path.join(tmp_dir, '1kg_annotations.txt') source = resources['1kg_annotations'] info(f'downloading 1KG annotations ...\n' f' Source: {source}') sync_retry_transient_errors(urlretrieve, source, tmp_sample_annot) tmp_gene_annot = os.path.join(tmp_dir, 'ensembl_gene_annotations.txt') source = resources['1kg_ensembl_gene_annotations'] info(f'downloading Ensembl gene annotations ...\n' f' Source: {source}') sync_retry_transient_errors(urlretrieve, source, tmp_gene_annot) hl.hadoop_copy(local_path_uri(tmp_sample_annot), sample_annotations_path) hl.hadoop_copy(local_path_uri(tmp_gene_annot), gene_annotations_path) hl.hadoop_copy(local_path_uri(tmp_vcf), vcf_path) info('Done!') else: info('1KG files found')
def get_dataset(): if Tests._dataset is None: Tests._dataset = hl.split_multi_hts( hl.import_vcf(resource('sample.vcf'))) return Tests._dataset
def test_mendel_errors(self): mt = hl.import_vcf(resource('mendel.vcf')) ped = hl.Pedigree.read(resource('mendel.fam')) men, fam, ind, var = hl.mendel_errors(mt['GT'], ped) self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr)) self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr, fam_id=hl.tstr, mendel_code=hl.tint)) self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr)) self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr, fam_id=hl.tstr, children=hl.tint, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr)) self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr, fam_id=hl.tstr, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr))) self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), errors=hl.tint64)) self.assertEqual(men.count(), 41) self.assertEqual(fam.count(), 2) self.assertEqual(ind.count(), 7) self.assertEqual(var.count(), mt.count_rows()) self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()), { hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2, errors=41, snp_errors=39), hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1, errors=0, snp_errors=0) }) self.assertEqual(set(ind.select('errors', 'snp_errors').collect()), { hl.utils.Struct(s='Son1', errors=23, snp_errors=22), hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17), hl.utils.Struct(s='Dad1', errors=19, snp_errors=18), hl.utils.Struct(s='Mom1', errors=22, snp_errors=21), hl.utils.Struct(s='Dad2', errors=0, snp_errors=0), hl.utils.Struct(s='Mom2', errors=0, snp_errors=0), hl.utils.Struct(s='Son2', errors=0, snp_errors=0) }) to_keep = hl.set([ (hl.Locus("1", 1), ['C', 'CT']), (hl.Locus("1", 2), ['C', 'T']), (hl.Locus("X", 1), ['C', 'T']), (hl.Locus("X", 3), ['C', 'T']), (hl.Locus("Y", 1), ['C', 'T']), (hl.Locus("Y", 3), ['C', 'T']) ]) self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles))) .order_by('locus') .select('locus', 'alleles', 'errors').collect(), [ hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2), hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2), hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1), ]) ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam')) men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2) self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
def get_dataset(): global _dataset if _dataset is None: _dataset = hl.split_multi_hts(hl.import_vcf(resource('sample.vcf'))).cache() return _dataset
def test_info_char(self): self.assertEqual( hl.import_vcf(resource('infochar.vcf')).count_rows(), 1)
def test_import_vcf_no_reference_specified(self): vcf = hl.import_vcf(resource('sample2.vcf'), reference_genome=None) self.assertTrue( vcf.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(vcf.count_rows(), 735)
def test_import_vcf_bad_reference_allele(self): vcf = hl.import_vcf(resource('invalid_base.vcf')) self.assertEqual(vcf.count_rows(), 1)
def test_import_vcf_can_import_float_array_format(self): mt = hl.import_vcf(resource('floating_point_array.vcf')) self.assertTrue( mt.aggregate_entries(hl.agg.all(mt.numeric_array == [1.5, 2.5])))
def test_glob(self): full = hl.import_vcf(resource('sample.vcf')) parts = hl.import_vcf(resource('samplepart*.vcf')) self.assertTrue(parts._same(full))
# chr21 0 ~ 1714 # chr22 0 ~ 1669 filelist = [ 'gs://rcstorage/genotype/gnarly_chr22.' + str(i) + '.variant_filtered.vcf.gz' for i in range(7700) ] # define output files vds_splitmulti_file = 'gs://rcstorage/matrixtable/' + chrom + '/splitmulti.vds' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # II. Import VCF # Combine all VCF chunks for one chromosome and import as vds #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("importing vcf...") vds = hl.import_vcf(filelist, force_bgz=True, reference_genome='GRCh38') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # III. Remove variants without PASS in Filter column #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("filtering variants without pass...") vds = vds.filter_rows(hl.len(vds.filters) == 0) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # IV. Split multi #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("spliting multi...") vds = hl.split_multi_hts( vds.select_entries(vds.GT, vds.AD, vds.DP, vds.GQ, vds.PL)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def process_gnomad_data(datapath, chromosome, transcript_list, exomes=True, synonymous=True): """ Uses hail to process the gnomAD dataset """ basedir = dirname(__file__) logdir = path_join(basedir, 'hail.log') hl_init(log=logdir, append=True, default_reference='GRCh38') # this try-except block makes sure the program won't spend time # writing the table to disk if it already exists from a previous loop #try: # mt = hl.import_vcf(datapath) #except: # #it already exists, so just read it. # pass mt = import_vcf(datapath) # first filter down to the right number of transcripts transcripts, intervals = zip(*transcript_list) transcripts = hl_literal(list(transcripts)) mt = filter_intervals(mt, [ parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ]) mt = mt.filter_rows(mt.filters == hl_empty_set('str')) mt = mt.explode_rows(mt.info.vep) # get the right transcript mt = mt.annotate_rows(vep=mt.info.vep.split('\|')) #print(mt.vep.take(1)) mt = mt.annotate_rows(gene=mt.vep[3]) mt = mt.annotate_rows(enst=mt.vep[6]) mt = mt.filter_rows(transcripts.contains(mt.enst)) mt = mt.annotate_rows(vartype=mt.vep[1].split('&')) mt = mt.explode_rows(mt.vartype) vartype_list = hl_literal([ 'frameshift_variant', 'inframe_deletion', 'inframe_insertion', 'missense_variant', 'start_lost', 'stop_gained' ]) if synonymous: vartype_list = vartype_list.extend(['synonymous_variant']) mt = mt.filter_rows(vartype_list.contains(mt.vartype)) mt = mt.annotate_rows(codon_num=mt.vep[14]) mt = mt.annotate_rows(aa_change=mt.vep[15]) #mt = mt.annotate_rows(orig_aa = mt.vep[15].split('/')[0]) #mt = mt.annotate_rows(var_aa = mt.vep[15].split('/')[1]) #mt.filter_rows(mt.vartype == "synonymous_variant").var_aa = None mt = mt.annotate_rows(transcript_consequence=mt.vep[10]) mt = mt.annotate_rows(protein_consequence=mt.vep[11]) mt = mt.annotate_rows(AC=mt.info.AC[0]) try: mt = mt.annotate_rows(non_neuro_AC=mt.info.non_neuro_AC[0]) mt = mt.annotate_rows(non_neuro_AN=mt.info.non_neuro_AN[0]) except: mt = mt.annotate_rows(non_neuro_AC=hl_null('int')) mt = mt.annotate_rows(non_neuro_AN=hl_null('int')) try: mt = mt.annotate_rows(non_topmed_AC=mt.info.non_topmed_AC[0]) mt = mt.annotate_rows(non_topmed_AN=mt.info.non_topmed_AN[0]) except: mt = mt.annotate_rows(non_topmed_AC=hl_null('int')) mt = mt.annotate_rows(non_topmed_AN=hl_null('int')) try: mt = mt.annotate_rows(non_cancer_AC=mt.info.non_cancer_AC[0]) mt = mt.annotate_rows(non_cancer_AN=mt.info.non_cancer_AN[0]) except: mt = mt.annotate_rows(non_cancer_AC=hl_null('int')) mt = mt.annotate_rows(non_cancer_AN=hl_null('int')) try: mt = mt.annotate_rows(controls_AC=mt.info.controls_AC[0]) mt = mt.annotate_rows(controls_AN=mt.info.controls_AN[0]) except: mt = mt.annotate_rows(controls_AC=hl_null('int')) mt = mt.annotate_rows(controls_AN=hl_null('int')) try: mt = mt.annotate_rows(pab_max=mt.info.pab_max[0]) except: mt = mt.annotate_rows(pab_max=hl_null('int')) try: mt = mt.annotate_rows(VQSLOD=mt.info.VQSLOD) except: mt = mt.annotate_rows(VQSLOD=hl_null('int')) try: mt = mt.annotate_rows(DP=mt.info.DP) except: mt = mt.annotate_rows(DP=hl_null('int')) try: mt = mt.annotate_rows(BaseQRankSum=mt.info.BaseQRankSum) except: mt = mt.annotate_rows(BaseQRankSum=hl_null('int')) try: mt = mt.annotate_rows(ClippingRankSum=mt.info.ClippingRankSum) except: mt = mt.annotate_rows(ClippingRankSum=hl_null('int')) try: mt = mt.annotate_rows(rf_tp_probability=mt.info.rf_tp_probability) except: mt = mt.annotate_rows(rf_tp_probability=hl_null('int')) ht = mt.select_rows(mt.qual, mt.filters, mt.vartype, mt.gene, mt.transcript_consequence, mt.protein_consequence, mt.codon_num, mt.aa_change, mt.info.FS, mt.info.MQRankSum, mt.info.InbreedingCoeff, mt.info.ReadPosRankSum, mt.VQSLOD, mt.info.QD, mt.DP, mt.BaseQRankSum, mt.info.MQ, mt.ClippingRankSum, mt.rf_tp_probability, mt.pab_max, mt.AC, mt.info.AN, mt.non_neuro_AC, mt.non_neuro_AN, mt.non_cancer_AC, mt.non_cancer_AN, mt.non_topmed_AC, mt.non_topmed_AN, mt.controls_AC, mt.controls_AN).make_table() ht = ht.annotate(chromosome=ht.locus.contig, position=ht.locus.position) ht = ht.annotate(allele_ref=ht.alleles[0], allele_alt=ht.alleles[1]) ht = ht.key_by(ht.chromosome, ht.position, ht.allele_ref, ht.allele_alt) ht = ht.drop(ht.alleles, ht.locus) df = ht.to_pandas() hl_stop() cols = df.columns.tolist() cols = cols[-4:] + cols[:-4] df = df[cols] df['filters'] = 'PASS' df['ref_aa'], df['alt_aa'] = df['aa_change'].str.split('/', 1).str df.loc[df.vartype == 'synonymous_variant', 'protein_consequence'] = None df['Variant'] = df.apply(lambda row: Variant_name(row), axis=1) df = df.drop(['aa_change'], axis=1) cols = df.columns.tolist() cols = cols[:11] + cols[-2:] + cols[11:-2] df = df[cols] if exomes: ome = 'exomes' else: ome = 'genomes' df['source'] = ome #filename = 'gnomad_' + ome + '_chr' + chromosome + '_processed.tsv' #df.to_csv(filename, sep='\t', encoding = 'utf-8', index=False) #os.remove('temp_matrix_table_' + chromosome + '.mt') return df
if (args.chr_prefix is True) and (args.reference_genome == "GRCh37"): recode = {f"chr{i}": f"{i}" for i in (list(range(1, 23)) + ['X', 'Y'])} elif (args.chr_prefix is False) and (args.reference_genome == "GRCh38"): recode = {f"{i}": f"chr{i}" for i in (list(range(1, 23)) + ['X', 'Y'])} else: recode = None # If MT does not already exist, load in VCF and then write it to disk stat_cmd = ['gsutil', '-q', 'stat', mt_name + "/metadata.json.gz"] exists = subprocess.call(stat_cmd) if exists == 1: # stat returns 1 if file/folder does not exist, 0 if it exists logging.info(f'Detected mt of input vcf {vcf} does not exist, importing vcf.') if recode is None: hl.import_vcf(vcf_name, force_bgz=args.force_bgz, call_fields=args.call_fields, reference_genome=args.reference_genome).write(mt_name, overwrite=True) else: hl.import_vcf(vcf_name, force_bgz=args.force_bgz, call_fields=args.call_fields, reference_genome=args.reference_genome, contig_recoding=recode ).write(mt_name, overwrite=True) else: logging.info(f"Detected mt of input vcf {vcf} already exists, reading mt directly.") mt = hl.read_matrix_table(mt_name) if args.test: logging.info('Test flag given, filtering to on chrom 22.') if args.reference_genome == "GRCh38": chrom_code = "chr22" else: chrom_code = "22"