def test_import_bgen_variant_filtering(self): desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198] actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, n_partitions=10, _row_fields=['file_row_idx'], _variants_per_file={ resource('example.8bits.bgen'): desired_variant_indexes }) # doing the expected import_bgen second catches the case where the # hadoop configuraiton is polluted with old data from the # _variants_per_file everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, _row_fields=['file_row_idx']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variant_indexes).contains( hl.int32(everything.file_row_idx))) self.assertTrue(expected._same(actual)) self.assertEqual( (hl.str(actual.locus.contig) + ":" + hl.str(actual.locus.position)).collect(), [ '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000', '1:13000', '1:15000', '1:19000', '1:100001' ])
def test_import_bgen_locus_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) # Test with Struct(Locus) desired_loci = [hl.Struct(locus=hl.Locus('1', 10000))] expected_result = [ hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']), hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant ] locus_struct = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(locus_struct.rows().key_by('locus', 'alleles').select().collect() == expected_result) # Test with Locus object desired_loci = [hl.Locus('1', 10000)] locus_object = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(locus_object.rows().key_by('locus', 'alleles').select().collect() == expected_result)
def test_multiple_files_variant_filtering(self): bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def test_multiple_files_variant_filtering(self): bgen_file = [ resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen') ] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def test_import_bgen(self): hl.index_bgen(resource('example.v11.bgen')) bgen_rows = hl.import_bgen(resource('example.v11.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample'), contig_recoding={'01': '1'}, reference_genome='GRCh37').rows() self.assertTrue(bgen_rows.all(bgen_rows.locus.contig == '1')) self.assertEqual(bgen_rows.count(), 199) hl.index_bgen(resource('example.8bits.bgen')) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample'), contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64))) self.assertEqual(bgen.count_rows(), 199) hl.index_bgen(resource('example.10bits.bgen')) bgen = hl.import_bgen(resource('example.10bits.bgen'), entry_fields=['GT', 'GP', 'dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64), dosage=hl.tfloat64)) self.assertEqual(bgen.locus.dtype, hl.tlocus('GRCh37'))
def test_multiple_references_throws_error(self): sample_file = resource('random.sample') bgen_file1 = resource('random-b.bgen') bgen_file2 = resource('random-c.bgen') hl.index_bgen(bgen_file1, reference_genome=None) hl.index_bgen(bgen_file2, reference_genome='GRCh37') with self.assertRaisesRegex(FatalError, 'Found multiple reference genomes were specified in the BGEN index files'): hl.import_bgen([bgen_file1, bgen_file2], ['GT'], sample_file=sample_file)
def test_multiple_references_throws_error(self): sample_file = resource('random.sample') bgen_file1 = resource('random-b.bgen') bgen_file2 = resource('random-c.bgen') hl.index_bgen(bgen_file1, reference_genome=None) hl.index_bgen(bgen_file2, reference_genome='GRCh37') with self.assertRaisesRegex( FatalError, 'Found multiple reference genomes were specified in the BGEN index files' ): hl.import_bgen([bgen_file1, bgen_file2], ['GT'], sample_file=sample_file)
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen( bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen( bgen_file, ['GT'], n_partitions= 199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def test_import_bgen_variant_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) desired_variants = everything.rows() actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) # filtering with everything self.assertTrue(everything._same(actual))
def test_import_bgen_variant_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) desired_variants = everything.rows() actual = hl.import_bgen( bgen_file, ['GT'], n_partitions=10, variants=desired_variants) # filtering with everything self.assertTrue(everything._same(actual))
def test_multiple_files_disjoint(self): sample_file = resource('random.sample') bgen_file = [ resource('random-b-disjoint.bgen'), resource('random-c-disjoint.bgen'), resource('random-a-disjoint.bgen') ] hl.index_bgen(bgen_file) with self.assertRaisesRegex( FatalError, 'Each BGEN file must contain a region of the genome disjoint from other files' ): hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)
def test_old_index_file_throws_error(self): sample_file = resource('random.sample') bgen_file = resource('random.bgen') # missing file if os.path.exists(bgen_file + '.idx2'): run_command(['rm', '-r', bgen_file + '.idx2']) with self.assertRaisesRegex(FatalError, 'have no .idx2 index file'): hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3) # old index file run_command(['touch', bgen_file + '.idx']) with self.assertRaisesRegex(FatalError, 'have no .idx2 index file'): hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) run_command(['rm', bgen_file + '.idx'])
def test_parallel_import(self): bgen_file = resource('parallelBgenExport.bgen') hl.index_bgen(bgen_file) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], resource('parallelBgenExport.sample')) self.assertEqual(mt.count(), (16, 10))
def import_bgen_filter_count(bgen_path, sample_path): mt = hl.import_bgen(bgen_path, sample_file=sample_path, entry_fields=['GT', 'GP'], n_partitions=8) mt = mt.filter_rows(mt.alleles == ['A', 'T']) mt._force_count_rows()
def import_bgen_info_score(bgen_path, sample_path): mt = hl.import_bgen(bgen_path, sample_file=sample_path, entry_fields=['GP'], n_partitions=8) mt = mt.annotate_rows(info_score=hl.agg.info_score(mt.GP)) mt.rows().select('info_score')._force_count()
def import_bgen_info_score(): mt = hl.import_bgen(resource('sim_ukb.bgen'), sample_file=resource('sim_ukb.sample'), entry_fields=['GP'], n_partitions=8) mt = mt.annotate_rows(info_score=hl.agg.info_score(mt.GP)) mt.rows().select('info_score')._force_count()
def test_import_bgen_skip_invalid_loci(self): hl.index_bgen(resource('skip_invalid_loci.bgen')) mt = hl.import_bgen(resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample'), reference_genome='GRCh37', skip_invalid_loci=True) self.assertTrue(mt._force_count_rows() == 3) with self.assertRaisesRegex(FatalError, 'Invalid locus'): mt = hl.import_bgen( resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample')) mt._force_count_rows()
def test_import_bgen_no_reference_specified(self): bgen = hl.import_bgen(resource('example.10bits.bgen'), entry_fields=['GT', 'GP', 'dosage'], contig_recoding={'01': '1'}, reference_genome=None) self.assertTrue(bgen.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(bgen.count_rows(), 199)
def import_bgen_filter_count(): mt = hl.import_bgen(resource('sim_ukb.bgen'), sample_file=resource('sim_ukb.sample'), entry_fields=['GT', 'GP'], n_partitions=8) mt = mt.filter_rows(mt.alleles == ['A', 'T']) mt._force_count_rows()
def test_import_bgen_skip_invalid_loci(self): hl.index_bgen(resource('skip_invalid_loci.bgen'), reference_genome='GRCh37', skip_invalid_loci=True) mt = hl.import_bgen(resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample')) self.assertTrue(mt._force_count_rows() == 3) with self.assertRaisesRegex(FatalError, 'Invalid locus'): hl.index_bgen(resource('skip_invalid_loci.bgen')) mt = hl.import_bgen(resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample')) mt._force_count_rows()
def test_n_partitions(self): hl.index_bgen(resource('example.8bits.bgen')) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', n_partitions=5) self.assertEqual(bgen.n_partitions(), 5)
def test_import_bgen_dosage_entry(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) self.assertEqual(bgen.count_rows(), 199)
def test_import_bgen_random(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = resource('random.bgen') hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_import_bgen_GT_GP_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
def test_import_bgen_no_reference(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome=None) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP', 'dosage']) self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(bgen.count_rows(), 199)
def test_multiple_files(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')] hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3) self.assertTrue( bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_n_partitions(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], n_partitions=210) self.assertEqual(bgen.n_partitions(), 199) # only 199 variants in the file
def test_import_bgen_locus_filtering_from_exprs(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) actual_struct = hl.import_bgen(bgen_file, ['GT'], variants=hl.struct(locus=everything.locus)) self.assertTrue(everything._same(actual_struct)) actual_locus = hl.import_bgen(bgen_file, ['GT'], variants=everything.locus) self.assertTrue(everything._same(actual_locus))
def test_import_bgen_random(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = resource('random.bgen') hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) self.assertTrue( bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_import_bgen_row_fields(self): default_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual( default_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr, varid=hl.tstr)) no_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=[]) self.assertEqual( no_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr))) varid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=['varid']) self.assertEqual( varid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), varid=hl.tstr)) rsid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=['rsid']) self.assertEqual( rsid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr)) self.assertTrue(default_row_fields.drop('varid')._same(rsid_only)) self.assertTrue(default_row_fields.drop('rsid')._same(varid_only)) self.assertTrue( default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
def test_import_bgen_no_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=[], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct()) bgen._jvds.typecheck()
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen(bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by('locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen(bgen_file, ['GT'], n_partitions=199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by('locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def test_import_bgen_empty_variant_filter(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=[]) self.assertEqual(actual.count_rows(), 0) nothing = hl.import_bgen(bgen_file, ['GT']).drop_rows() self.assertEqual(nothing.count(), (0, 500)) desired_variants = hl.struct(locus=nothing.locus, alleles=nothing.alleles) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 0)
def get_ukb_imputed_data(chromosome: str = '1', variant_list: hl.Table = None, entry_fields=('GP', )): if chromosome == 'all': chromosome = '{' + ','.join(map(str, range(1, 23))) + '}' add_args = {} if variant_list is not None: add_args['variants'] = variant_list return hl.import_bgen(ukb_imputed_bgen_path.format(chromosome), entry_fields=entry_fields, sample_file=get_sample_file(chromosome), **add_args)
def test_import_bgen_empty_variant_filter(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=[]) self.assertEqual(actual.count_rows(), 0) nothing = hl.import_bgen(bgen_file, ['GT']).filter_rows(False) self.assertEqual(nothing.count(), (0, 500)) desired_variants = hl.struct(locus=nothing.locus, alleles=nothing.alleles) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 0)
def test_import_bgen_dosage_and_gp_dosage_function_agree(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding) bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file) et = bgenmt.entries() et = et.transmute(gp_dosage=hl.gp_dosage(et.GP)) self.assertTrue( et.all((hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
def test_specify_different_index_file(self): sample_file = resource('random.sample') bgen_file = resource('random.bgen') index_file = new_temp_file(suffix='idx2') index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, index_file_map=index_file_map) self.assertEqual(mt.count(), (30, 10)) with self.assertRaisesRegex(FatalError, 'missing a .idx2 file extension'): index_file = new_temp_file() index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map)
def test_unphased_bgen(spark, tmp_path): spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1') input_bgen = 'test-data/bgen/example.8bits.bgen' hl.index_bgen(input_bgen, reference_genome=None) hail_df = functions.from_matrix_table( hl.import_bgen(input_bgen, entry_fields=['GP'])) _assert_lossless_adapter(spark, tmp_path, hail_df, input_bgen, 'bgen', 'bigbgen', writer_options={'bitsPerProbability': '8'})
def test_import_bgen_gavin_example(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') genmt = hl.import_gen(resource('example.gen'), sample_file, contig_recoding=recoding, reference_genome="GRCh37") bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding, reference_genome="GRCh37") bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) self.assertTrue( bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_import_bgen_dosage_and_gp_dosage_function_agree(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding) bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file) et = bgenmt.entries() et = et.transmute(gp_dosage = hl.gp_dosage(et.GP)) self.assertTrue(et.all( (hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
def test_import_bgen_row_fields(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') default_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(default_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr, varid=hl.tstr)) no_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=[]) self.assertEqual(no_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr))) varid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['varid']) self.assertEqual(varid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), varid=hl.tstr)) rsid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['rsid']) self.assertEqual(rsid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr)) self.assertTrue(default_row_fields.drop('varid')._same(rsid_only)) self.assertTrue(default_row_fields.drop('rsid')._same(varid_only)) self.assertTrue( default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
def test_drop(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) dr = bgen.filter_rows(False) self.assertEqual(dr._force_count_rows(), 0) self.assertEqual(dr._force_count_cols(), 500) dc = bgen.filter_cols(False) self.assertEqual(dc._force_count_rows(), 199) self.assertEqual(dc._force_count_cols(), 0)
def test_drop(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) dr = bgen.drop_rows() self.assertEqual(dr._force_count_rows(), 0) self.assertEqual(dr._force_count_cols(), 500) dc = bgen.drop_cols() self.assertEqual(dc._force_count_rows(), 199) self.assertEqual(dc._force_count_cols(), 0)
def test_import_bgen_gavin_example(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') genmt = hl.import_gen(resource('example.gen'), sample_file, contig_recoding=recoding) bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, contig_recoding=recoding) self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_multiple_files(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = [ resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen') ] hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3) self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_import_bgen_row_fields(self): mt = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=['rsid', 'file_row_idx']) self.assertEqual(mt.file_row_idx.take(10), [99, 0, 100, 1, 101, 2, 102, 3, 103, 4]) # the rsids are numbered 2 to 200 and corresond to the order of the # variants in the file (the loci are out of order in this file) # # the rsids look like: "RSID_99" rsids = mt.rsid.collect() self.assertEqual(mt.file_row_idx.collect(), [int(rsid[5:]) - 2 for rsid in rsids])
def test_import_bgen_locus_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) desired_loci = hl.Table.parallelize([{'locus': hl.Locus('1', 10000)}], schema=hl.tstruct(locus=hl.tlocus()), key='locus') expected_result = [ hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']), hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant ] result = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(result.rows().key_by('locus', 'alleles').select().collect() == expected_result)
import hail as hl from hail.linalg import BlockMatrix mt = hl.import_vcf('gs://hail-1kg/1kg_coreexome.vcf.bgz') mt = mt.annotate_rows(x = 5) mt._force_count_rows() mt = hl.import_bgen('gs://hail-ci/example.8bits.bgen', entry_fields=['GT']) mt._force_count_rows() bm = BlockMatrix.random(10, 11) bm.to_numpy(_force_blocking=True) bm.to_numpy()
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
def test_multiple_files_disjoint(self): sample_file = resource('random.sample') bgen_file = [resource('random-b-disjoint.bgen'), resource('random-c-disjoint.bgen'), resource('random-a-disjoint.bgen')] hl.index_bgen(bgen_file) with self.assertRaisesRegex(FatalError, 'Each BGEN file must contain a region of the genome disjoint from other files'): hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)