Ejemplo n.º 1
0
    def test_import_bgen_variant_filtering(self):
        desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198]
        actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                contig_recoding={'01': '1'},
                                reference_genome=None,
                                n_partitions=10,
                                _row_fields=['file_row_idx'],
                                _variants_per_file={
                                    resource('example.8bits.bgen'):
                                    desired_variant_indexes
                                })
        # doing the expected import_bgen second catches the case where the
        # hadoop configuraiton is polluted with old data from the
        # _variants_per_file
        everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                    contig_recoding={'01': '1'},
                                    reference_genome=None,
                                    _row_fields=['file_row_idx'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variant_indexes).contains(
                hl.int32(everything.file_row_idx)))

        self.assertTrue(expected._same(actual))
        self.assertEqual(
            (hl.str(actual.locus.contig) + ":" +
             hl.str(actual.locus.position)).collect(), [
                 '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000',
                 '1:13000', '1:15000', '1:19000', '1:100001'
             ])
Ejemplo n.º 2
0
    def test_import_bgen_locus_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        # Test with Struct(Locus)
        desired_loci = [hl.Struct(locus=hl.Locus('1', 10000))]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant
        ]

        locus_struct = hl.import_bgen(bgen_file,
                                      ['GT'],
                                      variants=desired_loci)
        self.assertTrue(locus_struct.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        # Test with Locus object
        desired_loci = [hl.Locus('1', 10000)]

        locus_object = hl.import_bgen(bgen_file,
                                      ['GT'],
                                      variants=desired_loci)
        self.assertTrue(locus_object.rows().key_by('locus', 'alleles').select().collect() == expected_result)
Ejemplo n.º 3
0
    def test_multiple_files_variant_filtering(self):
        bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file,
                                    ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))
Ejemplo n.º 4
0
    def test_multiple_files_variant_filtering(self):
        bgen_file = [
            resource('random-b.bgen'),
            resource('random-c.bgen'),
            resource('random-a.bgen')
        ]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))
Ejemplo n.º 5
0
    def test_import_bgen(self):
        hl.index_bgen(resource('example.v11.bgen'))

        bgen_rows = hl.import_bgen(resource('example.v11.bgen'),
                                   entry_fields=['GT', 'GP'],
                                   sample_file=resource('example.sample'),
                                   contig_recoding={'01': '1'},
                                   reference_genome='GRCh37').rows()
        self.assertTrue(bgen_rows.all(bgen_rows.locus.contig == '1'))
        self.assertEqual(bgen_rows.count(), 199)

        hl.index_bgen(resource('example.8bits.bgen'))

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37')
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'),
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37')
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
        self.assertEqual(bgen.count_rows(), 199)

        hl.index_bgen(resource('example.10bits.bgen'))
        bgen = hl.import_bgen(resource('example.10bits.bgen'),
                              entry_fields=['GT', 'GP', 'dosage'],
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37')
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64), dosage=hl.tfloat64))
        self.assertEqual(bgen.locus.dtype, hl.tlocus('GRCh37'))
Ejemplo n.º 6
0
    def test_multiple_references_throws_error(self):
        sample_file = resource('random.sample')
        bgen_file1 = resource('random-b.bgen')
        bgen_file2 = resource('random-c.bgen')
        hl.index_bgen(bgen_file1, reference_genome=None)
        hl.index_bgen(bgen_file2, reference_genome='GRCh37')

        with self.assertRaisesRegex(FatalError, 'Found multiple reference genomes were specified in the BGEN index files'):
            hl.import_bgen([bgen_file1, bgen_file2], ['GT'], sample_file=sample_file)
Ejemplo n.º 7
0
    def test_multiple_references_throws_error(self):
        sample_file = resource('random.sample')
        bgen_file1 = resource('random-b.bgen')
        bgen_file2 = resource('random-c.bgen')
        hl.index_bgen(bgen_file1, reference_genome=None)
        hl.index_bgen(bgen_file2, reference_genome='GRCh37')

        with self.assertRaisesRegex(
                FatalError,
                'Found multiple reference genomes were specified in the BGEN index files'
        ):
            hl.import_bgen([bgen_file1, bgen_file2], ['GT'],
                           sample_file=sample_file)
Ejemplo n.º 8
0
    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000),
                      alleles=alleles),  # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=1,  # forcing seek to be called
            variants=desired_variants)
        self.assertTrue(part_1.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=
            199,  # forcing each variant to be its own partition for testing duplicates work properly
            variants=desired_variants)
        self.assertTrue(part_199.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))
Ejemplo n.º 9
0
    def test_import_bgen_variant_filtering_from_table(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        desired_variants = everything.rows()

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants) # filtering with everything

        self.assertTrue(everything._same(actual))
Ejemplo n.º 10
0
    def test_import_bgen_variant_filtering_from_table(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        desired_variants = everything.rows()

        actual = hl.import_bgen(
            bgen_file, ['GT'], n_partitions=10,
            variants=desired_variants)  # filtering with everything

        self.assertTrue(everything._same(actual))
Ejemplo n.º 11
0
 def test_multiple_files_disjoint(self):
     sample_file = resource('random.sample')
     bgen_file = [
         resource('random-b-disjoint.bgen'),
         resource('random-c-disjoint.bgen'),
         resource('random-a-disjoint.bgen')
     ]
     hl.index_bgen(bgen_file)
     with self.assertRaisesRegex(
             FatalError,
             'Each BGEN file must contain a region of the genome disjoint from other files'
     ):
         hl.import_bgen(bgen_file, ['GT', 'GP'],
                        sample_file,
                        n_partitions=3)
Ejemplo n.º 12
0
    def test_old_index_file_throws_error(self):
        sample_file = resource('random.sample')
        bgen_file = resource('random.bgen')

        # missing file
        if os.path.exists(bgen_file + '.idx2'):
            run_command(['rm', '-r', bgen_file + '.idx2'])
        with self.assertRaisesRegex(FatalError, 'have no .idx2 index file'):
            hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)

        # old index file
        run_command(['touch', bgen_file + '.idx'])
        with self.assertRaisesRegex(FatalError, 'have no .idx2 index file'):
            hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        run_command(['rm', bgen_file + '.idx'])
Ejemplo n.º 13
0
 def test_parallel_import(self):
     bgen_file = resource('parallelBgenExport.bgen')
     hl.index_bgen(bgen_file)
     mt = hl.import_bgen(bgen_file,
                         ['GT', 'GP'],
                         resource('parallelBgenExport.sample'))
     self.assertEqual(mt.count(), (16, 10))
Ejemplo n.º 14
0
def import_bgen_filter_count(bgen_path, sample_path):
    mt = hl.import_bgen(bgen_path,
                        sample_file=sample_path,
                        entry_fields=['GT', 'GP'],
                        n_partitions=8)
    mt = mt.filter_rows(mt.alleles == ['A', 'T'])
    mt._force_count_rows()
Ejemplo n.º 15
0
def import_bgen_info_score(bgen_path, sample_path):
    mt = hl.import_bgen(bgen_path,
                        sample_file=sample_path,
                        entry_fields=['GP'],
                        n_partitions=8)
    mt = mt.annotate_rows(info_score=hl.agg.info_score(mt.GP))
    mt.rows().select('info_score')._force_count()
Ejemplo n.º 16
0
def import_bgen_info_score():
    mt = hl.import_bgen(resource('sim_ukb.bgen'),
                        sample_file=resource('sim_ukb.sample'),
                        entry_fields=['GP'],
                        n_partitions=8)
    mt = mt.annotate_rows(info_score=hl.agg.info_score(mt.GP))
    mt.rows().select('info_score')._force_count()
Ejemplo n.º 17
0
    def test_import_bgen_skip_invalid_loci(self):
        hl.index_bgen(resource('skip_invalid_loci.bgen'))

        mt = hl.import_bgen(resource('skip_invalid_loci.bgen'),
                            entry_fields=[],
                            sample_file=resource('skip_invalid_loci.sample'),
                            reference_genome='GRCh37',
                            skip_invalid_loci=True)
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            mt = hl.import_bgen(
                resource('skip_invalid_loci.bgen'),
                entry_fields=[],
                sample_file=resource('skip_invalid_loci.sample'))
            mt._force_count_rows()
Ejemplo n.º 18
0
 def test_import_bgen_no_reference_specified(self):
     bgen = hl.import_bgen(resource('example.10bits.bgen'),
                           entry_fields=['GT', 'GP', 'dosage'],
                           contig_recoding={'01': '1'},
                           reference_genome=None)
     self.assertTrue(bgen.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32))
     self.assertEqual(bgen.count_rows(), 199)
Ejemplo n.º 19
0
def import_bgen_filter_count():
    mt = hl.import_bgen(resource('sim_ukb.bgen'),
                        sample_file=resource('sim_ukb.sample'),
                        entry_fields=['GT', 'GP'],
                        n_partitions=8)
    mt = mt.filter_rows(mt.alleles == ['A', 'T'])
    mt._force_count_rows()
Ejemplo n.º 20
0
    def test_import_bgen_skip_invalid_loci(self):
        hl.index_bgen(resource('skip_invalid_loci.bgen'),
                      reference_genome='GRCh37',
                      skip_invalid_loci=True)

        mt = hl.import_bgen(resource('skip_invalid_loci.bgen'),
                            entry_fields=[],
                            sample_file=resource('skip_invalid_loci.sample'))
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            hl.index_bgen(resource('skip_invalid_loci.bgen'))

            mt = hl.import_bgen(resource('skip_invalid_loci.bgen'),
                                entry_fields=[],
                                sample_file=resource('skip_invalid_loci.sample'))
            mt._force_count_rows()
Ejemplo n.º 21
0
    def test_old_index_file_throws_error(self):
        sample_file = resource('random.sample')
        bgen_file = resource('random.bgen')

        # missing file
        if os.path.exists(bgen_file + '.idx2'):
            run_command(['rm', '-r', bgen_file + '.idx2'])
        with self.assertRaisesRegex(FatalError, 'have no .idx2 index file'):
            hl.import_bgen(bgen_file, ['GT', 'GP'],
                           sample_file,
                           n_partitions=3)

        # old index file
        run_command(['touch', bgen_file + '.idx'])
        with self.assertRaisesRegex(FatalError, 'have no .idx2 index file'):
            hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        run_command(['rm', bgen_file + '.idx'])
Ejemplo n.º 22
0
    def test_n_partitions(self):
        hl.index_bgen(resource('example.8bits.bgen'))

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37',
                              n_partitions=5)
        self.assertEqual(bgen.n_partitions(), 5)
Ejemplo n.º 23
0
    def test_import_bgen_dosage_entry(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))
        self.assertEqual(bgen.count_rows(), 199)
Ejemplo n.º 24
0
    def test_import_bgen_random(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = resource('random.bgen')
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255,
                                     absolute=True))
Ejemplo n.º 25
0
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
Ejemplo n.º 26
0
    def test_import_bgen_no_reference(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome=None)

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP', 'dosage'])
        self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32))
        self.assertEqual(bgen.count_rows(), 199)
Ejemplo n.º 27
0
    def test_import_bgen_dosage_entry(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))
        self.assertEqual(bgen.count_rows(), 199)
Ejemplo n.º 28
0
    def test_multiple_files(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')]
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)
        self.assertTrue(
            bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
Ejemplo n.º 29
0
    def test_n_partitions(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              n_partitions=210)
        self.assertEqual(bgen.n_partitions(), 199) # only 199 variants in the file
Ejemplo n.º 30
0
    def test_import_bgen_locus_filtering_from_exprs(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        actual_struct = hl.import_bgen(bgen_file,
                                ['GT'],
                                variants=hl.struct(locus=everything.locus))

        self.assertTrue(everything._same(actual_struct))

        actual_locus = hl.import_bgen(bgen_file,
                                ['GT'],
                                variants=everything.locus)

        self.assertTrue(everything._same(actual_locus))
Ejemplo n.º 31
0
    def test_import_bgen_random(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = resource('random.bgen')
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        self.assertTrue(
            bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
Ejemplo n.º 32
0
    def test_import_bgen_row_fields(self):
        default_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                            entry_fields=['dosage'],
                                            contig_recoding={'01': '1'},
                                            reference_genome='GRCh37')
        self.assertEqual(
            default_row_fields.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       rsid=hl.tstr,
                       varid=hl.tstr))
        no_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                       entry_fields=['dosage'],
                                       contig_recoding={'01': '1'},
                                       reference_genome='GRCh37',
                                       _row_fields=[])
        self.assertEqual(
            no_row_fields.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr)))
        varid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                    entry_fields=['dosage'],
                                    contig_recoding={'01': '1'},
                                    reference_genome='GRCh37',
                                    _row_fields=['varid'])
        self.assertEqual(
            varid_only.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       varid=hl.tstr))
        rsid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                   entry_fields=['dosage'],
                                   contig_recoding={'01': '1'},
                                   reference_genome='GRCh37',
                                   _row_fields=['rsid'])
        self.assertEqual(
            rsid_only.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       rsid=hl.tstr))

        self.assertTrue(default_row_fields.drop('varid')._same(rsid_only))
        self.assertTrue(default_row_fields.drop('rsid')._same(varid_only))
        self.assertTrue(
            default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
Ejemplo n.º 33
0
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype,
                         hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
Ejemplo n.º 34
0
    def test_n_partitions(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              n_partitions=210)
        self.assertEqual(bgen.n_partitions(),
                         199)  # only 199 variants in the file
Ejemplo n.º 35
0
    def test_import_bgen_no_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=[],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct())
        bgen._jvds.typecheck()
Ejemplo n.º 36
0
    def test_import_bgen_no_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=[],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct())
        bgen._jvds.typecheck()
Ejemplo n.º 37
0
    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=1, # forcing seek to be called
                                variants=desired_variants)
        self.assertTrue(part_1.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=199, # forcing each variant to be its own partition for testing duplicates work properly
                                variants=desired_variants)
        self.assertTrue(part_199.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))
Ejemplo n.º 38
0
    def test_import_bgen_empty_variant_filter(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=[])
        self.assertEqual(actual.count_rows(), 0)

        nothing = hl.import_bgen(bgen_file, ['GT']).drop_rows()
        self.assertEqual(nothing.count(), (0, 500))

        desired_variants = hl.struct(locus=nothing.locus,
                                     alleles=nothing.alleles)

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 0)
Ejemplo n.º 39
0
def get_ukb_imputed_data(chromosome: str = '1',
                         variant_list: hl.Table = None,
                         entry_fields=('GP', )):
    if chromosome == 'all':
        chromosome = '{' + ','.join(map(str, range(1, 23))) + '}'
    add_args = {}
    if variant_list is not None:
        add_args['variants'] = variant_list
    return hl.import_bgen(ukb_imputed_bgen_path.format(chromosome),
                          entry_fields=entry_fields,
                          sample_file=get_sample_file(chromosome),
                          **add_args)
Ejemplo n.º 40
0
    def test_import_bgen_empty_variant_filter(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=[])
        self.assertEqual(actual.count_rows(), 0)

        nothing = hl.import_bgen(bgen_file, ['GT']).filter_rows(False)
        self.assertEqual(nothing.count(), (0, 500))

        desired_variants = hl.struct(locus=nothing.locus, alleles=nothing.alleles)

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 0)
Ejemplo n.º 41
0
    def test_import_bgen_dosage_and_gp_dosage_function_agree(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding=recoding)

        bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file)
        et = bgenmt.entries()
        et = et.transmute(gp_dosage=hl.gp_dosage(et.GP))
        self.assertTrue(
            et.all((hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage))
                   | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
Ejemplo n.º 42
0
    def test_specify_different_index_file(self):
        sample_file = resource('random.sample')
        bgen_file = resource('random.bgen')
        index_file = new_temp_file(suffix='idx2')
        index_file_map = {bgen_file: index_file}
        hl.index_bgen(bgen_file, index_file_map=index_file_map)
        mt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, index_file_map=index_file_map)
        self.assertEqual(mt.count(), (30, 10))

        with self.assertRaisesRegex(FatalError, 'missing a .idx2 file extension'):
            index_file = new_temp_file()
            index_file_map = {bgen_file: index_file}
            hl.index_bgen(bgen_file, index_file_map=index_file_map)
Ejemplo n.º 43
0
def test_unphased_bgen(spark, tmp_path):
    spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')
    input_bgen = 'test-data/bgen/example.8bits.bgen'
    hl.index_bgen(input_bgen, reference_genome=None)
    hail_df = functions.from_matrix_table(
        hl.import_bgen(input_bgen, entry_fields=['GP']))
    _assert_lossless_adapter(spark,
                             tmp_path,
                             hail_df,
                             input_bgen,
                             'bgen',
                             'bigbgen',
                             writer_options={'bitsPerProbability': '8'})
Ejemplo n.º 44
0
    def test_import_bgen_gavin_example(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        genmt = hl.import_gen(resource('example.gen'), sample_file,
                              contig_recoding=recoding,
                              reference_genome="GRCh37")

        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding=recoding,
                      reference_genome="GRCh37")
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        self.assertTrue(
            bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
Ejemplo n.º 45
0
    def test_import_bgen_dosage_and_gp_dosage_function_agree(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding=recoding)

        bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file)
        et = bgenmt.entries()
        et = et.transmute(gp_dosage = hl.gp_dosage(et.GP))
        self.assertTrue(et.all(
            (hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) |
            (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
Ejemplo n.º 46
0
    def test_import_bgen_row_fields(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        default_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                            entry_fields=['dosage'])
        self.assertEqual(default_row_fields.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    rsid=hl.tstr,
                                    varid=hl.tstr))
        no_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                       entry_fields=['dosage'],
                                       _row_fields=[])
        self.assertEqual(no_row_fields.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr)))
        varid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                    entry_fields=['dosage'],
                                    _row_fields=['varid'])
        self.assertEqual(varid_only.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    varid=hl.tstr))
        rsid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                   entry_fields=['dosage'],
                                   _row_fields=['rsid'])
        self.assertEqual(rsid_only.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    rsid=hl.tstr))

        self.assertTrue(default_row_fields.drop('varid')._same(rsid_only))
        self.assertTrue(default_row_fields.drop('rsid')._same(varid_only))
        self.assertTrue(
            default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
Ejemplo n.º 47
0
    def test_drop(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])

        dr = bgen.filter_rows(False)
        self.assertEqual(dr._force_count_rows(), 0)
        self.assertEqual(dr._force_count_cols(), 500)

        dc = bgen.filter_cols(False)
        self.assertEqual(dc._force_count_rows(), 199)
        self.assertEqual(dc._force_count_cols(), 0)
Ejemplo n.º 48
0
    def test_drop(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])

        dr = bgen.drop_rows()
        self.assertEqual(dr._force_count_rows(), 0)
        self.assertEqual(dr._force_count_cols(), 500)

        dc = bgen.drop_cols()
        self.assertEqual(dc._force_count_rows(), 199)
        self.assertEqual(dc._force_count_cols(), 0)
Ejemplo n.º 49
0
    def test_import_bgen_gavin_example(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        genmt = hl.import_gen(resource('example.gen'),
                              sample_file,
                              contig_recoding=recoding)

        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                                sample_file,
                                contig_recoding=recoding)
        self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255,
                                     absolute=True))
Ejemplo n.º 50
0
    def test_multiple_files(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = [
            resource('random-b.bgen'),
            resource('random-c.bgen'),
            resource('random-a.bgen')
        ]
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                                sample_file,
                                n_partitions=3)
        self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255,
                                     absolute=True))
Ejemplo n.º 51
0
    def test_import_bgen_row_fields(self):
        mt = hl.import_bgen(resource('example.8bits.bgen'),
                            entry_fields=['dosage'],
                            contig_recoding={'01': '1'},
                            reference_genome='GRCh37',
                            _row_fields=['rsid', 'file_row_idx'])
        self.assertEqual(mt.file_row_idx.take(10),
                         [99, 0, 100, 1, 101, 2, 102, 3, 103, 4])

        # the rsids are numbered 2 to 200 and corresond to the order of the
        # variants in the file (the loci are out of order in this file)
        #
        # the rsids look like: "RSID_99"
        rsids = mt.rsid.collect()
        self.assertEqual(mt.file_row_idx.collect(),
                         [int(rsid[5:]) - 2 for rsid in rsids])
Ejemplo n.º 52
0
    def test_specify_different_index_file(self):
        sample_file = resource('random.sample')
        bgen_file = resource('random.bgen')
        index_file = new_temp_file(suffix='idx2')
        index_file_map = {bgen_file: index_file}
        hl.index_bgen(bgen_file, index_file_map=index_file_map)
        mt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                            sample_file,
                            index_file_map=index_file_map)
        self.assertEqual(mt.count(), (30, 10))

        with self.assertRaisesRegex(FatalError,
                                    'missing a .idx2 file extension'):
            index_file = new_temp_file()
            index_file_map = {bgen_file: index_file}
            hl.index_bgen(bgen_file, index_file_map=index_file_map)
Ejemplo n.º 53
0
    def test_import_bgen_locus_filtering_from_table(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        desired_loci = hl.Table.parallelize([{'locus': hl.Locus('1', 10000)}],
                                            schema=hl.tstruct(locus=hl.tlocus()),
                                            key='locus')

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G'])  # Duplicated variant
        ]

        result = hl.import_bgen(bgen_file,
                                ['GT'],
                                variants=desired_loci)

        self.assertTrue(result.rows().key_by('locus', 'alleles').select().collect() == expected_result)
Ejemplo n.º 54
0
import hail as hl
from hail.linalg import BlockMatrix

mt = hl.import_vcf('gs://hail-1kg/1kg_coreexome.vcf.bgz')
mt = mt.annotate_rows(x = 5)
mt._force_count_rows()

mt = hl.import_bgen('gs://hail-ci/example.8bits.bgen', entry_fields=['GT'])
mt._force_count_rows()

bm = BlockMatrix.random(10, 11)
bm.to_numpy(_force_blocking=True)
bm.to_numpy()
Ejemplo n.º 55
0
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)
Ejemplo n.º 56
0
 def test_multiple_files_disjoint(self):
     sample_file = resource('random.sample')
     bgen_file = [resource('random-b-disjoint.bgen'), resource('random-c-disjoint.bgen'), resource('random-a-disjoint.bgen')]
     hl.index_bgen(bgen_file)
     with self.assertRaisesRegex(FatalError, 'Each BGEN file must contain a region of the genome disjoint from other files'):
         hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)