コード例 #1
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
 def test_parallel_import(self):
     bgen_file = resource('parallelBgenExport.bgen')
     hl.index_bgen(bgen_file)
     mt = hl.import_bgen(bgen_file,
                         ['GT', 'GP'],
                         resource('parallelBgenExport.sample'))
     self.assertEqual(mt.count(), (16, 10))
コード例 #2
0
    def test_multiple_files_variant_filtering(self):
        bgen_file = [
            resource('random-b.bgen'),
            resource('random-c.bgen'),
            resource('random-a.bgen')
        ]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))
コード例 #3
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_locus_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        # Test with Struct(Locus)
        desired_loci = [hl.Struct(locus=hl.Locus('1', 10000))]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant
        ]

        locus_struct = hl.import_bgen(bgen_file,
                                      ['GT'],
                                      variants=desired_loci)
        self.assertTrue(locus_struct.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        # Test with Locus object
        desired_loci = [hl.Locus('1', 10000)]

        locus_object = hl.import_bgen(bgen_file,
                                      ['GT'],
                                      variants=desired_loci)
        self.assertTrue(locus_object.rows().key_by('locus', 'alleles').select().collect() == expected_result)
コード例 #4
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_multiple_files_variant_filtering(self):
        bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file,
                                    ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))
コード例 #5
0
    def test_matrix_ir_parses(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hail.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32])
        collect = ir.MakeStruct([('x',
                                  ir.ApplyAggOp([ir.I32(0)], [], None,
                                                collect_sig,
                                                hl.tarray(hl.tint32)))])

        matrix_read = ir.MatrixRead(
            resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False,
            False)
        table_read = ir.TableRead(
            resource('backward_compatability/1.0.0/table/0.ht'), False, None)

        matrix_irs = [
            ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1),
                               ir.MatrixRange(5, 5, 1)),
            ir.UnlocalizeEntries(ir.LocalizeEntries(matrix_read, '__entries'),
                                 ir.MatrixColsTable(matrix_read), '__entries'),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            ir.MatrixRange(1, 1, 10),
            ir.MatrixImportVCF([resource('sample.vcf')], False, False,
                               None, None, False, ['GT'],
                               hail.get_reference('GRCh37'), {}, True, False),
            ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'],
                                resource('example.sample'), {}, 10, 1,
                                ['varid'], None),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]),
                             ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []),
                             ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read,
                                ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read,
                                ir.MakeStruct([('x', ir.I64(20))])),
            ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'],
                                  ['aset'], ['mset'], 100),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
        ]

        for x in matrix_irs:
            try:
                Env.hail().expr.Parser.parse_matrix_ir(str(x))
            except Exception as e:
                raise ValueError(str(x)) from e
コード例 #6
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_no_reference(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome=None)

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP', 'dosage'])
        self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32))
        self.assertEqual(bgen.count_rows(), 199)
コード例 #7
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
コード例 #8
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_dosage_entry(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))
        self.assertEqual(bgen.count_rows(), 199)
コード例 #9
0
ファイル: test_impex.py プロジェクト: maccum/hail
    def test_n_partitions(self):
        hl.index_bgen(resource('example.8bits.bgen'))

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37',
                              n_partitions=5)
        self.assertEqual(bgen.n_partitions(), 5)
コード例 #10
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_random(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = resource('random.bgen')
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        self.assertTrue(
            bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
コード例 #11
0
 def _create(self, resource_dir):
     bgen = 'sim_ukb.bgen'
     sample = 'sim_ukb.sample'
     download(resource_dir, bgen)
     download(resource_dir, sample)
     local_bgen = os.path.join(resource_dir, bgen)
     logging.info(f'indexing {bgen}...')
     hl.index_bgen(local_bgen)
     logging.info(f'done indexing {bgen}.')
コード例 #12
0
    def test_import_bgen_dosage_entry(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))
        self.assertEqual(bgen.count_rows(), 199)
コード例 #13
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_multiple_references_throws_error(self):
        sample_file = resource('random.sample')
        bgen_file1 = resource('random-b.bgen')
        bgen_file2 = resource('random-c.bgen')
        hl.index_bgen(bgen_file1, reference_genome=None)
        hl.index_bgen(bgen_file2, reference_genome='GRCh37')

        with self.assertRaisesRegex(FatalError, 'Found multiple reference genomes were specified in the BGEN index files'):
            hl.import_bgen([bgen_file1, bgen_file2], ['GT'], sample_file=sample_file)
コード例 #14
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_multiple_files(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')]
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)
        self.assertTrue(
            bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
コード例 #15
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_n_partitions(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              n_partitions=210)
        self.assertEqual(bgen.n_partitions(), 199) # only 199 variants in the file
コード例 #16
0
    def test_import_bgen_random(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = resource('random.bgen')
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255,
                                     absolute=True))
コード例 #17
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_no_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=[],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct())
        bgen._jvds.typecheck()
コード例 #18
0
    def test_import_bgen_GT_GP_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype,
                         hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
コード例 #19
0
    def test_import_bgen_no_reference(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome=None)

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP', 'dosage'])
        self.assertEqual(bgen.locus.dtype,
                         hl.tstruct(contig=hl.tstr, position=hl.tint32))
        self.assertEqual(bgen.count_rows(), 199)
コード例 #20
0
    def test_n_partitions(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              n_partitions=210)
        self.assertEqual(bgen.n_partitions(),
                         199)  # only 199 variants in the file
コード例 #21
0
    def test_import_bgen_no_entries(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=[],
                              sample_file=resource('example.sample'))
        self.assertEqual(bgen.entry.dtype, hl.tstruct())
        bgen._jvds.typecheck()
コード例 #22
0
    def test_matrix_ir_parses(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hl.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)

        matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10))
        matrix_irs = [
            ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE),
            ir.MatrixUnionRows(matrix_range, matrix_range),
            ir.MatrixDistinctByRow(matrix_range),
            ir.MatrixRowsHead(matrix_read, 5),
            ir.CastTableToMatrix(
                ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
                '__entries',
                '__cols',
                []),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            matrix_read,
            matrix_range,
            ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None,
                                             False, True, False, True, None, None, None)),
            ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
            ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True})
        ]


        for x in matrix_irs:
            try:
                Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x))
            except Exception as e:
                raise ValueError(str(x)) from e
コード例 #23
0
    def matrix_irs(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hl.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(
                resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False),
            False, False)
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False)

        matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10))
        matrix_irs = [
            ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE),
            ir.MatrixUnionRows(matrix_range, matrix_range),
            ir.MatrixDistinctByRow(matrix_range),
            ir.MatrixRowsHead(matrix_read, 5),
            ir.MatrixColsHead(matrix_read, 5),
            ir.CastTableToMatrix(
                ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
                '__entries',
                '__cols',
                []),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            matrix_read,
            matrix_range,
            ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None,
                                             False, True, False, True, None, None, None)),
            ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
            ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}),
            ir.MatrixRename(matrix_read, {'global_f32': 'global_foo'}, {'col_f32': 'col_foo'}, {'row_aset': 'row_aset2'}, {'entry_f32': 'entry_foo'}),
            ir.MatrixFilterIntervals(matrix_read, [hl.utils.Interval(hl.utils.Struct(row_idx=0), hl.utils.Struct(row_idx=10))], hl.tstruct(row_idx=hl.tint32), keep=False),
        ]

        return matrix_irs
コード例 #24
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_specify_different_index_file(self):
        sample_file = resource('random.sample')
        bgen_file = resource('random.bgen')
        index_file = new_temp_file(suffix='idx2')
        index_file_map = {bgen_file: index_file}
        hl.index_bgen(bgen_file, index_file_map=index_file_map)
        mt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, index_file_map=index_file_map)
        self.assertEqual(mt.count(), (30, 10))

        with self.assertRaisesRegex(FatalError, 'missing a .idx2 file extension'):
            index_file = new_temp_file()
            index_file_map = {bgen_file: index_file}
            hl.index_bgen(bgen_file, index_file_map=index_file_map)
コード例 #25
0
    def test_import_bgen_dosage_and_gp_dosage_function_agree(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding=recoding)

        bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file)
        et = bgenmt.entries()
        et = et.transmute(gp_dosage=hl.gp_dosage(et.GP))
        self.assertTrue(
            et.all((hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage))
                   | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
コード例 #26
0
    def test_multiple_references_throws_error(self):
        sample_file = resource('random.sample')
        bgen_file1 = resource('random-b.bgen')
        bgen_file2 = resource('random-c.bgen')
        hl.index_bgen(bgen_file1, reference_genome=None)
        hl.index_bgen(bgen_file2, reference_genome='GRCh37')

        with self.assertRaisesRegex(
                FatalError,
                'Found multiple reference genomes were specified in the BGEN index files'
        ):
            hl.import_bgen([bgen_file1, bgen_file2], ['GT'],
                           sample_file=sample_file)
コード例 #27
0
def test_unphased_bgen(spark, tmp_path):
    spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')
    input_bgen = 'test-data/bgen/example.8bits.bgen'
    hl.index_bgen(input_bgen, reference_genome=None)
    hail_df = functions.from_matrix_table(
        hl.import_bgen(input_bgen, entry_fields=['GP']))
    _assert_lossless_adapter(spark,
                             tmp_path,
                             hail_df,
                             input_bgen,
                             'bgen',
                             'bigbgen',
                             writer_options={'bitsPerProbability': '8'})
コード例 #28
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_gavin_example(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        genmt = hl.import_gen(resource('example.gen'), sample_file,
                              contig_recoding=recoding,
                              reference_genome="GRCh37")

        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding=recoding,
                      reference_genome="GRCh37")
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file)
        self.assertTrue(
            bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
コード例 #29
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_dosage_and_gp_dosage_function_agree(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding=recoding)

        bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file)
        et = bgenmt.entries()
        et = et.transmute(gp_dosage = hl.gp_dosage(et.GP))
        self.assertTrue(et.all(
            (hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) |
            (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
コード例 #30
0
ファイル: test_ir.py プロジェクト: danking/hail
    def test_matrix_ir_parses(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hail.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32])
        collect = ir.MakeStruct([('x', ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig))])

        matrix_read = ir.MatrixRead(
            resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False, False)
        table_read = ir.TableRead(resource('backward_compatability/1.0.0/table/0.ht'), False, None)

        matrix_irs = [
            ir.MatrixRepartition(ir.MatrixRange(5, 5, 1), 100, True),
            ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1), ir.MatrixRange(5, 5, 1)),
            ir.MatrixDistinctByRow(ir.MatrixRange(5, 5, 1)),
            ir.CastTableToMatrix(
                ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
                '__entries',
                '__cols',
                []),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            ir.MatrixRange(1, 1, 10),
            ir.MatrixImportVCF([resource('sample.vcf')], False, False, None, None, False, ['GT'],
                               hail.get_reference('GRCh37'), {}, True, False),
            ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'], resource('example.sample'), {}, 10, 1,
                                ['varid'], None),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
        ]

        for x in matrix_irs:
            try:
                Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x))
            except Exception as e:
                raise ValueError(str(x)) from e
コード例 #31
0
    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000),
                      alleles=alleles),  # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=1,  # forcing seek to be called
            variants=desired_variants)
        self.assertTrue(part_1.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=
            199,  # forcing each variant to be its own partition for testing duplicates work properly
            variants=desired_variants)
        self.assertTrue(part_199.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))
コード例 #32
0
    def test_import_bgen_variant_filtering_from_table(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        desired_variants = everything.rows()

        actual = hl.import_bgen(
            bgen_file, ['GT'], n_partitions=10,
            variants=desired_variants)  # filtering with everything

        self.assertTrue(everything._same(actual))
コード例 #33
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_variant_filtering_from_table(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        desired_variants = everything.rows()

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants) # filtering with everything

        self.assertTrue(everything._same(actual))
コード例 #34
0
    def test_drop(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])

        dr = bgen.drop_rows()
        self.assertEqual(dr._force_count_rows(), 0)
        self.assertEqual(dr._force_count_cols(), 500)

        dc = bgen.drop_cols()
        self.assertEqual(dc._force_count_rows(), 199)
        self.assertEqual(dc._force_count_cols(), 0)
コード例 #35
0
    def test_multiple_files(self):
        sample_file = resource('random.sample')
        genmt = hl.import_gen(resource('random.gen'), sample_file)

        bgen_file = [
            resource('random-b.bgen'),
            resource('random-c.bgen'),
            resource('random-a.bgen')
        ]
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                                sample_file,
                                n_partitions=3)
        self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255,
                                     absolute=True))
コード例 #36
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_drop(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'])

        dr = bgen.filter_rows(False)
        self.assertEqual(dr._force_count_rows(), 0)
        self.assertEqual(dr._force_count_cols(), 500)

        dc = bgen.filter_cols(False)
        self.assertEqual(dc._force_count_rows(), 199)
        self.assertEqual(dc._force_count_cols(), 0)
コード例 #37
0
 def test_multiple_files_disjoint(self):
     sample_file = resource('random.sample')
     bgen_file = [
         resource('random-b-disjoint.bgen'),
         resource('random-c-disjoint.bgen'),
         resource('random-a-disjoint.bgen')
     ]
     hl.index_bgen(bgen_file)
     with self.assertRaisesRegex(
             FatalError,
             'Each BGEN file must contain a region of the genome disjoint from other files'
     ):
         hl.import_bgen(bgen_file, ['GT', 'GP'],
                        sample_file,
                        n_partitions=3)
コード例 #38
0
ファイル: test_impex.py プロジェクト: maccum/hail
    def test_import_bgen_gavin_example(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        genmt = hl.import_gen(resource('example.gen'),
                              sample_file,
                              contig_recoding=recoding)

        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file)
        bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                                sample_file,
                                contig_recoding=recoding)
        self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255,
                                     absolute=True))
コード例 #39
0
ファイル: test_ir.py プロジェクト: jigold/hail
    def matrix_irs(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hl.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])

        matrix_read = ir.MatrixRead(
            ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)
        table_read = ir.TableRead(
            ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)

        matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10))
        matrix_irs = [
            ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE),
            ir.MatrixUnionRows(matrix_range, matrix_range),
            ir.MatrixDistinctByRow(matrix_range),
            ir.MatrixRowsHead(matrix_read, 5),
            ir.CastTableToMatrix(
                ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
                '__entries',
                '__cols',
                []),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            matrix_read,
            matrix_range,
            ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None,
                                             False, True, False, True, None, None, None)),
            ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
            ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True})
        ]

        return matrix_irs
コード例 #40
0
ファイル: test_impex.py プロジェクト: maccum/hail
    def test_import_bgen_skip_invalid_loci(self):
        hl.index_bgen(resource('skip_invalid_loci.bgen'))

        mt = hl.import_bgen(resource('skip_invalid_loci.bgen'),
                            entry_fields=[],
                            sample_file=resource('skip_invalid_loci.sample'),
                            reference_genome='GRCh37',
                            skip_invalid_loci=True)
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            mt = hl.import_bgen(
                resource('skip_invalid_loci.bgen'),
                entry_fields=[],
                sample_file=resource('skip_invalid_loci.sample'))
            mt._force_count_rows()
コード例 #41
0
    def test_specify_different_index_file(self):
        sample_file = resource('random.sample')
        bgen_file = resource('random.bgen')
        index_file = new_temp_file(suffix='idx2')
        index_file_map = {bgen_file: index_file}
        hl.index_bgen(bgen_file, index_file_map=index_file_map)
        mt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                            sample_file,
                            index_file_map=index_file_map)
        self.assertEqual(mt.count(), (30, 10))

        with self.assertRaisesRegex(FatalError,
                                    'missing a .idx2 file extension'):
            index_file = new_temp_file()
            index_file_map = {bgen_file: index_file}
            hl.index_bgen(bgen_file, index_file_map=index_file_map)
コード例 #42
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_skip_invalid_loci(self):
        hl.index_bgen(resource('skip_invalid_loci.bgen'),
                      reference_genome='GRCh37',
                      skip_invalid_loci=True)

        mt = hl.import_bgen(resource('skip_invalid_loci.bgen'),
                            entry_fields=[],
                            sample_file=resource('skip_invalid_loci.sample'))
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            hl.index_bgen(resource('skip_invalid_loci.bgen'))

            mt = hl.import_bgen(resource('skip_invalid_loci.bgen'),
                                entry_fields=[],
                                sample_file=resource('skip_invalid_loci.sample'))
            mt._force_count_rows()
コード例 #43
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_locus_filtering_from_table(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        desired_loci = hl.Table.parallelize([{'locus': hl.Locus('1', 10000)}],
                                            schema=hl.tstruct(locus=hl.tlocus()),
                                            key='locus')

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G'])  # Duplicated variant
        ]

        result = hl.import_bgen(bgen_file,
                                ['GT'],
                                variants=desired_loci)

        self.assertTrue(result.rows().key_by('locus', 'alleles').select().collect() == expected_result)
コード例 #44
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_locus_filtering_from_exprs(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        actual_struct = hl.import_bgen(bgen_file,
                                ['GT'],
                                variants=hl.struct(locus=everything.locus))

        self.assertTrue(everything._same(actual_struct))

        actual_locus = hl.import_bgen(bgen_file,
                                ['GT'],
                                variants=everything.locus)

        self.assertTrue(everything._same(actual_locus))
コード例 #45
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=1, # forcing seek to be called
                                variants=desired_variants)
        self.assertTrue(part_1.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=199, # forcing each variant to be its own partition for testing duplicates work properly
                                variants=desired_variants)
        self.assertTrue(part_199.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))
コード例 #46
0
    def test_import_bgen_empty_variant_filter(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=[])
        self.assertEqual(actual.count_rows(), 0)

        nothing = hl.import_bgen(bgen_file, ['GT']).drop_rows()
        self.assertEqual(nothing.count(), (0, 500))

        desired_variants = hl.struct(locus=nothing.locus,
                                     alleles=nothing.alleles)

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 0)
コード例 #47
0
ファイル: test_methods.py プロジェクト: shulik7/hail
    def test_import_bgen(self):
        hl.index_bgen(resource('example.v11.bgen'))

        bgen_rows = hl.import_bgen(resource('example.v11.bgen'),
                                   entry_fields=['GT', 'GP'],
                                   sample_file=resource('example.sample'),
                                   contig_recoding={'01': '1'},
                                   reference_genome='GRCh37').rows()
        self.assertTrue(bgen_rows.all(bgen_rows.locus.contig == '1'))
        self.assertEqual(bgen_rows.count(), 199)

        hl.index_bgen(resource('example.8bits.bgen'))

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['dosage'],
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37')
        self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64))

        bgen = hl.import_bgen(resource('example.8bits.bgen'),
                              entry_fields=['GT', 'GP'],
                              sample_file=resource('example.sample'),
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37')
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
        self.assertEqual(bgen.count_rows(), 199)

        hl.index_bgen(resource('example.10bits.bgen'))
        bgen = hl.import_bgen(resource('example.10bits.bgen'),
                              entry_fields=['GT', 'GP', 'dosage'],
                              contig_recoding={'01': '1'},
                              reference_genome='GRCh37')
        self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64), dosage=hl.tfloat64))
        self.assertEqual(bgen.locus.dtype, hl.tlocus('GRCh37'))
コード例 #48
0
    def test_import_bgen_row_fields(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        default_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                            entry_fields=['dosage'])
        self.assertEqual(
            default_row_fields.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       rsid=hl.tstr,
                       varid=hl.tstr))
        no_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                       entry_fields=['dosage'],
                                       _row_fields=[])
        self.assertEqual(
            no_row_fields.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr)))
        varid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                    entry_fields=['dosage'],
                                    _row_fields=['varid'])
        self.assertEqual(
            varid_only.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       varid=hl.tstr))
        rsid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                   entry_fields=['dosage'],
                                   _row_fields=['rsid'])
        self.assertEqual(
            rsid_only.row.dtype,
            hl.tstruct(locus=hl.tlocus('GRCh37'),
                       alleles=hl.tarray(hl.tstr),
                       rsid=hl.tstr))

        self.assertTrue(default_row_fields.drop('varid')._same(rsid_only))
        self.assertTrue(default_row_fields.drop('rsid')._same(varid_only))
        self.assertTrue(
            default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
コード例 #49
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_empty_variant_filter(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=[])
        self.assertEqual(actual.count_rows(), 0)

        nothing = hl.import_bgen(bgen_file, ['GT']).filter_rows(False)
        self.assertEqual(nothing.count(), (0, 500))

        desired_variants = hl.struct(locus=nothing.locus, alleles=nothing.alleles)

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 0)
コード例 #50
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_import_bgen_row_fields(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      contig_recoding={'01': '1'},
                      reference_genome='GRCh37')

        default_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                            entry_fields=['dosage'])
        self.assertEqual(default_row_fields.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    rsid=hl.tstr,
                                    varid=hl.tstr))
        no_row_fields = hl.import_bgen(resource('example.8bits.bgen'),
                                       entry_fields=['dosage'],
                                       _row_fields=[])
        self.assertEqual(no_row_fields.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr)))
        varid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                    entry_fields=['dosage'],
                                    _row_fields=['varid'])
        self.assertEqual(varid_only.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    varid=hl.tstr))
        rsid_only = hl.import_bgen(resource('example.8bits.bgen'),
                                   entry_fields=['dosage'],
                                   _row_fields=['rsid'])
        self.assertEqual(rsid_only.row.dtype,
                         hl.tstruct(locus=hl.tlocus('GRCh37'),
                                    alleles=hl.tarray(hl.tstr),
                                    rsid=hl.tstr))

        self.assertTrue(default_row_fields.drop('varid')._same(rsid_only))
        self.assertTrue(default_row_fields.drop('rsid')._same(varid_only))
        self.assertTrue(
            default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
コード例 #51
0
def download_data(data_dir):
    global _data_dir, _mt
    _data_dir = data_dir or os.environ.get(
        'HAIL_BENCHMARK_DIR') or '/tmp/hail_benchmark_data'
    logging.info(f'using benchmark data directory {_data_dir}')
    os.makedirs(_data_dir, exist_ok=True)

    files = map(lambda f: os.path.join(_data_dir, f), [
        'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht',
        'table_10M_par_100.ht', 'table_10M_par_10.ht',
        'gnomad_dp_simulation.mt', 'many_strings_table.ht',
        'many_ints_table.ht', 'sim_ukb.bgen'
    ])
    if not all(os.path.exists(file) for file in files):
        hl.init()  # use all cores

        vcf = os.path.join(_data_dir, 'profile.vcf.bgz')
        logging.info('downloading profile.vcf.bgz...')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz',
            vcf)
        logging.info('done downloading profile.vcf.bgz.')
        logging.info('importing profile.vcf.bgz...')
        hl.import_vcf(vcf, min_partitions=16).write(os.path.join(
            _data_dir, 'profile.mt'),
                                                    overwrite=True)
        logging.info('done importing profile.vcf.bgz.')

        logging.info('writing 10M row partitioned tables...')

        ht = hl.utils.range_table(
            10_000_000,
            1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1)
                              for i in range(5)})
        ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'),
                           overwrite=True)
        ht = ht.naive_coalesce(100).checkpoint(os.path.join(
            _data_dir, 'table_10M_par_100.ht'),
                                               overwrite=True)
        ht.naive_coalesce(10).write(os.path.join(_data_dir,
                                                 'table_10M_par_10.ht'),
                                    overwrite=True)
        logging.info('done writing 10M row partitioned tables.')

        logging.info('creating gnomad_dp_simulation matrix table...')
        mt = hl.utils.range_matrix_table(n_rows=250_000,
                                         n_cols=1_000,
                                         n_partitions=32)
        mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3))
        mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'),
                 overwrite=True)
        logging.info('done creating gnomad_dp_simulation matrix table.')

        logging.info('downloading many_strings_table.tsv.bgz...')
        mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz')
        mst_ht = os.path.join(_data_dir, 'many_strings_table.ht')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz',
            mst_tsv)
        logging.info('done downloading many_strings_table.tsv.bgz.')
        logging.info('importing many_strings_table.tsv.bgz...')
        hl.import_table(mst_tsv).write(mst_ht, overwrite=True)
        logging.info('done importing many_strings_table.tsv.bgz.')

        logging.info('downloading many_ints_table.tsv.bgz...')
        mit_tsv = os.path.join(_data_dir, 'many_ints_table.tsv.bgz')
        mit_ht = os.path.join(_data_dir, 'many_ints_table.ht')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/many_ints_table.tsv.bgz',
            mit_tsv)
        logging.info('done downloading many_ints_table.tsv.bgz.')
        logging.info('importing many_ints_table.tsv.bgz...')
        hl.import_table(mit_tsv,
                        types={
                            'idx': 'int',
                            **{f'i{i}': 'int'
                               for i in range(5)},
                            **{f'array{i}': 'array<int>'
                               for i in range(2)}
                        }).write(mit_ht, overwrite=True)
        logging.info('done importing many_ints_table.tsv.bgz.')

        bgen = 'sim_ukb.bgen'
        sample = 'sim_ukb.sample'
        logging.info(f'downloading {bgen}...')
        local_bgen = os.path.join(_data_dir, bgen)
        local_sample = os.path.join(_data_dir, sample)
        urlretrieve(
            f'https://storage.googleapis.com/hail-common/benchmark/{bgen}',
            local_bgen)
        urlretrieve(
            f'https://storage.googleapis.com/hail-common/benchmark/{sample}',
            local_sample)
        logging.info(f'done downloading {bgen}...')
        logging.info(f'indexing {bgen}...')
        hl.index_bgen(local_bgen)
        logging.info(f'done indexing {bgen}.')

        hl.stop()
    else:
        logging.info('all files found.')
コード例 #52
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
コード例 #53
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
 def test_multiple_files_disjoint(self):
     sample_file = resource('random.sample')
     bgen_file = [resource('random-b-disjoint.bgen'), resource('random-c-disjoint.bgen'), resource('random-a-disjoint.bgen')]
     hl.index_bgen(bgen_file)
     with self.assertRaisesRegex(FatalError, 'Each BGEN file must contain a region of the genome disjoint from other files'):
         hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)
コード例 #54
0
 def test_parallel_import(self):
     bgen_file = resource('parallelBgenExport.bgen')
     hl.index_bgen(bgen_file)
     mt = hl.import_bgen(bgen_file, ['GT', 'GP'],
                         resource('parallelBgenExport.sample'))
     self.assertEqual(mt.count(), (16, 10))
コード例 #55
0
parser.add_argument('--index_file', help='''
    output index file path
''')
args = parser.parse_args()

import hail as hl
import logging, os, time, sys

logging.basicConfig(level=logging.INFO, stream=sys.stderr)
logging.info('echo $PYSPARK_SUBMIT_ARGS')
os.system('echo $PYSPARK_SUBMIT_ARGS')

bgen_file = args.bgen
index_file = args.index_file
chrnum = args.chromosome_number

logging.info('Start indexing {file}'.format(file=bgen_file))
tstart = time.time()
if len(chrnum) == 1:
    old = '0' + chrnum
    new = chrnum
    contig_map = {old: new}
    hl.index_bgen(bgen_file,
                  index_file_map={bgen_file: index_file},
                  contig_recoding=contig_map)
else:
    hl.index_bgen(bgen_file, index_file_map={bgen_file: index_file})
logging.info('Finished! {time} seconds elapsed'.format(time=time.time() -
                                                       tstart))
logging.info('Index file saved as {file}'.format(file=index_file))