def test_parallel_import(self): bgen_file = resource('parallelBgenExport.bgen') hl.index_bgen(bgen_file) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], resource('parallelBgenExport.sample')) self.assertEqual(mt.count(), (16, 10))
def test_multiple_files_variant_filtering(self): bgen_file = [ resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen') ] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def test_import_bgen_locus_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) # Test with Struct(Locus) desired_loci = [hl.Struct(locus=hl.Locus('1', 10000))] expected_result = [ hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']), hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant ] locus_struct = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(locus_struct.rows().key_by('locus', 'alleles').select().collect() == expected_result) # Test with Locus object desired_loci = [hl.Locus('1', 10000)] locus_object = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(locus_object.rows().key_by('locus', 'alleles').select().collect() == expected_result)
def test_multiple_files_variant_filtering(self): bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def test_matrix_ir_parses(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hail.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32]) collect = ir.MakeStruct([('x', ir.ApplyAggOp([ir.I32(0)], [], None, collect_sig, hl.tarray(hl.tint32)))]) matrix_read = ir.MatrixRead( resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False, False) table_read = ir.TableRead( resource('backward_compatability/1.0.0/table/0.ht'), False, None) matrix_irs = [ ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1), ir.MatrixRange(5, 5, 1)), ir.UnlocalizeEntries(ir.LocalizeEntries(matrix_read, '__entries'), ir.MatrixColsTable(matrix_read), '__entries'), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), ir.MatrixRange(1, 1, 10), ir.MatrixImportVCF([resource('sample.vcf')], False, False, None, None, False, ['GT'], hail.get_reference('GRCh37'), {}, True, False), ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'], resource('example.sample'), {}, 10, 1, ['varid'], None), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ] for x in matrix_irs: try: Env.hail().expr.Parser.parse_matrix_ir(str(x)) except Exception as e: raise ValueError(str(x)) from e
def test_import_bgen_no_reference(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome=None) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP', 'dosage']) self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(bgen.count_rows(), 199)
def test_import_bgen_GT_GP_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
def test_import_bgen_dosage_entry(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) self.assertEqual(bgen.count_rows(), 199)
def test_n_partitions(self): hl.index_bgen(resource('example.8bits.bgen')) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', n_partitions=5) self.assertEqual(bgen.n_partitions(), 5)
def test_import_bgen_random(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = resource('random.bgen') hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) self.assertTrue( bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def _create(self, resource_dir): bgen = 'sim_ukb.bgen' sample = 'sim_ukb.sample' download(resource_dir, bgen) download(resource_dir, sample) local_bgen = os.path.join(resource_dir, bgen) logging.info(f'indexing {bgen}...') hl.index_bgen(local_bgen) logging.info(f'done indexing {bgen}.')
def test_import_bgen_dosage_entry(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) self.assertEqual(bgen.count_rows(), 199)
def test_multiple_references_throws_error(self): sample_file = resource('random.sample') bgen_file1 = resource('random-b.bgen') bgen_file2 = resource('random-c.bgen') hl.index_bgen(bgen_file1, reference_genome=None) hl.index_bgen(bgen_file2, reference_genome='GRCh37') with self.assertRaisesRegex(FatalError, 'Found multiple reference genomes were specified in the BGEN index files'): hl.import_bgen([bgen_file1, bgen_file2], ['GT'], sample_file=sample_file)
def test_multiple_files(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')] hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3) self.assertTrue( bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_n_partitions(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], n_partitions=210) self.assertEqual(bgen.n_partitions(), 199) # only 199 variants in the file
def test_import_bgen_random(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = resource('random.bgen') hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_import_bgen_no_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=[], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct()) bgen._jvds.typecheck()
def test_import_bgen_GT_GP_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64)))
def test_import_bgen_no_reference(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome=None) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP', 'dosage']) self.assertEqual(bgen.locus.dtype, hl.tstruct(contig=hl.tstr, position=hl.tint32)) self.assertEqual(bgen.count_rows(), 199)
def test_n_partitions(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], n_partitions=210) self.assertEqual(bgen.n_partitions(), 199) # only 199 variants in the file
def test_import_bgen_no_entries(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=[], sample_file=resource('example.sample')) self.assertEqual(bgen.entry.dtype, hl.tstruct()) bgen._jvds.typecheck()
def test_matrix_ir_parses(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) table_read = ir.TableRead( ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.MatrixRowsHead(matrix_read, 5), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None, None, None)), ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}) ] for x in matrix_irs: try: Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x)) except Exception as e: raise ValueError(str(x)) from e
def matrix_irs(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False), False, False) table_read = ir.TableRead( ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.MatrixRowsHead(matrix_read, 5), ir.MatrixColsHead(matrix_read, 5), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None, None, None)), ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}), ir.MatrixRename(matrix_read, {'global_f32': 'global_foo'}, {'col_f32': 'col_foo'}, {'row_aset': 'row_aset2'}, {'entry_f32': 'entry_foo'}), ir.MatrixFilterIntervals(matrix_read, [hl.utils.Interval(hl.utils.Struct(row_idx=0), hl.utils.Struct(row_idx=10))], hl.tstruct(row_idx=hl.tint32), keep=False), ] return matrix_irs
def test_specify_different_index_file(self): sample_file = resource('random.sample') bgen_file = resource('random.bgen') index_file = new_temp_file(suffix='idx2') index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, index_file_map=index_file_map) self.assertEqual(mt.count(), (30, 10)) with self.assertRaisesRegex(FatalError, 'missing a .idx2 file extension'): index_file = new_temp_file() index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map)
def test_import_bgen_dosage_and_gp_dosage_function_agree(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding) bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file) et = bgenmt.entries() et = et.transmute(gp_dosage=hl.gp_dosage(et.GP)) self.assertTrue( et.all((hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
def test_multiple_references_throws_error(self): sample_file = resource('random.sample') bgen_file1 = resource('random-b.bgen') bgen_file2 = resource('random-c.bgen') hl.index_bgen(bgen_file1, reference_genome=None) hl.index_bgen(bgen_file2, reference_genome='GRCh37') with self.assertRaisesRegex( FatalError, 'Found multiple reference genomes were specified in the BGEN index files' ): hl.import_bgen([bgen_file1, bgen_file2], ['GT'], sample_file=sample_file)
def test_unphased_bgen(spark, tmp_path): spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1') input_bgen = 'test-data/bgen/example.8bits.bgen' hl.index_bgen(input_bgen, reference_genome=None) hail_df = functions.from_matrix_table( hl.import_bgen(input_bgen, entry_fields=['GP'])) _assert_lossless_adapter(spark, tmp_path, hail_df, input_bgen, 'bgen', 'bigbgen', writer_options={'bitsPerProbability': '8'})
def test_import_bgen_gavin_example(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') genmt = hl.import_gen(resource('example.gen'), sample_file, contig_recoding=recoding, reference_genome="GRCh37") bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding, reference_genome="GRCh37") bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file) self.assertTrue( bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_import_bgen_dosage_and_gp_dosage_function_agree(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding) bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file) et = bgenmt.entries() et = et.transmute(gp_dosage = hl.gp_dosage(et.GP)) self.assertTrue(et.all( (hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
def test_matrix_ir_parses(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hail.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32]) collect = ir.MakeStruct([('x', ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig))]) matrix_read = ir.MatrixRead( resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False, False) table_read = ir.TableRead(resource('backward_compatability/1.0.0/table/0.ht'), False, None) matrix_irs = [ ir.MatrixRepartition(ir.MatrixRange(5, 5, 1), 100, True), ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1), ir.MatrixRange(5, 5, 1)), ir.MatrixDistinctByRow(ir.MatrixRange(5, 5, 1)), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), ir.MatrixRange(1, 1, 10), ir.MatrixImportVCF([resource('sample.vcf')], False, False, None, None, False, ['GT'], hail.get_reference('GRCh37'), {}, True, False), ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'], resource('example.sample'), {}, 10, 1, ['varid'], None), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ] for x in matrix_irs: try: Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x)) except Exception as e: raise ValueError(str(x)) from e
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen( bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen( bgen_file, ['GT'], n_partitions= 199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def test_import_bgen_variant_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) desired_variants = everything.rows() actual = hl.import_bgen( bgen_file, ['GT'], n_partitions=10, variants=desired_variants) # filtering with everything self.assertTrue(everything._same(actual))
def test_import_bgen_variant_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) desired_variants = everything.rows() actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) # filtering with everything self.assertTrue(everything._same(actual))
def test_drop(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) dr = bgen.drop_rows() self.assertEqual(dr._force_count_rows(), 0) self.assertEqual(dr._force_count_cols(), 500) dc = bgen.drop_cols() self.assertEqual(dc._force_count_rows(), 199) self.assertEqual(dc._force_count_cols(), 0)
def test_multiple_files(self): sample_file = resource('random.sample') genmt = hl.import_gen(resource('random.gen'), sample_file) bgen_file = [ resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen') ] hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3) self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def test_drop(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) dr = bgen.filter_rows(False) self.assertEqual(dr._force_count_rows(), 0) self.assertEqual(dr._force_count_cols(), 500) dc = bgen.filter_cols(False) self.assertEqual(dc._force_count_rows(), 199) self.assertEqual(dc._force_count_cols(), 0)
def test_multiple_files_disjoint(self): sample_file = resource('random.sample') bgen_file = [ resource('random-b-disjoint.bgen'), resource('random-c-disjoint.bgen'), resource('random-a-disjoint.bgen') ] hl.index_bgen(bgen_file) with self.assertRaisesRegex( FatalError, 'Each BGEN file must contain a region of the genome disjoint from other files' ): hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)
def test_import_bgen_gavin_example(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') genmt = hl.import_gen(resource('example.gen'), sample_file, contig_recoding=recoding) bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file) bgenmt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, contig_recoding=recoding) self.assertTrue(bgenmt._same(genmt, tolerance=1.0 / 255, absolute=True))
def matrix_irs(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) table_read = ir.TableRead( ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.MatrixRowsHead(matrix_read, 5), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None, None, None)), ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}) ] return matrix_irs
def test_import_bgen_skip_invalid_loci(self): hl.index_bgen(resource('skip_invalid_loci.bgen')) mt = hl.import_bgen(resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample'), reference_genome='GRCh37', skip_invalid_loci=True) self.assertTrue(mt._force_count_rows() == 3) with self.assertRaisesRegex(FatalError, 'Invalid locus'): mt = hl.import_bgen( resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample')) mt._force_count_rows()
def test_specify_different_index_file(self): sample_file = resource('random.sample') bgen_file = resource('random.bgen') index_file = new_temp_file(suffix='idx2') index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, index_file_map=index_file_map) self.assertEqual(mt.count(), (30, 10)) with self.assertRaisesRegex(FatalError, 'missing a .idx2 file extension'): index_file = new_temp_file() index_file_map = {bgen_file: index_file} hl.index_bgen(bgen_file, index_file_map=index_file_map)
def test_import_bgen_skip_invalid_loci(self): hl.index_bgen(resource('skip_invalid_loci.bgen'), reference_genome='GRCh37', skip_invalid_loci=True) mt = hl.import_bgen(resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample')) self.assertTrue(mt._force_count_rows() == 3) with self.assertRaisesRegex(FatalError, 'Invalid locus'): hl.index_bgen(resource('skip_invalid_loci.bgen')) mt = hl.import_bgen(resource('skip_invalid_loci.bgen'), entry_fields=[], sample_file=resource('skip_invalid_loci.sample')) mt._force_count_rows()
def test_import_bgen_locus_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) desired_loci = hl.Table.parallelize([{'locus': hl.Locus('1', 10000)}], schema=hl.tstruct(locus=hl.tlocus()), key='locus') expected_result = [ hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']), hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant ] result = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(result.rows().key_by('locus', 'alleles').select().collect() == expected_result)
def test_import_bgen_locus_filtering_from_exprs(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) actual_struct = hl.import_bgen(bgen_file, ['GT'], variants=hl.struct(locus=everything.locus)) self.assertTrue(everything._same(actual_struct)) actual_locus = hl.import_bgen(bgen_file, ['GT'], variants=everything.locus) self.assertTrue(everything._same(actual_locus))
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen(bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by('locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen(bgen_file, ['GT'], n_partitions=199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by('locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def test_import_bgen_empty_variant_filter(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=[]) self.assertEqual(actual.count_rows(), 0) nothing = hl.import_bgen(bgen_file, ['GT']).drop_rows() self.assertEqual(nothing.count(), (0, 500)) desired_variants = hl.struct(locus=nothing.locus, alleles=nothing.alleles) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 0)
def test_import_bgen(self): hl.index_bgen(resource('example.v11.bgen')) bgen_rows = hl.import_bgen(resource('example.v11.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample'), contig_recoding={'01': '1'}, reference_genome='GRCh37').rows() self.assertTrue(bgen_rows.all(bgen_rows.locus.contig == '1')) self.assertEqual(bgen_rows.count(), 199) hl.index_bgen(resource('example.8bits.bgen')) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample'), contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64))) self.assertEqual(bgen.count_rows(), 199) hl.index_bgen(resource('example.10bits.bgen')) bgen = hl.import_bgen(resource('example.10bits.bgen'), entry_fields=['GT', 'GP', 'dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64), dosage=hl.tfloat64)) self.assertEqual(bgen.locus.dtype, hl.tlocus('GRCh37'))
def test_import_bgen_row_fields(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') default_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual( default_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr, varid=hl.tstr)) no_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=[]) self.assertEqual( no_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr))) varid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['varid']) self.assertEqual( varid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), varid=hl.tstr)) rsid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['rsid']) self.assertEqual( rsid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr)) self.assertTrue(default_row_fields.drop('varid')._same(rsid_only)) self.assertTrue(default_row_fields.drop('rsid')._same(varid_only)) self.assertTrue( default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
def test_import_bgen_empty_variant_filter(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=[]) self.assertEqual(actual.count_rows(), 0) nothing = hl.import_bgen(bgen_file, ['GT']).filter_rows(False) self.assertEqual(nothing.count(), (0, 500)) desired_variants = hl.struct(locus=nothing.locus, alleles=nothing.alleles) actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 0)
def test_import_bgen_row_fields(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') default_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(default_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr, varid=hl.tstr)) no_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=[]) self.assertEqual(no_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr))) varid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['varid']) self.assertEqual(varid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), varid=hl.tstr)) rsid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['rsid']) self.assertEqual(rsid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr)) self.assertTrue(default_row_fields.drop('varid')._same(rsid_only)) self.assertTrue(default_row_fields.drop('rsid')._same(varid_only)) self.assertTrue( default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
def download_data(data_dir): global _data_dir, _mt _data_dir = data_dir or os.environ.get( 'HAIL_BENCHMARK_DIR') or '/tmp/hail_benchmark_data' logging.info(f'using benchmark data directory {_data_dir}') os.makedirs(_data_dir, exist_ok=True) files = map(lambda f: os.path.join(_data_dir, f), [ 'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht', 'table_10M_par_100.ht', 'table_10M_par_10.ht', 'gnomad_dp_simulation.mt', 'many_strings_table.ht', 'many_ints_table.ht', 'sim_ukb.bgen' ]) if not all(os.path.exists(file) for file in files): hl.init() # use all cores vcf = os.path.join(_data_dir, 'profile.vcf.bgz') logging.info('downloading profile.vcf.bgz...') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz', vcf) logging.info('done downloading profile.vcf.bgz.') logging.info('importing profile.vcf.bgz...') hl.import_vcf(vcf, min_partitions=16).write(os.path.join( _data_dir, 'profile.mt'), overwrite=True) logging.info('done importing profile.vcf.bgz.') logging.info('writing 10M row partitioned tables...') ht = hl.utils.range_table( 10_000_000, 1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1) for i in range(5)}) ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'), overwrite=True) ht = ht.naive_coalesce(100).checkpoint(os.path.join( _data_dir, 'table_10M_par_100.ht'), overwrite=True) ht.naive_coalesce(10).write(os.path.join(_data_dir, 'table_10M_par_10.ht'), overwrite=True) logging.info('done writing 10M row partitioned tables.') logging.info('creating gnomad_dp_simulation matrix table...') mt = hl.utils.range_matrix_table(n_rows=250_000, n_cols=1_000, n_partitions=32) mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3)) mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'), overwrite=True) logging.info('done creating gnomad_dp_simulation matrix table.') logging.info('downloading many_strings_table.tsv.bgz...') mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz') mst_ht = os.path.join(_data_dir, 'many_strings_table.ht') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz', mst_tsv) logging.info('done downloading many_strings_table.tsv.bgz.') logging.info('importing many_strings_table.tsv.bgz...') hl.import_table(mst_tsv).write(mst_ht, overwrite=True) logging.info('done importing many_strings_table.tsv.bgz.') logging.info('downloading many_ints_table.tsv.bgz...') mit_tsv = os.path.join(_data_dir, 'many_ints_table.tsv.bgz') mit_ht = os.path.join(_data_dir, 'many_ints_table.ht') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/many_ints_table.tsv.bgz', mit_tsv) logging.info('done downloading many_ints_table.tsv.bgz.') logging.info('importing many_ints_table.tsv.bgz...') hl.import_table(mit_tsv, types={ 'idx': 'int', **{f'i{i}': 'int' for i in range(5)}, **{f'array{i}': 'array<int>' for i in range(2)} }).write(mit_ht, overwrite=True) logging.info('done importing many_ints_table.tsv.bgz.') bgen = 'sim_ukb.bgen' sample = 'sim_ukb.sample' logging.info(f'downloading {bgen}...') local_bgen = os.path.join(_data_dir, bgen) local_sample = os.path.join(_data_dir, sample) urlretrieve( f'https://storage.googleapis.com/hail-common/benchmark/{bgen}', local_bgen) urlretrieve( f'https://storage.googleapis.com/hail-common/benchmark/{sample}', local_sample) logging.info(f'done downloading {bgen}...') logging.info(f'indexing {bgen}...') hl.index_bgen(local_bgen) logging.info(f'done indexing {bgen}.') hl.stop() else: logging.info('all files found.')
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
def test_multiple_files_disjoint(self): sample_file = resource('random.sample') bgen_file = [resource('random-b-disjoint.bgen'), resource('random-c-disjoint.bgen'), resource('random-a-disjoint.bgen')] hl.index_bgen(bgen_file) with self.assertRaisesRegex(FatalError, 'Each BGEN file must contain a region of the genome disjoint from other files'): hl.import_bgen(bgen_file, ['GT', 'GP'], sample_file, n_partitions=3)
def test_parallel_import(self): bgen_file = resource('parallelBgenExport.bgen') hl.index_bgen(bgen_file) mt = hl.import_bgen(bgen_file, ['GT', 'GP'], resource('parallelBgenExport.sample')) self.assertEqual(mt.count(), (16, 10))
parser.add_argument('--index_file', help=''' output index file path ''') args = parser.parse_args() import hail as hl import logging, os, time, sys logging.basicConfig(level=logging.INFO, stream=sys.stderr) logging.info('echo $PYSPARK_SUBMIT_ARGS') os.system('echo $PYSPARK_SUBMIT_ARGS') bgen_file = args.bgen index_file = args.index_file chrnum = args.chromosome_number logging.info('Start indexing {file}'.format(file=bgen_file)) tstart = time.time() if len(chrnum) == 1: old = '0' + chrnum new = chrnum contig_map = {old: new} hl.index_bgen(bgen_file, index_file_map={bgen_file: index_file}, contig_recoding=contig_map) else: hl.index_bgen(bgen_file, index_file_map={bgen_file: index_file}) logging.info('Finished! {time} seconds elapsed'.format(time=time.time() - tstart)) logging.info('Index file saved as {file}'.format(file=index_file))