def test_index_opens(self): ''' loads index when available ''' bfile = BgenFile(self.folder / 'example.15bits.bgen') self.assertFalse( bfile._check_for_index(str(self.folder / 'example.15bits.bgen'))) bfile = BgenFile(self.folder / 'example.16bits.bgen') self.assertTrue( bfile._check_for_index(str(self.folder / 'example.16bits.bgen')))
def test_context_handler_closed_bgen_length(self): ''' error raised if accessing length of exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): len(bfile)
def test_zstd_compressed(self): ''' check we can parse genotypes from zstd compressed geno probabilities ''' path = self.folder / 'example.16bits.zstd.bgen' bfile = BgenFile(str(path)) for var, g in zip(bfile, self.gen_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, 16))
def test_fetch(self): ''' can fetch variants within a genomic region ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') self.assertTrue( bfile._check_for_index(str(self.folder / 'example.16bits.bgen'))) self.assertTrue(list(bfile.fetch('02')) == [])
def test_context_handler_closed_bgen_slice(self): ''' error raised if slicing variant from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): var = bfile[0]
def test_context_handler_closed_bgen_with_rsid(self): ''' error raised if getting variant with rsid from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): var = bfile.with_rsid('rs111')
def test_context_handler_closed_bgen_at_position(self): ''' error raised if getting variant at position from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): var = bfile.at_position(100)
def test_context_handler_closed_bgen_positions(self): ''' no positions available from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile.positions()) > 0) with self.assertRaises(ValueError): bfile.positions()
def __init__(self, bgen_file_path, phenotype_file_path, index_column_name, covariate_file_path=None, sample_file_path=None): """ This software is meant to be called from the command line, so no documentation is included here. Note, the code here is a bit verbose, which was done in an attempt to minimize the number of function calls given the need to perform millions of calls. This could likely be optimized in a better way. """ self.index_column_name = index_column_name assert os.path.isfile(bgen_file_path), "bgen file does not exist" if os.path.isfile(bgen_file_path + '.bgi') is False: print( "Warning: No bgen index (.bgi) file provided in same directory as bgen file. Initial reading of the bgen is MUCH faster with index file. " ) if sample_file_path is not None: assert os.path.isfile( sample_file_path ), "sample file does not exist at provided location" else: sample_file_path = bgen_file_path.strip('bgen') + 'sample' if os.path.isfile(sample_file_path) is False: raise FileNotFoundError( "No sample file at {0:s}. A sample file must be provided.". format(sample_file_path)) print( 'Reading bgen file from {0:s} using sample file {1:s}. If these seem like an error, kill program.' .format(bgen_file_path, sample_file_path)) self.bgen_dataset = BgenFile(bgen_file_path, sample_path=sample_file_path) if os.path.isfile(phenotype_file_path): self.phenotype_dataset = pd.read_csv(phenotype_file_path, sep='\t', index_col=index_column_name) else: raise FileNotFoundError("No phenotype file at provided location") if covariate_file_path is not None: if os.path.isfile(covariate_file_path): self.covariate_dataset = pd.read_csv( covariate_file_path, sep='\t', index_col=index_column_name) else: raise FileNotFoundError( "No covariate file at provided location") else: print( "No covariate file provided. Will use phenotype file for covariates.\n", flush=True) self.covariate_dataset = self.phenotype_dataset
def test_load_haplotypes_bgen(self): ''' check we can open a bgen with haplotypes, and parse genotypes correctly ''' path = self.folder / 'haplotypes.bgen' bfile = BgenFile(str(path)) bit_depth = 16 for var, g in zip(bfile, self.haps_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_v11(self): ''' check we can open a bgen in v1.1 format, and parse genotypes correctly ''' path = self.folder / 'example.v11.bgen' bfile = BgenFile(str(path)) bit_depth = 16 for var, g in zip(bfile, self.gen_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_load_complex_file(self): ''' make sure we can open a complex bgen file ''' path = self.folder / 'complex.bgen' bfile = BgenFile(path) bit_depth = 16 for var, g in zip(bfile, self.vcf_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth)) self.assertTrue(all(x == y for x, y in zip(g.ploidy, var.ploidy)))
def test_load_example_genotypes_bit_depths(self): ''' check parsing genotypes from the example files with different bit depths ''' for path in self.folder.glob('example.*bits.bgen'): bit_depth = int(path.stem.split('.')[1].strip('bits')) bfile = BgenFile(str(path)) for var, g in zip(bfile, self.gen_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_load_complex_files(self): ''' make sure we can open the complex bgen files ''' for path in self.folder.glob('complex.*.bgen'): bit_depth = int(path.stem.split('.')[1].strip('bits')) bfile = BgenFile(path) for var, g in zip(bfile, self.vcf_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_fetch_whole_chrom(self): ''' fetching just with chrom gives all variants on chromosome ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') # test fetching a whole chromosome sortkey = lambda x: (x.chrom, x.pos) for x, y in zip(sorted(bfile.fetch(chrom), key=sortkey), sorted(self.gen_data, key=sortkey)): self.assertEqual(x.rsid, y.rsid) self.assertEqual(x.chrom, y.chrom) self.assertEqual(x.pos, y.pos)
def test_fetch_after_position(self): ''' fetching variants with chrom and start gives all variants after pos ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') sortkey = lambda x: (x.chrom, x.pos) gen_vars = [ x for x in sorted(self.gen_data, key=sortkey) if start <= x.pos ] for x, y in zip(sorted(bfile.fetch(chrom, start), key=sortkey), gen_vars): self.assertEqual(x.rsid, y.rsid) self.assertEqual(x.chrom, y.chrom) self.assertEqual(x.pos, y.pos)
def test_pickling(self): ''' BgenVar should pickle and unpickle ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: for var in bfile: # this checks that we can pickle and unpickle a BgenVar pickled = pickle.dumps(var) unpickled = pickle.loads(pickled) # check attributes of the original and unpickled are identical self.assertEqual(var.varid, unpickled.varid) self.assertEqual(var.rsid, unpickled.rsid) self.assertEqual(var.chrom, unpickled.chrom) self.assertEqual(var.pos, unpickled.pos) self.assertEqual(var.alleles, unpickled.alleles)
def test_minor_allele_dosage_v11(self): ''' test we calculate minor_allele_dosage correctly with version 1 bgens ''' path = self.folder / 'example.v11.bgen' with BgenFile(path) as bfile: for var in bfile: dose = var.minor_allele_dosage probs = var.probabilities # calculate dosages for each allele a1 = (probs[:, 0] * 2 + probs[:, 1]) a2 = (probs[:, 2] * 2 + probs[:, 1]) # get delta between var.minor_allele_dosage and values calculated here recomputed = a2 if np.nansum(a1) >= np.nansum(a2) else a1 delta = abs(dose - recomputed) # check difference between the two estimates is sufficiently low self.assertTrue(np.nanmax(delta) < 7e-5)
def test_fetch_in_region(self): ''' fetching variants with chrom, start, stop gives variants in region ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') sortkey = lambda x: (x.chrom, x.pos) gen_vars = [ x for x in sorted(self.gen_data, key=sortkey) if start <= x.pos <= stop ] for x, y in zip(sorted(bfile.fetch(chrom, start, stop), key=sortkey), gen_vars): self.assertEqual(x.rsid, y.rsid) self.assertEqual(x.chrom, y.chrom) self.assertEqual(x.pos, y.pos) # check that we don't get any variants in a region without any self.assertEqual(list(bfile.fetch(chrom, start * 1000, stop * 1000)), [])
def test_Path(self): ''' check we can open bgen files from Path objects ''' path = self.folder / 'example.v11.bgen' bfile = BgenFile(path)
def test_load_missing_file(self): ''' check passing in a path to a missing file fails gracefully ''' with self.assertRaises(ValueError): BgenFile('/zzz/jjj/qqq.bgen')