def test_two_rsids_for_one_gene(self): # Prepare tmpdir = tempfile.mkdtemp() output_file = os.path.join(tmpdir, 'output.hdf5') model_name = 'model00' model_path = _create_model( model_name, [ ['rs1', 'gene00', 0.3712, 'A', 'G'], # ambiguous ['rs2', 'gene00', 0.0807, 'G', 'C'], # non-ambiguous ]) options = [ self.python_path, self.predixcan_path, '--bgens-dir', get_full_path('tests/data/set00/'), '--bgens-prefix', 'chr', '--bgens-sample-file', get_full_path('tests/data/set00/impv1.sample'), '--weights-file', model_path, '--output-file', output_file, ] return_code = call(options) assert return_code == 0 assert os.path.isfile(output_file) with h5py.File(output_file, 'r') as hdf5_file: assert len(hdf5_file.keys()) == 3 assert 'genes' in hdf5_file.keys() assert hdf5_file['genes'].shape == (1, ) genes = [x.decode() for x in hdf5_file['genes']] assert genes[0] == 'gene00' assert 'pred_expr' in hdf5_file.keys() preds = hdf5_file['pred_expr'] assert preds.shape == (1, 300) assert preds.chunks == (1, 300) assert truncate(preds[0, 0]) == truncate( 0.3712 * (2 - np.dot([0.74909, 0.01339, 0.23758], [0, 1, 2])) + 0.0807 * (np.dot([0.75232, 0.11729, 0.13050], [0, 1, 2]))), preds[0, 0] assert truncate(preds[0, 299]) == truncate( 0.3712 * (2 - np.dot([0.05763, 0.77338, 0.16910], [0, 1, 2])) + 0.0807 * (np.dot([0.00937, 0.13421, 0.85658], [0, 1, 2]))), preds[0, 299]
def test_get_iterator(self): # Prepare bgen_dosage = BGENDosage(get_repository_path('set00/chr2impv1.bgen')) # Run all_items = list(bgen_dosage.items(n_rows_cached=10)) assert len(all_items) == 150 # snp 1 assert all_items[0].chr == 2 assert all_items[0].position == 100 assert all_items[0].allele0 == 'A' assert all_items[0].allele1 == 'G' assert all_items[0].rsid == 'rs2000000' assert all_items[0].dosages.shape == (300, ) assert truncate(all_items[0].dosages[0]) == truncate( np.dot([0.94401, 0.02976, 0.02623], [0, 1, 2])) == 0.0822 assert truncate(all_items[0].dosages[2]) == truncate( np.dot([0.00658, 0.92760, 0.06582], [0, 1, 2])) == 1.0592 # snp middle assert all_items[99].chr == 2 assert all_items[99].position == 7516 assert all_items[99].allele0 == 'T' assert all_items[99].allele1 == 'A' assert all_items[99].rsid == 'rs2000099' assert all_items[99].dosages.shape == (300, ) assert truncate( all_items[99].dosages[0] ) == 1.1071 # truncate(np.dot([0.03148, 0.82993, 0.13854], [0, 1, 2])) == 1.1070, truncate(all_items[99].dosages[0]) assert truncate(all_items[99].dosages[5]) == truncate( np.dot([0.04327, 0.89103, 0.06570], [0, 1, 2])) == 1.0224 # snp last assert all_items[149].chr == 2 assert all_items[149].position == 11226 assert all_items[149].allele0 == 'G' assert all_items[149].allele1 == 'T' assert all_items[149].rsid == 'rs2000149' assert all_items[149].dosages.shape == (300, ) assert truncate( all_items[149].dosages[1] ) == 1.8772 # truncate(np.dot([0.01371, 0.09532, 0.89091], [0, 1, 2])) == 1.8771, truncate(all_items[149].dosages[1]) assert truncate( all_items[149].dosages[2] ) == 1.7562 # truncate(np.dot([0.07391, 0.09597, 0.83011], [0, 1, 2])) == 1.7561, truncate(all_items[149].dosages[2])
def test_alleles_in_bgen_order_gene(self): # Prepare tmpdir = tempfile.mkdtemp() output_file = os.path.join(tmpdir, 'output.hdf5') model_name = 'model00' model_path = _create_model(model_name, [['rs1', 'gene00', 0.3712, 'G', 'A']]) options = [ self.python_path, self.predixcan_path, '--bgens-dir', get_full_path('tests/data/set00/'), '--bgens-prefix', 'chr', '--bgens-sample-file', get_full_path('tests/data/set00/impv1.sample'), '--weights-file', model_path, '--output-file', output_file, ] return_code = call(options) assert return_code == 0 assert os.path.isfile(output_file) with h5py.File(output_file, 'r') as hdf5_file: assert len(hdf5_file.keys()) == 3 assert 'pred_expr' in hdf5_file.keys() preds = hdf5_file['pred_expr'] assert preds.shape == (1, 300) assert preds.dtype == np.dtype('float32') assert preds.scaleoffset == 4 assert preds.chunks == (1, 300) assert truncate(preds[0, 0]) == truncate( 0.3712 * (np.dot([0.74909, 0.01333, 0.23758], [0, 1, 2]))), preds[0, 0]
def test_get_iterator_filter_by_rsid(self): # Prepare bgen_dosage = BGENDosage(get_repository_path('set00/chr2impv1.bgen')) # Run all_items = list( bgen_dosage.items( n_rows_cached=10, include_rsid=['rs2000000', 'rs2000002', 'rs2000149'])) assert len(all_items) == 3 all_rsids = [i.rsid for i in all_items] assert 'rs2000000' in all_rsids assert 'rs2000002' in all_rsids assert 'rs2000149' in all_rsids # snp 1 rsid = 'rs2000000' rsid_idx = all_rsids.index(rsid) assert all_items[rsid_idx].chr == 2 assert all_items[rsid_idx].position == 100 assert all_items[rsid_idx].allele0 == 'A' assert all_items[rsid_idx].allele1 == 'G' assert all_items[rsid_idx].rsid == rsid assert all_items[rsid_idx].dosages.shape == (300, ) assert truncate(all_items[rsid_idx].dosages[0]) == truncate( np.dot([0.94401, 0.02976, 0.02623], [0, 1, 2])) == 0.0822 assert truncate(all_items[rsid_idx].dosages[2]) == truncate( np.dot([0.00658, 0.92760, 0.06582], [0, 1, 2])) == 1.0592 # snp middle rsid = 'rs2000002' rsid_idx = all_rsids.index(rsid) assert all_items[rsid_idx].chr == 2 assert all_items[rsid_idx].position == 215 assert all_items[rsid_idx].allele0 == 'A' assert all_items[rsid_idx].allele1 == 'T' assert all_items[rsid_idx].rsid == rsid assert all_items[rsid_idx].dosages.shape == (300, ) assert truncate(all_items[rsid_idx].dosages[0]) == truncate( np.dot([0.91804, 0.01235, 0.06961], [0, 1, 2])), truncate(all_items[rsid_idx].dosages[0]) assert truncate(all_items[rsid_idx].dosages[5]) == truncate( np.dot([0.02761, 0.92139, 0.05101], [0, 1, 2])), truncate(all_items[rsid_idx].dosages[5]) # snp last rsid = 'rs2000149' rsid_idx = all_rsids.index(rsid) assert all_items[rsid_idx].chr == 2 assert all_items[rsid_idx].position == 11226 assert all_items[rsid_idx].allele0 == 'G' assert all_items[rsid_idx].allele1 == 'T' assert all_items[rsid_idx].rsid == rsid assert all_items[rsid_idx].dosages.shape == (300, ) assert truncate(all_items[rsid_idx].dosages[1]) == truncate( np.dot([0.01371, 0.09542, 0.89091], [0, 1, 2])) == 1.8772, truncate( all_items[rsid_idx].dosages[1]) assert truncate(all_items[rsid_idx].dosages[2]) == truncate( np.dot([0.07391, 0.09607, 0.83011], [0, 1, 2])) == 1.7562, truncate( all_items[rsid_idx].dosages[2])
def test_get_iterator_with_n_cache_greater_than_n_variants(self): # Prepare bgen_dosage = BGENDosage( get_repository_path('set06_repeated_positions/chr2impv1.bgen')) # Run np.random.rand(0) all_items = list(bgen_dosage.items(n_rows_cached=15)) assert len(all_items) == 13 idx0 = 1 idx1 = 0 # snp 1 assert all_items[idx0].chr == 2 assert all_items[idx0].position == 100 assert all_items[idx0].allele0 == 'C' assert all_items[idx0].allele1 == 'G' assert all_items[idx0].rsid == 'rs2000000' assert all_items[idx0].dosages.shape == (20, ) assert truncate(all_items[idx0].dosages[0]) == truncate( np.dot([0.86648, 0.00133, 0.13219], [0, 1, 2])) == 0.2657 assert truncate(all_items[idx0].dosages[19]) == truncate( np.dot([0.16810, 0.60427, 0.22762], [0, 1, 2])) == 1.0595 # snp 2 assert all_items[idx1].chr == 2 assert all_items[idx1].position == 100 assert all_items[idx1].allele0 == 'A' assert all_items[idx1].allele1 == 'G' assert all_items[idx1].rsid == 'rs2000000' assert all_items[idx1].dosages.shape == (20, ) assert truncate(all_items[idx1].dosages[0]) == truncate( np.dot([0.02759, 0.17211, 0.80030], [0, 1, 2])) == 1.7727 assert truncate(all_items[idx1].dosages[1]) == truncate( np.dot([0.63037, 0.32488, 0.04474], [0, 1, 2])) == 0.4143 assert truncate(all_items[idx1].dosages[19]) == truncate( np.dot([0.16943, 0.05240, 0.77817], [0, 1, 2])) == 1.6087 # snp 3 assert all_items[2].chr == 2 assert all_items[2].position == 168 assert all_items[2].allele0 == 'G' assert all_items[2].allele1 == 'C' assert all_items[2].rsid == 'rs2000001' assert all_items[2].dosages.shape == (20, ) assert truncate(all_items[2].dosages[0]) == truncate( np.dot([0.05452, 0.10875, 0.83674], [0, 1, 2])) == 1.7822 assert truncate(all_items[2].dosages[1]) == truncate( np.dot([0.83410, 0.14198, 0.02392], [0, 1, 2])) == 0.1898 assert truncate(all_items[2].dosages[19]) == truncate( np.dot([0.76940, 0.16924, 0.06136], [0, 1, 2])) == 0.2919 # snp last assert all_items[12].chr == 2 assert all_items[12].position == 934 assert all_items[12].allele0 == 'T' assert all_items[12].allele1 == 'A' assert all_items[12].rsid == 'rs2000011' assert all_items[12].dosages.shape == (20, ) assert truncate(all_items[12].dosages[0]) == truncate( np.dot([0.00457, 0.01764, 0.97778], [0, 1, 2])) == 1.9732 assert truncate(all_items[12].dosages[19]) == truncate( np.dot([0.11567, 0.05896, 0.82537], [0, 1, 2])) == 1.7097
def test_get_iterator_repeated_variant_positions(self): # Prepare bgen_dosage = BGENDosage( get_repository_path('set06_repeated_positions/chr1impv1.bgen')) # Run all_items = list(bgen_dosage.items(n_rows_cached=5)) assert len(all_items) == 11, len(all_items) # snp 1 assert all_items[0].chr == 1 assert all_items[0].position == 100 assert all_items[0].allele0 == 'T' assert all_items[0].allele1 == 'A' assert all_items[0].rsid == 'rs1' assert all_items[0].dosages.shape == (20, ) assert truncate(all_items[0].dosages[0]) == truncate( np.dot([0.06817, 0.27690, 0.65493], [0, 1, 2])) == 1.5867 assert truncate(all_items[0].dosages[19]) == truncate( np.dot([0.00219, 0.08983, 0.90798], [0, 1, 2])) == 1.9057 # snp 5 assert all_items[4].chr == 1 assert all_items[4].position == 418 assert all_items[4].allele0 == 'T' assert all_items[4].allele1 == 'A' assert all_items[4].rsid == 'rs5' assert all_items[4].dosages.shape == (20, ) assert truncate(all_items[4].dosages[0]) == truncate( np.dot([0.09158, 0.16910, 0.73933], [0, 1, 2])) == 1.6477 assert truncate(all_items[4].dosages[1]) == truncate( np.dot([0.09820, 0.09934, 0.80246], [0, 1, 2])) == 1.7042 assert truncate(all_items[4].dosages[19]) == truncate( np.dot([0.02833, 0.93189, 0.03978], [0, 1, 2])) == 1.0114 # snp 6 assert all_items[5].chr == 1 assert all_items[5].position == 418 assert all_items[5].allele0 == 'T' assert all_items[5].allele1 == 'C' assert all_items[5].rsid == 'rs5' assert all_items[5].dosages.shape == (20, ) assert truncate(all_items[5].dosages[0]) == truncate( np.dot([0.00598, 0.02878, 0.96524], [0, 1, 2])) == 1.9592 assert truncate(all_items[5].dosages[1]) == truncate( np.dot([0.01553, 0.14800, 0.83647], [0, 1, 2])) == 1.8209 assert truncate(all_items[5].dosages[19]) == truncate( np.dot([0.08347, 0.02509, 0.89144], [0, 1, 2])) == 1.8079 # snp last assert all_items[10].chr == 1 assert all_items[10].position == 839 assert all_items[10].allele0 == 'G' assert all_items[10].allele1 == 'A' assert all_items[10].rsid == 'rs10' assert all_items[10].dosages.shape == (20, ) assert truncate(all_items[10].dosages[0]) == truncate( np.dot([0.03161, 0.82957, 0.13882], [0, 1, 2])) == 1.1072 assert truncate(all_items[10].dosages[19]) == truncate( np.dot([0.96104, 0.03167, 0.00729], [0, 1, 2])) == 0.0462
def test_many_dosages_files(self): # Prepare tmpdir = tempfile.mkdtemp() output_file = os.path.join(tmpdir, 'output.hdf5') model_name = 'model00' alleles = { 0: ('A', 'G'), 1: ('C', 'A'), 9: ('C', 'T'), 10: ('G', 'T'), 11: ('C', 'G'), 21: ('G', 'A'), } weights = { 0: 0.1158, 1: 0.5455, 9: 0.9876, 10: 0.1755, 11: 0.2754, 21: 0.6855, } model_path = _create_model( model_name, [[ 'rs{}'.format(i + 1), 'gene{:0>3d}'.format(j), weights.get(j, 0.5), alleles.get(j, ('G', ))[0], alleles.get(j, ('', 'C'))[1] ] for j in range(21 + 1) for i in range(j * 10, j * 10 + 2)] + [[ 'rs{}'.format(i + 1), 'gene2{:0>2d}'.format(j), weights.get(j, 0.5), alleles.get(j, ('G', ))[0], alleles.get(j, ('', 'C'))[1] ] for j in range(11 + 1) for i in range(2000000 + j * 10, 2000000 + j * 10 + 2)], ) options = [ self.python_path, self.predixcan_path, '--bgens-dir', get_full_path('tests/data/set00/'), '--bgens-prefix', 'chr', '--bgens-sample-file', get_full_path('tests/data/set00/impv1.sample'), '--weights-file', model_path, '--output-file', output_file, ] return_code = call(options) assert return_code == 0 assert os.path.isfile(output_file) with h5py.File(output_file, 'r') as hdf5_file: assert len(hdf5_file.keys()) == 3 assert 'samples' in hdf5_file.keys() samples = hdf5_file['samples'] assert samples.shape == (300, ) assert all(samples[:].astype(str) == np.array( [str(x) for x in range(1, 300 + 1)])) assert 'genes' in hdf5_file.keys() assert hdf5_file['genes'].shape == (22 + 12, ) genes = [x.decode() for x in hdf5_file['genes']] assert genes[0] == 'gene000' assert genes[1] == 'gene001' assert genes[20] == 'gene020' assert genes[21] == 'gene021' assert genes[22] == 'gene200' assert genes[23] == 'gene201' assert genes[32] == 'gene210' assert genes[-1] == 'gene211' assert 'pred_expr' in hdf5_file.keys() preds = hdf5_file['pred_expr'] assert preds.shape == (22 + 12, 300) assert preds.chunks == (10, 300) # genes from chr1 # gene00 assert truncate(preds[0, 0]) == truncate( 0.1158 * (2 - np.dot([0.74909, 0.01333, 0.23740], [0, 1, 2])) + 0.1158 * (np.dot([0.75232, 0.11725, 0.13030], [0, 1, 2])) ) == 0.2188, preds[0, 0] assert truncate(preds[0, 299]) == truncate( 0.1158 * (2 - np.dot([0.05763, 0.77328, 0.16910], [0, 1, 2])) + 0.1158 * (np.dot([0.00937, 0.13421, 0.85640], [0, 1, 2])) ) == 0.3167, preds[0, 299] # gene01 assert truncate(preds[1, 0]) == truncate( 0.5455 * (2 - np.dot([0.96807, 0.01962, 0.01231], [0, 1, 2])) + 0.5455 * (np.dot([0.00190, 0.00429, 0.99381], [0, 1, 2])) ) == 2.1534, preds[1, 0] assert truncate(preds[1, 1]) == truncate( 0.5455 * (2 - np.dot([0.91510, 0.06826, 0.01669], [0, 1, 2])) + 0.5455 * (np.dot([0.70937, 0.04896, 0.24177], [0, 1, 2])) ) == 1.3260, preds[1, 1] # gene09 assert truncate(preds[9, 0]) == truncate( 0.9876 * (2 - np.dot([0.74754, 0.13307, 0.11935], [0, 1, 2])) + 0.9876 * (2 - np.dot([0.03755, 0.78400, 0.17849], [0, 1, 2])) ) == 2.4564, preds[9, 0] assert truncate(preds[9, 298]) == truncate( 0.9876 * (2 - np.dot([0.71102, 0.00968, 0.27929], [0, 1, 2])) + 0.9876 * (2 - np.dot([0.08631, 0.77275, 0.14089], [0, 1, 2])) ) == 2.3477, preds[9, 298] # gene10 assert truncate(preds[10, 0]) == truncate( 0.1755 * (2 - np.dot([0.05931, 0.08242, 0.85827], [0, 1, 2])) + 0.1755 * (np.dot([0.83525, 0.01184, 0.15291], [0, 1, 2])) ) == 0.091, preds[10, 0] assert truncate(preds[10, 2]) == truncate( 0.1755 * (2 - np.dot([0.61247, 0.22145, 0.16605], [0, 1, 2])) + 0.1755 * (np.dot([0.09727, 0.77103, 0.13170], [0, 1, 2])) ) == 0.4353, preds[10, 2] # gene11 assert truncate(preds[11, 0]) == truncate( 0.2754 * (np.dot([0.83211, 0.12816, 0.03970], [0, 1, 2])) + 0.2754 * (np.dot([0.96441, 0.02577, 0.00990], [0, 1, 2])) ) == 0.0697, preds[11, 0] assert truncate(preds[11, 299]) == truncate( 0.2754 * (np.dot([0.04018, 0.84357, 0.11625], [0, 1, 2])) + 0.2754 * (np.dot([0.07541, 0.11284, 0.81175], [0, 1, 2])) ) == 0.7745, preds[11, 299] # gene21 assert truncate(preds[21, 0]) == truncate( 0.6855 * (np.dot([0.73030, 0.13711, 0.13255], [0, 1, 2])) + 0.6855 * (2 - np.dot([0.11456, 0.04225, 0.84315], [0, 1, 2])) ) == 0.4617, preds[21, 0] assert truncate(preds[21, 299]) == truncate( 0.6855 * (np.dot([0.13023, 0.18599, 0.68379], [0, 1, 2])) + 0.6855 * (2 - np.dot([0.85909, 0.07115, 0.06976], [0, 1, 2])) ) == 2.2915, preds[21, 299] # genes from chr 2 # gene200 assert truncate(preds[22, 0]) == truncate( 0.1158 * (np.dot([0.96459, 0.02124, 0.01418], [0, 1, 2])) + 0.1158 * (2 - np.dot([0.91804, 0.01235, 0.06966], [0, 1, 2])) ) == 0.2197, preds[22, 0] assert truncate(preds[22, 299]) == truncate( 0.1158 * (np.dot([0.15472, 0.80145, 0.04384], [0, 1, 2])) + 0.1158 * (2 - np.dot([0.95387, 0.00694, 0.03919], [0, 1, 2])) ) == 0.3246, preds[22, 299] # gene211 assert truncate(preds[33, 0]) == truncate( 0.2754 * (np.dot([0.07602, 0.84035, 0.08360], [0, 1, 2])) + 0.2754 * (np.dot([0.79911, 0.18546, 0.01550], [0, 1, 2])) ) == 0.337, preds[33, 0] assert truncate(preds[33, 299]) == truncate( 0.2754 * (np.dot([0.18570, 0.81029, 0.00390], [0, 1, 2])) + 0.2754 * (np.dot([0.21121, 0.01581, 0.77310], [0, 1, 2])) ) == 0.6554, preds[33, 299]
def test_two_rsids_for_one_gene_one_rsid_for_another_gene(self): # Prepare tmpdir = tempfile.mkdtemp() output_file = os.path.join(tmpdir, 'output.hdf5') model_name = 'model00' model_path = _create_model( model_name, [ ['rs1', 'gene00', 0.3712, 'A', 'G'], # ambiguous ['rs2', 'gene00', 0.0807, 'G', 'C'], # non-ambiguous ['rs10', 'gene01', 0.6188, 'A', 'T'], # non-ambiguous ]) options = [ self.python_path, self.predixcan_path, '--bgens-dir', get_full_path('tests/data/set00/'), '--bgens-prefix', 'chr', '--bgens-sample-file', get_full_path('tests/data/set00/impv1.sample'), '--weights-file', model_path, '--output-file', output_file, ] return_code = call(options) assert return_code == 0 assert os.path.isfile(output_file) with h5py.File(output_file, 'r') as hdf5_file: assert len(hdf5_file.keys()) == 3 assert 'samples' in hdf5_file.keys() samples = hdf5_file['samples'] assert samples.shape == (300, ) assert all(samples[:].astype(str) == np.array( [str(x) for x in range(1, 300 + 1)])) assert 'genes' in hdf5_file.keys() assert hdf5_file['genes'].shape == (2, ) genes = [x.decode() for x in hdf5_file['genes']] assert genes[0] == 'gene00' assert genes[1] == 'gene01' assert 'pred_expr' in hdf5_file.keys() preds = hdf5_file['pred_expr'] assert preds.shape == (2, 300) assert preds.chunks == (2, 300) # gene00 assert truncate(preds[0, 0]) == truncate( 0.3712 * (2 - np.dot([0.74909, 0.01333, 0.23758], [0, 1, 2])) + 0.0807 * (np.dot([0.75232, 0.11749, 0.13040], [0, 1, 2])) ) == 0.5916, preds[0, 0] assert truncate(preds[0, 299]) == truncate( 0.3712 * (2 - np.dot([0.05763, 0.77328, 0.16910], [0, 1, 2])) + 0.0807 * (np.dot([0.00937, 0.13421, 0.85650], [0, 1, 2])) ) == 0.4788, preds[0, 299] # gene01 assert truncate(preds[1, 0]) == truncate(0.6188 * (np.dot( [0.11764, 0.86431, 0.01805], [0, 1, 2]))) == 0.5571, preds[1, 0] assert truncate(preds[1, 298]) == truncate(0.6188 * (np.dot( [0.03509, 0.82789, 0.13705], [0, 1, 2]))) == 0.6819, preds[1, 298]