def test_two_rsids_for_one_gene(self):
        # Prepare
        tmpdir = tempfile.mkdtemp()
        output_file = os.path.join(tmpdir, 'output.hdf5')

        model_name = 'model00'
        model_path = _create_model(
            model_name,
            [
                ['rs1', 'gene00', 0.3712, 'A', 'G'],  # ambiguous
                ['rs2', 'gene00', 0.0807, 'G', 'C'],  # non-ambiguous
            ])

        options = [
            self.python_path,
            self.predixcan_path,
            '--bgens-dir',
            get_full_path('tests/data/set00/'),
            '--bgens-prefix',
            'chr',
            '--bgens-sample-file',
            get_full_path('tests/data/set00/impv1.sample'),
            '--weights-file',
            model_path,
            '--output-file',
            output_file,
        ]

        return_code = call(options)
        assert return_code == 0

        assert os.path.isfile(output_file)
        with h5py.File(output_file, 'r') as hdf5_file:
            assert len(hdf5_file.keys()) == 3

            assert 'genes' in hdf5_file.keys()
            assert hdf5_file['genes'].shape == (1, )
            genes = [x.decode() for x in hdf5_file['genes']]
            assert genes[0] == 'gene00'

            assert 'pred_expr' in hdf5_file.keys()
            preds = hdf5_file['pred_expr']
            assert preds.shape == (1, 300)
            assert preds.chunks == (1, 300)

            assert truncate(preds[0, 0]) == truncate(
                0.3712 * (2 - np.dot([0.74909, 0.01339, 0.23758], [0, 1, 2])) +
                0.0807 *
                (np.dot([0.75232, 0.11729, 0.13050], [0, 1, 2]))), preds[0, 0]

            assert truncate(preds[0, 299]) == truncate(
                0.3712 * (2 - np.dot([0.05763, 0.77338, 0.16910], [0, 1, 2])) +
                0.0807 *
                (np.dot([0.00937, 0.13421, 0.85658], [0, 1, 2]))), preds[0,
                                                                         299]
    def test_alleles_in_bgen_order_gene(self):
        # Prepare
        tmpdir = tempfile.mkdtemp()
        output_file = os.path.join(tmpdir, 'output.hdf5')

        model_name = 'model00'
        model_path = _create_model(model_name,
                                   [['rs1', 'gene00', 0.3712, 'G', 'A']])

        options = [
            self.python_path,
            self.predixcan_path,
            '--bgens-dir',
            get_full_path('tests/data/set00/'),
            '--bgens-prefix',
            'chr',
            '--bgens-sample-file',
            get_full_path('tests/data/set00/impv1.sample'),
            '--weights-file',
            model_path,
            '--output-file',
            output_file,
        ]

        return_code = call(options)
        assert return_code == 0

        assert os.path.isfile(output_file)
        with h5py.File(output_file, 'r') as hdf5_file:
            assert len(hdf5_file.keys()) == 3
            assert 'pred_expr' in hdf5_file.keys()

            preds = hdf5_file['pred_expr']
            assert preds.shape == (1, 300)
            assert preds.dtype == np.dtype('float32')
            assert preds.scaleoffset == 4
            assert preds.chunks == (1, 300)

            assert truncate(preds[0, 0]) == truncate(
                0.3712 *
                (np.dot([0.74909, 0.01333, 0.23758], [0, 1, 2]))), preds[0, 0]
def _create_model(model_name, values):
    model_path = get_full_path('tests/data/models/{}.db'.format(model_name))
    if os.path.exists(model_path): os.remove(model_path)

    with sqlite3.connect(model_path) as conn:
        conn.execute("""
            CREATE TABLE weights
            ( "rsid" TEXT,
                "gene" TEXT,
                "weight" REAL,
                "ref_allele" TEXT,
                "eff_allele" TEXT
            );
        """)

        for val in values:
            conn.execute("""
                insert into weights (rsid, gene, weight, ref_allele, eff_allele) values
                ('{}', '{}', {}, '{}', '{}');
            """.format(val[0], val[1], val[2], val[3], val[4]))

    return model_path
Beispiel #4
0
 def setUp(self):
     self.load_data_path = get_full_path(os.path.join('ukbrest', 'load_data.py'))
     
     super(LoadDataTest, self).setUp()
 def setUp(self):
     self.predixcan_path = get_full_path(os.path.join('.', 'predict.py'))
     self.python_path = '/home/miltondp/software/miniconda3/envs/predixcan_prediction/bin/python'
    def test_many_dosages_files(self):
        # Prepare
        tmpdir = tempfile.mkdtemp()
        output_file = os.path.join(tmpdir, 'output.hdf5')

        model_name = 'model00'
        alleles = {
            0: ('A', 'G'),
            1: ('C', 'A'),
            9: ('C', 'T'),
            10: ('G', 'T'),
            11: ('C', 'G'),
            21: ('G', 'A'),
        }

        weights = {
            0: 0.1158,
            1: 0.5455,
            9: 0.9876,
            10: 0.1755,
            11: 0.2754,
            21: 0.6855,
        }

        model_path = _create_model(
            model_name,
            [[
                'rs{}'.format(i + 1), 'gene{:0>3d}'.format(j),
                weights.get(j, 0.5),
                alleles.get(j, ('G', ))[0],
                alleles.get(j, ('', 'C'))[1]
            ] for j in range(21 + 1) for i in range(j * 10, j * 10 + 2)] + [[
                'rs{}'.format(i + 1), 'gene2{:0>2d}'.format(j),
                weights.get(j, 0.5),
                alleles.get(j, ('G', ))[0],
                alleles.get(j, ('', 'C'))[1]
            ] for j in range(11 + 1) for i in range(2000000 + j * 10, 2000000 +
                                                    j * 10 + 2)],
        )

        options = [
            self.python_path,
            self.predixcan_path,
            '--bgens-dir',
            get_full_path('tests/data/set00/'),
            '--bgens-prefix',
            'chr',
            '--bgens-sample-file',
            get_full_path('tests/data/set00/impv1.sample'),
            '--weights-file',
            model_path,
            '--output-file',
            output_file,
        ]

        return_code = call(options)
        assert return_code == 0

        assert os.path.isfile(output_file)
        with h5py.File(output_file, 'r') as hdf5_file:
            assert len(hdf5_file.keys()) == 3

            assert 'samples' in hdf5_file.keys()
            samples = hdf5_file['samples']
            assert samples.shape == (300, )
            assert all(samples[:].astype(str) == np.array(
                [str(x) for x in range(1, 300 + 1)]))

            assert 'genes' in hdf5_file.keys()
            assert hdf5_file['genes'].shape == (22 + 12, )
            genes = [x.decode() for x in hdf5_file['genes']]
            assert genes[0] == 'gene000'
            assert genes[1] == 'gene001'
            assert genes[20] == 'gene020'
            assert genes[21] == 'gene021'
            assert genes[22] == 'gene200'
            assert genes[23] == 'gene201'
            assert genes[32] == 'gene210'
            assert genes[-1] == 'gene211'

            assert 'pred_expr' in hdf5_file.keys()
            preds = hdf5_file['pred_expr']
            assert preds.shape == (22 + 12, 300)
            assert preds.chunks == (10, 300)

            # genes from chr1
            # gene00
            assert truncate(preds[0, 0]) == truncate(
                0.1158 *
                (2 - np.dot([0.74909, 0.01333, 0.23740], [0, 1, 2])) + 0.1158 *
                (np.dot([0.75232, 0.11725, 0.13030], [0, 1, 2]))
            ) == 0.2188, preds[0, 0]

            assert truncate(preds[0, 299]) == truncate(
                0.1158 *
                (2 - np.dot([0.05763, 0.77328, 0.16910], [0, 1, 2])) + 0.1158 *
                (np.dot([0.00937, 0.13421, 0.85640], [0, 1, 2]))
            ) == 0.3167, preds[0, 299]

            # gene01
            assert truncate(preds[1, 0]) == truncate(
                0.5455 *
                (2 - np.dot([0.96807, 0.01962, 0.01231], [0, 1, 2])) + 0.5455 *
                (np.dot([0.00190, 0.00429, 0.99381], [0, 1, 2]))
            ) == 2.1534, preds[1, 0]

            assert truncate(preds[1, 1]) == truncate(
                0.5455 *
                (2 - np.dot([0.91510, 0.06826, 0.01669], [0, 1, 2])) + 0.5455 *
                (np.dot([0.70937, 0.04896, 0.24177], [0, 1, 2]))
            ) == 1.3260, preds[1, 1]

            # gene09
            assert truncate(preds[9, 0]) == truncate(
                0.9876 *
                (2 - np.dot([0.74754, 0.13307, 0.11935], [0, 1, 2])) + 0.9876 *
                (2 - np.dot([0.03755, 0.78400, 0.17849], [0, 1, 2]))
            ) == 2.4564, preds[9, 0]

            assert truncate(preds[9, 298]) == truncate(
                0.9876 *
                (2 - np.dot([0.71102, 0.00968, 0.27929], [0, 1, 2])) + 0.9876 *
                (2 - np.dot([0.08631, 0.77275, 0.14089], [0, 1, 2]))
            ) == 2.3477, preds[9, 298]

            # gene10
            assert truncate(preds[10, 0]) == truncate(
                0.1755 *
                (2 - np.dot([0.05931, 0.08242, 0.85827], [0, 1, 2])) + 0.1755 *
                (np.dot([0.83525, 0.01184, 0.15291], [0, 1, 2]))
            ) == 0.091, preds[10, 0]

            assert truncate(preds[10, 2]) == truncate(
                0.1755 *
                (2 - np.dot([0.61247, 0.22145, 0.16605], [0, 1, 2])) + 0.1755 *
                (np.dot([0.09727, 0.77103, 0.13170], [0, 1, 2]))
            ) == 0.4353, preds[10, 2]

            # gene11
            assert truncate(preds[11, 0]) == truncate(
                0.2754 *
                (np.dot([0.83211, 0.12816, 0.03970], [0, 1, 2])) + 0.2754 *
                (np.dot([0.96441, 0.02577, 0.00990], [0, 1, 2]))
            ) == 0.0697, preds[11, 0]

            assert truncate(preds[11, 299]) == truncate(
                0.2754 *
                (np.dot([0.04018, 0.84357, 0.11625], [0, 1, 2])) + 0.2754 *
                (np.dot([0.07541, 0.11284, 0.81175], [0, 1, 2]))
            ) == 0.7745, preds[11, 299]

            # gene21
            assert truncate(preds[21, 0]) == truncate(
                0.6855 *
                (np.dot([0.73030, 0.13711, 0.13255], [0, 1, 2])) + 0.6855 *
                (2 - np.dot([0.11456, 0.04225, 0.84315], [0, 1, 2]))
            ) == 0.4617, preds[21, 0]

            assert truncate(preds[21, 299]) == truncate(
                0.6855 *
                (np.dot([0.13023, 0.18599, 0.68379], [0, 1, 2])) + 0.6855 *
                (2 - np.dot([0.85909, 0.07115, 0.06976], [0, 1, 2]))
            ) == 2.2915, preds[21, 299]

            # genes from chr 2
            # gene200
            assert truncate(preds[22, 0]) == truncate(
                0.1158 *
                (np.dot([0.96459, 0.02124, 0.01418], [0, 1, 2])) + 0.1158 *
                (2 - np.dot([0.91804, 0.01235, 0.06966], [0, 1, 2]))
            ) == 0.2197, preds[22, 0]

            assert truncate(preds[22, 299]) == truncate(
                0.1158 *
                (np.dot([0.15472, 0.80145, 0.04384], [0, 1, 2])) + 0.1158 *
                (2 - np.dot([0.95387, 0.00694, 0.03919], [0, 1, 2]))
            ) == 0.3246, preds[22, 299]

            # gene211
            assert truncate(preds[33, 0]) == truncate(
                0.2754 *
                (np.dot([0.07602, 0.84035, 0.08360], [0, 1, 2])) + 0.2754 *
                (np.dot([0.79911, 0.18546, 0.01550], [0, 1, 2]))
            ) == 0.337, preds[33, 0]

            assert truncate(preds[33, 299]) == truncate(
                0.2754 *
                (np.dot([0.18570, 0.81029, 0.00390], [0, 1, 2])) + 0.2754 *
                (np.dot([0.21121, 0.01581, 0.77310], [0, 1, 2]))
            ) == 0.6554, preds[33, 299]
    def test_two_rsids_for_one_gene_one_rsid_for_another_gene(self):
        # Prepare
        tmpdir = tempfile.mkdtemp()
        output_file = os.path.join(tmpdir, 'output.hdf5')

        model_name = 'model00'
        model_path = _create_model(
            model_name,
            [
                ['rs1', 'gene00', 0.3712, 'A', 'G'],  # ambiguous
                ['rs2', 'gene00', 0.0807, 'G', 'C'],  # non-ambiguous
                ['rs10', 'gene01', 0.6188, 'A', 'T'],  # non-ambiguous
            ])

        options = [
            self.python_path,
            self.predixcan_path,
            '--bgens-dir',
            get_full_path('tests/data/set00/'),
            '--bgens-prefix',
            'chr',
            '--bgens-sample-file',
            get_full_path('tests/data/set00/impv1.sample'),
            '--weights-file',
            model_path,
            '--output-file',
            output_file,
        ]

        return_code = call(options)
        assert return_code == 0

        assert os.path.isfile(output_file)
        with h5py.File(output_file, 'r') as hdf5_file:
            assert len(hdf5_file.keys()) == 3

            assert 'samples' in hdf5_file.keys()
            samples = hdf5_file['samples']
            assert samples.shape == (300, )
            assert all(samples[:].astype(str) == np.array(
                [str(x) for x in range(1, 300 + 1)]))

            assert 'genes' in hdf5_file.keys()
            assert hdf5_file['genes'].shape == (2, )
            genes = [x.decode() for x in hdf5_file['genes']]
            assert genes[0] == 'gene00'
            assert genes[1] == 'gene01'

            assert 'pred_expr' in hdf5_file.keys()
            preds = hdf5_file['pred_expr']
            assert preds.shape == (2, 300)
            assert preds.chunks == (2, 300)

            # gene00
            assert truncate(preds[0, 0]) == truncate(
                0.3712 *
                (2 - np.dot([0.74909, 0.01333, 0.23758], [0, 1, 2])) + 0.0807 *
                (np.dot([0.75232, 0.11749, 0.13040], [0, 1, 2]))
            ) == 0.5916, preds[0, 0]

            assert truncate(preds[0, 299]) == truncate(
                0.3712 *
                (2 - np.dot([0.05763, 0.77328, 0.16910], [0, 1, 2])) + 0.0807 *
                (np.dot([0.00937, 0.13421, 0.85650], [0, 1, 2]))
            ) == 0.4788, preds[0, 299]

            # gene01
            assert truncate(preds[1, 0]) == truncate(0.6188 * (np.dot(
                [0.11764, 0.86431, 0.01805], [0, 1, 2]))) == 0.5571, preds[1,
                                                                           0]

            assert truncate(preds[1, 298]) == truncate(0.6188 * (np.dot(
                [0.03509, 0.82789, 0.13705], [0, 1, 2]))) == 0.6819, preds[1,
                                                                           298]