Esempio n. 1
0
def test_bgen_reader_with_wrong_metadata_file():
    filepath = example_filepath("example.32bits.bgen")
    filepath.touch()
    metafile_filepath = example_filepath("wrong.metadata")
    metafile_filepath.touch()  # make sure that the metafile has a later timestamp (otherwise, it might be re-created)
    with pytest.raises(RuntimeError):
        read_bgen(filepath, verbose=False, metafile_filepath=metafile_filepath)
Esempio n. 2
0
def test_metafile_provided_not_supported_anymore():
    with pytest.raises(RuntimeError):
        read_bgen(
            example_filepath("haplotypes.bgen"),
            metafile_filepath=example_filepath("haplotypes.bgen.metadata.valid"),
            verbose=False,
        )
Esempio n. 3
0
def test_allele_expectation():
    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen2:
        e = bgen2.allele_expectation(np.s_[bgen2.samples == "sample_005",
                                           bgen2.rsids == "RSID_6"])
        assert np.allclose(e, [[[1.01086423, 0.98913577]]])

    with pytest.raises(ValueError):
        filepath = example_filepath("haplotypes.bgen")
        with open_bgen(filepath, verbose=False) as bgen2:
            bgen2.allele_expectation()

    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen2:
        e = bgen2.allele_expectation(np.s_[:, []])
        assert e.shape == (500, 0, 2)

    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen2:
        e = bgen2.allele_expectation(
            np.s_[bgen2.samples == "sample_005", bgen2.rsids == "RSID_6"],
            assume_constant_ploidy=False,
        )
        assert np.allclose(e, [[[1.01086423, 0.98913577]]])

    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen2:
        e = bgen2.allele_expectation(np.s_[:, []],
                                     assume_constant_ploidy=False)
        assert e.shape == (500, 0, 2)
Esempio n. 4
0
def test_bgen_samples_outside_bgen_unreadable(tmp_path):
    bgen_filepath = example_filepath("complex.23bits.bgen")
    samples_filepath = tmp_path / "complex.sample"
    copyfile(example_filepath("complex.sample"), samples_filepath)
    with noread_permission(samples_filepath):
        with pytest.raises(PermissionError):
            read_bgen(bgen_filepath, samples_filepath=samples_filepath, verbose=False)
Esempio n. 5
0
def test_bgen_samples_specify_samples_file():
    data = read_bgen(
        example_filepath("complex.23bits.bgen"),
        samples_filepath=example_filepath("complex.sample"),
        verbose=False,
    )
    samples = ["sample_0", "sample_1", "sample_2", "sample_3"]
    samples = Series(samples, dtype=str, name="id")
    assert all(data["samples"] == samples)
Esempio n. 6
0
def test_allele_expectation_interface():
    bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False)
    with pytest.raises(ValueError):
        allele_expectation(bgen, 1)

    bgen = read_bgen(example_filepath("complex.23bits.bgen"), verbose=False)
    e = allele_expectation(bgen, 3)
    assert_allclose(
        e,
        [[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [1.0, 1.0, 0.0], [0.0, 2.0, 0.0]])
Esempio n. 7
0
def test_zero_width():
    filepath = example_filepath("complex.bgen")
    with open_bgen(filepath, allow_complex=True, verbose=False) as bgen:
        for assume_constant_ploidy in [False, True]:
            e = bgen.allele_expectation(
                [],
                assume_constant_ploidy=assume_constant_ploidy,
            )
            f = bgen.allele_frequency(e)
            assert e.shape == (bgen.nsamples, 0, bgen.nalleles[0])
            assert f.shape == (0, bgen.nalleles[0])

            good_variants = logical_not(bgen.phased) * (bgen.nalleles == 2)
            e = bgen.allele_expectation(
                ([], good_variants),
                assume_constant_ploidy=assume_constant_ploidy,
            )
            f = bgen.allele_frequency(e)
            assert e.shape == (0, sum(good_variants), bgen.nalleles[0])
            assert_equal(f, zeros(
                (sum(good_variants), bgen.nalleles[0]
                 )))  # We define the freq of something with no samples as 0

            e = bgen.allele_expectation(
                ([], []),
                assume_constant_ploidy=assume_constant_ploidy,
            )
            f = bgen.allele_frequency(e)
            assert e.shape == (0, 0, bgen.nalleles[0])
            assert f.shape == (0, bgen.nalleles[0])
Esempio n. 8
0
def test_dosage_example_32bits():
    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen:
        e = bgen.allele_expectation([5, 0])
        assert_allclose(e[7, 0, :], [1.9556273911044997, 0.044372608895500334])
        assert all(isnan(e[0, 1, :]))
        assert_equal(e.shape, (500, 2, 2))
Esempio n. 9
0
    def test_coverage(self):
        from pysnptools.distreader import DistGen

        with example_filepath("example.32bits.bgen") as filepath:
            bgen = Bgen(filepath,
                        fresh_properties=False,
                        iid_function=lambda sam: ("X", sam))
            assert bgen.iid[0, 0] == "X"
            metadata_filepath = bgen._open_bgen._metadata2_path
            metadata2_temp = metadata_filepath.parent / (
                metadata_filepath.name + ".temp")
            del bgen
            if metadata2_temp.exists():
                metadata2_temp.unlink()
            os.rename(metadata_filepath, metadata2_temp)
            bgen = Bgen(filepath)
            assert bgen.iid[0, 0] == "0"
            bgen[0, 0].read(order='A')
            if not os.path.exists("temp"):
                os.mkdir("temp")
            os.chdir("temp")
            file1x = "coverage.bgen"
            Bgen.write(file1x, bgen[:100, :100])
            Bgen.write(file1x, bgen[:100, :100])
            os.chdir("..")

        distgen0data = DistGen(seed=332, iid_count=10010, sid_count=5).read()
        file1 = "temp/roundtrip1-big.bgen"
        bed3 = Bgen.write(file1,
                          distgen0data,
                          bits=8,
                          compression="zlib",
                          cleanup_temp_files=False,
                          sample_function=lambda fam, ind: f'{fam},{ind}')
        bed3.iid[0, 0] = '0'
Esempio n. 10
0
def test_bgen_reader_phased_genotype():
    filepath = example_filepath("haplotypes.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"]
    samples = bgen["samples"]

    v = variants.loc[0].compute()
    assert_equal(v["chrom"].values[0], "1")
    assert_equal(v["id"].values[0], "SNP1")
    assert_equal(v["nalleles"].values[0], 2)
    assert_equal(v["allele_ids"].values[0], "A,G")
    assert_equal(v["pos"].values[0], 1)
    assert_equal(v["rsid"].values[0], "RS1")

    v = variants.loc[2].compute()
    assert_equal(v["chrom"].values[0], "1")
    assert_equal(v["id"].values[0], "SNP3")
    assert_equal(v["nalleles"].values[0], 2)
    assert_equal(v["allele_ids"].values[0], "A,G")
    assert_equal(v["pos"].values[0], 3)
    assert_equal(v["rsid"].values[0], "RS3")

    assert_equal(samples.loc[0], "sample_0")
    assert_equal(samples.loc[2], "sample_2")

    n = samples.shape[0]
    assert_equal(samples.loc[n - 1], "sample_3")

    g = bgen["genotype"][0].compute()
    assert_allclose(g["probs"][0], [1.0, 0.0, 1.0, 0.0])
    k = len(variants)
    n = len(samples)
    g = bgen["genotype"][k - 1].compute()
    assert_allclose(g["probs"][n - 1], [1.0, 0.0, 0.0, 1.0])
Esempio n. 11
0
def test_bgen_reader_with_nonexistent_metadata_file():
    filepath = example_filepath("example.32bits.bgen")
    folder = os.path.dirname(filepath)
    metafile_filepath = os.path.join(folder, "nonexistent.metadata")

    with pytest.raises(FileNotFoundError):
        with pytest.warns(UserWarning):
            read_bgen(filepath, verbose=False, metafile_filepath=metafile_filepath)
Esempio n. 12
0
def test_bgen_samples_specify_samples_file():
    data = open_bgen(
        example_filepath2("complex.23bits.bgen"),
        samples_filepath=example_filepath("complex.sample"),
        verbose=False,
    )
    samples = ["sample_0", "sample_1", "sample_2", "sample_3"]
    assert all(data.samples == samples)
Esempio n. 13
0
def test_metafile_not_provided_no_permission_to_create(tmp_path):
    src = example_filepath("haplotypes.bgen")
    dst = tmp_path / "haplotypes.bgen"
    copyfile(src, dst)
    path = os.path.dirname(dst)
    with nowrite_permission(path):
        with pytest.warns(UserWarning):
            read_bgen(dst, verbose=False)
Esempio n. 14
0
def test_read_bgem_interface():
    filepath = example_filepath("haplotypes.bgen")
    bgen = read_bgen(filepath, verbose=False)
    assert isinstance(bgen, dict)
    assert isinstance(bgen["variants"], dd.DataFrame)
    assert isinstance(bgen["samples"], Series)
    assert isinstance(bgen["genotype"], list)
    assert isinstance(bgen["genotype"][0], Delayed)
Esempio n. 15
0
def example_filepath2(filename):
    filepath = example_filepath(filename)
    for allow_complex in [False, True]:
        metadata2_path = open_bgen._metadata_path_from_filename(
            filepath, samples_filepath=None, allow_complex=allow_complex)
        if metadata2_path.exists():
            metadata2_path.unlink()
    return filepath
Esempio n. 16
0
def test_freq():
    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen:
        variant_index = bgen.rsids == "RSID_6"
        e = bgen.allele_expectation(variant_index)
        f = bgen.allele_frequency(e)
        assert_allclose(f[0, 0], 229.23103218810434)
        assert_allclose(f[0, 1], 270.7689678118956)
Esempio n. 17
0
def test_threads():
    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen2:
        for num_threads in [1, 2]:
            for slice in [np.s_[:, :], np.s_[:, []]]:
                val = bgen2.read(index=slice, num_threads=num_threads)
                row_count = len(bgen2.samples[slice[0]])
                col_count = len(bgen2.ids[slice[1]])
                assert val.shape == (row_count, col_count, 3)
Esempio n. 18
0
def test_bgen_samples_specify_samples_file():
    with open_bgen(
            example_filepath2("complex.23bits.bgen"),
            samples_filepath=example_filepath("complex.sample"),
            allow_complex=True,
            verbose=False,
    ) as data:
        samples = ["sample_0", "sample_1", "sample_2", "sample_3"]
        assert all(data.samples == samples)
Esempio n. 19
0
def test_bgen_reader_without_metadata():
    filepath = example_filepath("example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"].compute()
    samples = bgen["samples"]
    assert "genotype" in bgen
    assert_equal(variants.loc[7, "allele_ids"], "A,G")
    n = samples.shape[0]
    assert_equal(samples.loc[n - 1], "sample_500")
Esempio n. 20
0
 def test_bgen_samples_inside_bgen(self):
     with example_filepath("example.32bits.bgen") as filepath:
         data = Bgen(filepath)
         samples = [
             ("0", "sample_001"),
             ("0", "sample_002"),
             ("0", "sample_003"),
             ("0", "sample_004"),
         ]
         assert (data.iid[:4] == samples).all()
Esempio n. 21
0
def test_create_metadata_file(tmp_path):
    filepath = example_filepath("example.32bits.bgen")
    metafile_filepath = tmp_path / (filepath.name + ".metadata")

    try:
        create_metafile(filepath, metafile_filepath, verbose=False)
        assert os.path.exists(metafile_filepath)
    finally:
        if os.path.exists(metafile_filepath):
            os.remove(metafile_filepath)
Esempio n. 22
0
def test_bgen_reader_complex_sample_file():
    bgen = read_bgen(
        example_filepath("complex.23bits.bgen"),
        samples_filepath=example_filepath("complex.sample"),
        verbose=False,
    )
    variants = bgen["variants"].compute()
    samples = bgen["samples"]
    assert "genotype" in bgen

    assert_equal(variants.loc[0, "chrom"], "01")
    assert_equal(variants.loc[0, "id"], "")
    assert_equal(variants.loc[0, "nalleles"], 2)
    assert_equal(variants.loc[0, "allele_ids"], "A,G")
    assert_equal(variants.loc[0, "pos"], 1)
    assert_equal(variants.loc[0, "rsid"], "V1")

    assert_equal(variants.loc[7, "chrom"], "01")
    assert_equal(variants.loc[7, "id"], "")
    assert_equal(variants.loc[7, "nalleles"], 7)
    assert_equal(variants.loc[7, "allele_ids"], "A,G,GT,GTT,GTTT,GTTTT,GTTTTT")
    assert_equal(variants.loc[7, "pos"], 8)
    assert_equal(variants.loc[7, "rsid"], "M8")

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, "chrom"], "01")
    assert_equal(variants.loc[n - 1, "id"], "")
    assert_equal(variants.loc[n - 1, "nalleles"], 2)
    assert_equal(variants.loc[n - 1, "allele_ids"], "A,G")
    assert_equal(variants.loc[n - 1, "pos"], 10)
    assert_equal(variants.loc[n - 1, "rsid"], "M10")

    assert_equal(samples.loc[0], "sample_0")
    assert_equal(samples.loc[3], "sample_3")

    ploidy = bgen["genotype"][2].compute()["ploidy"]
    missing = bgen["genotype"][2].compute()["missing"]
    nvariants = len(variants)
    phased = [bgen["genotype"][i].compute()["phased"] for i in range(nvariants)]
    assert_allclose(ploidy, [1, 2, 2, 2])
    assert_allclose(missing, [0, 0, 0, 0])
    assert_allclose(phased, [0, 1, 1, 0, 1, 1, 1, 1, 0, 0])
Esempio n. 23
0
def test_dosage_example_32bits():
    filepath = example_filepath("example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)

    e = allele_expectation(bgen, 5)
    assert_allclose(e[7], [1.9556273911044997, 0.044372608895500334])

    e = allele_expectation(bgen, 0)
    assert all(isnan(e[0]))

    e = allele_expectation(bgen, 0)
    assert_equal(e.shape, (500, 2))
Esempio n. 24
0
def test_bgen_reader_complex():
    filepath = example_filepath("complex.23bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"].compute()
    samples = bgen["samples"]
    assert "genotype" in bgen

    assert_equal(variants.loc[0, "chrom"], "01")
    assert_equal(variants.loc[0, "id"], "")
    assert_equal(variants.loc[0, "nalleles"], 2)
    assert_equal(variants.loc[0, "allele_ids"], "A,G")
    assert_equal(variants.loc[0, "pos"], 1)
    assert_equal(variants.loc[0, "rsid"], "V1")

    assert_equal(variants.loc[7, "chrom"], "01")
    assert_equal(variants.loc[7, "id"], "")
    assert_equal(variants.loc[7, "nalleles"], 7)
    assert_equal(variants.loc[7, "allele_ids"], "A,G,GT,GTT,GTTT,GTTTT,GTTTTT")
    assert_equal(variants.loc[7, "pos"], 8)
    assert_equal(variants.loc[7, "rsid"], "M8")

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, "chrom"], "01")
    assert_equal(variants.loc[n - 1, "id"], "")
    assert_equal(variants.loc[n - 1, "nalleles"], 2)
    assert_equal(variants.loc[n - 1, "allele_ids"], "A,G")
    assert_equal(variants.loc[n - 1, "pos"], 10)
    assert_equal(variants.loc[n - 1, "rsid"], "M10")

    assert_equal(samples.loc[0], "sample_0")
    assert_equal(samples.loc[3], "sample_3")

    g = bgen["genotype"][0].compute()["probs"][0]
    assert_allclose(g[:2], [1, 0])
    assert isnan(g[2])

    g = bgen["genotype"][0].compute()["probs"][1]
    assert_allclose(g[:3], [1, 0, 0])

    g = bgen["genotype"][-1].compute()["probs"][-1]
    assert_allclose(g[:5], [0, 0, 0, 1, 0])

    ploidy = bgen["genotype"][0].compute()["ploidy"]
    assert_allclose(ploidy, [1, 2, 2, 2])
    ploidy = bgen["genotype"][-1].compute()["ploidy"]
    assert_allclose(ploidy, [4, 4, 4, 4])

    nvariants = len(variants)
    phased = [bgen["genotype"][i].compute()["phased"] for i in range(nvariants)]
    phased = array(phased)
    assert_equal(phased.dtype.name, "bool")
    ideal = array([False, True, True, False, True, True, True, True, False, False])
    assert array_equal(phased, ideal)
Esempio n. 25
0
def test_dosage1():
    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen:
        variant_index = 3
        e = bgen.allele_expectation(variant_index)
        # Compute the dosage when considering the allele
        # in position 1 as the reference/alternative one.
        alt_allele_index = 1
        dosage = e[..., alt_allele_index]
        # Print the dosage of the first five samples only.
        # print(dosage[:5])
        assert_allclose(dosage[:2, 0],
                        [1.9618530841455453, 0.009826655967586362])
Esempio n. 26
0
def test_bgen_reader_variants_info():
    filepath = example_filepath("example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"]
    samples = bgen["samples"]
    assert "genotype" in bgen

    variants = variants.compute()
    assert_equal(variants.loc[0, "chrom"], "01")
    assert_equal(variants.loc[0, "id"], "SNPID_2")
    assert_equal(variants.loc[0, "nalleles"], 2)
    assert_equal(variants.loc[0, "allele_ids"], "A,G")
    assert_equal(variants.loc[0, "pos"], 2000)
    assert_equal(variants.loc[0, "rsid"], "RSID_2")

    assert_equal(variants.loc[7, "chrom"], "01")
    assert_equal(variants.loc[7, "id"], "SNPID_9")
    assert_equal(variants.loc[7, "nalleles"], 2)
    assert_equal(variants.loc[7, "allele_ids"], "A,G")
    assert_equal(variants.loc[7, "pos"], 9000)
    assert_equal(variants.loc[7, "rsid"], "RSID_9")

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, "chrom"], "01")
    assert_equal(variants.loc[n - 1, "id"], "SNPID_200")
    assert_equal(variants.loc[n - 1, "nalleles"], 2)
    assert_equal(variants.loc[n - 1, "allele_ids"], "A,G")
    assert_equal(variants.loc[n - 1, "pos"], 100001)
    assert_equal(variants.loc[n - 1, "rsid"], "RSID_200")

    assert_equal(samples.loc[0], "sample_001")
    assert_equal(samples.loc[7], "sample_008")

    n = samples.shape[0]
    assert_equal(samples.loc[n - 1], "sample_500")

    g = bgen["genotype"][0].compute()["probs"]
    assert all(isnan(g[0, :]))

    g = bgen["genotype"][0].compute()["probs"]
    a = [0.027802362811705648, 0.00863673794284387, 0.9635608992454505]
    assert_allclose(g[1, :], a)

    b = [
        0.97970582847010945215516,
        0.01947019668749305418287,
        0.00082397484239749366197,
    ]
    g = bgen["genotype"][1].compute()["probs"]
    assert_allclose(g[2, :], b)
Esempio n. 27
0
def test_coverage3():
    with pytest.raises(ValueError):
        with open_bgen(
                example_filepath2("example.bgen"),
                samples_filepath=example_filepath(
                    "complex.sample"),  # Wrong size sample file
                verbose=False,
        ) as _:
            pass

    with pytest.raises(ValueError):
        with open_bgen(
                example_filepath2("complex.bgen"),
                verbose=False,
        ) as _:
            pass
Esempio n. 28
0
def test_allele_frequency_interface():
    filepath = example_filepath("complex.23bits.bgen")
    with pytest.raises(ValueError):
        bgen = read_bgen(filepath, verbose=False)
        allele_expectation(bgen, 1)

    bgen = read_bgen(filepath, verbose=False)
    expec = allele_expectation(bgen, 3)
    freq = allele_frequency(expec)
    assert_allclose(freq, [1.33333333333, 1.0, 0.0])

    freq = allele_frequency([[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [1.0, 1.0, 0.0],
                             [0.0, 2.0, 0.0]])
    assert_allclose(freq, [1.33333333333, 1.0, 0.0])

    with pytest.raises(ValueError):
        allele_frequency([2, 3, 1])
Esempio n. 29
0
def test_error():
    filepath = example_filepath("complex.bgen")
    with open_bgen(filepath, allow_complex=True, verbose=False) as bgen:
        with pytest.raises(ValueError):
            bgen.allele_expectation()  # some phased

        with pytest.raises(ValueError):
            # different #'s of alleles
            bgen.allele_expectation(logical_not(bgen.phased))
        with pytest.raises(ValueError):
            # nonconstant ploidy
            bgen.allele_expectation(
                logical_not(bgen.phased) * (bgen.nalleles == 2))
        e = bgen.allele_expectation(
            logical_not(bgen.phased) * (bgen.nalleles == 2),
            assume_constant_ploidy=False,
        )
        f = bgen.allele_frequency(e)
        assert_allclose(e[-1, -1, :], [1.0, 3.0])
        assert_allclose(f[-1, :], [5.0, 3.0])
Esempio n. 30
0
def test_dosage2():
    import numpy as np
    import pandas as pd

    filepath = example_filepath("example.32bits.bgen")
    with open_bgen(filepath, verbose=False) as bgen:
        variant_index = [3]
        assert bgen.ids[variant_index] == "SNPID_5"
        assert bgen.rsids[variant_index] == "RSID_5"
        probs, missing, ploidy = bgen.read(variant_index,
                                           return_missings=True,
                                           return_ploidies=True)
        assert not np.any(missing)
        assert np.all(ploidy == 2)
        df1 = pd.DataFrame({
            "sample": bgen.samples,
            "0": probs[:, 0, 0],
            "1": probs[:, 0, 1],
            "2": probs[:, 0, 2],
        })
        # print(df1)
        assert_allclose(df1.iloc[-1, -1], 0.015471935508649781)
        alleles_per_variant = [
            allele_ids.split(",")
            for allele_ids in bgen.allele_ids[variant_index]
        ]
        e = bgen.allele_expectation(variant_index)
        f = bgen.allele_frequency(e)
        df2 = pd.DataFrame({
            "sample": bgen.samples,
            alleles_per_variant[0][0]: e[:, 0, 0],
            alleles_per_variant[0][1]: e[:, 0, 1],
        })
        # print(df2)  # doctest: +NORMALIZE_WHITESPACE
        assert_allclose(df2.iloc[-1, -1], 1.0152583189809832)
        alt_index = f[0, :].argmin()
        alt = alleles_per_variant[0][alt_index]
        dosage = e[:, 0, alt_index]
        df4 = pd.DataFrame({"sample": bgen.samples, f"alt={alt}": dosage})
        assert_allclose(df4.iloc[-1, -1], 1.0152583189809832)