Esempio n. 1
0
def test_c_reader_bed(shared_datadir):
    for force_python_only in [False, True]:
        bed = open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                       count_A1=False)

        val = bed.read(order="F", force_python_only=force_python_only)
        assert val.dtype == np.float32
        ref_val = reference_val(shared_datadir)
        ref_val = ref_val * -1 + 2
        assert np.allclose(ref_val,
                           val,
                           rtol=1e-05,
                           atol=1e-05,
                           equal_nan=True)

        val = bed.read(order="F", dtype='int8', force_python_only=False)
        assert val.dtype == np.int8
        ref_val[ref_val != ref_val] = -127
        ref_val = ref_val.astype("int8")
        ref_val = ref_val.astype("int8")
        assert np.all(ref_val == val)

        bed.close()

        with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
            val = bed.read(order="F",
                           dtype="float64",
                           force_python_only=force_python_only)
            ref_val = reference_val(shared_datadir)
            assert np.allclose(ref_val,
                               val,
                               rtol=1e-05,
                               atol=1e-05,
                               equal_nan=True)
Esempio n. 2
0
def test_bed_int8(tmp_path, shared_datadir):
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
        for force_python_only in [False, True]:
            for order in ["F", "C"]:
                val = bed.read(dtype="int8",
                               force_python_only=force_python_only,
                               order=order)
                assert val.dtype == np.int8
                assert (val.flags["C_CONTIGUOUS"]
                        and order == "C") or (val.flags["F_CONTIGUOUS"]
                                              and order == "F")
                ref_val = reference_val(shared_datadir)
                ref_val[ref_val != ref_val] = -127
                ref_val = ref_val.astype("int8")
                assert np.array_equal(ref_val, val)
                output = str(tmp_path / "int8.bed")
                for count_A1 in [False, True]:
                    open_bed.write(
                        output,
                        ref_val,
                        count_A1=count_A1,
                        force_python_only=force_python_only,
                    )
                    with open_bed(output, count_A1=count_A1) as bed2:
                        assert np.array_equal(
                            bed2.read(dtype="int8",
                                      force_python_only=force_python_only),
                            ref_val,
                        )
Esempio n. 3
0
def test_bad_dtype_or_order(shared_datadir):
    with pytest.raises(ValueError):
        open_bed(shared_datadir /
                 "distributed_bed_test1_X.bed").read(dtype=np.int32)
    with pytest.raises(ValueError):
        open_bed(shared_datadir /
                 "distributed_bed_test1_X.bed").read(order="X")
Esempio n. 4
0
def test_iid_sid_count(shared_datadir):
    iid_count_ref, sid_count_ref = open_bed(
        shared_datadir / "plink_sim_10s_100v_10pmiss.bed").shape
    assert (iid_count_ref, sid_count_ref) == open_bed(
        shared_datadir / "plink_sim_10s_100v_10pmiss.bed",
        iid_count=iid_count_ref).shape
    assert (iid_count_ref, sid_count_ref) == open_bed(
        shared_datadir / "plink_sim_10s_100v_10pmiss.bed",
        sid_count=sid_count_ref).shape
    assert (iid_count_ref, sid_count_ref) == open_bed(
        shared_datadir / "plink_sim_10s_100v_10pmiss.bed",
        iid_count=iid_count_ref,
        sid_count=sid_count_ref,
    ).shape
Esempio n. 5
0
def test_write1_bed_f64cpp(tmp_path, shared_datadir):
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
        for iid_index in [0, 1, 5]:
            for force_python_only in [False, True]:
                val = bed.read(
                    np.s_[0:iid_index, :],
                    order="F",
                    dtype=np.float64,
                    force_python_only=force_python_only,
                )
                assert val.shape == (iid_index, 100)
                output = str(tmp_path / f"toydata.F64cpp.{iid_index}")
                open_bed.write(output, val, count_A1=False)
                val2 = open_bed(output, count_A1=False).read(dtype="float64")
                assert np.allclose(val, val2, equal_nan=True)
Esempio n. 6
0
def test_write12(tmp_path):
    # ===================================
    #    Starting main function
    # ===================================
    logging.info("starting 'test_writes'")
    np.random.seed(0)
    output_template = str(tmp_path / "writes.{0}.bed")
    i = 0
    for row_count in [0, 5, 2, 1]:
        for col_count in [4, 2, 1, 0]:
            val = np.random.randint(0, 4, size=(row_count, col_count)) * 1.0
            val[val == 3] = np.NaN
            row0 = ["0", "1", "2", "3", "4"][:row_count]
            row1 = ["0", "1", "2", "3", "4"][:row_count]
            col = ["s0", "s1", "s2", "s3", "s4"][:col_count]
            for is_none in [True, False]:
                metadata = {"fid": row0, "iid": row1, "sid": col}
                if is_none:
                    col_prop012 = [x for x in range(5)][:col_count]
                    metadata["chromosome"] = col_prop012
                    metadata["bp_position"] = col_prop012
                    metadata["cm_position"] = col_prop012
                else:
                    col_prop012 = None

                filename = output_template.format(i)
                logging.info(filename)
                i += 1
                open_bed.write(filename, val, metadata=metadata
                               )  #!!!cmk is it weird to "open_bed.write"?
                for subsetter in [None, np.s_[::2, ::3]]:
                    with open_bed(filename) as bed:
                        val2 = bed.read(
                            index=subsetter, order="C", dtype="float32"
                        )  #!!!cmk should float32 be the default so that NaN is better?
                        if subsetter is None:
                            expected = val
                        else:
                            expected = val[subsetter[0], :][:, subsetter[1]]
                        assert np.allclose(val2, expected, equal_nan=True)
                        assert np.array_equal(bed.fid,
                                              np.array(row0, dtype="str"))
                        assert np.array_equal(bed.iid,
                                              np.array(row1, dtype="str"))
                        assert np.array_equal(bed.sid,
                                              np.array(col, dtype="str"))
                        if col_prop012 is not None:
                            assert np.array_equal(
                                bed.chromosome,
                                np.array(col_prop012, dtype="str"))
                            assert np.array_equal(bed.bp_position,
                                                  np.array(col_prop012))
                            assert np.array_equal(bed.cm_position,
                                                  np.array(col_prop012))
                    try:
                        os.remove(filename)
                    except:
                        pass
    logging.info("done with 'test_writes'")
Esempio n. 7
0
def test_zero_files(tmp_path):
    for force_python_only in [False, True]:
        for iid_count in [3, 0]:
            for sid_count in [5, 0]:
                for dtype in [np.int8, np.float32, np.float64]:
                    val = np.zeros((iid_count, sid_count), dtype=dtype)
                    if iid_count * sid_count > 0:
                        val[0, 0] = 2
                        val[0,
                            1] = -127 if np.dtype(dtype) == np.int8 else np.nan
                    filename = str(tmp_path / "zero_files.bed")

                    # Write
                    open_bed.write(filename,
                                   val,
                                   force_python_only=force_python_only)

                    # Read
                    with open_bed(filename) as bed2:
                        val2 = bed2.read(dtype=dtype)
                        assert np.allclose(val, val2, equal_nan=True)
                        metadata2 = bed2.metadata
                        for prop in metadata2.values():
                            assert len(prop) in {iid_count, sid_count}

                    # Change metdata and write again
                    if iid_count > 0:
                        metadata2["iid"][0] = "iidx"
                    if sid_count > 0:
                        metadata2["sid"][0] = "sidx"
                    open_bed.write(
                        filename,
                        val2,
                        metadata=metadata2,
                        force_python_only=force_python_only,
                    )

                    # Read again
                    with open_bed(filename) as bed3:
                        val3 = bed3.read(dtype=dtype)
                        assert np.allclose(val, val3, equal_nan=True)
                        metadata3 = bed3.metadata
                        for key2, value_list2 in metadata2.items():
                            value_list3 = metadata3[key2]
                            assert np.array_equal(value_list2, value_list3)
Esempio n. 8
0
def test_write1_x_x_cpp(tmp_path, shared_datadir):
    for count_A1 in [False, True]:
        with open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                      count_A1=count_A1) as bed:
            for order in ["C", "F", "A"]:
                for dtype in [np.float32, np.float64]:
                    val = bed.read(order=order, dtype=dtype)
                    metadata = bed.metadata
                    val[-1, 0] = float("NAN")
                    output = str(tmp_path / "toydata.{0}{1}.cpp".format(
                        order, "32" if dtype == np.float32 else "64"))
                    open_bed.write(output,
                                   val,
                                   metadata=metadata,
                                   count_A1=count_A1)
                    val2 = open_bed(output,
                                    count_A1=count_A1).read(dtype=dtype)
                    assert np.allclose(val, val2, equal_nan=True)
Esempio n. 9
0
def test_write1(tmp_path, shared_datadir):
    in_file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed"
    out_file = tmp_path / "out.bed"
    with open_bed(in_file) as bed:
        val0 = bed.read()
        metadata0 = {
            "fid": bed.fid,
            "iid": bed.iid,
            "sid": bed.sid,
            "chromosome": bed.chromosome,
            "cm_position": bed.cm_position,
            "bp_position": bed.bp_position,
        }
        open_bed.write(out_file, val0, metadata=metadata0)
        with open_bed(out_file) as bed1:
            assert np.allclose(val0, bed1.read(), equal_nan=True)
            assert np.array_equal(bed.fid, metadata0["fid"])
            assert np.array_equal(bed.iid, metadata0["iid"])
            assert np.array_equal(bed.sid, metadata0["sid"])
            assert np.array_equal(bed.chromosome, metadata0["chromosome"])
            assert np.allclose(bed.cm_position, metadata0["cm_position"])
            assert np.allclose(bed.bp_position, metadata0["bp_position"])

    val_float = val0.astype("float")
    val_float[0, 0] = 0.5

    for force_python_only in [False, True]:
        with pytest.raises(ValueError):
            open_bed.write(
                out_file,
                val_float,
                metadata=metadata0,
                force_python_only=force_python_only,
            )
    val_int8 = val0.astype("int8")
    val_int8[0, 0] = -1
    for force_python_only in [False, True]:
        with pytest.raises(ValueError):
            open_bed.write(
                out_file,
                val_int8,
                metadata=metadata0,
                force_python_only=force_python_only,
            )
Esempio n. 10
0
def test_coverage2(shared_datadir):
    with open_bed(shared_datadir / "plink_sim_10s_100v_10pmiss.bed",
                  metadata={"iid": None}) as bed:
        assert len(bed.iid) > 1
    with pytest.raises(ValueError):
        open_bed(
            shared_datadir / "plink_sim_10s_100v_10pmiss.bed",
            metadata={
                "iid": [1, 2, 3],
                "mother": [1, 2]
            },
        )
    val = np.zeros((3, 5))[::2]
    assert not val.flags["C_CONTIGUOUS"] and not val.flags["F_CONTIGUOUS"]
    with pytest.raises(ValueError):
        open_bed.write("ignore", val)
    val = np.zeros((3, 5), dtype=np.str)
    with pytest.raises(ValueError):
        open_bed.write("ignore", val)
Esempio n. 11
0
def test_threads(shared_datadir):
    ref_val_float = reference_val(shared_datadir)
    ref_val_int8 = ref_val_float.astype("int8")
    ref_val_int8[ref_val_float != ref_val_float] = -127

    for num_threads in [1, 4]:
        with open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                      num_threads=num_threads) as bed:
            val = bed.read(dtype='int8')
            assert np.allclose(ref_val_int8, val, equal_nan=True)
Esempio n. 12
0
def test_read1(shared_datadir):
    file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed"
    with open_bed(file) as bed:
        assert bed.iid_count == 10
        assert bed.fid[-1] == "0"
        assert bed.iid[-1] == "9"
        assert bed.shape == (10, 100)
        val = bed.read(dtype='int8')
        assert (
            val.mean() == -13.142
        )  # really shouldn't do mean on data where -127 represents missing
        assert bed.chromosome[-1] == "1"
        assert bed.bp_position[-1] == 100
Esempio n. 13
0
def test_index(shared_datadir):
    ref_val_float = reference_val(shared_datadir)

    with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
        val = bed.read()
        assert np.allclose(ref_val_float, val, equal_nan=True)

        val = bed.read(2)
        assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True)

        val = bed.read((2))
        assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True)

        val = bed.read((None, 2))
        assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True)

        val = bed.read((1, 2))
        assert np.allclose(ref_val_float[[1], [2]], val, equal_nan=True)

        val = bed.read([2, -2])
        assert np.allclose(ref_val_float[:, [2, -2]], val, equal_nan=True)

        val = bed.read(([1, -1], [2, -2]))
        assert np.allclose(ref_val_float[[1, -1], :][:, [2, -2]],
                           val,
                           equal_nan=True)

        iid_bool = ([False, False, True] * bed.iid_count)[:bed.iid_count]
        sid_bool = ([True, False, True] * bed.sid_count)[:bed.sid_count]
        val = bed.read(sid_bool)
        assert np.allclose(ref_val_float[:, sid_bool], val, equal_nan=True)

        val = bed.read((iid_bool, sid_bool))
        assert np.allclose(ref_val_float[iid_bool, :][:, sid_bool],
                           val,
                           equal_nan=True)

        val = bed.read((1, sid_bool))
        assert np.allclose(ref_val_float[[1], :][:, sid_bool],
                           val,
                           equal_nan=True)

        slicer = np.s_[::2, ::3]
        val = bed.read(slicer[1])
        assert np.allclose(ref_val_float[:, slicer[1]], val, equal_nan=True)

        val = bed.read(slicer)
        assert np.allclose(ref_val_float[slicer], val, equal_nan=True)

        val = bed.read((1, slicer[1]))
        assert np.allclose(ref_val_float[[1], slicer[1]], val, equal_nan=True)
Esempio n. 14
0
 def __init__(self, path, shape, dtype=np.int8, count_A1=True):
     # n variants (sid = SNP id), n samples (iid = Individual id)
     n_sid, n_iid = shape
     # Initialize Bed with empty arrays for axis data, otherwise it will
     # load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
     self.open_bed = open_bed(
         path,
         count_A1=count_A1,
         iid_count=n_iid,
         sid_count=n_sid,
     )
     self.shape = (n_sid, n_iid, 2)
     self.dtype = dtype
     self.ndim = 3
Esempio n. 15
0
def test_respect_read_inputs(shared_datadir):
    ref_val_float = reference_val(shared_datadir)

    ref_val_int8 = ref_val_float.astype("int8")
    ref_val_int8[ref_val_float != ref_val_float] = -127

    with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
        for order in ["F", "C", "A"]:
            for dtype in [np.int8, np.float32, np.float64]:
                for force_python_only in [True, False]:
                    val = bed.read(order=order,
                                   dtype=dtype,
                                   force_python_only=force_python_only)
                    has_right_order = (
                        order == "A"
                        or (order == "C" and val.flags["C_CONTIGUOUS"])
                        or (order == "F" and val.flags["F_CONTIGUOUS"]))
                    assert val.dtype == dtype and has_right_order
                    ref_val = ref_val_int8 if dtype == np.int8 else ref_val_float
                    assert np.allclose(ref_val, val, equal_nan=True)
Esempio n. 16
0
def test_overrides(shared_datadir):
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
        fid = bed.fid
        iid = bed.iid
        father = bed.father
        mother = bed.mother
        sex = bed.sex
        pheno = bed.pheno
        chromosome = bed.chromosome
        sid = bed.sid
        cm_position = bed.cm_position
        bp_position = bed.bp_position
        allele_1 = bed.allele_1
        allele_2 = bed.allele_2
    # lock in the expected results: np.savez(shared_datadir / "distributed_bed_test1_X.metadata.npz",fid=fid,iid=iid,father=father,mother=mother,sex=sex,pheno=pheno,chromosome=chromosome,sid=sid,cm_position=cm_position,bp_position=bp_position,allele_1=allele_1,allele_2=allele_2)
    property_dict = np.load(shared_datadir /
                            "distributed_bed_test1_X.metadata.npz")
    assert np.array_equal(property_dict["fid"], fid)
    assert np.array_equal(property_dict["iid"], iid)
    assert np.array_equal(property_dict["father"], father)
    assert np.array_equal(property_dict["mother"], mother)
    assert np.array_equal(property_dict["sex"], sex)
    assert np.array_equal(property_dict["pheno"], pheno)
    assert np.array_equal(property_dict["chromosome"], chromosome)
    assert np.array_equal(property_dict["sid"], sid)
    assert np.array_equal(property_dict["cm_position"], cm_position)
    assert np.array_equal(property_dict["bp_position"], bp_position)
    assert np.array_equal(property_dict["allele_1"], allele_1)
    assert np.array_equal(property_dict["allele_2"], allele_2)

    with pytest.raises(KeyError):
        open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                 metadata={"unknown": [3, 4, 4]})
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                  metadata={"iid": None}) as bed1:
        assert np.array_equal(bed1.iid, property_dict["iid"])
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                  metadata={"iid": []}) as bed1:
        assert np.issubdtype(bed1.iid.dtype, np.str_)
        assert len(bed1.iid) == 0
        with pytest.raises(ValueError):
            bed1.father

    with open_bed(
            shared_datadir / "distributed_bed_test1_X.bed",
            metadata={"sid": [i for i in range(len(sid))]},
    ) as bed1:
        assert np.issubdtype(bed1.sid.dtype, np.str_)
        assert bed1.sid[0] == "0"
    with pytest.raises(ValueError):
        open_bed(
            shared_datadir / "distributed_bed_test1_X.bed",
            metadata={"sex": ["F" for i in range(len(sex))]},
        )  # Sex must be coded as a number
    with open_bed(
            shared_datadir / "distributed_bed_test1_X.bed",
            metadata={"sid": np.array([i for i in range(len(sid))])},
    ) as bed1:
        assert np.issubdtype(bed1.sid.dtype, np.str_)
        assert bed1.sid[0] == "0"
    with pytest.raises(ValueError):
        open_bed(
            shared_datadir / "distributed_bed_test1_X.bed",
            metadata={"sid": np.array([(i, i) for i in range(len(sid))])},
        )
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed",
                  metadata={"sid": [1, 2, 3]}) as bed1:
        with pytest.raises(ValueError):
            bed1.chromosome
Esempio n. 17
0
def test_str(shared_datadir):
    with open_bed(shared_datadir / "distributed_bed_test1_X.bed") as bed:
        assert "open_bed(" in str(bed)
Esempio n. 18
0
def test_bad_bed(shared_datadir):
    with pytest.raises(ValueError):
        open_bed(shared_datadir / "badfile.bed")
    open_bed(shared_datadir / "badfile.bed", skip_format_check=True)
Esempio n. 19
0
def test_properties(shared_datadir):
    file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed"
    with open_bed(file) as bed:
        iid_list = bed.iid.tolist()
        sid_list = bed.sid.tolist()
        chromosome_list = bed.chromosome.tolist()

    test_count = 75

    seq_dict = {
        "iid": [None, iid_list, np.array(iid_list)],
        "iid_count": [None, len(iid_list)],
        "iid_before_read": [False, True],
        "iid_after_read": [False, True],
        "sid": [None, sid_list, np.array(sid_list)],
        "sid_count": [None, len(sid_list)],
        "sid_before_read": [False, True],
        "sid_after_read": [False, True],
        "chromosome": [
            None,
            chromosome_list,
            np.array(chromosome_list),
        ],
        "chromosome_before_read": [False, True],
        "chromosome_after_read": [False, True],
    }

    for test_index, settings in enumerate(setting_generator(seq_dict)):
        if test_index >= test_count:
            break
        with open_bed(
                file,
                iid_count=settings["iid_count"],
                sid_count=settings["sid_count"],
                metadata={
                    "iid": settings["iid"],
                    "sid": settings["sid"],
                    "chromosome": settings["chromosome"],
                },
        ) as bed:
            logging.info(f"Test {test_count}")
            if settings["iid_before_read"]:
                assert np.array_equal(bed.iid, iid_list)
            if settings["sid_before_read"]:
                assert np.array_equal(bed.sid, sid_list)
            if settings["chromosome_before_read"]:
                assert np.array_equal(
                    bed.chromosome,
                    chromosome_list,
                )
            val = bed.read()
            assert val.shape == (
                len(iid_list),
                len(sid_list),
            )
            if settings["iid_after_read"]:
                assert np.array_equal(bed.iid, iid_list)
            if settings["sid_after_read"]:
                assert np.array_equal(bed.sid, sid_list)
            if settings["chromosome_after_read"]:
                assert np.array_equal(
                    bed.chromosome,
                    chromosome_list,
                )
Esempio n. 20
0
def test_shape(shared_datadir):
    with open_bed(shared_datadir / "plink_sim_10s_100v_10pmiss.bed") as bed:
        assert bed.shape == (10, 100)
Esempio n. 21
0
                filepointer.write(
                    sep.join(str(seq[index]) for seq in fam_bim_list) + "\n")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    import os

    if True:
        import numpy as np
        from sgkit_plink._open_bed import open_bed

        # Can get file from https://www.dropbox.com/sh/xluk9opjiaobteg/AABgEggLk0ZoO0KQq0I4CaTJa?dl=0
        bigfile = r"M:\deldir\genbgen\2\merged_487400x220000.1.bed"
        # bigfile = '/mnt/m/deldir/genbgen/2/merged_487400x220000.1.bed'
        with open_bed(bigfile, num_threads=20) as bed:
            sid_batch = 22 * 1000
            for sid_start in range(0, 10 * sid_batch, sid_batch):
                slicer = np.s_[:10000, sid_start:sid_start + sid_batch]
                print(slicer)
                val = bed.read(slicer)
            print(val.shape)

    if False:
        file = r"D:\OneDrive\programs\sgkit-plink\sgkit_plink\tests\data/plink_sim_10s_100v_10pmiss.bed"
        with open_bed(file) as bed:
            print(bed.iid)
            print(bed.shape)
            val = bed.read()
            print(val)