Example #1
0
    def test_respect_read_inputs(self):
        from pysnptools.distreader import Bgen,DistGen,DistHdf5,DistMemMap,DistNpz
        from pysnptools.snpreader import Bed

        previous_wd = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        for distreader in [
                           _DistMergeSIDs([Bgen('../examples/example.bgen')[:,:5].read(),Bgen('../examples/example.bgen')[:,5:].read()]),
                           Bed('../examples/toydata.5chrom.bed',count_A1=True).as_dist(block_size=2000),
                           Bed('../examples/toydata.5chrom.bed',count_A1=True).as_dist(),
                           Bgen('../examples/example.bgen').read(),
                           Bgen('../examples/bits1.bgen'),                          
                           DistGen(seed=0,iid_count=500,sid_count=50),
                           DistGen(seed=0,iid_count=500,sid_count=50)[::2,::2],
                           DistHdf5('../examples/toydata.snpmajor.dist.hdf5'),
                           DistMemMap('../examples/tiny.dist.memmap'),
                           DistNpz('../examples/toydata10.dist.npz')
                          ]:
            logging.info(str(distreader))
            for order in ['F','C','A']:
                for dtype in [np.float32,np.float64]:
                    for force_python_only in [True,False]:
                        for view_ok in [True,False]:
                            val = distreader.read(order=order,dtype=dtype,force_python_only=force_python_only,view_ok=view_ok).val
                            has_right_order = order=="A" or (order=="C" and val.flags["C_CONTIGUOUS"]) or (order=="F" and val.flags["F_CONTIGUOUS"])
                            if hasattr(distreader,'val') and not view_ok:
                                assert distreader.val is not val
                            if (hasattr(distreader,'val') and view_ok and distreader.val is not val and
                                (order == 'A' or (order == 'F' and distreader.val.flags['F_CONTIGUOUS']) or (order == 'C' and distreader.val.flags['C_CONTIGUOUS'])) and
                                (dtype is None or  distreader.val.dtype == dtype)):
                                logging.info("{0} could have read a view, but didn't".format(distreader))
                            assert val.dtype == dtype and has_right_order

        os.chdir(previous_wd)
Example #2
0
    def test_read_write_round_trip(self):
        from pysnptools.distreader import DistGen

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        assert (
            "QCTOOLPATH" in os.environ
        ), "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')."

        exampledata = Bgen("../examples/example.bgen")[:, 10].read()
        distgen0data = DistGen(seed=332, iid_count=50, sid_count=5).read()

        for i, distdata0 in enumerate([distgen0data, exampledata]):
            for bits in list(range(1, 33)):
                logging.info("input#={0},bits={1}".format(i, bits))
                file1 = "temp/roundtrip1-{0}-{1}.bgen".format(i, bits)
                distdata1 = Bgen.write(
                    file1,
                    distdata0,
                    bits=bits,
                    compression="zlib",
                    cleanup_temp_files=False,
                ).read()
                assert distdata1.iid[0, 0] == "0"
                distdata2 = Bgen(file1).read()
                assert distdata1.allclose(distdata2, equal_nan=True)
                atol = 1.0 / (2**(bits or 16))
                if (bits or 16) == 1:
                    atol *= 1.4  # because values must add up to 1, there is more rounding error
                TestBgen.assert_approx_equal(distdata0, distdata1, atol=atol)
        os.chdir(old_dir)
Example #3
0
    def test_coverage(self):
        from pysnptools.distreader import DistGen

        with example_filepath("example.32bits.bgen") as filepath:
            bgen = Bgen(filepath,
                        fresh_properties=False,
                        iid_function=lambda sam: ("X", sam))
            assert bgen.iid[0, 0] == "X"
            metadata_filepath = bgen._open_bgen._metadata2_path
            metadata2_temp = metadata_filepath.parent / (
                metadata_filepath.name + ".temp")
            del bgen
            if metadata2_temp.exists():
                metadata2_temp.unlink()
            os.rename(metadata_filepath, metadata2_temp)
            bgen = Bgen(filepath)
            assert bgen.iid[0, 0] == "0"
            bgen[0, 0].read(order='A')
            if not os.path.exists("temp"):
                os.mkdir("temp")
            os.chdir("temp")
            file1x = "coverage.bgen"
            Bgen.write(file1x, bgen[:100, :100])
            Bgen.write(file1x, bgen[:100, :100])
            os.chdir("..")

        distgen0data = DistGen(seed=332, iid_count=10010, sid_count=5).read()
        file1 = "temp/roundtrip1-big.bgen"
        bed3 = Bgen.write(file1,
                          distgen0data,
                          bits=8,
                          compression="zlib",
                          cleanup_temp_files=False,
                          sample_function=lambda fam, ind: f'{fam},{ind}')
        bed3.iid[0, 0] = '0'
Example #4
0
 def test_bgen_reader_file_notfound(self):
     bgen = Bgen("/1/2/3/example.32bits.bgen")
     try:
         bgen.iid  # expect error
         got_error = False
     except Exception:
         got_error = True
     assert got_error
Example #5
0
    def test_other(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        logging.info("in TestBgen test_other")
        bgen = Bgen("../examples/example.bgen",
                    sample="../examples/other.sample")
        assert np.all(bgen.iid[0] == ("0", "other_001"))
        os.chdir(old_dir)
Example #6
0
    def test_zero(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        bgen = Bgen("../examples/example.bgen")
        assert bgen[:, []].read().val.shape == (500, 0, 3)
        assert bgen[[], :].read().val.shape == (0, 199, 3)
        assert bgen[[], []].read().val.shape == (0, 0, 3)
        os.chdir(old_dir)
Example #7
0
    def test_other(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        logging.info("in TestBgen test_other")
        bgen = Bgen('../examples/example.bgen',
                    sample='../examples/other.sample')
        assert np.all(bgen.iid[0] == ('0', 'other_001'))
        os.chdir(old_dir)
Example #8
0
    def test1(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        logging.info("in TestBgen test1")
        bgen = Bgen('../examples/example.bgen')
        distdata = bgen.read()
        bgen2 = bgen[:2, ::3]
        distdata2 = bgen2.read()
        os.chdir(old_dir)
Example #9
0
    def test2(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        logging.info("in TestBgen test2")
        bgen = Bgen("../examples/bits1.bgen")
        bgen.read()
        bgen2 = bgen[:2, ::3]
        bgen2.read()
        os.chdir(old_dir)
Example #10
0
 def test_bgen_samples_inside_bgen(self):
     with example_filepath("example.32bits.bgen") as filepath:
         data = Bgen(filepath)
         samples = [
             ("0", "sample_001"),
             ("0", "sample_002"),
             ("0", "sample_003"),
             ("0", "sample_004"),
         ]
         assert (data.iid[:4] == samples).all()
 def gen_reference(self, load_path):
     """Get the pysnptools reference via the load type"""
     if self.gen_type == ".bed":
         return Bed(load_path, count_A1=True)
     elif self.gen_type == ".bgen":
         if self._snp_tools:
             return Bgen(load_path)
         else:
             return BgenObject(load_path)
     else:
         raise Exception("Unknown load type set")
Example #12
0
    def test2(self):
        from pysnptools.distreader import Bgen

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        bgen = Bgen('../examples/example.bgen')
        distmemmap = DistMemMap.write("tempdir/bgentomemmap.dist.memamp", bgen)
        assert DistData.allclose(bgen.read(),
                                 distmemmap.read(),
                                 equal_nan=True)
        os.chdir(old_dir)
Example #13
0
def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name):
    """
    Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random
    set of snps of total == pre-defined int, where the int is set to snps_to_id.

    :param memory_location: Location of bgen memory file
    :type memory_location: Path | str

    :param snps_to_id: Location of snps csv to id
    :type snps_to_id: Path | str

    :param gen_path: The path to the genetic file
    :type gen_path: Path | str

    :param write_dir: The directory to write the snp index csv file to
    :type write_dir: Path | str

    :param file_name: The name of the snp index file
    :type file_name: str

    :return: Nothing, write the id's to a csv then stop
    :rtype: None

    :raise TypeError: If a str / int is not passed
    """

    # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref
    custom_meta_path(validate_path(memory_location))
    gen = Bgen(str(validate_path(gen_path).absolute()))

    # Construct a lookup dict for variant_id-rsid
    v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]}

    # Load the list of snps to validate
    snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0]

    # Get the index of each snp that is present
    snp_indexes = []
    for snp in snps_list:
        try:
            snp_indexes.append(
                gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist())
        except KeyError:
            pass

    # Write the snp indexes out
    write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes)
    print(
        f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}"
    )
Example #14
0
    def test_read1(self):

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        file_from = "../examples/example.bgen"
        file_to = "temp/example.bgen"
        pstutil.create_directory_if_necessary(file_to)
        if os.path.exists(file_to + ".metadata"):
            os.remove(file_to + ".metadata")
        meta = open_bgen._metadata_path_from_filename(file_to,
                                                      samples_filepath=None)
        if os.path.exists(meta):
            os.remove(meta)
        shutil.copy(file_from, file_to)

        for loop_index in range(2):
            bgen = Bgen(file_to)
            assert np.array_equal(bgen.iid[0], ["0", "sample_001"])
            assert bgen.sid[0] == "SNPID_2,RSID_2"

            # Use the bgen_sample_id for both parts of iid
            def iid_dup(bgen_sample_id):
                return (bgen_sample_id, bgen_sample_id)

            iid_function = iid_dup
            bgen = Bgen(file_to, iid_function=iid_function, sid_function="id")
            assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"])
            assert bgen.sid[0] == "SNPID_2"

            bgen = Bgen(file_to,
                        iid_function=iid_function,
                        sid_function="rsid")
            assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"])
            assert bgen.sid[0] == "RSID_2"

            sid_function = lambda id, rsid: "{0},{1}".format(id, rsid)
            bgen = Bgen(file_to, iid_function, sid_function=sid_function)
            assert bgen.sid[0] == "SNPID_2,RSID_2"

        metafile = bgen._open_bgen._metadata_path_from_filename(
            file_to, samples_filepath=None)
        del bgen
        os.remove(metafile)
        sid_function = lambda id, rsid: "{0},{1}".format(id, rsid)
        bgen = Bgen(file_to, iid_function, sid_function=sid_function)
        assert bgen.sid[0] == "SNPID_2,RSID_2"

        metafile = bgen._open_bgen._metadata_path_from_filename(
            file_to, samples_filepath=None)
        del bgen
        os.remove(metafile)
        bgen = Bgen(file_to, iid_function, sid_function="rsid")
        assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"])
        assert bgen.sid[0] == "RSID_2"

        os.chdir(old_dir)
Example #15
0
    def test_bad_sum(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))
        Path("temp").mkdir(parents=True, exist_ok=True)

        assert (
            "QCTOOLPATH" in os.environ
        ), "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')."

        distdata = Bgen("../examples/example.bgen")[:5, :5].read()

        # Just one NaN
        distdata.val[0, 0, :] = [np.nan, 0.5, 0.5]
        bgen = Bgen.write("temp/should_be_all_nan.bgen", distdata)
        assert np.isnan(bgen[0, 0].read().val).all()

        # Just one NaN
        distdata.val[0, 0, :] = [0, 0, 0]
        bgen = Bgen.write("temp/should_be_all_nan2.bgen", distdata)
        assert np.isnan(bgen[0, 0].read().val).all()

        # Just sums to more than 1
        distdata.val[0, 0, :] = [1, 2, 3]
        failed = False
        try:
            bgen = Bgen.write("temp/should_fail.bgen", distdata)
        except Exception:
            failed = True
        assert failed

        # Just sums to less than 1
        distdata.val[0, 0, :] = [0.2, 0.2, 0.2]
        failed = False
        try:
            bgen = Bgen.write("temp/should_fail.bgen", distdata)
        except Exception:
            failed = True
        assert failed

        # a negative value
        distdata.val[0, 0, :] = [-1, 1, 0]
        failed = False
        try:
            bgen = Bgen.write("temp/should_fail.bgen", distdata)
        except Exception:
            failed = True
        assert failed
        os.chdir(old_dir)
Example #16
0
    def test_memmap(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))
        assert (
            "QCTOOLPATH" in os.environ
        ), "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')."

        distgen0data = Bgen("../examples/example.bgen")[:, 10].read()
        assert distgen0data.iid[0, 0] == "0"
        # distgen0data = DistGen(seed=332,iid_count=50,sid_count=5).read()
        file1y = "temp/roundtrip1-{0}-{1}y.bgen".format(0, 1)
        bgen = Bgen.write(file1y, distgen0data)
        assert bgen.iid is not None
        file1x = "temp/roundtrip1-{0}-{1}x.bgen".format(0, 1)
        assert Bgen.write(file1x, distgen0data).iid[0, 0] == "0"
        os.chdir(old_dir)
Example #17
0
    def test_bad_sum(self):
        from pysnptools.distreader import DistGen

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        assert 'QCTOOLPATH' in os.environ, "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')."

        distdata = Bgen('../examples/example.bgen')[:5, :5].read()

        #Just one NaN
        distdata.val[0, 0, :] = [np.nan, .5, .5]
        bgen = Bgen.write('temp/should_be_all_nan.bgen', distdata)
        assert np.isnan(bgen[0, 0].read().val).all()

        #Just one NaN
        distdata.val[0, 0, :] = [0, 0, 0]
        bgen = Bgen.write('temp/should_be_all_nan2.bgen', distdata)
        assert np.isnan(bgen[0, 0].read().val).all()

        #Just sums to more than 1
        distdata.val[0, 0, :] = [1, 2, 3]
        failed = False
        try:
            bgen = Bgen.write('temp/should_fail.bgen', distdata)
        except:
            failed = True
        assert failed

        #Just sums to less than 1
        distdata.val[0, 0, :] = [.2, .2, .2]
        failed = False
        try:
            bgen = Bgen.write('temp/should_fail.bgen', distdata)
        except:
            failed = True
        assert failed

        #a negative value
        distdata.val[0, 0, :] = [-1, 1, 0]
        failed = False
        try:
            bgen = Bgen.write('temp/should_fail.bgen', distdata)
        except:
            failed = True
        assert failed
        os.chdir(old_dir)
Example #18
0
    def test_read1(self):

        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)))

        file_from = '../examples/example.bgen'
        file_to = 'temp/example.bgen'
        pstutil.create_directory_if_necessary(file_to)
        if os.path.exists(file_to + ".metadata"):
            os.remove(file_to + ".metadata")
        meta = open_bgen._metadatapath_from_filename(file_to)
        if os.path.exists(meta):
            os.remove(meta)
        shutil.copy(file_from, file_to)

        for loop_index in range(2):
            bgen = Bgen(file_to)
            assert np.array_equal(bgen.iid[0], ['0', 'sample_001'])
            assert bgen.sid[0] == 'SNPID_2,RSID_2'

            iid_function = lambda bgen_sample_id: (
                bgen_sample_id, bgen_sample_id
            )  #Use the bgen_sample_id for both parts of iid
            bgen = Bgen(file_to, iid_function=iid_function, sid_function='id')
            assert np.array_equal(bgen.iid[0], ['sample_001', 'sample_001'])
            assert bgen.sid[0] == 'SNPID_2'

            bgen = Bgen(file_to,
                        iid_function=iid_function,
                        sid_function='rsid')
            assert np.array_equal(bgen.iid[0], ['sample_001', 'sample_001'])
            assert bgen.sid[0] == 'RSID_2'

            sid_function = lambda id, rsid: '{0},{1}'.format(id, rsid)
            bgen = Bgen(file_to, iid_function, sid_function=sid_function)
            assert bgen.sid[0] == 'SNPID_2,RSID_2'

        os.remove(bgen._open_bgen._metadatapath_from_filename(file_to))
        sid_function = lambda id, rsid: '{0},{1}'.format(id, rsid)
        bgen = Bgen(file_to, iid_function, sid_function=sid_function)
        assert bgen.sid[0] == 'SNPID_2,RSID_2'

        os.remove(bgen._open_bgen._metadatapath_from_filename(file_to))
        bgen = Bgen(file_to, iid_function, sid_function='rsid')
        assert np.array_equal(bgen.iid[0], ['sample_001', 'sample_001'])
        assert bgen.sid[0] == 'RSID_2'

        os.chdir(old_dir)
Example #19
0
    def test_bgen_reader_variants_info(self):
        with example_filepath("example.32bits.bgen") as filepath:
            bgen = Bgen(filepath, sid_function="id")

            assert bgen.pos[0, 0] == 1
            assert bgen.sid[0] == "SNPID_2"
            assert bgen.pos[0, 2] == 2000

            assert bgen.pos[7, 0] == 1
            assert bgen.sid[7] == "SNPID_9"
            assert bgen.pos[7, 2] == 9000

            assert bgen.pos[-1, 0] == 1
            assert bgen.sid[-1] == "SNPID_200"
            assert bgen.pos[-1, 2] == 100001

            assert (bgen.iid[0] == ("0", "sample_001")).all()
            assert (bgen.iid[7] == ("0", "sample_008")).all()
            assert (bgen.iid[-1] == ("0", "sample_500")).all()

            g = bgen[0, 0].read()
            assert np.isnan(g.val).all()

            g = bgen[1, 0].read()
            a = [[[
                0.027802362811705648, 0.00863673794284387, 0.9635608992454505
            ]]]
            np.testing.assert_array_almost_equal(g.val, a)

            b = [[[
                0.97970582847010945215516,
                0.01947019668749305418287,
                0.00082397484239749366197,
            ]]]
            g = bgen[2, 1].read()
            np.testing.assert_array_almost_equal(g.val, b)
Example #20
0
if __name__ == '__main__':

    # Set the path to the example file you downloaded here as well as the output directory. Do not remove the forward
    # 'r' as this allows the string to be read as literal
    path_to_gen_file = r"C:\Users\Samuel\Documents\Genetic_Examples\PolyTutOut\ByChromosome\EUR.ldpred_1.bgen"
    output_directory = r"I:\Work\Genetics\Residuals"

    # Load gen file
    # It will take longer the first time you use it as it has to create pysnptools metadat .mmm files, which act as a
    # faster .bim /.bgi equivalent. Finalised system will also utilise pyGenicParser which is an extended version of
    # pyBgen written by myself that can use .bim in lue of .mmm files when hard-drive / scratch space is at a premium.
    # Will also be able to use .bed/bim/fam files for plink interfacing. See snipped of the pyGenicPipeline i am the
    # author of at line 212 method isolate_raw_snps of the following for reference if required.
    # https://github.com/sbaker-dev/pyGenicPipeline/blob/main/pyGenicPipeline/core/Loaders/commonGenetic.py
    gen = Bgen(validate_path(path_to_gen_file))

    # Create some dummy values for gender and district Fixed effects, iid count of example dataset is 483 so district
    # FE set lower than actual so that there is within district groups.
    gender = [randint(0, 1) for _ in range(gen.iid_count)]
    district = [randint(0, 50) for _ in range(gen.iid_count)]
    print(f"Total number of individuals {gen.iid_count}")

    # Output file takes for form of a matrix of M x N in csv form, where M rows represent the number of snps that have
    # residuals for N number of individuals. As a note, csv files are unlikely to be sufficient for storing this amount
    # of data, but we can use the plink / bgen IoStream byte code write/unpack logic if we generally need this amount
    # of data
    file = ResOut(validate_path(output_directory), "ResidualsOVERIDE")

    # Construct IO stream to write out to and write the header of Snp + [IID1, IID2, ... IID(N)].
    # Bgen files store [variant id, rs_id], we just want the rs_id hence the [1]; see https://bit.ly/2J0C1kC
Example #21
0
 def test_bgen_samples_inside_bgen(self):
     with example_filepath("haplotypes.bgen") as filepath:
         data = Bgen(filepath)
         samples = [("0", "sample_0"), ("0", "sample_1"), ("0", "sample_2"),
                    ("0", "sample_3")]
         assert (data.iid == samples).all()
Example #22
0
# Download a sample file
from pysnptools.util import example_file
bgen_file = example_file("pysnptools/examples/example.bgen")

# Read from the file
from pysnptools.distreader import Bgen
bgen = Bgen(bgen_file)  # Create a reader
probs0 = bgen[:, 0].read().val  # Read 1st SNP
print(probs0.shape)  # Shape of the NumPy array
assert probs0.shape == (500, 1, 3)

probs_all = bgen.read().val  # Read all variants
print(probs_all.shape)  # Shape of the NumPy array
assert probs_all.shape == (500, 199, 3)
Example #23
0
    def _setup_variables(self):
        """
        The order of IID in genetic file may not equal to submission, this sorts the arrays to be equivalent.

        :return: Bgenfile for this chromosome as well as a pandas dataframe of the external variables
        """

        # Load the variables as pandas dataframe and setup the reference genetic file for this chromosome
        df = pd.read_csv(validate_path(self.args["variables"]))
        gen = Bgen(self._select_file_on_chromosome())
        self.logger.write(f"...Loaded external variables {terminal_time()}")

        # Validate that the variables we have set in the formula exist in the DataFrame
        [
            self._validate_variable(df, cont, "Continuous")
            for cont in self.covariant
        ]
        assert self.args["phenotype"], "GWAS requires a phenotype"

        # Recast IID as an int
        df["IID"] = [self._strip_iid(iid) for iid in df["IID"].tolist()]

        # Isolate the IID to match against the variables IID and create the reference
        genetic_iid = np.array([self._strip_iid(iid) for _, iid in gen.iid])
        genetic_position = gen.iid

        # Remove any IID that is in the external data array but not in the genetic array
        out = np.in1d(df["IID"].to_numpy(), genetic_iid)
        df = df[out]

        # Remove any IID that is in the genetic array but not in the external data
        out = np.in1d(genetic_iid, df["IID"].to_numpy())
        genetic_iid = genetic_iid[out]
        genetic_position = genetic_position[out]

        # Sort both arrays to be in the same order
        df = df.sort_values(by=['IID'], ascending=True)
        gen = gen[
            gen.iid_to_index(genetic_position[np.argsort(genetic_iid)]), :]

        # Load phenotypic and covariant variables as numeric
        for index, v in enumerate(df.columns):
            if v in [self.phenotype] + [self.covariant]:
                df[v] = df[v].apply(pd.to_numeric)

        # Create an IID array of the genetic iid
        genetic_iid = pd.DataFrame(genetic_iid)
        genetic_iid.columns = ["IID"]

        # Add a constant and the residualised phenotype to the databases
        df["Constant"] = [1 for _ in range(len(df))]
        self.covariant = self.covariant + ["Constant"]
        result = sm.OLS(df[self.phenotype], df[self.covariant],
                        missing='drop').fit()
        df = pd.concat(
            [df,
             pd.DataFrame(result.resid, columns=[f"{self.phenotype}RES"])],
            axis=1)

        # Remove non used data to save memory
        return gen, df[["IID", self.phenotype, f"{self.phenotype}RES"] +
                       self.covariant + ["Constant"]], genetic_iid
Example #24
0
 def test_bgen_reader_without_metadata(self):
     with example_filepath("example.32bits.bgen") as filepath:
         bgen = Bgen(filepath)
         bgen.read()
         samples = bgen.iid
         assert samples[-1, 1] == "sample_500"
Example #25
0
        bgen.read(dtype='float32').val  #Read the data from disk

    if False:

        import tracemalloc
        import logging
        import time

        logging.basicConfig(level=logging.INFO)
        tracemalloc.start()

        start = time.time()

        filename = "M:/deldir/genbgen/good/merged_487400x220000.bgen"
        #filename = "M:/deldir/genbgen/good/merged_487400x1100000.bgen"
        bgen = Bgen(filename, fresh_properties=False)
        val = bgen[:, 1000000:1000031].read().val
        # val = bgen[200000:200031, 100000:100031].read().val
        print("{0},{1:,}".format(val.shape, val.shape[0] * val.shape[1]))

        current, peak = tracemalloc.get_traced_memory()
        print(
            f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB"
        )
        print("Time = {0} seconds".format(time.time() - start))
        tracemalloc.stop()

    if False:

        # filename = r"M:\deldir\fakeuk450000x1000.bgen"
        # filename = "M:/deldir/genbgen/good/merged_487400x220000.bgen"
Example #26
0
 def test_bgen_reader_no_sample(self):
     with example_filepath("example.32bits.bgen") as filepath:
         bgen = Bgen(filepath)
         assert bgen.sid_count == 199
Example #27
0
    def write(
        filename,
        distreader,
        bits=16,
        compression=None,
        sample_function=default_sample_function,
        id_rsid_function=default_id_rsid_function,
        iid_function=default_iid_function,
        sid_function=default_sid_function,
        block_size=None,
        qctool_path=None,
        cleanup_temp_files=True,
    ):
        """Writes a :class:`DistReader` to BGEN format and return a the :class:`.Bgen`. Requires access to the 3rd party QCTool.

        :param filename: the name of the file to create
        :type filename: string
        :param distreader: The data that should be written to disk. It can also be any distreader, for example, :class:`.DistNpz`, :class:`.DistData`, or
           another :class:`.Bgen`.
        :type distreader: :class:`DistReader`
        :param bits: Number of bits, between 1 and 32 used to represent each 0-to-1 probability value. Default is 16.
            An np.float32 needs 23 bits. A np.float64 would need 52 bits, which the BGEN format doesn't offer, so use 32.
        :type bits: int
        :param compression: How to compress the file. Can be None (default), 'zlib', or 'zstd'.
        :type compression: string
        :param sample_function: Function to turn a :attr:`DistReader.iid` into a BGEN sample.
           (Default: :meth:`bgen.default_sample_function`.)
        :type sample_function: function
        :param id_rsid_function: Function to turn a  a :attr:`DistReader.sid` into a BGEN (SNP) id and rsid.
           (Default: :meth:`bgen.default_id_rsid_function`.)
        :type id_rsid_function: function
        :param iid_function: Function to turn a BGEN sample into a :attr:`DistReader.iid`.
           (Default: :meth:`bgen.default_iid_function`.)
        :type iid_function: function
        :param sid_function: Function to turn a BGEN (SNP) id and rsid into a :attr:`DistReader.sid`.
           (Default: :meth:`bgen.default_sid_function`.)
        :type sid_function: function
        :param block_size: The number of SNPs to read in a batch from *distreader*. Defaults to a *block_size* such that *block_size* \* *iid_count* is about 100,000.
        :type block_size: number
        :param qctool_path: Tells the path to the 3rd party `QCTool <https://www.well.ox.ac.uk/~gav/qctool_v2/>`_. Defaults to reading
           path from environment variable QCTOOLPATH. (To use on Windows, install Ubuntu for Windows, install QCTool in Ubuntu,
           and then give the path as "ubuntu run <UBUNTU PATH TO QCTOOL".)
        :type qctool_path: string
        :param cleanup_temp_files: Tells if delete temporary \*.gen and \*.sample files.
        :type cleanup_temp_files: bool
        :rtype: :class:`.Bgen`

        >>> from pysnptools.distreader import DistHdf5, Bgen
        >>> import pysnptools.util as pstutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> hdf5_file = example_file("pysnptools/examples/toydata.snpmajor.dist.hdf5")
        >>> distreader = DistHdf5(hdf5_file)[:,:10] # A reader for the first 10 SNPs in Hdf5 format
        >>> pstutil.create_directory_if_necessary("tempdir/toydata10.bgen")
        >>> Bgen.write("tempdir/toydata10.bgen",distreader)        # Write data in BGEN format
        Bgen('tempdir/toydata10.bgen')
        """
        qctool_path = qctool_path or os.environ.get("QCTOOLPATH")
        assert (
            qctool_path is not None
        ), "Bgen.write() requires a path to an external qctool program either via the qctool_path input or by setting the QCTOOLPATH environment variable."

        # We need the +1 so that all three values will have enough precision to be very near 1
        # The max(3,..) is needed to even 1 bit will have enough precision in the gen file
        genfilename = os.path.splitext(filename)[0] + ".gen"
        decimal_places = max(3, math.ceil(math.log(2**bits, 10)) + 1)
        Bgen.genwrite(
            genfilename,
            distreader,
            decimal_places,
            id_rsid_function,
            sample_function,
            block_size,
        )

        dir, file = os.path.split(filename)
        if dir == "":
            dir = "."
        metadata_mmm = open_bgen._metadata_path_from_filename(
            file, samples_filepath=None)
        samplefile = os.path.splitext(file)[0] + ".sample"
        genfile = os.path.splitext(file)[0] + ".gen"
        olddir = os.getcwd()
        os.chdir(dir)

        if os.path.exists(file):
            os.remove(file)
        if os.path.exists(metadata_mmm):
            os.remove(metadata_mmm)
        cmd = "{0} -g {1} -s {2} -og {3}{4}{5}".format(
            qctool_path,
            genfile,
            samplefile,
            file,
            " -bgen-bits {0}".format(bits) if bits is not None else "",
            " -bgen-compression {0}".format(compression)
            if compression is not None else "",
        )
        try:
            _ = subprocess.check_output(cmd,
                                        stderr=subprocess.STDOUT,
                                        shell=True,
                                        universal_newlines=True)
        except subprocess.CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output
                  )  # LATER this doesn't seem to work when called from VS
            if cleanup_temp_files:
                raise Exception("qctool command failed")
            else:
                print("qctool command failed\n{0}".format(cmd))
        if cleanup_temp_files:
            os.remove(genfile)
            os.remove(samplefile)
        os.chdir(olddir)
        new_bgen = Bgen(filename,
                        iid_function=iid_function,
                        sid_function=sid_function)
        return new_bgen
Example #28
0
        if hasattr(distreader,'val') and distreader.val.dtype==dtype and (order=="A" or (order=="C" and distreader.val.flags["C_CONTIGUOUS"]) or (order=="F" and distreader.val.flags["F_CONTIGUOUS"])):
            return distreader
        else:
            return distreader.read(order=order,dtype=dtype,view_ok=True)
    
    def copyinputs(self, copier):
        raise NotImplementedError

    def _assert_iid_sid_pos(self,check_val):
        if check_val:
            assert len(self._val.shape)==3 and self._val.shape[-1]==3, "val should have 3 dimensions and the last dimension should have size 3"
            assert self._val.shape == (len(self._row),len(self._col),3), "val shape should match that of iid_count x sid_count"
        assert self._row.dtype.type is np.str_ and len(self._row.shape)==2 and self._row.shape[1]==2, "iid should be dtype str, have two dimensions, and the second dimension should be size 2"
        assert self._col.dtype.type is np.str_ and len(self._col.shape)==1, "sid should be of dtype of str and one dimensional"




if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    if False:
        from pysnptools.distreader import Bgen
        dist_on_disk = Bgen('../examples/2500x100.bgen')
        print(dist_on_disk.pos[:4,].astype('int')) # print position information for the first three sids: #The '...' is for possible space char

    import doctest
    doctest.testmod(optionflags=doctest.ELLIPSIS|doctest.NORMALIZE_WHITESPACE)
    # There is also a unit test case in 'pysnptools\test.py' that calls this doc t
    print("done")
Example #29
0
def getTestSuite():
    """
    set up composite test suite
    """

    test_suite = unittest.TestSuite([])
    test_suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBgen))
    return test_suite


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    if False:
        from pysnptools.distreader import Bgen
        bgen = Bgen(r'M:\deldir\2500x100.bgen')
        bgen.read()
        print(bgen.shape)
        print("")

    if False:
        from pysnptools.distreader import Bgen
        bgen = Bgen(r'M:\deldir\1x1000000.bgen', verbose=True)
        print(bgen.shape)
        print("")

    if False:
        from pysnptools.distreader import Bgen
        bgen2 = Bgen(r'M:\deldir\10x5000000.bgen', verbose=True)
        print(bgen2.shape)