def test_respect_read_inputs(self): from pysnptools.distreader import Bgen,DistGen,DistHdf5,DistMemMap,DistNpz from pysnptools.snpreader import Bed previous_wd = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) for distreader in [ _DistMergeSIDs([Bgen('../examples/example.bgen')[:,:5].read(),Bgen('../examples/example.bgen')[:,5:].read()]), Bed('../examples/toydata.5chrom.bed',count_A1=True).as_dist(block_size=2000), Bed('../examples/toydata.5chrom.bed',count_A1=True).as_dist(), Bgen('../examples/example.bgen').read(), Bgen('../examples/bits1.bgen'), DistGen(seed=0,iid_count=500,sid_count=50), DistGen(seed=0,iid_count=500,sid_count=50)[::2,::2], DistHdf5('../examples/toydata.snpmajor.dist.hdf5'), DistMemMap('../examples/tiny.dist.memmap'), DistNpz('../examples/toydata10.dist.npz') ]: logging.info(str(distreader)) for order in ['F','C','A']: for dtype in [np.float32,np.float64]: for force_python_only in [True,False]: for view_ok in [True,False]: val = distreader.read(order=order,dtype=dtype,force_python_only=force_python_only,view_ok=view_ok).val has_right_order = order=="A" or (order=="C" and val.flags["C_CONTIGUOUS"]) or (order=="F" and val.flags["F_CONTIGUOUS"]) if hasattr(distreader,'val') and not view_ok: assert distreader.val is not val if (hasattr(distreader,'val') and view_ok and distreader.val is not val and (order == 'A' or (order == 'F' and distreader.val.flags['F_CONTIGUOUS']) or (order == 'C' and distreader.val.flags['C_CONTIGUOUS'])) and (dtype is None or distreader.val.dtype == dtype)): logging.info("{0} could have read a view, but didn't".format(distreader)) assert val.dtype == dtype and has_right_order os.chdir(previous_wd)
def test_read_write_round_trip(self): from pysnptools.distreader import DistGen old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) assert ( "QCTOOLPATH" in os.environ ), "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')." exampledata = Bgen("../examples/example.bgen")[:, 10].read() distgen0data = DistGen(seed=332, iid_count=50, sid_count=5).read() for i, distdata0 in enumerate([distgen0data, exampledata]): for bits in list(range(1, 33)): logging.info("input#={0},bits={1}".format(i, bits)) file1 = "temp/roundtrip1-{0}-{1}.bgen".format(i, bits) distdata1 = Bgen.write( file1, distdata0, bits=bits, compression="zlib", cleanup_temp_files=False, ).read() assert distdata1.iid[0, 0] == "0" distdata2 = Bgen(file1).read() assert distdata1.allclose(distdata2, equal_nan=True) atol = 1.0 / (2**(bits or 16)) if (bits or 16) == 1: atol *= 1.4 # because values must add up to 1, there is more rounding error TestBgen.assert_approx_equal(distdata0, distdata1, atol=atol) os.chdir(old_dir)
def test_coverage(self): from pysnptools.distreader import DistGen with example_filepath("example.32bits.bgen") as filepath: bgen = Bgen(filepath, fresh_properties=False, iid_function=lambda sam: ("X", sam)) assert bgen.iid[0, 0] == "X" metadata_filepath = bgen._open_bgen._metadata2_path metadata2_temp = metadata_filepath.parent / ( metadata_filepath.name + ".temp") del bgen if metadata2_temp.exists(): metadata2_temp.unlink() os.rename(metadata_filepath, metadata2_temp) bgen = Bgen(filepath) assert bgen.iid[0, 0] == "0" bgen[0, 0].read(order='A') if not os.path.exists("temp"): os.mkdir("temp") os.chdir("temp") file1x = "coverage.bgen" Bgen.write(file1x, bgen[:100, :100]) Bgen.write(file1x, bgen[:100, :100]) os.chdir("..") distgen0data = DistGen(seed=332, iid_count=10010, sid_count=5).read() file1 = "temp/roundtrip1-big.bgen" bed3 = Bgen.write(file1, distgen0data, bits=8, compression="zlib", cleanup_temp_files=False, sample_function=lambda fam, ind: f'{fam},{ind}') bed3.iid[0, 0] = '0'
def test_bgen_reader_file_notfound(self): bgen = Bgen("/1/2/3/example.32bits.bgen") try: bgen.iid # expect error got_error = False except Exception: got_error = True assert got_error
def test_other(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) logging.info("in TestBgen test_other") bgen = Bgen("../examples/example.bgen", sample="../examples/other.sample") assert np.all(bgen.iid[0] == ("0", "other_001")) os.chdir(old_dir)
def test_zero(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) bgen = Bgen("../examples/example.bgen") assert bgen[:, []].read().val.shape == (500, 0, 3) assert bgen[[], :].read().val.shape == (0, 199, 3) assert bgen[[], []].read().val.shape == (0, 0, 3) os.chdir(old_dir)
def test_other(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) logging.info("in TestBgen test_other") bgen = Bgen('../examples/example.bgen', sample='../examples/other.sample') assert np.all(bgen.iid[0] == ('0', 'other_001')) os.chdir(old_dir)
def test1(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) logging.info("in TestBgen test1") bgen = Bgen('../examples/example.bgen') distdata = bgen.read() bgen2 = bgen[:2, ::3] distdata2 = bgen2.read() os.chdir(old_dir)
def test2(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) logging.info("in TestBgen test2") bgen = Bgen("../examples/bits1.bgen") bgen.read() bgen2 = bgen[:2, ::3] bgen2.read() os.chdir(old_dir)
def test_bgen_samples_inside_bgen(self): with example_filepath("example.32bits.bgen") as filepath: data = Bgen(filepath) samples = [ ("0", "sample_001"), ("0", "sample_002"), ("0", "sample_003"), ("0", "sample_004"), ] assert (data.iid[:4] == samples).all()
def gen_reference(self, load_path): """Get the pysnptools reference via the load type""" if self.gen_type == ".bed": return Bed(load_path, count_A1=True) elif self.gen_type == ".bgen": if self._snp_tools: return Bgen(load_path) else: return BgenObject(load_path) else: raise Exception("Unknown load type set")
def test2(self): from pysnptools.distreader import Bgen old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) bgen = Bgen('../examples/example.bgen') distmemmap = DistMemMap.write("tempdir/bgentomemmap.dist.memamp", bgen) assert DistData.allclose(bgen.read(), distmemmap.read(), equal_nan=True) os.chdir(old_dir)
def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name): """ Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random set of snps of total == pre-defined int, where the int is set to snps_to_id. :param memory_location: Location of bgen memory file :type memory_location: Path | str :param snps_to_id: Location of snps csv to id :type snps_to_id: Path | str :param gen_path: The path to the genetic file :type gen_path: Path | str :param write_dir: The directory to write the snp index csv file to :type write_dir: Path | str :param file_name: The name of the snp index file :type file_name: str :return: Nothing, write the id's to a csv then stop :rtype: None :raise TypeError: If a str / int is not passed """ # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref custom_meta_path(validate_path(memory_location)) gen = Bgen(str(validate_path(gen_path).absolute())) # Construct a lookup dict for variant_id-rsid v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]} # Load the list of snps to validate snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0] # Get the index of each snp that is present snp_indexes = [] for snp in snps_list: try: snp_indexes.append( gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist()) except KeyError: pass # Write the snp indexes out write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes) print( f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}" )
def test_read1(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) file_from = "../examples/example.bgen" file_to = "temp/example.bgen" pstutil.create_directory_if_necessary(file_to) if os.path.exists(file_to + ".metadata"): os.remove(file_to + ".metadata") meta = open_bgen._metadata_path_from_filename(file_to, samples_filepath=None) if os.path.exists(meta): os.remove(meta) shutil.copy(file_from, file_to) for loop_index in range(2): bgen = Bgen(file_to) assert np.array_equal(bgen.iid[0], ["0", "sample_001"]) assert bgen.sid[0] == "SNPID_2,RSID_2" # Use the bgen_sample_id for both parts of iid def iid_dup(bgen_sample_id): return (bgen_sample_id, bgen_sample_id) iid_function = iid_dup bgen = Bgen(file_to, iid_function=iid_function, sid_function="id") assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"]) assert bgen.sid[0] == "SNPID_2" bgen = Bgen(file_to, iid_function=iid_function, sid_function="rsid") assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"]) assert bgen.sid[0] == "RSID_2" sid_function = lambda id, rsid: "{0},{1}".format(id, rsid) bgen = Bgen(file_to, iid_function, sid_function=sid_function) assert bgen.sid[0] == "SNPID_2,RSID_2" metafile = bgen._open_bgen._metadata_path_from_filename( file_to, samples_filepath=None) del bgen os.remove(metafile) sid_function = lambda id, rsid: "{0},{1}".format(id, rsid) bgen = Bgen(file_to, iid_function, sid_function=sid_function) assert bgen.sid[0] == "SNPID_2,RSID_2" metafile = bgen._open_bgen._metadata_path_from_filename( file_to, samples_filepath=None) del bgen os.remove(metafile) bgen = Bgen(file_to, iid_function, sid_function="rsid") assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"]) assert bgen.sid[0] == "RSID_2" os.chdir(old_dir)
def test_bad_sum(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) Path("temp").mkdir(parents=True, exist_ok=True) assert ( "QCTOOLPATH" in os.environ ), "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')." distdata = Bgen("../examples/example.bgen")[:5, :5].read() # Just one NaN distdata.val[0, 0, :] = [np.nan, 0.5, 0.5] bgen = Bgen.write("temp/should_be_all_nan.bgen", distdata) assert np.isnan(bgen[0, 0].read().val).all() # Just one NaN distdata.val[0, 0, :] = [0, 0, 0] bgen = Bgen.write("temp/should_be_all_nan2.bgen", distdata) assert np.isnan(bgen[0, 0].read().val).all() # Just sums to more than 1 distdata.val[0, 0, :] = [1, 2, 3] failed = False try: bgen = Bgen.write("temp/should_fail.bgen", distdata) except Exception: failed = True assert failed # Just sums to less than 1 distdata.val[0, 0, :] = [0.2, 0.2, 0.2] failed = False try: bgen = Bgen.write("temp/should_fail.bgen", distdata) except Exception: failed = True assert failed # a negative value distdata.val[0, 0, :] = [-1, 1, 0] failed = False try: bgen = Bgen.write("temp/should_fail.bgen", distdata) except Exception: failed = True assert failed os.chdir(old_dir)
def test_memmap(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) assert ( "QCTOOLPATH" in os.environ ), "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')." distgen0data = Bgen("../examples/example.bgen")[:, 10].read() assert distgen0data.iid[0, 0] == "0" # distgen0data = DistGen(seed=332,iid_count=50,sid_count=5).read() file1y = "temp/roundtrip1-{0}-{1}y.bgen".format(0, 1) bgen = Bgen.write(file1y, distgen0data) assert bgen.iid is not None file1x = "temp/roundtrip1-{0}-{1}x.bgen".format(0, 1) assert Bgen.write(file1x, distgen0data).iid[0, 0] == "0" os.chdir(old_dir)
def test_bad_sum(self): from pysnptools.distreader import DistGen old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) assert 'QCTOOLPATH' in os.environ, "To run test_read_write_round_trip, QCTOOLPATH environment variable must be set. (On Windows, install QcTools in 'Ubuntu on Windows' and set to 'ubuntu run <qctoolLinuxPath>')." distdata = Bgen('../examples/example.bgen')[:5, :5].read() #Just one NaN distdata.val[0, 0, :] = [np.nan, .5, .5] bgen = Bgen.write('temp/should_be_all_nan.bgen', distdata) assert np.isnan(bgen[0, 0].read().val).all() #Just one NaN distdata.val[0, 0, :] = [0, 0, 0] bgen = Bgen.write('temp/should_be_all_nan2.bgen', distdata) assert np.isnan(bgen[0, 0].read().val).all() #Just sums to more than 1 distdata.val[0, 0, :] = [1, 2, 3] failed = False try: bgen = Bgen.write('temp/should_fail.bgen', distdata) except: failed = True assert failed #Just sums to less than 1 distdata.val[0, 0, :] = [.2, .2, .2] failed = False try: bgen = Bgen.write('temp/should_fail.bgen', distdata) except: failed = True assert failed #a negative value distdata.val[0, 0, :] = [-1, 1, 0] failed = False try: bgen = Bgen.write('temp/should_fail.bgen', distdata) except: failed = True assert failed os.chdir(old_dir)
def test_read1(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) file_from = '../examples/example.bgen' file_to = 'temp/example.bgen' pstutil.create_directory_if_necessary(file_to) if os.path.exists(file_to + ".metadata"): os.remove(file_to + ".metadata") meta = open_bgen._metadatapath_from_filename(file_to) if os.path.exists(meta): os.remove(meta) shutil.copy(file_from, file_to) for loop_index in range(2): bgen = Bgen(file_to) assert np.array_equal(bgen.iid[0], ['0', 'sample_001']) assert bgen.sid[0] == 'SNPID_2,RSID_2' iid_function = lambda bgen_sample_id: ( bgen_sample_id, bgen_sample_id ) #Use the bgen_sample_id for both parts of iid bgen = Bgen(file_to, iid_function=iid_function, sid_function='id') assert np.array_equal(bgen.iid[0], ['sample_001', 'sample_001']) assert bgen.sid[0] == 'SNPID_2' bgen = Bgen(file_to, iid_function=iid_function, sid_function='rsid') assert np.array_equal(bgen.iid[0], ['sample_001', 'sample_001']) assert bgen.sid[0] == 'RSID_2' sid_function = lambda id, rsid: '{0},{1}'.format(id, rsid) bgen = Bgen(file_to, iid_function, sid_function=sid_function) assert bgen.sid[0] == 'SNPID_2,RSID_2' os.remove(bgen._open_bgen._metadatapath_from_filename(file_to)) sid_function = lambda id, rsid: '{0},{1}'.format(id, rsid) bgen = Bgen(file_to, iid_function, sid_function=sid_function) assert bgen.sid[0] == 'SNPID_2,RSID_2' os.remove(bgen._open_bgen._metadatapath_from_filename(file_to)) bgen = Bgen(file_to, iid_function, sid_function='rsid') assert np.array_equal(bgen.iid[0], ['sample_001', 'sample_001']) assert bgen.sid[0] == 'RSID_2' os.chdir(old_dir)
def test_bgen_reader_variants_info(self): with example_filepath("example.32bits.bgen") as filepath: bgen = Bgen(filepath, sid_function="id") assert bgen.pos[0, 0] == 1 assert bgen.sid[0] == "SNPID_2" assert bgen.pos[0, 2] == 2000 assert bgen.pos[7, 0] == 1 assert bgen.sid[7] == "SNPID_9" assert bgen.pos[7, 2] == 9000 assert bgen.pos[-1, 0] == 1 assert bgen.sid[-1] == "SNPID_200" assert bgen.pos[-1, 2] == 100001 assert (bgen.iid[0] == ("0", "sample_001")).all() assert (bgen.iid[7] == ("0", "sample_008")).all() assert (bgen.iid[-1] == ("0", "sample_500")).all() g = bgen[0, 0].read() assert np.isnan(g.val).all() g = bgen[1, 0].read() a = [[[ 0.027802362811705648, 0.00863673794284387, 0.9635608992454505 ]]] np.testing.assert_array_almost_equal(g.val, a) b = [[[ 0.97970582847010945215516, 0.01947019668749305418287, 0.00082397484239749366197, ]]] g = bgen[2, 1].read() np.testing.assert_array_almost_equal(g.val, b)
if __name__ == '__main__': # Set the path to the example file you downloaded here as well as the output directory. Do not remove the forward # 'r' as this allows the string to be read as literal path_to_gen_file = r"C:\Users\Samuel\Documents\Genetic_Examples\PolyTutOut\ByChromosome\EUR.ldpred_1.bgen" output_directory = r"I:\Work\Genetics\Residuals" # Load gen file # It will take longer the first time you use it as it has to create pysnptools metadat .mmm files, which act as a # faster .bim /.bgi equivalent. Finalised system will also utilise pyGenicParser which is an extended version of # pyBgen written by myself that can use .bim in lue of .mmm files when hard-drive / scratch space is at a premium. # Will also be able to use .bed/bim/fam files for plink interfacing. See snipped of the pyGenicPipeline i am the # author of at line 212 method isolate_raw_snps of the following for reference if required. # https://github.com/sbaker-dev/pyGenicPipeline/blob/main/pyGenicPipeline/core/Loaders/commonGenetic.py gen = Bgen(validate_path(path_to_gen_file)) # Create some dummy values for gender and district Fixed effects, iid count of example dataset is 483 so district # FE set lower than actual so that there is within district groups. gender = [randint(0, 1) for _ in range(gen.iid_count)] district = [randint(0, 50) for _ in range(gen.iid_count)] print(f"Total number of individuals {gen.iid_count}") # Output file takes for form of a matrix of M x N in csv form, where M rows represent the number of snps that have # residuals for N number of individuals. As a note, csv files are unlikely to be sufficient for storing this amount # of data, but we can use the plink / bgen IoStream byte code write/unpack logic if we generally need this amount # of data file = ResOut(validate_path(output_directory), "ResidualsOVERIDE") # Construct IO stream to write out to and write the header of Snp + [IID1, IID2, ... IID(N)]. # Bgen files store [variant id, rs_id], we just want the rs_id hence the [1]; see https://bit.ly/2J0C1kC
def test_bgen_samples_inside_bgen(self): with example_filepath("haplotypes.bgen") as filepath: data = Bgen(filepath) samples = [("0", "sample_0"), ("0", "sample_1"), ("0", "sample_2"), ("0", "sample_3")] assert (data.iid == samples).all()
# Download a sample file from pysnptools.util import example_file bgen_file = example_file("pysnptools/examples/example.bgen") # Read from the file from pysnptools.distreader import Bgen bgen = Bgen(bgen_file) # Create a reader probs0 = bgen[:, 0].read().val # Read 1st SNP print(probs0.shape) # Shape of the NumPy array assert probs0.shape == (500, 1, 3) probs_all = bgen.read().val # Read all variants print(probs_all.shape) # Shape of the NumPy array assert probs_all.shape == (500, 199, 3)
def _setup_variables(self): """ The order of IID in genetic file may not equal to submission, this sorts the arrays to be equivalent. :return: Bgenfile for this chromosome as well as a pandas dataframe of the external variables """ # Load the variables as pandas dataframe and setup the reference genetic file for this chromosome df = pd.read_csv(validate_path(self.args["variables"])) gen = Bgen(self._select_file_on_chromosome()) self.logger.write(f"...Loaded external variables {terminal_time()}") # Validate that the variables we have set in the formula exist in the DataFrame [ self._validate_variable(df, cont, "Continuous") for cont in self.covariant ] assert self.args["phenotype"], "GWAS requires a phenotype" # Recast IID as an int df["IID"] = [self._strip_iid(iid) for iid in df["IID"].tolist()] # Isolate the IID to match against the variables IID and create the reference genetic_iid = np.array([self._strip_iid(iid) for _, iid in gen.iid]) genetic_position = gen.iid # Remove any IID that is in the external data array but not in the genetic array out = np.in1d(df["IID"].to_numpy(), genetic_iid) df = df[out] # Remove any IID that is in the genetic array but not in the external data out = np.in1d(genetic_iid, df["IID"].to_numpy()) genetic_iid = genetic_iid[out] genetic_position = genetic_position[out] # Sort both arrays to be in the same order df = df.sort_values(by=['IID'], ascending=True) gen = gen[ gen.iid_to_index(genetic_position[np.argsort(genetic_iid)]), :] # Load phenotypic and covariant variables as numeric for index, v in enumerate(df.columns): if v in [self.phenotype] + [self.covariant]: df[v] = df[v].apply(pd.to_numeric) # Create an IID array of the genetic iid genetic_iid = pd.DataFrame(genetic_iid) genetic_iid.columns = ["IID"] # Add a constant and the residualised phenotype to the databases df["Constant"] = [1 for _ in range(len(df))] self.covariant = self.covariant + ["Constant"] result = sm.OLS(df[self.phenotype], df[self.covariant], missing='drop').fit() df = pd.concat( [df, pd.DataFrame(result.resid, columns=[f"{self.phenotype}RES"])], axis=1) # Remove non used data to save memory return gen, df[["IID", self.phenotype, f"{self.phenotype}RES"] + self.covariant + ["Constant"]], genetic_iid
def test_bgen_reader_without_metadata(self): with example_filepath("example.32bits.bgen") as filepath: bgen = Bgen(filepath) bgen.read() samples = bgen.iid assert samples[-1, 1] == "sample_500"
bgen.read(dtype='float32').val #Read the data from disk if False: import tracemalloc import logging import time logging.basicConfig(level=logging.INFO) tracemalloc.start() start = time.time() filename = "M:/deldir/genbgen/good/merged_487400x220000.bgen" #filename = "M:/deldir/genbgen/good/merged_487400x1100000.bgen" bgen = Bgen(filename, fresh_properties=False) val = bgen[:, 1000000:1000031].read().val # val = bgen[200000:200031, 100000:100031].read().val print("{0},{1:,}".format(val.shape, val.shape[0] * val.shape[1])) current, peak = tracemalloc.get_traced_memory() print( f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB" ) print("Time = {0} seconds".format(time.time() - start)) tracemalloc.stop() if False: # filename = r"M:\deldir\fakeuk450000x1000.bgen" # filename = "M:/deldir/genbgen/good/merged_487400x220000.bgen"
def test_bgen_reader_no_sample(self): with example_filepath("example.32bits.bgen") as filepath: bgen = Bgen(filepath) assert bgen.sid_count == 199
def write( filename, distreader, bits=16, compression=None, sample_function=default_sample_function, id_rsid_function=default_id_rsid_function, iid_function=default_iid_function, sid_function=default_sid_function, block_size=None, qctool_path=None, cleanup_temp_files=True, ): """Writes a :class:`DistReader` to BGEN format and return a the :class:`.Bgen`. Requires access to the 3rd party QCTool. :param filename: the name of the file to create :type filename: string :param distreader: The data that should be written to disk. It can also be any distreader, for example, :class:`.DistNpz`, :class:`.DistData`, or another :class:`.Bgen`. :type distreader: :class:`DistReader` :param bits: Number of bits, between 1 and 32 used to represent each 0-to-1 probability value. Default is 16. An np.float32 needs 23 bits. A np.float64 would need 52 bits, which the BGEN format doesn't offer, so use 32. :type bits: int :param compression: How to compress the file. Can be None (default), 'zlib', or 'zstd'. :type compression: string :param sample_function: Function to turn a :attr:`DistReader.iid` into a BGEN sample. (Default: :meth:`bgen.default_sample_function`.) :type sample_function: function :param id_rsid_function: Function to turn a a :attr:`DistReader.sid` into a BGEN (SNP) id and rsid. (Default: :meth:`bgen.default_id_rsid_function`.) :type id_rsid_function: function :param iid_function: Function to turn a BGEN sample into a :attr:`DistReader.iid`. (Default: :meth:`bgen.default_iid_function`.) :type iid_function: function :param sid_function: Function to turn a BGEN (SNP) id and rsid into a :attr:`DistReader.sid`. (Default: :meth:`bgen.default_sid_function`.) :type sid_function: function :param block_size: The number of SNPs to read in a batch from *distreader*. Defaults to a *block_size* such that *block_size* \* *iid_count* is about 100,000. :type block_size: number :param qctool_path: Tells the path to the 3rd party `QCTool <https://www.well.ox.ac.uk/~gav/qctool_v2/>`_. Defaults to reading path from environment variable QCTOOLPATH. (To use on Windows, install Ubuntu for Windows, install QCTool in Ubuntu, and then give the path as "ubuntu run <UBUNTU PATH TO QCTOOL".) :type qctool_path: string :param cleanup_temp_files: Tells if delete temporary \*.gen and \*.sample files. :type cleanup_temp_files: bool :rtype: :class:`.Bgen` >>> from pysnptools.distreader import DistHdf5, Bgen >>> import pysnptools.util as pstutil >>> from pysnptools.util import example_file # Download and return local file name >>> hdf5_file = example_file("pysnptools/examples/toydata.snpmajor.dist.hdf5") >>> distreader = DistHdf5(hdf5_file)[:,:10] # A reader for the first 10 SNPs in Hdf5 format >>> pstutil.create_directory_if_necessary("tempdir/toydata10.bgen") >>> Bgen.write("tempdir/toydata10.bgen",distreader) # Write data in BGEN format Bgen('tempdir/toydata10.bgen') """ qctool_path = qctool_path or os.environ.get("QCTOOLPATH") assert ( qctool_path is not None ), "Bgen.write() requires a path to an external qctool program either via the qctool_path input or by setting the QCTOOLPATH environment variable." # We need the +1 so that all three values will have enough precision to be very near 1 # The max(3,..) is needed to even 1 bit will have enough precision in the gen file genfilename = os.path.splitext(filename)[0] + ".gen" decimal_places = max(3, math.ceil(math.log(2**bits, 10)) + 1) Bgen.genwrite( genfilename, distreader, decimal_places, id_rsid_function, sample_function, block_size, ) dir, file = os.path.split(filename) if dir == "": dir = "." metadata_mmm = open_bgen._metadata_path_from_filename( file, samples_filepath=None) samplefile = os.path.splitext(file)[0] + ".sample" genfile = os.path.splitext(file)[0] + ".gen" olddir = os.getcwd() os.chdir(dir) if os.path.exists(file): os.remove(file) if os.path.exists(metadata_mmm): os.remove(metadata_mmm) cmd = "{0} -g {1} -s {2} -og {3}{4}{5}".format( qctool_path, genfile, samplefile, file, " -bgen-bits {0}".format(bits) if bits is not None else "", " -bgen-compression {0}".format(compression) if compression is not None else "", ) try: _ = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True, universal_newlines=True) except subprocess.CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output ) # LATER this doesn't seem to work when called from VS if cleanup_temp_files: raise Exception("qctool command failed") else: print("qctool command failed\n{0}".format(cmd)) if cleanup_temp_files: os.remove(genfile) os.remove(samplefile) os.chdir(olddir) new_bgen = Bgen(filename, iid_function=iid_function, sid_function=sid_function) return new_bgen
if hasattr(distreader,'val') and distreader.val.dtype==dtype and (order=="A" or (order=="C" and distreader.val.flags["C_CONTIGUOUS"]) or (order=="F" and distreader.val.flags["F_CONTIGUOUS"])): return distreader else: return distreader.read(order=order,dtype=dtype,view_ok=True) def copyinputs(self, copier): raise NotImplementedError def _assert_iid_sid_pos(self,check_val): if check_val: assert len(self._val.shape)==3 and self._val.shape[-1]==3, "val should have 3 dimensions and the last dimension should have size 3" assert self._val.shape == (len(self._row),len(self._col),3), "val shape should match that of iid_count x sid_count" assert self._row.dtype.type is np.str_ and len(self._row.shape)==2 and self._row.shape[1]==2, "iid should be dtype str, have two dimensions, and the second dimension should be size 2" assert self._col.dtype.type is np.str_ and len(self._col.shape)==1, "sid should be of dtype of str and one dimensional" if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if False: from pysnptools.distreader import Bgen dist_on_disk = Bgen('../examples/2500x100.bgen') print(dist_on_disk.pos[:4,].astype('int')) # print position information for the first three sids: #The '...' is for possible space char import doctest doctest.testmod(optionflags=doctest.ELLIPSIS|doctest.NORMALIZE_WHITESPACE) # There is also a unit test case in 'pysnptools\test.py' that calls this doc t print("done")
def getTestSuite(): """ set up composite test suite """ test_suite = unittest.TestSuite([]) test_suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBgen)) return test_suite if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if False: from pysnptools.distreader import Bgen bgen = Bgen(r'M:\deldir\2500x100.bgen') bgen.read() print(bgen.shape) print("") if False: from pysnptools.distreader import Bgen bgen = Bgen(r'M:\deldir\1x1000000.bgen', verbose=True) print(bgen.shape) print("") if False: from pysnptools.distreader import Bgen bgen2 = Bgen(r'M:\deldir\10x5000000.bgen', verbose=True) print(bgen2.shape)