def test_intersection(self): from pysnptools.standardizer import Unit from pysnptools.kernelreader import SnpKernel from pysnptools.snpreader import Pheno from pysnptools.kernelreader._subset import _KernelSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed", count_A1=False) k = SnpKernel(snps_all, stdizer.Identity()) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:, :] # To test intersection we remove a iid from pheno k1, pheno = intersect_apply([ k, pheno ]) #SnpKernel is special because it standardizes AFTER intersecting. assert isinstance(k1.snpreader, _SnpSubset) and not isinstance(k1, _KernelSubset) #What happens with fancy selection? k2 = k[::2] assert isinstance(k2, SnpKernel) logging.info("Done with test_intersection")
def _read(self, row_index_or_none, col_index_or_none, order, dtype, force_python_only, view_ok, num_threads): assert row_index_or_none is None and col_index_or_none is None #real assert because indexing should already be pushed to the inner snpreader dtype = np.dtype(dtype) #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count) if self.block_size is None or (self.sid_count <= self.block_size or self.sid_count <= self.iid_count): snpdata, _ = SnpReader._as_snpdata( self.snpreader, dtype=dtype, order=order, force_python_only=force_python_only, standardizer=stdizer.Identity(), num_threads=num_threads) val = self._snpval_to_distval(snpdata.val, order, dtype) has_right_order = order = "A" or ( order == "C" and val.flags["C_CONTIGUOUS"]) or ( order == "F" and val.flags["F_CONTIGUOUS"]) assert has_right_order return val else: #Do in blocks t0 = time.time() if order == 'A': order = 'F' val = np.zeros([self.iid_count, self.sid_count, 3], dtype=dtype, order=order) #LATER use empty or fillnan??? logging.info( "reading {0} value data in blocks of {1} SNPs and finding distribution (for {2} individuals)" .format(self.sid_count, self.block_size, self.iid_count)) ct = 0 ts = time.time() for start in range(0, self.sid_count, self.block_size): ct += self.block_size snpdata = self.snpreader[:, start:start + self.block_size].read( order=order, dtype=dtype, force_python_only=force_python_only, view_ok=True, num_threads=num_threads ) # a view is always OK, because we'll allocate memory in the next step val[:, start:start + self.block_size] = self._snpval_to_distval( snpdata.val, order, dtype) if ct % self.block_size == 0: diff = time.time() - ts if diff > 5: logging.info("read %s SNPs in %.2f seconds" % (ct, diff)) t1 = time.time() logging.info("%.2f seconds elapsed" % (t1 - t0)) return val
def _as_snpdata(snpreader, standardizer, force_python_only, dtype): ''' Like 'read' except (1) won't read if already a snpdata and (2) returns the standardizer ''' from pysnptools.snpreader import SnpData if isinstance(snpreader,SnpData) and snpreader.val.dtype==dtype and isinstance(standardizer,stdizer.Identity): return snpreader, stdizer.Identity() else: return snpreader.read(order='A',dtype=dtype).standardize(standardizer,return_trained=True,force_python_only=force_python_only)
def _read_kernel(self, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False): #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count) if block_size is None or (self.sid_count <= block_size or self.sid_count <= self.iid_count): train_data,trained_standardizer = SnpReader._as_snpdata(self,standardizer=standardizer,dtype=dtype,force_python_only=force_python_only) kernel = train_data._read_kernel(stdizer.Identity(), order=order,dtype=dtype,force_python_only=force_python_only,view_ok=False) if return_trained: return kernel, trained_standardizer else: return kernel else: #Do in blocks #Set the default order to 'C' because with kernels any order is fine and the Python .dot method likes 'C' best. if order=='A': order = 'C' t0 = time.time() K = np.zeros([self.iid_count,self.iid_count],dtype=dtype,order=order) trained_standardizer_list = [] logging.info("reading {0} SNPs in blocks of {1} and adding up kernels (for {2} individuals)".format(self.sid_count, block_size, self.iid_count)) ct = 0 ts = time.time() for start in range(0, self.sid_count, block_size): ct += block_size train_data,trained_standardizer = SnpReader._as_snpdata(self[:,start:start+block_size],standardizer=standardizer,dtype=dtype,force_python_only=force_python_only) trained_standardizer_list.append(trained_standardizer) K += train_data._read_kernel(stdizer.Identity(),block_size=None,order=order,dtype=dtype,force_python_only=force_python_only,view_ok=False) if ct % block_size==0: diff = time.time()-ts if diff > 1: logging.info("read %s SNPs in %.2f seconds" % (ct, diff)) t1 = time.time() logging.info("%.2f seconds elapsed" % (t1-t0)) if return_trained: return K, standardizer._merge_trained(trained_standardizer_list) #turns this into a single standardizer else: return K
def _as_snpdata(snpreader, standardizer, force_python_only, order, dtype): ''' Like 'read' except (1) won't read if already a snpdata and (2) returns the standardizer ''' from pysnptools.snpreader import SnpData dtype = np.dtype(dtype) if (hasattr(snpreader, 'val') and snpreader.val.dtype == dtype and isinstance(standardizer, stdizer.Identity) and (order == "A" or (order == "C" and snpreader.val.flags["C_CONTIGUOUS"]) or (order == "F" and snpreader.val.flags["F_CONTIGUOUS"]))): return snpreader, stdizer.Identity() else: return snpreader.read(order=order, dtype=dtype).standardize( standardizer, return_trained=True, force_python_only=force_python_only)
def test_respect_inputs(self): np.random.seed(0) for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]: for order_start in ['F', 'C', 'A']: for sid_count in [20, 2]: snpdataX = SnpData( iid=[["0", "0"], ["1", "1"], ["2", "2"]], sid=[str(i) for i in range(sid_count)], val=np.array(np.random.randint(3, size=[3, sid_count]), dtype=dtype_start, order=order_start)) for stdx in [ stdizer.Beta(1, 25), stdizer.Identity(), stdizer.Unit() ]: for snpreader0 in [snpdataX, snpdataX[:, 1:]]: snpreader1 = snpreader0[1:, :] refdata0, trained_standardizer = snpreader0.read( ).standardize(stdx, return_trained=True, force_python_only=True) refval0 = refdata0.val.dot(refdata0.val.T) refdata1 = snpreader1.read().standardize( trained_standardizer, force_python_only=True ) #LATER why aren't these used? refval1 = refdata0.val.dot( refdata1.val.T) #LATER why aren't these used? for dtype_goal, decimal_goal in [(np.float32, 5), (np.float64, 10)]: for order_goal in ['F', 'C', 'A']: k = snpreader0.read_kernel( standardizer=stdx, block_size=1, order=order_goal, dtype=dtype_goal) PstReader._array_properties_are_ok( k.val, order_goal, dtype_goal) np.testing.assert_array_almost_equal( refval0, k.val, decimal=min(decimal_start, decimal_goal))
def _read_kernel(self, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False, num_threads=None): ''' Will respect the cupy environment variable. ''' dtype = np.dtype(dtype) #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count) if block_size is None or (self.sid_count <= block_size or self.sid_count <= self.iid_count): train_data, trained_standardizer = SnpReader._as_snpdata( self, standardizer=standardizer, dtype=dtype, order='A', force_python_only=force_python_only, num_threads=num_threads) kernel = train_data._read_kernel( stdizer.Identity(), order=order, dtype=dtype, force_python_only=force_python_only, view_ok=False, num_threads=num_threads) if return_trained: return kernel, trained_standardizer else: return kernel else: #Do in blocks xp = pstutil.array_module() #Set the default order to 'C' because with kernels any order is fine and the Python .dot method likes 'C' best. if order == 'A': order = 'C' t0 = time.time() K = xp.zeros([self.iid_count, self.iid_count], dtype=dtype, order=order) trained_standardizer_list = [] logging.info( f"reading {self.sid_count:,} SNPs in blocks of {block_size:,} and adding up kernels (for {self.iid_count:,} individuals) with {xp.__name__}." ) ct = 0 ts = time.time() diff_last = 0 for start in range(0, self.sid_count, block_size): ct += block_size train_data, trained_standardizer = SnpReader._as_snpdata( self[:, start:start + block_size], standardizer=standardizer, dtype=dtype, order='A', force_python_only=force_python_only, num_threads=num_threads) trained_standardizer_list.append(trained_standardizer) K += train_data._read_kernel( stdizer.Identity(), block_size=None, order=order, dtype=dtype, force_python_only=force_python_only, view_ok=False, num_threads=num_threads) if ct % block_size == 0: diff = time.time() - ts if diff > 1 and diff - diff_last > 5: logging.info(f"read {ct:,} SNPs in {diff:.2f} seconds") diff_last = diff t1 = time.time() logging.info("%.2f seconds elapsed" % (t1 - t0)) if return_trained: return K, standardizer._merge_trained( trained_standardizer_list ) #turns this into a single standardizer else: return K