def _read(self, row_indexer, col_indexer, order, dtype, force_python_only, view_ok, num_threads): self._run_once() dtype = np.dtype(dtype) if hasattr(self._internal, '_read_accepts_slices'): assert self._internal._read_accepts_slices, "If an object has the _read_accepts_slices attribute, it must have value 'True'" composed_row_index_or_none = _PstSubset.compose_indexer_with_indexer( self._internal.row_count, self._row_indexer, self.row_count, row_indexer) composed_col_index_or_none = _PstSubset.compose_indexer_with_indexer( self._internal.col_count, self._col_indexer, self.col_count, col_indexer) val = self._internal._read(composed_row_index_or_none, composed_col_index_or_none, order, dtype, force_python_only, view_ok, num_threads) return val else: row_index_or_none = PstReader._make_sparray_from_sparray_or_slice( self.row_count, row_indexer) composed_row_index_or_none = _PstSubset.compose_indexer_with_index_or_none( self._internal.row_count, self._row_indexer, self.row_count, row_index_or_none) col_index_or_none = PstReader._make_sparray_from_sparray_or_slice( self.col_count, col_indexer) composed_col_index_or_none = _PstSubset.compose_indexer_with_index_or_none( self._internal.col_count, self._col_indexer, self.col_count, col_index_or_none) val = self._internal._read(composed_row_index_or_none, composed_col_index_or_none, order, dtype, force_python_only, view_ok, num_threads) return val
def __getitem__(self, iid_indexer_and_snp_indexer): if isinstance(iid_indexer_and_snp_indexer, tuple): iid0_indexer, iid1_indexer = iid_indexer_and_snp_indexer else: iid0_indexer = iid_indexer_and_snp_indexer iid1_indexer = iid0_indexer row_index_or_none = PstReader._make_sparray_from_sparray_or_slice( self.row_count, iid0_indexer) col_index_or_none = PstReader._make_sparray_from_sparray_or_slice( self.col_count, iid1_indexer) if row_index_or_none is None: row_index_or_none = list(range(self.row_count)) assert not isinstance(row_index_or_none, str), "row_index_or_none should not be a string" iid = self.row[row_index_or_none] if col_index_or_none is None or np.array_equal( col_index_or_none, list(range(self.col_count))): test = self.test else: test = self.test[col_index_or_none] try: #case 1: asking for train x test train = self.train[self.train.iid_to_index(iid), :] is_ok = True except: is_ok = False if is_ok: return _SnpTrainTest(train=train, test=test, standardizer=self.standardizer, block_size=self.block_size) #case 2: asking for train x test if np.array_equal(test.iid, iid): return SnpKernel(test, standardizer=self.standardizer, block_size=self.block_size) #case 3: Just re-reordering the iids if len(row_index_or_none) == self.row_count and ( col_index_or_none is None or len(col_index_or_none) == self.col_count): result = _SnpWholeTest(train=self.train, test=test, standardizer=self.standardizer, block_size=self.block_size, iid0=iid) return result raise Exception( "When reading from a _SnpWholeTest, can only ask to reorder iids or to access from train x test or test x test" )
def _read(self, row_index_or_none, col_index_or_none, order, dtype, force_python_only, view_ok): if row_index_or_none is None and col_index_or_none is None and self._row0 is self._row1: #read all of a square ID return np.identity(self.row_count,dtype=dtype) else: #Non-square #!!! This is also less efficient than it could be because it create a big identity matrix and then slices it. #In about O(col_count + row_count) fill in zeros big = np.zeros([self.row_count,self.col_count],dtype=dtype) common = set([PstReader._makekey(x) for x in self.row]) & set([PstReader._makekey(x) for x in self.col]) big[self.row_to_index(common),self.col_to_index(common)] = 1.0 val, shares_memory = self._apply_sparray_or_slice_to_val(big, row_index_or_none, col_index_or_none, order, dtype, force_python_only) return val
def __init__(self, internal, row_indexer, col_indexer): ''' an indexer can be: an integer i (same as [i]) a slice a list of integers (including negatives) a list of Booleans ''' super(_PstSubset, self).__init__() self._ran_once = False self._internal = internal self._row_indexer = PstReader._make_sparray_or_slice(row_indexer) self._col_indexer = PstReader._make_sparray_or_slice(col_indexer)
def compose_indexer_with_indexer(countA, indexerA, countB, indexerB): if _PstSubset._is_all_slice(indexerA): return indexerB if _PstSubset._is_all_slice(indexerB): return indexerA indexA = PstReader._make_sparray_from_sparray_or_slice( countA, indexerA) indexB = PstReader._make_sparray_from_sparray_or_slice( countB, indexerB) indexAB = indexA[indexB] return indexAB
def _read_kernel(train, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False): ''' The method creates a kernel for the in-memory SNP data. It handles these cases * No standardization is needed & everything is in memory OR uses the FROM-DISK method ''' from pysnptools.pstreader import PstReader #Just do a 'python' dot, if no standardization is needed and everything is the right type if isinstance(standardizer,Identity) and train.val.dtype == dtype: ts = time.time() #is_worth_logging = train.val.shape[0] * train.val.shape[1] * test.val.shape[0] > 1e9 #if is_worth_logging: logging.info(" _read_kernel about to multiply train{0} x test{1}".format(train.val.shape,test.val.shape)) if order == 'F': #numpy's 'dot' always returns 'C' order K = (train.val.dot(train.val.T)).T else: K = train.val.dot(train.val.T) assert PstReader._array_properties_are_ok(K,order,dtype), "internal error: K is not of the expected order or dtype" #if is_worth_logging: logging.info(" _read_kernel took %.2f seconds" % (time.time()-ts)) if return_trained: return K, standardizer else: return K else: #Do things the more general SnpReader way. return SnpReader._read_kernel(train, standardizer, block_size=block_size, order=order, dtype=dtype, force_python_only=force_python_only,view_ok=view_ok, return_trained=return_trained)
def test_respect_inputs(self): np.random.seed(0) for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]: for order_start in ['F', 'C', 'A']: for sid_count in [20, 2]: snpdataX = SnpData( iid=[["0", "0"], ["1", "1"], ["2", "2"]], sid=[str(i) for i in range(sid_count)], val=np.array(np.random.randint(3, size=[3, sid_count]), dtype=dtype_start, order=order_start)) for stdx in [ stdizer.Beta(1, 25), stdizer.Identity(), stdizer.Unit() ]: for snpreader0 in [snpdataX, snpdataX[:, 1:]]: snpreader1 = snpreader0[1:, :] refdata0, trained_standardizer = snpreader0.read( ).standardize(stdx, return_trained=True, force_python_only=True) refval0 = refdata0.val.dot(refdata0.val.T) refdata1 = snpreader1.read().standardize( trained_standardizer, force_python_only=True ) #LATER why aren't these used? refval1 = refdata0.val.dot( refdata1.val.T) #LATER why aren't these used? for dtype_goal, decimal_goal in [(np.float32, 5), (np.float64, 10)]: for order_goal in ['F', 'C', 'A']: k = snpreader0.read_kernel( standardizer=stdx, block_size=1, order=order_goal, dtype=dtype_goal) PstReader._array_properties_are_ok( k.val, order_goal, dtype_goal) np.testing.assert_array_almost_equal( refval0, k.val, decimal=min(decimal_start, decimal_goal))
def test_respect_inputs(self): np.random.seed(0) for dtype_start,decimal_start in [(np.float32,5),(np.float64,10)]: for order_start in ['F','C','A']: for snp_count in [20,2]: snpdataX = SnpData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(snp_count)],val=np.array(np.random.randint(3,size=[3,snp_count]),dtype=dtype_start,order=order_start)) for stdx in [stdizer.Beta(1,25),stdizer.Identity(),stdizer.Unit()]: for snpreader0 in [snpdataX,snpdataX[:,1:]]: snpreader1 = snpreader0[1:,:] refdata0, trained_standardizer = snpreader0.read().standardize(stdx,return_trained=True,force_python_only=True) refval0 = refdata0.val.dot(refdata0.val.T) refdata1 = snpreader1.read().standardize(trained_standardizer,force_python_only=True) refval1 = refdata0.val.dot(refdata1.val.T) for dtype_goal,decimal_goal in [(np.float32,5),(np.float64,10)]: for order_goal in ['F','C','A']: k = snpreader0.read_kernel(standardizer=stdx,block_size=1,order=order_goal,dtype=dtype_goal) PstReader._array_properties_are_ok(k.val,order_goal,dtype_goal) np.testing.assert_array_almost_equal(refval0,k.val, decimal=min(decimal_start,decimal_goal))
def _read(self, row_index_or_none, col_index_or_none, order, dtype, force_python_only, view_ok, num_threads): dtype = np.dtype(dtype) if row_index_or_none is None and col_index_or_none is None and self._row0 is self._row1: #read all of a square ID val = np.identity(self.row_count, dtype=dtype) if (order == 'F' and not val.flags["F_CONTIGUOUS"]) or ( order == 'C' and not val.flags["C_CONTIGUOUS"]): val = val.T return val else: #Non-square #!!! This is also less efficient than it could be because it create a big identity matrix and then slices it. #In about O(col_count + row_count) fill in zeros big = np.zeros([self.row_count, self.col_count], dtype=dtype) common = set([PstReader._makekey(x) for x in self.row]) & set( [PstReader._makekey(x) for x in self.col]) big[self.row_to_index(common), self.col_to_index(common)] = 1.0 val, shares_memory = self._apply_sparray_or_slice_to_val( big, row_index_or_none, col_index_or_none, order, dtype, force_python_only, num_threads) return val
def __getitem__(self, iid_indexer_and_snp_indexer): if isinstance(iid_indexer_and_snp_indexer,tuple): iid0_indexer, iid1_indexer = iid_indexer_and_snp_indexer else: iid0_indexer = iid_indexer_and_snp_indexer iid1_indexer = iid0_indexer row_index_or_none = PstReader._make_sparray_from_sparray_or_slice(self.row_count, iid0_indexer) col_index_or_none = PstReader._make_sparray_from_sparray_or_slice(self.col_count, iid1_indexer) if row_index_or_none is None: row_index_or_none = range(self.row_count) assert not isinstance(row_index_or_none,str), "row_index_or_none should not be a string" iid = self.row[row_index_or_none] if col_index_or_none is None or np.array_equal(col_index_or_none,range(self.col_count)): test = self.test else: test = self.test[col_index_or_none] try: #case 1: asking for train x test train = self.train[self.train.iid_to_index(iid),:] is_ok = True except: is_ok = False if is_ok: return _SnpTrainTest(train=train,test=test,standardizer=self.standardizer,block_size=self.block_size) #case 2: asking for train x test if np.array_equal(test.iid,iid): return SnpKernel(test,standardizer=self.standardizer,block_size=self.block_size) #case 3: Just re-reordering the iids if len(row_index_or_none) == self.row_count and (col_index_or_none is None or len(col_index_or_none) == self.col_count): result = _SnpWholeTest(train=self.train,test=test,standardizer=self.standardizer,block_size=self.block_size,iid0=iid) return result raise Exception("When reading from a _SnpWholeTest, can only ask to reorder iids or to access from train x test or test x test")
def __init__(self, filename): PstReader.__init__(self) self._ran_once = False self._filename = filename