Ejemplo n.º 1
0
 def _read(self, row_indexer, col_indexer, order, dtype, force_python_only,
           view_ok, num_threads):
     self._run_once()
     dtype = np.dtype(dtype)
     if hasattr(self._internal, '_read_accepts_slices'):
         assert self._internal._read_accepts_slices, "If an object has the _read_accepts_slices attribute, it must have value 'True'"
         composed_row_index_or_none = _PstSubset.compose_indexer_with_indexer(
             self._internal.row_count, self._row_indexer, self.row_count,
             row_indexer)
         composed_col_index_or_none = _PstSubset.compose_indexer_with_indexer(
             self._internal.col_count, self._col_indexer, self.col_count,
             col_indexer)
         val = self._internal._read(composed_row_index_or_none,
                                    composed_col_index_or_none, order,
                                    dtype, force_python_only, view_ok,
                                    num_threads)
         return val
     else:
         row_index_or_none = PstReader._make_sparray_from_sparray_or_slice(
             self.row_count, row_indexer)
         composed_row_index_or_none = _PstSubset.compose_indexer_with_index_or_none(
             self._internal.row_count, self._row_indexer, self.row_count,
             row_index_or_none)
         col_index_or_none = PstReader._make_sparray_from_sparray_or_slice(
             self.col_count, col_indexer)
         composed_col_index_or_none = _PstSubset.compose_indexer_with_index_or_none(
             self._internal.col_count, self._col_indexer, self.col_count,
             col_index_or_none)
         val = self._internal._read(composed_row_index_or_none,
                                    composed_col_index_or_none, order,
                                    dtype, force_python_only, view_ok,
                                    num_threads)
         return val
Ejemplo n.º 2
0
    def __getitem__(self, iid_indexer_and_snp_indexer):
        if isinstance(iid_indexer_and_snp_indexer, tuple):
            iid0_indexer, iid1_indexer = iid_indexer_and_snp_indexer
        else:
            iid0_indexer = iid_indexer_and_snp_indexer
            iid1_indexer = iid0_indexer

        row_index_or_none = PstReader._make_sparray_from_sparray_or_slice(
            self.row_count, iid0_indexer)
        col_index_or_none = PstReader._make_sparray_from_sparray_or_slice(
            self.col_count, iid1_indexer)

        if row_index_or_none is None:
            row_index_or_none = list(range(self.row_count))

        assert not isinstance(row_index_or_none,
                              str), "row_index_or_none should not be a string"
        iid = self.row[row_index_or_none]

        if col_index_or_none is None or np.array_equal(
                col_index_or_none, list(range(self.col_count))):
            test = self.test
        else:
            test = self.test[col_index_or_none]

        try:  #case 1: asking for train x test
            train = self.train[self.train.iid_to_index(iid), :]
            is_ok = True
        except:
            is_ok = False
        if is_ok:
            return _SnpTrainTest(train=train,
                                 test=test,
                                 standardizer=self.standardizer,
                                 block_size=self.block_size)

        #case 2: asking for train x test
        if np.array_equal(test.iid, iid):
            return SnpKernel(test,
                             standardizer=self.standardizer,
                             block_size=self.block_size)

        #case 3: Just re-reordering the iids
        if len(row_index_or_none) == self.row_count and (
                col_index_or_none is None
                or len(col_index_or_none) == self.col_count):
            result = _SnpWholeTest(train=self.train,
                                   test=test,
                                   standardizer=self.standardizer,
                                   block_size=self.block_size,
                                   iid0=iid)
            return result

        raise Exception(
            "When reading from a _SnpWholeTest, can only ask to reorder iids or to access from train x test or test x test"
        )
Ejemplo n.º 3
0
    def _read(self, row_index_or_none, col_index_or_none, order, dtype, force_python_only, view_ok):
        if row_index_or_none is None and col_index_or_none is None and self._row0 is self._row1: #read all of a square ID
            return np.identity(self.row_count,dtype=dtype)
        else: #Non-square
            #!!! This is also less efficient than it could be because it create a big identity matrix and then slices it.

            #In about O(col_count + row_count) fill in zeros
            big = np.zeros([self.row_count,self.col_count],dtype=dtype)
            common = set([PstReader._makekey(x) for x in self.row]) & set([PstReader._makekey(x) for x in self.col])
            big[self.row_to_index(common),self.col_to_index(common)] = 1.0
            val, shares_memory = self._apply_sparray_or_slice_to_val(big, row_index_or_none, col_index_or_none, order, dtype, force_python_only)
            return val
Ejemplo n.º 4
0
    def _read(self, row_index_or_none, col_index_or_none, order, dtype, force_python_only, view_ok):
        if row_index_or_none is None and col_index_or_none is None and self._row0 is self._row1: #read all of a square ID
            return np.identity(self.row_count,dtype=dtype)
        else: #Non-square
            #!!! This is also less efficient than it could be because it create a big identity matrix and then slices it.

            #In about O(col_count + row_count) fill in zeros
            big = np.zeros([self.row_count,self.col_count],dtype=dtype)
            common = set([PstReader._makekey(x) for x in self.row]) & set([PstReader._makekey(x) for x in self.col])
            big[self.row_to_index(common),self.col_to_index(common)] = 1.0
            val, shares_memory = self._apply_sparray_or_slice_to_val(big, row_index_or_none, col_index_or_none, order, dtype, force_python_only)
            return val
Ejemplo n.º 5
0
    def __init__(self, internal, row_indexer, col_indexer):
        '''
        an indexer can be:
             an integer i (same as [i])
             a slice
             a list of integers (including negatives)
             a list of Booleans
        '''
        super(_PstSubset, self).__init__()
        self._ran_once = False

        self._internal = internal
        self._row_indexer = PstReader._make_sparray_or_slice(row_indexer)
        self._col_indexer = PstReader._make_sparray_or_slice(col_indexer)
Ejemplo n.º 6
0
    def compose_indexer_with_indexer(countA, indexerA, countB, indexerB):
        if _PstSubset._is_all_slice(indexerA):
            return indexerB

        if _PstSubset._is_all_slice(indexerB):
            return indexerA

        indexA = PstReader._make_sparray_from_sparray_or_slice(
            countA, indexerA)
        indexB = PstReader._make_sparray_from_sparray_or_slice(
            countB, indexerB)

        indexAB = indexA[indexB]

        return indexAB
Ejemplo n.º 7
0
    def _read_kernel(train, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False):
        '''
        The method creates a kernel for the in-memory SNP data. It handles these cases
                * No standardization is needed & everything is in memory  OR uses the FROM-DISK method
        '''
        from pysnptools.pstreader import PstReader


        #Just do a 'python' dot, if no standardization is needed and everything is the right type
        if isinstance(standardizer,Identity) and train.val.dtype == dtype:
            ts = time.time()
            #is_worth_logging = train.val.shape[0] * train.val.shape[1] * test.val.shape[0] > 1e9
            #if is_worth_logging: logging.info("  _read_kernel about to multiply train{0} x test{1}".format(train.val.shape,test.val.shape))
            if order == 'F': #numpy's 'dot' always returns 'C' order
                K = (train.val.dot(train.val.T)).T
            else:
                K = train.val.dot(train.val.T)
            assert PstReader._array_properties_are_ok(K,order,dtype), "internal error: K is not of the expected order or dtype"
            #if is_worth_logging: logging.info("  _read_kernel took %.2f seconds" % (time.time()-ts))
            if return_trained:
                return K, standardizer
            else:
                return K
        else: #Do things the more general SnpReader way.
            return SnpReader._read_kernel(train, standardizer, block_size=block_size, order=order, dtype=dtype, force_python_only=force_python_only,view_ok=view_ok, return_trained=return_trained)
Ejemplo n.º 8
0
    def _read_kernel(train, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False):
        '''
        The method creates a kernel for the in-memory SNP data. It handles these cases
                * No standardization is needed & everything is in memory  OR uses the FROM-DISK method
        '''
        from pysnptools.pstreader import PstReader


        #Just do a 'python' dot, if no standardization is needed and everything is the right type
        if isinstance(standardizer,Identity) and train.val.dtype == dtype:
            ts = time.time()
            #is_worth_logging = train.val.shape[0] * train.val.shape[1] * test.val.shape[0] > 1e9
            #if is_worth_logging: logging.info("  _read_kernel about to multiply train{0} x test{1}".format(train.val.shape,test.val.shape))
            if order == 'F': #numpy's 'dot' always returns 'C' order
                K = (train.val.dot(train.val.T)).T
            else:
                K = train.val.dot(train.val.T)
            assert PstReader._array_properties_are_ok(K,order,dtype), "internal error: K is not of the expected order or dtype"
            #if is_worth_logging: logging.info("  _read_kernel took %.2f seconds" % (time.time()-ts))
            if return_trained:
                return K, standardizer
            else:
                return K
        else: #Do things the more general SnpReader way.
            return SnpReader._read_kernel(train, standardizer, block_size=block_size, order=order, dtype=dtype, force_python_only=force_python_only,view_ok=view_ok, return_trained=return_trained)
Ejemplo n.º 9
0
    def test_respect_inputs(self):
        np.random.seed(0)
        for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]:
            for order_start in ['F', 'C', 'A']:
                for sid_count in [20, 2]:
                    snpdataX = SnpData(
                        iid=[["0", "0"], ["1", "1"], ["2", "2"]],
                        sid=[str(i) for i in range(sid_count)],
                        val=np.array(np.random.randint(3, size=[3, sid_count]),
                                     dtype=dtype_start,
                                     order=order_start))
                    for stdx in [
                            stdizer.Beta(1, 25),
                            stdizer.Identity(),
                            stdizer.Unit()
                    ]:
                        for snpreader0 in [snpdataX, snpdataX[:, 1:]]:
                            snpreader1 = snpreader0[1:, :]

                            refdata0, trained_standardizer = snpreader0.read(
                            ).standardize(stdx,
                                          return_trained=True,
                                          force_python_only=True)
                            refval0 = refdata0.val.dot(refdata0.val.T)
                            refdata1 = snpreader1.read().standardize(
                                trained_standardizer, force_python_only=True
                            )  #LATER why aren't these used?
                            refval1 = refdata0.val.dot(
                                refdata1.val.T)  #LATER why aren't these used?
                            for dtype_goal, decimal_goal in [(np.float32, 5),
                                                             (np.float64, 10)]:
                                for order_goal in ['F', 'C', 'A']:
                                    k = snpreader0.read_kernel(
                                        standardizer=stdx,
                                        block_size=1,
                                        order=order_goal,
                                        dtype=dtype_goal)
                                    PstReader._array_properties_are_ok(
                                        k.val, order_goal, dtype_goal)
                                    np.testing.assert_array_almost_equal(
                                        refval0,
                                        k.val,
                                        decimal=min(decimal_start,
                                                    decimal_goal))
Ejemplo n.º 10
0
    def test_respect_inputs(self):
        np.random.seed(0)
        for dtype_start,decimal_start in [(np.float32,5),(np.float64,10)]:
            for order_start in ['F','C','A']:
                for snp_count in [20,2]:
                    snpdataX = SnpData(iid=[["0","0"],["1","1"],["2","2"]],sid=[str(i) for i in range(snp_count)],val=np.array(np.random.randint(3,size=[3,snp_count]),dtype=dtype_start,order=order_start))
                    for stdx in [stdizer.Beta(1,25),stdizer.Identity(),stdizer.Unit()]:
                        for snpreader0 in [snpdataX,snpdataX[:,1:]]:
                            snpreader1 = snpreader0[1:,:]

                            refdata0, trained_standardizer = snpreader0.read().standardize(stdx,return_trained=True,force_python_only=True)
                            refval0 = refdata0.val.dot(refdata0.val.T)
                            refdata1 = snpreader1.read().standardize(trained_standardizer,force_python_only=True)
                            refval1 = refdata0.val.dot(refdata1.val.T)
                            for dtype_goal,decimal_goal in [(np.float32,5),(np.float64,10)]:
                                for order_goal in ['F','C','A']:
                                    k = snpreader0.read_kernel(standardizer=stdx,block_size=1,order=order_goal,dtype=dtype_goal)
                                    PstReader._array_properties_are_ok(k.val,order_goal,dtype_goal)
                                    np.testing.assert_array_almost_equal(refval0,k.val, decimal=min(decimal_start,decimal_goal))
Ejemplo n.º 11
0
    def _read(self, row_index_or_none, col_index_or_none, order, dtype,
              force_python_only, view_ok, num_threads):
        dtype = np.dtype(dtype)
        if row_index_or_none is None and col_index_or_none is None and self._row0 is self._row1:  #read all of a square ID
            val = np.identity(self.row_count, dtype=dtype)
            if (order == 'F' and not val.flags["F_CONTIGUOUS"]) or (
                    order == 'C' and not val.flags["C_CONTIGUOUS"]):
                val = val.T
            return val
        else:  #Non-square
            #!!! This is also less efficient than it could be because it create a big identity matrix and then slices it.

            #In about O(col_count + row_count) fill in zeros
            big = np.zeros([self.row_count, self.col_count], dtype=dtype)
            common = set([PstReader._makekey(x) for x in self.row]) & set(
                [PstReader._makekey(x) for x in self.col])
            big[self.row_to_index(common), self.col_to_index(common)] = 1.0
            val, shares_memory = self._apply_sparray_or_slice_to_val(
                big, row_index_or_none, col_index_or_none, order, dtype,
                force_python_only, num_threads)
            return val
Ejemplo n.º 12
0
    def __getitem__(self, iid_indexer_and_snp_indexer):
        if isinstance(iid_indexer_and_snp_indexer,tuple):
            iid0_indexer, iid1_indexer = iid_indexer_and_snp_indexer
        else:
            iid0_indexer = iid_indexer_and_snp_indexer
            iid1_indexer = iid0_indexer

        row_index_or_none = PstReader._make_sparray_from_sparray_or_slice(self.row_count, iid0_indexer)
        col_index_or_none = PstReader._make_sparray_from_sparray_or_slice(self.col_count, iid1_indexer)

        if row_index_or_none is None:
            row_index_or_none = range(self.row_count)

        assert not isinstance(row_index_or_none,str), "row_index_or_none should not be a string"
        iid = self.row[row_index_or_none]

        if col_index_or_none is None or np.array_equal(col_index_or_none,range(self.col_count)):
            test = self.test
        else:
            test = self.test[col_index_or_none]
        
        try: #case 1: asking for train x test
            train = self.train[self.train.iid_to_index(iid),:]
            is_ok = True
        except:
            is_ok = False
        if is_ok:
            return _SnpTrainTest(train=train,test=test,standardizer=self.standardizer,block_size=self.block_size)

        #case 2: asking for train x test
        if np.array_equal(test.iid,iid):
            return SnpKernel(test,standardizer=self.standardizer,block_size=self.block_size)

        #case 3: Just re-reordering the iids
        if len(row_index_or_none) == self.row_count and (col_index_or_none is None or len(col_index_or_none) == self.col_count):
            result = _SnpWholeTest(train=self.train,test=test,standardizer=self.standardizer,block_size=self.block_size,iid0=iid)
            return result

        
        raise Exception("When reading from a _SnpWholeTest, can only ask to reorder iids or to access from train x test or test x test")
Ejemplo n.º 13
0
 def __init__(self, filename):
     PstReader.__init__(self)
     self._ran_once = False
     self._filename = filename