コード例 #1
0
    def test_intersection(self):

        from pysnptools.standardizer import Unit
        from pysnptools.kernelreader import SnpKernel
        from pysnptools.snpreader import Pheno
        from pysnptools.kernelreader._subset import _KernelSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                       count_A1=False)
        k = SnpKernel(snps_all, stdizer.Identity())

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:, :]  # To test intersection we remove a iid from pheno

        k1, pheno = intersect_apply([
            k, pheno
        ])  #SnpKernel is special because it standardizes AFTER intersecting.
        assert isinstance(k1.snpreader,
                          _SnpSubset) and not isinstance(k1, _KernelSubset)

        #What happens with fancy selection?
        k2 = k[::2]
        assert isinstance(k2, SnpKernel)

        logging.info("Done with test_intersection")
コード例 #2
0
    def _read(self, row_index_or_none, col_index_or_none, order, dtype,
              force_python_only, view_ok, num_threads):
        assert row_index_or_none is None and col_index_or_none is None  #real assert because indexing should already be pushed to the inner snpreader
        dtype = np.dtype(dtype)

        #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count)
        if self.block_size is None or (self.sid_count <= self.block_size
                                       or self.sid_count <= self.iid_count):
            snpdata, _ = SnpReader._as_snpdata(
                self.snpreader,
                dtype=dtype,
                order=order,
                force_python_only=force_python_only,
                standardizer=stdizer.Identity(),
                num_threads=num_threads)
            val = self._snpval_to_distval(snpdata.val, order, dtype)

            has_right_order = order = "A" or (
                order == "C" and val.flags["C_CONTIGUOUS"]) or (
                    order == "F" and val.flags["F_CONTIGUOUS"])
            assert has_right_order
            return val
        else:  #Do in blocks
            t0 = time.time()
            if order == 'A':
                order = 'F'
            val = np.zeros([self.iid_count, self.sid_count, 3],
                           dtype=dtype,
                           order=order)  #LATER use empty or fillnan???

            logging.info(
                "reading {0} value data in blocks of {1} SNPs and finding distribution (for {2} individuals)"
                .format(self.sid_count, self.block_size, self.iid_count))
            ct = 0
            ts = time.time()

            for start in range(0, self.sid_count, self.block_size):
                ct += self.block_size
                snpdata = self.snpreader[:, start:start + self.block_size].read(
                    order=order,
                    dtype=dtype,
                    force_python_only=force_python_only,
                    view_ok=True,
                    num_threads=num_threads
                )  # a view is always OK, because we'll allocate memory in the next step
                val[:,
                    start:start + self.block_size] = self._snpval_to_distval(
                        snpdata.val, order, dtype)
                if ct % self.block_size == 0:
                    diff = time.time() - ts
                    if diff > 5:
                        logging.info("read %s SNPs in %.2f seconds" %
                                     (ct, diff))

            t1 = time.time()
            logging.info("%.2f seconds elapsed" % (t1 - t0))

            return val
コード例 #3
0
 def _as_snpdata(snpreader, standardizer, force_python_only, dtype):
     '''
     Like 'read' except (1) won't read if already a snpdata and (2) returns the standardizer
     '''
     from pysnptools.snpreader import SnpData
     if isinstance(snpreader,SnpData) and snpreader.val.dtype==dtype and isinstance(standardizer,stdizer.Identity):
         return snpreader, stdizer.Identity()
     else:
         return snpreader.read(order='A',dtype=dtype).standardize(standardizer,return_trained=True,force_python_only=force_python_only)
コード例 #4
0
    def _read_kernel(self, standardizer, block_size=None, order='A', dtype=np.float64, force_python_only=False, view_ok=False, return_trained=False):
        #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count)
        if block_size is None or (self.sid_count <= block_size or self.sid_count <= self.iid_count):
            train_data,trained_standardizer  = SnpReader._as_snpdata(self,standardizer=standardizer,dtype=dtype,force_python_only=force_python_only)
            kernel = train_data._read_kernel(stdizer.Identity(), order=order,dtype=dtype,force_python_only=force_python_only,view_ok=False)
            if return_trained:
                return kernel, trained_standardizer
            else:
                return kernel

        else: #Do in blocks
            #Set the default order to 'C' because with kernels any order is fine and the Python .dot method likes 'C' best.
            if order=='A':
                order = 'C'
            t0 = time.time()
            K = np.zeros([self.iid_count,self.iid_count],dtype=dtype,order=order)
            trained_standardizer_list = []

            logging.info("reading {0} SNPs in blocks of {1} and adding up kernels (for {2} individuals)".format(self.sid_count, block_size, self.iid_count))

            ct = 0
            ts = time.time()

            for start in range(0, self.sid_count, block_size):
                ct += block_size
                train_data,trained_standardizer = SnpReader._as_snpdata(self[:,start:start+block_size],standardizer=standardizer,dtype=dtype,force_python_only=force_python_only)
                trained_standardizer_list.append(trained_standardizer)
                K += train_data._read_kernel(stdizer.Identity(),block_size=None,order=order,dtype=dtype,force_python_only=force_python_only,view_ok=False)
                if ct % block_size==0:
                    diff = time.time()-ts
                    if diff > 1: logging.info("read %s SNPs in %.2f seconds" % (ct, diff))

            t1 = time.time()
            logging.info("%.2f seconds elapsed" % (t1-t0))

            if return_trained:
                return K, standardizer._merge_trained(trained_standardizer_list) #turns this into a single standardizer
            else:
                return K
コード例 #5
0
    def _as_snpdata(snpreader, standardizer, force_python_only, order, dtype):
        '''
        Like 'read' except (1) won't read if already a snpdata and (2) returns the standardizer
        '''
        from pysnptools.snpreader import SnpData
        dtype = np.dtype(dtype)

        if (hasattr(snpreader, 'val') and snpreader.val.dtype == dtype
                and isinstance(standardizer, stdizer.Identity)
                and (order == "A" or
                     (order == "C" and snpreader.val.flags["C_CONTIGUOUS"]) or
                     (order == "F" and snpreader.val.flags["F_CONTIGUOUS"]))):
            return snpreader, stdizer.Identity()
        else:
            return snpreader.read(order=order, dtype=dtype).standardize(
                standardizer,
                return_trained=True,
                force_python_only=force_python_only)
コード例 #6
0
    def test_respect_inputs(self):
        np.random.seed(0)
        for dtype_start, decimal_start in [(np.float32, 5), (np.float64, 10)]:
            for order_start in ['F', 'C', 'A']:
                for sid_count in [20, 2]:
                    snpdataX = SnpData(
                        iid=[["0", "0"], ["1", "1"], ["2", "2"]],
                        sid=[str(i) for i in range(sid_count)],
                        val=np.array(np.random.randint(3, size=[3, sid_count]),
                                     dtype=dtype_start,
                                     order=order_start))
                    for stdx in [
                            stdizer.Beta(1, 25),
                            stdizer.Identity(),
                            stdizer.Unit()
                    ]:
                        for snpreader0 in [snpdataX, snpdataX[:, 1:]]:
                            snpreader1 = snpreader0[1:, :]

                            refdata0, trained_standardizer = snpreader0.read(
                            ).standardize(stdx,
                                          return_trained=True,
                                          force_python_only=True)
                            refval0 = refdata0.val.dot(refdata0.val.T)
                            refdata1 = snpreader1.read().standardize(
                                trained_standardizer, force_python_only=True
                            )  #LATER why aren't these used?
                            refval1 = refdata0.val.dot(
                                refdata1.val.T)  #LATER why aren't these used?
                            for dtype_goal, decimal_goal in [(np.float32, 5),
                                                             (np.float64, 10)]:
                                for order_goal in ['F', 'C', 'A']:
                                    k = snpreader0.read_kernel(
                                        standardizer=stdx,
                                        block_size=1,
                                        order=order_goal,
                                        dtype=dtype_goal)
                                    PstReader._array_properties_are_ok(
                                        k.val, order_goal, dtype_goal)
                                    np.testing.assert_array_almost_equal(
                                        refval0,
                                        k.val,
                                        decimal=min(decimal_start,
                                                    decimal_goal))
コード例 #7
0
    def _read_kernel(self,
                     standardizer,
                     block_size=None,
                     order='A',
                     dtype=np.float64,
                     force_python_only=False,
                     view_ok=False,
                     return_trained=False,
                     num_threads=None):
        '''
        Will respect the cupy environment variable.
        '''
        dtype = np.dtype(dtype)
        #Do all-at-once (not in blocks) if 1. No block size is given or 2. The #ofSNPs < Min(block_size,iid_count)
        if block_size is None or (self.sid_count <= block_size
                                  or self.sid_count <= self.iid_count):
            train_data, trained_standardizer = SnpReader._as_snpdata(
                self,
                standardizer=standardizer,
                dtype=dtype,
                order='A',
                force_python_only=force_python_only,
                num_threads=num_threads)
            kernel = train_data._read_kernel(
                stdizer.Identity(),
                order=order,
                dtype=dtype,
                force_python_only=force_python_only,
                view_ok=False,
                num_threads=num_threads)
            if return_trained:
                return kernel, trained_standardizer
            else:
                return kernel

        else:  #Do in blocks
            xp = pstutil.array_module()
            #Set the default order to 'C' because with kernels any order is fine and the Python .dot method likes 'C' best.
            if order == 'A':
                order = 'C'
            t0 = time.time()
            K = xp.zeros([self.iid_count, self.iid_count],
                         dtype=dtype,
                         order=order)
            trained_standardizer_list = []
            logging.info(
                f"reading {self.sid_count:,} SNPs in blocks of {block_size:,} and adding up kernels (for {self.iid_count:,} individuals) with {xp.__name__}."
            )

            ct = 0
            ts = time.time()
            diff_last = 0

            for start in range(0, self.sid_count, block_size):
                ct += block_size
                train_data, trained_standardizer = SnpReader._as_snpdata(
                    self[:, start:start + block_size],
                    standardizer=standardizer,
                    dtype=dtype,
                    order='A',
                    force_python_only=force_python_only,
                    num_threads=num_threads)
                trained_standardizer_list.append(trained_standardizer)
                K += train_data._read_kernel(
                    stdizer.Identity(),
                    block_size=None,
                    order=order,
                    dtype=dtype,
                    force_python_only=force_python_only,
                    view_ok=False,
                    num_threads=num_threads)
                if ct % block_size == 0:
                    diff = time.time() - ts
                    if diff > 1 and diff - diff_last > 5:
                        logging.info(f"read {ct:,} SNPs in {diff:.2f} seconds")
                        diff_last = diff

            t1 = time.time()
            logging.info("%.2f seconds elapsed" % (t1 - t0))

            if return_trained:
                return K, standardizer._merge_trained(
                    trained_standardizer_list
                )  #turns this into a single standardizer
            else:
                return K