Beispiel #1
0
    def par_tsne(self, param_list, store_res=True, nprocs=1):
        """
        Run t-SNE with multiple sets of parameters parallely.

        Parameters
        ----------
        param_list: list of dict
            List of parameters being passed to t-SNE.
        nprocs: int
            Number of processes.

        Returns
        -------
        tsne_res_list: list of float arrays
            List of t-SNE results of corresponding parameter set.

        Notes
        -----
        Parallel running results cannot be stored during the run, because
        racing conditions may happen.
        """
        nprocs = min(int(nprocs), len(param_list))
        # single run tsne

        def srun_tsne(param_dict):
            return self.tsne(store_res=False, **param_dict)

        resl = utils.parmap(srun_tsne, param_list, nprocs)
        if store_res:
            for i in range(len(param_list)):
                self.put_tsne(str(param_list[i]), resl[i])
        return resl
Beispiel #2
0
    def detect_rare_samples(self, k, d_cutoff, n_iter, nprocs=1):
        """
        KNN rare sample detection with multiple parameter combinations

        Assuming that there are at least k samples look similar in this
        dataset, the samples with less than k similar neighbors may be
        rare. The rare samples can either be really distinct from the general
        populaton or caused by technical errors.

        This procedure iteratively detects samples according to their k-th
        nearest neighbors. The samples most distinct from its k-th nearest
        neighbors are detected first. Then, the left samples are detected
        by less stringent distance cutoff. The distance cutoff decreases
        linearly from maximum distance to d_cutoff with n_iter iterations.

        Parameters
        ----------
        k: int list or scalar
            K nearest neighbors to detect rare samples.
        d_cutoff: float list or scalar
            Samples with >= d_cutoff distances are distinct from each other.
            Minimum (>=) distance to be called as rare.
        n_iter: int list or scalar
            N progressive iNN detections on the dataset.
        nproces: int
            N processes to run all parameter tuples.

        Returns
        -------
        res_list
            Indices of non-rare samples of each corresponding parameter
            tuple.

        Notes
        -----
        If parameters are provided as lists of equal length n, the n
        corresponding parameter tuples will be executed parallely.

        Example:

        `k = [10, 15, 20]`

        `d_cutoff = [1, 2, 3]`

        `n_iter = [10, 20, 30]`

        `(k, d_cutoff, n_iter)` tuples `(10, 1, 10), (15, 2, 20), (20, 3, 30)`
        will be tried parallely with nprocs.
        """
        # Convert scalar to list
        if np.isscalar(k):
            k_list = [k]
        else:
            k_list = list(k)

        if np.isscalar(d_cutoff):
            d_cutoff_list = [d_cutoff]
        else:
            d_cutoff_list = list(d_cutoff)

        if np.isscalar(n_iter):
            n_iter_list = [n_iter]
        else:
            n_iter_list = list(n_iter)
        # Check all param lists have the same length
        if not (len(k_list) == len(d_cutoff_list) == len(n_iter_list)):
            raise ValueError("Parameter should have the same length."
                             "k: {}, d_cutoff: {}, n_iter: {}.".format(
                                 k, d_cutoff, n_iter))
        n_param_tups = len(k_list)
        # type check all parameters
        for i in range(n_param_tups):
            if k_list[i] < 1 or k_list[i] > self._sdm._x.shape[0] - 1:
                raise ValueError("k should be >= 1 and <= n_samples-1. "
                                 "k: {}".format(k))
            else:
                k_list[i] = int(k_list[i])

            if d_cutoff_list[i] <= 0:
                raise ValueError("d_cutoff should be > 0. "
                                 "d_cutoff: {}".format(d_cutoff))
            else:
                d_cutoff_list[i] = float(d_cutoff_list[i])

            if n_iter_list[i] < 1:
                raise ValueError("n_iter should be >= 1. "
                                 "n_iter: {}".format(n_iter))
            else:
                n_iter_list[i] = int(n_iter_list[i])

        param_tups = [(k_list[i], d_cutoff_list[i], n_iter_list[i])
                      for i in range(n_param_tups)]
        nprocs = int(nprocs)
        nprocs = min(nprocs, n_param_tups)

        # returns (filtered_sdm, progress_list (list of kept indices))
        res_list = utils.parmap(
            lambda ptup: self._rare_sample_detection_runner(*ptup), param_tups,
            nprocs)

        for i in range(n_param_tups):
            if param_tups[i] not in self._res_lut:
                self._res_lut[param_tups[i]] = res_list[i]

        return [res[0] for res in res_list]
Beispiel #3
0
    def knn_pickup_features(self,
                            k,
                            n_do,
                            min_present_val,
                            n_iter,
                            nprocs=1,
                            statistic_fun=np.median):
        """
        Runs KNN pick-up on multiple parameter sets parallely.

        Each parameter set will be executed in one process.

        Parameters
        ----------
        k: int
            Look at k nearest neighbors to decide whether to pickup or not.
        n_do: int
            Minimum (`>=`) number of above min_present_val neighbors among KNN
            to be callsed as drop-out, so that pick-up will be performed.
        min_present_val: float
            Minimum (`>=`) values of a feature to be called as present.
        n_iter: int
            The number of iterations to run.
        statistic_fun: callable
            The summary statistic used to correct gene dropouts. Default is
            median.

        Returns
        -------
        resl: list
            list of results, `[(pu_sdm, pu_idc_arr, stats), ...]`.

            pu_sdm: SampleDistanceMatrix
                SampleDistanceMatrix after pick-up
            pu_idc_arr: array of shape (n_samples, n_features)
                Indicator matrix of the ith iteration an entry is being
                picked up.
            stats: str
                Stats of the run.


        Notes
        -----
        If parameters are provided as lists of equal length n, the n
        corresponding parameter tuples will be executed parallely.

        Example
        -------

        If `k = [10, 15]`, `n_do = [1, 2]`, `min_present_val = [5, 6]`, and
        `n_iter = [10, 20]`, `(k, n_do, min_present_val, n_iter)` tuples
        `(10, 1, 5, 10) and (15, 2, 6, 20)` will be tried parallely
        with nprocs.

        n_do, min_present_val, n_iter
        """
        try:
            # make sure that the function runs on list of numbers
            if not np.isscalar(np.isreal(statistic_fun([0, 1, 2]))):
                raise ValueError("statistic_fun should be a function of a"
                                 "list of numbers that returns a scalar.")
        except Exception:
            raise ValueError("statistic_fun should be a function of a"
                             "list of numbers that returns a scalar.")

        if np.isscalar(k):
            k_list = [k]
        else:
            k_list = list(k)

        if np.isscalar(n_do):
            n_do_list = [n_do]
        else:
            n_do_list = list(n_do)

        if np.isscalar(min_present_val):
            min_present_val_list = [min_present_val]
        else:
            min_present_val_list = list(min_present_val)

        if np.isscalar(n_iter):
            n_iter_list = [n_iter]
        else:
            n_iter_list = list(n_iter)

        # Check all param lists have the same length
        if not (len(k_list) == len(n_do_list) == len(min_present_val_list) ==
                len(n_iter_list)):
            raise ValueError("Parameter should have the same length."
                             "k: {}, n_do: {}, min_present_val: {}, "
                             "n_iter: {}.".format(k, n_do, min_present_val,
                                                  n_iter))
        n_param_tups = len(k_list)
        # type check all parameters
        for i in range(n_param_tups):
            if k_list[i] < 1 or k_list[i] >= self._sdm._x.shape[0]:
                raise ValueError("k should be >= 1 and < n_samples. "
                                 "k: {}".format(k))
            else:
                k_list[i] = int(k_list[i])

            if n_do_list[i] > k_list[i] or n_do_list[i] < 1:
                raise ValueError("n_do should be  <= k and >= 1. "
                                 "n_do: {}".format(n_do))
            else:
                n_do_list[i] = int(n_do_list[i])

            min_present_val_list[i] = float(min_present_val_list[i])

            if n_iter_list[i] < 1:
                raise ValueError("n_iter should be >= 1. "
                                 "n_iter: {}".format(n_iter))
            else:
                n_iter_list[i] = int(n_iter_list[i])

        param_tups = [(k_list[i], n_do_list[i], min_present_val_list[i],
                       n_iter_list[i], statistic_fun)
                      for i in range(n_param_tups)]
        res_list = []
        # use cached results with the following procedure
        # 1. put cached results to res_list, with not cached ones as None
        # 2. run not cached ones
        # 3. after running, cache the results results and fill res_list
        # same as filter
        # TODO: abstract the running pattern into a function

        # parameter tuples without cached results for running
        run_param_tups = []
        # indices of results to be filled after running
        res_list_run_inds = []
        for i, ptup in enumerate(param_tups):
            if ptup in self._res_lut:
                res_list.append(self._res_lut[ptup])
            else:
                run_param_tups.append(ptup)
                res_list.append(None)
                res_list_run_inds.append(i)
        # set up parameters for running
        # use gzipped pickle bytecode to save space, because python
        # multiprocessing has a limit of sharing memory through pipe
        gz_pb_x = gzip.compress(pickle.dumps(self._sdm._x))
        run_param_setup_tups = []
        for ptup in run_param_tups:
            # assumes that the first element of the ptup is k
            run_param_setup_tups.append((gz_pb_x,
                                         self._sdm.s_knn_ind_lut(ptup[0])) +
                                        ptup)

        nprocs = int(nprocs)
        nprocs = min(nprocs, n_param_tups)
        run_res_list = utils.parmap(
            lambda ptup: self._knn_pickup_features_runner(*ptup),
            run_param_setup_tups, nprocs)

        for i, param_tup in enumerate(run_param_tups):
            # cache results
            if param_tup in self._res_lut:
                raise NotImplementedError("Unexpected scenario encountered")
            res_x = pickle.loads(gzip.decompress(run_res_list[i][0]))
            res_idc = pickle.loads(gzip.decompress(run_res_list[i][1]))
            res_tup = (res_x, res_idc, run_res_list[i][2])
            self._res_lut[param_tup] = res_tup
            # fill res_list
            if res_list[res_list_run_inds[i]] is not None:
                raise NotImplementedError("Unexpected scenario encountered")
            res_list[res_list_run_inds[i]] = res_tup

        kpu_sdm_list = []
        for res in res_list:
            kpu_x = res[0]
            kpu_sdm = eda.SampleDistanceMatrix(kpu_x,
                                               metric=self._sdm._metric,
                                               sids=self._sdm.sids,
                                               fids=self._sdm.fids,
                                               nprocs=self._sdm._nprocs)
            kpu_sdm_list.append(kpu_sdm)
        return kpu_sdm_list
Beispiel #4
0
    def detect_rare_samples(self,
                            k,
                            d_cutoff,
                            n_iter,
                            nprocs=1,
                            metric=None,
                            use_pca=False,
                            use_hnsw=False,
                            index_params=None,
                            query_params=None):
        """
        KNN rare sample detection with multiple parameter combinations

        Assuming that there are at least k samples look similar in this
        dataset, the samples with less than k similar neighbors may be
        rare. The rare samples can either be really distinct from the general
        populaton or caused by technical errors.

        This procedure iteratively detects samples according to their k-th
        nearest neighbors. The samples most distinct from its k-th nearest
        neighbors are detected first. Then, the left samples are detected
        by less stringent distance cutoff. The distance cutoff decreases
        linearly from maximum distance to d_cutoff with n_iter iterations.

        Parameters
        ----------
        k: int list or scalar
            K nearest neighbors to detect rare samples.
        d_cutoff: float list or scalar
            Samples with >= d_cutoff distances are distinct from each other.
            Minimum (>=) distance to be called as rare.
        n_iter: int list or scalar
            N progressive iNN detections on the dataset.
        metric: {'cosine', 'euclidean', None}
            If none, self._sdm._metric is used.
        use_pca: bool
            Use PCA for nearest neighbors or not.
        use_hnsw: bool
            Use Hierarchical Navigable Small World graph to compute
            approximate nearest neighbor.
        index_params: dict
            Parameters used by HNSW in indexing.

            efConstruction: int
                Default 100. Higher value improves the quality of a constructed
                graph and leads to higher accuracy of search. However this also
                leads to longer indexing times. The reasonable range of values
                is 100-2000.
            M: int
                Default 5. Higher value leads to better recall and shorter
                retrieval times, at the expense of longer indexing time. The
                reasonable range of values is 5-100.
            delaunay_type: {0, 1, 2, 3}
                Default 2. Pruning heuristic, which affects the trade-off
                between retrieval performance and indexing time. The default
                is usually quite good.
            post: {0, 1, 2}
                Default 0. The amount and type of postprocessing applied to the
                constructed graph. 0 means no processing. 2 means more
                processing.
            indexThreadQty: int
                Default self._nprocs. The number of threads used.

        query_params: dict
            Parameters used by HNSW in querying.

            efSearch: int
                Default 100. Higher value improves recall at the expense of
                longer retrieval time. The reasonable range of values is
                100-2000.
        nprocs: int
            N processes to run all parameter tuples.

        Returns
        -------
        res_list
            Indices of non-rare samples of each corresponding parameter
            tuple.

        Notes
        -----
        If parameters are provided as lists of equal length n, the n
        corresponding parameter tuples will be executed parallely.

        Example:

        `k = [10, 15, 20]`

        `d_cutoff = [1, 2, 3]`

        `n_iter = [10, 20, 30]`

        `(k, d_cutoff, n_iter)` tuples `(10, 1, 10), (15, 2, 20), (20, 3, 30)`
        will be tried parallely with nprocs.
        """
        # Convert scalar to list
        if np.isscalar(k):
            k_list = [k]
        else:
            k_list = list(k)

        if np.isscalar(d_cutoff):
            d_cutoff_list = [d_cutoff]
        else:
            d_cutoff_list = list(d_cutoff)

        if np.isscalar(n_iter):
            n_iter_list = [n_iter]
        else:
            n_iter_list = list(n_iter)
        # Check all param lists have the same length
        if not (len(k_list) == len(d_cutoff_list) == len(n_iter_list)):
            raise ValueError("Parameter should have the same length."
                             "k: {}, d_cutoff: {}, n_iter: {}.".format(
                                 k, d_cutoff, n_iter))
        n_param_tups = len(k_list)
        # type check all parameters
        for i in range(n_param_tups):
            if k_list[i] < 1 or k_list[i] > self._sdm._x.shape[0] - 1:
                raise ValueError("k should be >= 1 and <= n_samples-1. "
                                 "k: {}".format(k))
            else:
                k_list[i] = int(k_list[i])

            if d_cutoff_list[i] <= 0:
                raise ValueError("d_cutoff should be > 0. "
                                 "d_cutoff: {}".format(d_cutoff))
            else:
                d_cutoff_list[i] = float(d_cutoff_list[i])

            if n_iter_list[i] < 1:
                raise ValueError("n_iter should be >= 1. "
                                 "n_iter: {}".format(n_iter))
            else:
                n_iter_list[i] = int(n_iter_list[i])

        param_tups = [(k_list[i], d_cutoff_list[i], n_iter_list[i], metric,
                       use_pca, use_hnsw, index_params, query_params)
                      for i in range(n_param_tups)]
        nprocs = int(nprocs)
        nprocs = min(nprocs, n_param_tups)

        # returns (filtered_sdm, progress_list (list of kept indices))
        if self._sdm._use_pdist:
            res_list = utils.parmap(
                lambda ptup: self._pdist_rare_s_detect(*ptup), param_tups,
                nprocs)
        else:
            res_list = utils.parmap(
                lambda ptup: self._no_pdist_rare_s_detect(*ptup), param_tups,
                nprocs)

        for i in range(n_param_tups):
            # only use k, d, and n_iter for res saving
            param_key = param_tups[i][:3]
            if param_key not in self._res_lut:
                self._res_lut[param_key] = res_list[i]
        # print(res_list)
        return [res[0] for res in res_list]
Beispiel #5
0
def test_parmap_tup():
    pm_res = utils.parmap(lambda x: x**2, (1, 2, 3))
    assert isinstance(pm_res, list)
    assert pm_res == [1, 4, 9]
Beispiel #6
0
def test_parmap_exception_mp():
    n = 1000
    with pytest.warns(UserWarning, match='division by zero'):
        pm_res = utils.parmap(lambda x: x / 0, range(n), nprocs=10)
    assert all(map(lambda x: isinstance(x, ZeroDivisionError), pm_res))
Beispiel #7
0
def test_parmap_gen_mp():
    n = 1000
    pm_res = utils.parmap(lambda x: x**2, range(n), nprocs=10)
    assert isinstance(pm_res, list)
    assert pm_res == list(map(lambda x: x**2, range(n)))
Beispiel #8
0
def test_parmap_invalid_nprocs():
    with pytest.raises(ValueError) as excinfo:
        pm_res = utils.parmap(lambda x: x**2,
                              np.array([[1, 2], [3, 4]]),
                              nprocs=0.5)
Beispiel #9
0
def test_parmap_arr2d():
    pm_res = utils.parmap(lambda x: x**2, np.array([[1, 2], [3, 4]]))
    assert isinstance(pm_res, list)
    assert np.all(pm_res[0] == np.array([1, 4]))
    assert np.all(pm_res[1] == np.array([9, 16]))
Beispiel #10
0
def test_parmap_arr2d():
    pm_res = utils.parmap(lambda x: x**2, np.array([1, 2, 3]).reshape(3, 1))
    assert isinstance(pm_res, list)
    assert pm_res == [1, 4, 9]
Beispiel #11
0
def test_parmap_gen():
    pm_res = utils.parmap(lambda x: x**2, range(1, 4))
    assert isinstance(pm_res, list)
    assert pm_res == [1, 4, 9]
Beispiel #12
0
def test_parmap_lst():
    pm_res = utils.parmap(lambda x: x**2, [1, 2, 3])
    assert isinstance(pm_res, list)
    assert pm_res == [1, 4, 9]