Beispiel #1
0
def from_numpy(x, labels_matrix=None, undirected=True):
    G = Graph()
    if issparse(x):
        cx = x.tocoo()
        for i, j, v in zip(cx.row, cx.col, cx.data):
            if i == j:
                continue
            G[str(i)][str(j)] = {}
            if undirected:
                G[str(j)][str(i)] = {}
    else:
        raise Exception("Dense matrices not yet supported.")
    if labels_matrix != None:
        if issparse(labels_matrix):
            cx = labels_matrix.tocoo()
            for i, j, v in zip(cx.row, cx.col, cx.data):
                if random.random() > 0.5:
                    G.label[str(i)] = j
            # print(len(cx.row), len(cx.col))
            # exit()
        else:
            raise Exception("Dense matrices not yet supported.")

    # if undirected:
    #     G.make_undirected()

    # G.make_consistent()
    return G
Beispiel #2
0
def check_sample_shape_fits(D: np.ndarray, idx: np.ndarray):
    """ Check that number of columns in ``D`` equals the size of ``idx``. """
    if issparse(D) or issparse(idx):
        raise TypeError("Sparse matrices are not supported for SampleMP.")
    check_is_nD_array(D, 2, "Distance/similarity")
    check_is_nD_array(idx, 1, "Index")
    if D.shape[1] > D.shape[0]:
        raise ValueError("Number of samples is higher than number of points. "
                         "Must be less than or equal. In the latter case, "
                         "consider not using samples at all for efficiency. "
                         "Shape of `D`: {}.".format(D.shape))
    if D.shape[1] != idx.size:
        raise TypeError("Number of samples in index array does not match "
                        "the number of samples in the data matrix. "
                        "Size of `idx`: {}, Columns in `D`: {}.".format(
                            idx.size, D.shape[1]))
Beispiel #3
0
def to_netflux(flux):
    r"""Compute the netflux from the gross flux.   
    
    Parameters
    ----------
    flux : (M, M) ndarray
        Matrix of flux values between pairs of states.
    
    Returns
    -------
    netflux : (M, M) ndarray
        Matrix of netflux values between pairs of states.
        
    Notes
    -----
    The netflux or effective current is defined as
    
    .. math:: f_{ij}^{+}=\max \{ f_{ij}-f_{ji}, 0 \}
    
    :math:`f_{ij}` is the flux for the transition from :math:`A` to
    :math:`B`.
    
    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes. 
        Multiscale Model Simul 7: 1192-1219 (2009)
    
    """
    if issparse(flux):
        return sparse.tpt.to_netflux(flux)
    elif isdense(flux):
        return dense.tpt.to_netflux(flux)
    else:
        raise _type_not_supported  
def _partial_hubness(k, d_self, log, sort_order, 
                     rows, submatrix, idx, n, verbose):
    """Parallel hubness calculation: Get k nearest neighbors for all points 
    in 'rows'"""
    
    Dk = np.zeros((k, len(rows)), dtype=np.float32)
    
    for i, row in enumerate(submatrix):
        if verbose and ((rows[i]+1)%10000==0 or rows[i]+1==n):
            log.message("NN: {} of {}.".format(rows[i]+1, n), flush=True)
        if issparse(submatrix):
            d = row.toarray().ravel() # dense copy of one row
        else: # normal ndarray
            d = row
        d[rows[i]] = d_self
        d[~np.isfinite(d)] = d_self
        # randomize the distance matrix rows to avoid the problem case
        # if all numbers to sort are the same, which would yield high
        # hubness, even if there is none
        rp = np.random.permutation(n)
        d2 = d[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        Dk[:, i] = rp[d2idx[0:k]]  
    
    return [rows, Dk]    
Beispiel #5
0
def to_netflux(flux):
    r"""Compute the netflux from the gross flux.

    Parameters
    ----------
    flux : (M, M) ndarray
        Matrix of flux values between pairs of states.

    Returns
    -------
    netflux : (M, M) ndarray
        Matrix of netflux values between pairs of states.

    Notes
    -----
    The netflux or effective current is defined as

    .. math:: f_{ij}^{+}=\max \{ f_{ij}-f_{ji}, 0 \}

    :math:`f_{ij}` is the flux for the transition from :math:`A` to
    :math:`B`.

    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes.
        Multiscale Model Simul 7: 1192-1219 (2009)

    """
    if issparse(flux):
        return sparse.tpt.to_netflux(flux)
    elif isdense(flux):
        return dense.tpt.to_netflux(flux)
    else:
        raise _type_not_supported
Beispiel #6
0
def total_flux(F, A=None):
    r"""Compute the total flux, or turnover flux, that is produced by
        the flux sources and consumed by the flux sinks.

    Parameters
    ----------
    F : (M, M) ndarray
        Matrix of flux values between pairs of states.
    A : array_like (optional)
        List of integer state labels for set A (reactant)

    Returns
    -------
    F : float
        The total flux, or turnover flux, that is produced by the flux
        sources and consumed by the flux sinks

    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes.
        Multiscale Model Simul 7: 1192-1219 (2009)

    """
    if issparse(F):
        return sparse.tpt.total_flux(F, A=A)
    elif isdense(F):
        return dense.tpt.total_flux(F, A=A)
    else:
        raise _type_not_supported
Beispiel #7
0
def coarsegrain(F, sets):
    r"""Coarse-grains the flux to the given sets.

    Parameters
    ----------
    F : (n, n) ndarray or scipy.sparse matrix
        Matrix of flux values between pairs of states.
    sets : list of array-like of ints
        The sets of states onto which the flux is coarse-grained.

    Notes
    -----
    The coarse grained flux is defined as

    .. math:: fc_{I,J} = \sum_{i \in I,j \in J} f_{i,j}

    Note that if you coarse-grain a net flux, it does n ot necessarily
    have a net flux property anymore. If want to make sure you get a
    netflux, use to_netflux(coarsegrain(F,sets)).

    References
    ----------
    .. [1] F. Noe, Ch. Schuette, E. Vanden-Eijnden, L. Reich and
        T. Weikl: Constructing the Full Ensemble of Folding Pathways
        from Short Off-Equilibrium Simulations.
        Proc. Natl. Acad. Sci. USA, 106, 19011-19016 (2009)

    """
    if issparse(F):
        return sparse.tpt.coarsegrain(F, sets)
    elif isdense(F):
        return dense.tpt.coarsegrain(F, sets)
    else:
        raise _type_not_supported
 def __init__(self, D, classes, k, isSimilarityMatrix=False):
     """
     .. note:: Deprecated in hub-toolbox 2.3
               Class will be removed in hub-toolbox 3.0.
               Please use static functions instead.
     """
     print("DEPRECATED: Please use KnnClassification.score() instead.", 
           file=sys.stderr)
     if issparse(D):
         self.D = D
     else:
         self.D = np.copy(D)
     self.classes = np.copy(classes)
     if type(k) is np.ndarray:
         self.k = np.copy(k)
     else:
         self.k = np.array([k])
     self.isSimilarityMatrix = isSimilarityMatrix
     if self.isSimilarityMatrix:
         self.self_value = -np.inf
         self.sort_order = -1
     else:
         self.self_value = np.inf
         self.sort_order = 1
     assert D.shape[0] == len(classes)
Beispiel #9
0
def _hubness_nearest_neighbors(i, n, m, d_self, metric, 
                               kth, sort_order, log, verbose, shuffle_equal):
    if verbose and ((i+1)%10000==0 or i+1==n):
        log.message("NN: {} of {}.".format(i+1, n), flush=True)
    if issparse(D):
        d = D[i, :].toarray().ravel() # dense copy of one row
    else: # normal ndarray
        d = D[i, :]
    if n == m:
        d[i] = d_self
    else: # this does not hold for general dissimilarities
        if metric == 'distance':
            d[d==0] = d_self
    d[~np.isfinite(d)] = d_self
    if shuffle_equal:
        # Randomize equal values in the distance matrix rows to avoid the
        # problem case if all numbers to sort are the same, which would yield
        # high hubness, even if there is none.
        rp = np.random.permutation(m)
        d2 = d[rp]
        d2idx = np.argpartition(d2, kth=kth)
        D_k[i, :] = rp[d2idx[kth]][::sort_order]
    else:
        d_idx = np.argpartition(d, kth=kth)
        D_k[i, :] = d_idx[kth][::sort_order]
    return
def sparse_matrix_to_hdf(sparse_matrix, name_to_store, hdf_file_path):
    nonzero_indices = np.nonzero(sparse_matrix > 0)
    if len(nonzero_indices[0]) == 0:
        raise Exception("can't store empty sparse matrix!")

    if issparse(sparse_matrix):
        if sparse_matrix.__class__ is lil_matrix:
            nonzero_values = sparse_matrix.tocsr()[nonzero_indices].A1
        else:
            nonzero_values = lil_matrix(
                sparse_matrix).tocsr()[nonzero_indices].A1
    else:
        nonzero_values = np.array(sparse_matrix[nonzero_indices])


#     print(sparse_matrix.__class__,'=',name_to_store,sparse_matrix.shape,len(nonzero_values))

    matrix_dataframe = pd.DataFrame({
        "row_indexes": nonzero_indices[0],
        "col_indexes": nonzero_indices[1],
        "data": nonzero_values
    })
    matrix_shape_series = pd.Series(sparse_matrix.shape)

    matrix_dataframe.to_hdf(hdf_file_path, name_to_store)
    matrix_shape_series.to_hdf(hdf_file_path, "%s_shape" % name_to_store)

    del nonzero_indices, nonzero_values, matrix_dataframe, matrix_shape_series
Beispiel #11
0
def coarsegrain(F, sets):
    r"""Coarse-grains the flux to the given sets. 
    
    Parameters
    ----------
    F : (n, n) ndarray or scipy.sparse matrix
        Matrix of flux values between pairs of states.
    sets : list of array-like of ints
        The sets of states onto which the flux is coarse-grained.

    Notes
    -----
    The coarse grained flux is defined as

    .. math:: fc_{I,J} = \sum_{i \in I,j \in J} f_{i,j}
    
    Note that if you coarse-grain a net flux, it does n ot necessarily
    have a net flux property anymore. If want to make sure you get a
    netflux, use to_netflux(coarsegrain(F,sets)).
    
    References
    ----------
    .. [1] F. Noe, Ch. Schuette, E. Vanden-Eijnden, L. Reich and
        T. Weikl: Constructing the Full Ensemble of Folding Pathways
        from Short Off-Equilibrium Simulations.
        Proc. Natl. Acad. Sci. USA, 106, 19011-19016 (2009)
        
    """
    if issparse(F):
        return sparse.tpt.coarsegrain(F, sets)
    elif isdense(F):
        return dense.tpt.coarsegrain(F, sets)
    else:
        raise _type_not_supported  
Beispiel #12
0
def total_flux(F, A = None):
    r"""Compute the total flux, or turnover flux, that is produced by
        the flux sources and consumed by the flux sinks.
        
    Parameters
    ----------
    F : (M, M) ndarray
        Matrix of flux values between pairs of states.
    A : array_like (optional)
        List of integer state labels for set A (reactant)
        
    Returns
    -------
    F : float
        The total flux, or turnover flux, that is produced by the flux
        sources and consumed by the flux sinks
        
    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes. 
        Multiscale Model Simul 7: 1192-1219 (2009)
        
    """
    if issparse(F):
        return sparse.tpt.total_flux(F, A = A)
    elif isdense(F):
        return dense.tpt.total_flux(F, A = A)
    else:
        raise _type_not_supported  
Beispiel #13
0
    def _k_neighbors_precomputed_sparse(self, X, n_samples=None):
        ''' Find nearest neighbors in sparse distance matrix. 

        Parameters
        ----------
        X : sparse, shape = [n_test, n_indexed]
            Sparse distance matrix. Only non-zero elements
            may be considered neighbors.
        n_samples : int
            Number of sampled indexed objects, e.g.
            in approximate hubness reduction.
            If None, this is inferred from the first row of X.
    
        Returns
        -------
        k_neighbors : ndarray
            Flattened array of neighbor indices.
        '''
        assert issparse(X), f'Matrix is not sparse'
        X = X.tocsr()
        if n_samples is None:
            n_samples = X.indptr[1] - X.indptr[0]
        n_test, _ = X.shape
        # To allow different number of explicit entries per row,
        # we need to process the matrix row-by-row.
        if np.all(X.indptr[1:] - X.indptr[:-1] == n_samples)\
            and not self.shuffle_equal:
            min_ind = np.argpartition(X.data.reshape(n_test, n_samples),
                                      kth=np.arange(self.k),
                                      axis=1)[:, :self.k]
            k_neighbors = X.indices[
                min_ind.ravel() + np.repeat(X.indptr[:-1], repeats=self.k)]
        else:
            min_ind = np.empty((n_test,), dtype=object)
            k_neighbors = np.empty((n_test,), dtype=object)
            if self.shuffle_equal:
                for i in range(n_test):
                    if self.verbose > 1 \
                        or self.verbose and (i % 1000 == 0 or i+1 == n_test):
                        log.message(f"k neighbors (from sparse distances): "
                                    f"{i+1}/{n_test}.", flush=True)
                    x = X.getrow(i)
                    rp = self.random_state.permutation(x.nnz)
                    d2 = x.data[rp]
                    d2idx = np.argpartition(d2, kth=np.arange(self.k))
                    k_neighbors[i] = x.indices[rp[d2idx[:self.k]]]
            else:
                for i in range(n_test):
                    if self.verbose > 1 \
                        or self.verbose and (i % 1000 == 0 or i+1 == n_test):
                        log.message(f"k neighbors (from sparse distances): "
                                    f"{i+1}/{n_test}.", flush=True)
                    x = X.getrow(i)
                    min_ind = np.argpartition(
                        x.data, kth=np.arange(self.k))[:self.k]
                    k_neighbors[i] = x.indices[min_ind]
            k_neighbors = np.concatenate(k_neighbors)
        return k_neighbors
Beispiel #14
0
def pathways(F, A, B, fraction=1.0, maxiter=1000):
    r"""Decompose flux network into dominant reaction paths.

    Parameters
    ----------
    F : (M, M) scipy.sparse matrix
        The flux network (matrix of netflux values)
    A : array_like
        The set of starting states
    B : array_like
        The set of end states
    fraction : float, optional
        Fraction of total flux to assemble in pathway decomposition
    maxiter : int, optional
        Maximum number of pathways for decomposition

    Returns
    -------
    paths : list
        List of dominant reaction pathways
    capacities: list
        List of capacities corresponding to each reactions pathway in paths

    Notes
    -----
    The default value for fraction is 1.0, i.e. all dominant reaction
    pathways for the flux network are computed. For large netorks the
    number of possible reaction paths can increase rapidly so that it
    becomes prohibitevely expensive to compute all possible reaction
    paths. To prevent this from happening maxiter sets the maximum
    number of reaction pathways that will be computed.

    For large flux networks it might be necessary to decrease fraction
    or to increase maxiter. It is advisable to begin with a small
    value for fraction and monitor the number of pathways returned
    when increasing the value of fraction.

    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes.
        Multiscale Model Simul 7: 1192-1219 (2009)

    """
    if issparse(F):
        return sparse.pathways.pathways(F,
                                        A,
                                        B,
                                        fraction=fraction,
                                        maxiter=maxiter)
    elif isdense(F):
        return sparse.pathways.pathways(csr_matrix(F),
                                        A,
                                        B,
                                        fraction=fraction,
                                        maxiter=maxiter)
    else:
        raise _type_not_supported
Beispiel #15
0
    def _k_neighbors_precomputed_sparse(self,
                                        X: csr_matrix,
                                        n_samples: int = None):
        """ Find nearest neighbors in sparse distance matrix.

        Parameters
        ----------
        X: sparse, shape = [n_test, n_indexed]
            Sparse distance matrix. Only non-zero elements
            may be considered neighbors.

        n_samples: int
            Number of sampled indexed objects, e.g.
            in approximate hubness reduction.
            If None, this is inferred from the first row of X.

        Returns
        -------
        k_neighbors : ndarray
            Flattened array of neighbor indices.
        """
        if not issparse(X):
            raise TypeError(f'Matrix X is not sparse')
        X = X.tocsr()
        if n_samples is None:
            n_samples = X.indptr[1] - X.indptr[0]
        n_test, _ = X.shape
        # To allow different number of explicit entries per row,
        # we need to process the matrix row-by-row.
        if np.all(X.indptr[1:] -
                  X.indptr[:-1] == n_samples) and not self.shuffle_equal:
            min_ind = np.argpartition(X.data.reshape(n_test, n_samples),
                                      kth=np.arange(self.k),
                                      axis=1)[:, :self.k]
            k_neighbors = X.indices[min_ind.ravel() +
                                    np.repeat(X.indptr[:-1], repeats=self.k)]
        else:
            k_neighbors = np.empty((n_test, ), dtype=object)
            if self.verbose:
                range_n_test = tqdm(range(n_test))
            else:
                range_n_test = range(n_test)
            if self.shuffle_equal:
                for i in range_n_test:
                    x = X.getrow(i)
                    rp = self._random_state.permutation(x.nnz)
                    d2 = x.data[rp]
                    d2idx = np.argpartition(d2, kth=np.arange(self.k))
                    k_neighbors[i] = x.indices[rp[d2idx[:self.k]]]
            else:
                for i in range_n_test:
                    x = X.getrow(i)
                    min_ind = np.argpartition(x.data,
                                              kth=np.arange(self.k))[:self.k]
                    k_neighbors[i] = x.indices[min_ind]
            k_neighbors = np.concatenate(k_neighbors)
        return k_neighbors
 def __init__(self, D, k:int=7, scalingType='nicdm', isSimilarityMatrix=False):
     """
     .. note:: Deprecated in hub-toolbox 2.3
               Class will be removed in hub-toolbox 3.0.
               Please use static functions instead.
     """
     print("DEPRECATED: Please use LocalScaling.local_scaling() or "
           "LocalScaling.nicdm() instead.", file=sys.stderr)
     self.log = Logging.ConsoleLogging()
     self.D = np.copy(D)
     self.k = k
     self.scalingType = scalingType
     if isSimilarityMatrix:
         if scalingType=='nicdm':
             if issparse(D):
                 self.log.error("NICDM does not support sparse matrices.")
                 raise NotImplementedError(
                     "NICDM does not support sparse matrices.")
             else:
                 self.log.warning("NICDM does not support similarities. "
                     "Distances will be calculated as D=1-S/S.max and used "
                     "for NICDM scaling. Similarities are subsequently "
                     "obtained by the same procedure S=1-D/D.max")
         else:
             self.log.warning("Similarity-based LS support is experimental.")
     self.isSimilarityMatrix = isSimilarityMatrix
     if self.isSimilarityMatrix:
         self.sort_order = -1
         self.exclude = -np.inf 
     else:
         self.sort_order = 1
         self.exclude = np.inf
     if issparse(D):
         if isSimilarityMatrix:
             self.log.warning("Sparse matrix support for LS is experimental.")
         else:
             self.log.error("Sparse distance matrices are not supported.")
             raise NotImplementedError(
                            "Sparse distance matrices are not supported.")    
Beispiel #17
0
def pathways(F, A, B, fraction=1.0, maxiter=1000):
    r"""Decompose flux network into dominant reaction paths.

    Parameters
    ----------
    F : (M, M) scipy.sparse matrix
        The flux network (matrix of netflux values)
    A : array_like
        The set of starting states
    B : array_like
        The set of end states
    fraction : float, optional
        Fraction of total flux to assemble in pathway decomposition
    maxiter : int, optional
        Maximum number of pathways for decomposition
        
    Returns
    -------
    paths : list
        List of dominant reaction pathways
    capacities: list
        List of capacities corresponding to each reactions pathway in paths

    Notes
    -----
    The default value for fraction is 1.0, i.e. all dominant reaction
    pathways for the flux network are computed. For large netorks the
    number of possible reaction paths can increase rapidly so that it
    becomes prohibitevely expensive to compute all possible reaction
    paths. To prevent this from happening maxiter sets the maximum
    number of reaction pathways that will be computed. 

    For large flux networks it might be necessary to decrease fraction
    or to increase maxiter. It is advisable to begin with a small
    value for fraction and monitor the number of pathways returned
    when increasing the value of fraction.

    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes. 
        Multiscale Model Simul 7: 1192-1219 (2009)    
        
    """
    if issparse(F):
        return sparse.pathways.pathways(F, A, B, fraction=fraction, maxiter=maxiter)
    elif isdense(F):
        return sparse.pathways.pathways(csr_matrix(F), A, B, fraction=fraction, maxiter=maxiter)
    else:
        raise _type_not_supported
Beispiel #18
0
def pathways(F, A, B, qplus, fraction = 1.0, totalflux = None):
    r"""Pathway decomposition of the net flux.
    
    Parameters
    ----------
    F : (M, M) ndarray
        Matrix of flux values between pairs of states.
    A : array-like of ints
        A states (source, educt states)
    B : array-like of ints
        B states (sinks, product states)
    qplus : (M,) ndarray
        Forward committor
    fraction = float (optional)
        The fraction of the total flux for which pathways will be
        computed.  When set larger than 1.0, will use 1.0. When set <=
        0.0, no pathways will be computed and two empty lists will be
        returned.  For example, when set to fraction = 0.9, the
        pathway decomposition will stop when 90% of the flux have been
        accumulated. This is very useful for large flux networks which
        often contain a few major and a lot of minor paths. In such
        networks, the algorithm would spend a very long time in the
        last few percent of pathways
    
    Returns
    -------
    (paths,pathfluxes) : (list of int-arrays, double-array)
        paths in the order of decreasing flux. Each path is given as an 
        int-array of state indexes, ordered by increasing forward committor 
        values. The first index of each path will be a state in A,
        the last index a state in B. 
        The corresponding figure in the pathfluxes-array is the flux carried 
        by that path. The pathfluxes-array sums to the requested fraction of 
        the total A->B flux.
    
    References
    ----------
    .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes. 
        Multiscale Model Simul 7: 1192-1219 (2009)
    
    """
    # initialize decomposition object
    Fdense = F
    if (issparse(F)):
        RuntimeWarning('Sparse pathway decomposition is not implemented. Using dense pathway implementation.' 
                        +'Sorry, but this might lead to poor performance or memory overflow.')
        Fdense = F.toarray()
    return dense.tpt.pathways(Fdense, A, B, qplus, fraction=fraction, totalflux=totalflux)
Beispiel #19
0
def _write_instance(feats, class_label, f):

    if issparse(feats):
        feats = feats.toarray().reshape(-1,)

    try:
        feat_list = [str(i).replace(" ", "_") for i in feats]
    except ValueError:
        print("Please enter correct feature values")
    else:
        try:
            class_label = str(class_label)
        except ValueError:
            print("Please enter correct class label")
        else:
            f.write(",".join(feat_list)+","+class_label+"\n")
Beispiel #20
0
    def save(self, filename):
        # Store counts as a sparse matrix so they don't take up tonnes of space
        if issparse(self.counts):
            sparse_counts = self.counts
        else:
            sparse_counts = csr_matrix(self.counts)
        with open("%s-sparse-counts" % filename, "w") as f:
            pickle.dump(sparse_counts, f, -1)

        numpy.save("%s-init.npy" % filename, self.initial_counts)
        with open("%s.params" % filename, 'w') as params_file:
            pickle.dump(
                {
                    "laplace_smoothing": self.laplace_smoothing,
                    "backoff_threshold": self.backoff_threshold,
                }, params_file)
Beispiel #21
0
def is_multilabel(y):
    """ Check if ``y`` is in a multilabel format.

    Parameters
    ----------
    y : numpy array of shape [n_samples]
        Target values.

    Returns
    -------
    out : bool,
        Return ``True``, if ``y`` is in a multilabel format, else ```False``.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.utils.multiclass import is_multilabel
    >>> is_multilabel([0, 1, 0, 1])
    False
    >>> is_multilabel([[1], [0, 2], []])
    False
    >>> is_multilabel(np.array([[1, 0], [0, 0]]))
    True
    >>> is_multilabel(np.array([[1], [0], [0]]))
    False
    >>> is_multilabel(np.array([[1, 0, 0]]))
    True
    """
    if hasattr(y, '__array__') or isinstance(y, Sequence):
        y = np.asarray(y)
    if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
        return False

    if issparse(y):
        if isinstance(y, (dok_matrix, lil_matrix)):
            y = y.tocsr()
        return (len(y.data) == 0 or np.unique(y.data).size == 1 and
                (y.dtype.kind in 'biu' or  # bool, int, uint
                 _is_integral_float(np.unique(y.data))))
    else:
        labels = np.unique(y)

        return len(labels) < 3 and (y.dtype.kind in 'biu' or  # bool, int, uint
                                    _is_integral_float(labels))
def cal_skewness(dist_mat, k, metric):
    """
    This code is based on 'hub-toobox'
    args:
      - dist_mat (ndarray) : Distant (Similarity) matrix (n_query, n_target)
      - k (int) : Neighborhood size for `k`-occurence
      - metric ({'similarity' or 'distant'}) : whether dist_mat is distant or similarity
    """

    if issparse(dist_mat):
        raise NotImplementedError()

    if metric == 'distance':
        self_val = np.inf
        sort_order = 1
    elif metric == 'similarity':
        self_val = -np.inf
        sort_order = -1
    else:
        raise ValueError('Invalid metric: {}'.format(metric))

    dist_mat = dist_mat.copy()
    n_query, n_target = dist_mat.shape
    kbest_idxs = np.zeros((k, n_query), dtype=np.float32)

    # np.fill_diagonal(dist_mat, self_val)
    dist_mat[~np.isfinite(dist_mat)] = self_val

    for i in range(n_query):
        dists = dist_mat[i, :]
        # dists[i] = self_val
        dists[~np.isfinite(dists)] = self_val

        # randomize equal values for avoiding high hubness (see original code)
        rand_idxs = np.random.permutation(n_target)
        dists2 = dists[rand_idxs]
        rank_dists2 = np.argsort(dists2, axis=0)[::sort_order]
        kbest_idxs[:, i] = rand_idxs[rank_dists2[0:k]]

    n_k = np.bincount(kbest_idxs.astype(int).ravel())
    skewness = stats.skew(n_k)

    return skewness
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_set_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.
    
    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 7)
        Neighborhood radius for local scaling.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    log = Logging.ConsoleLogging()
    # Checking input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'similarity':
        sort_order = -1
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        sort_order = 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if issparse(D):
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.") 
            
    D = np.copy(D)
    n = D.shape[0]
    if test_set_ind is None:
        train_set_ind = slice(0, n) #take all        
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    
    r = np.zeros(n)
    for i in range(n):
        if issparse(D):
            di = D[i, train_set_ind].toarray()
        else:
            di = D[i, train_set_ind]
        di[i] = exclude
        nn = np.argsort(di)[::sort_order]
        r[i] = di[nn[k-1]] #largest similarities or smallest distances
    
    if issparse(D):
        D_ls = lil_matrix(D.shape)
    else:
        D_ls = np.zeros_like(D)
        
    for i in range(n):
        # vectorized inner loop: calc only triu part
        tmp = np.empty(n-i)
        tmp[0] = self_tmp_value
        if metric == 'similarity':
            tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
        else:
            tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
        D_ls[i, i:] = tmp
    # copy triu to tril -> symmetric matrix (diag=zeros)
    # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
    D_ls += D_ls.T
    
    if issparse(D):
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
def score(D:np.ndarray, target:np.ndarray, k=5, 
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0):
    """Perform `k`-nearest neighbor classification.
    
    Use the ``n x n`` symmetric distance matrix `D` and target class 
    labels `target` to perform a `k`-NN experiment (leave-one-out 
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).
    
    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.
        
        HINT: Providing more than one value for `k` is a cheap means to perform 
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit 
          model to remaining data. Evaluate model on test set.
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
    
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)
        
        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items
        
        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)
        
        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of 
        the ``k=20`` experiment.
    """
    
    # Check input sanity
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_distance_matrix_shape_fits_labels(D, target)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
    
    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:  
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e
        
    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))
        
    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, len(cl), len(cl)))
    
    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    
    cl = range(len(cl))
    
    # Classify each point in test set
    for i in test_set_ind:
        seed_class = classes[i]
        
        if issparse(D):
            row = D.getrow(i).toarray().ravel()
        else:
            row = D[i, :]
        row[i] = d_self
        
        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        rp = train_set_ind
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        idx = rp[d2idx]      
        
        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            nn_class = classes[idx[0:k[j]]]
            cs = np.bincount(nn_class.astype(int))
            max_cs = np.where(cs == np.max(cs))[0]
            
            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1       
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1
                       
    if verbose:
        log.message("Finished k-NN experiment.")
        
    return acc, corr, cmat
Beispiel #25
0
def flux_matrix(T, pi, qminus, qplus, netflux=True):
    r"""Compute the TPT flux network for the reaction A-->B.
    
    Parameters
    ----------
    T : (M, M) ndarray
        transition matrix
    pi : (M,) ndarray
        Stationary distribution corresponding to T
    qminus : (M,) ndarray
        Backward comittor
    qplus : (M,) ndarray
        Forward committor
    netflux : boolean
        True: net flux matrix will be computed  
        False: gross flux matrix will be computed
    
    Returns
    -------
    flux : (M, M) ndarray
        Matrix of flux values between pairs of states.
    
    Notes
    -----
    Computation of the flux network relies on transition path theory
    (TPT) [1]. Here we use discrete transition path theory [2] in
    the transition matrix formulation [3]. 
    
    See also
    --------
    committor.forward_committor, committor.backward_committor
    
    Notes
    -----
    Computation of the flux network relies on transition path theory
    (TPT). The central object used in transition path theory is the
    forward and backward comittor function.

    The TPT (gross) flux is defined as 
    
    .. math:: f_{ij}=\left \{ \begin{array}{rl}
                          \pi_i q_i^{(-)} p_{ij} q_j^{(+)} & i \neq j \\
                          0                                & i=j\
                          \end{array} \right .
    
    The TPT net flux is then defined as 
    
    .. math:: f_{ij}=\max\{f_{ij} - f_{ji}, 0\} \:\:\:\forall i,j.
        
    References
    ----------
    .. [1] W. E and E. Vanden-Eijnden.
        Towards a theory of transition paths. 
        J. Stat. Phys. 123: 503-523 (2006)
    .. [2] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes. 
        Multiscale Model Simul 7: 1192-1219 (2009)
    .. [3] F. Noe, Ch. Schuette, E. Vanden-Eijnden, L. Reich and
        T. Weikl: Constructing the Full Ensemble of Folding Pathways
        from Short Off-Equilibrium Simulations.
        Proc. Natl. Acad. Sci. USA, 106, 19011-19016 (2009)
        
    """
    if issparse(T):
        return sparse.tpt.flux_matrix(T, pi, qminus, qplus, netflux=netflux)
    elif isdense(T):
        return dense.tpt.flux_matrix(T, pi, qminus, qplus, netflux=netflux)
    else:
        raise _type_not_supported  
Beispiel #26
0
def _hubness_no_multiprocessing(D:np.ndarray, k:int=5, metric='distance',
                                verbose:int=0, random_state=None,
                                shuffle_equal:bool=True):
    """ Hubness calculations without multiprocessing overhead. """
    log = ConsoleLogging()
    io.check_is_nD_array(arr=D, n=2, arr_type='Distance')
    io.check_valid_metric_parameter(metric)
    n, m = D.shape
    if k >= m:
        k_old = k
        k = m - 1
        log.warning("Reducing k from {} to {}, so that it is less than "
                    "the total number of neighbors.".format(k_old, k))
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
        kth = np.arange(k)
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
        kth = np.arange(n - k, n)

    if verbose:
        log.message("Hubness calculation (skewness of {}-occurence)".format(k))
    D = D.copy()
    D_k = np.zeros((n, k), dtype=np.float64)
    rnd = np.random.RandomState(random_state)

    if issparse(D):
        pass # correct self-distance must be ensured upstream for sparse
    else:
        if n == m:
            # Set self dist to inf
            np.fill_diagonal(D, d_self)
        else:
            pass # a partial distances matrix should not contain self distances
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self

    for i in range(n):
        if verbose and ((i+1)%10000==0 or i+1==n):
            log.message("NN: {} of {}.".format(i+1, n), flush=True)
        if issparse(D):
            d = D[i, :].toarray().ravel() # dense copy of one row
        else: # normal ndarray
            d = D[i, :]
        if n == m:
            d[i] = d_self
        else: # this does not hold for general dissimilarities
            if metric == 'distance':
                d[d==0] = d_self
        d[~np.isfinite(d)] = d_self
        if shuffle_equal:
            # Randomize equal values in the distance matrix rows to avoid the
            # problem case if all numbers to sort are the same, which would
            # yield high hubness, even if there is none.
            rp = rnd.permutation(m)
            d2 = d[rp]
            d2idx = np.argpartition(d2, kth=kth)
            D_k[i, :] = rp[d2idx[kth]][::sort_order]
        else:
            d_idx = np.argpartition(d, kth=kth)
            D_k[i, :] = d_idx[kth][::sort_order]

    # N-occurence
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=m)
    # Hubness
    S_k = stats.skew(N_k)

    # return k-hubness, k-nearest neighbors, k-occurence
    if verbose:
        log.message("Hubness calculation done.", flush=True)
    return S_k, D_k, N_k
Beispiel #27
0
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0):
    """Compute hubness of a distance matrix.
    
    Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse 
    nearest neighbor count, i.e. how often does a point occur in the 
    `k`-nearest neighbor lists of other points).
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 5)
        Neighborhood size for `k`-occurence.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    S_k : float
        Hubness (skewness of `k`-occurence distribution)
    D_k : ndarray
        `k`-nearest neighbor lists
    N_k : ndarray
        `k`-occurence list    
    
    References
    ----------
    .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). 
           Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. 
           Journal of Machine Learning Research, 11, 2487–2531. Retrieved from 
           http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/
           radovanovic10a.pdf
    """
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
        
    if verbose:
        log.message("Hubness calculation (skewness of {}-occurence)".format(k))
    D = D.copy()           
    D_k = np.zeros((k, D.shape[1]), dtype=np.float32)
    n = D.shape[0]
    
    if issparse(D): 
        pass # correct self-distance must be ensured upstream for sparse
    else:
        # Set self dist to inf
        np.fill_diagonal(D, d_self)
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self
    
    for i in range(n):
        if verbose and ((i+1)%10000==0 or i+1==n):
            log.message("NN: {} of {}.".format(i+1, n), flush=True)
        if issparse(D):
            d = D[i, :].toarray().ravel() # dense copy of one row
        else: # normal ndarray
            d = D[i, :]
        d[i] = d_self
        d[~np.isfinite(d)] = d_self
        # Randomize equal values in the distance matrix rows to avoid the 
        # problem case if all numbers to sort are the same, which would yield 
        # high hubness, even if there is none.
        rp = np.random.permutation(n)
        d2 = d[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        D_k[:, i] = rp[d2idx[0:k]]      
               
    # N-occurence
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=n)    
    # Hubness
    S_k = stats.skew(N_k)
     
    # return k-hubness, k-nearest neighbors, k-occurence
    if verbose:
        log.message("Hubness calculation done.", flush=True)
    return S_k, D_k, N_k    
Beispiel #28
0
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance',
                         train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.

    --- DRAFT version ---

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if metric == 'similarity':
        if train_ind is not None:
            raise NotImplementedError
        kth = n - k
        exclude = -np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    D = np.copy(D)
    if test_ind is None:
        train_set_ind = slice(0, n) #take all
        n_ind = range(n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_ind)
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude
    r = np.zeros(n)
    for i in range(n):
        if train_ind is None:
            if sparse:
                di = D[i, train_set_ind].toarray()
            else:
                di = D[i, train_set_ind]
        else:
            di = D[i, :] # all columns are training in this case
        r[i] = np.partition(di, kth=kth)[kth]

    if sparse:
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        D_ls = np.zeros_like(D)

    if metric == 'similarity':
        for i in n_ind:
            if sparse and nnz[i] <= k: # Don't rescale if there are too few 
                D_ls[i, :] = D[i, :]   # neighbors in the current row
            else:
                D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))
    else:
        for i in n_ind:
            D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))

    if test_ind is None:
        if sparse:
            return D_ls.tocsr()
        else:
            np.fill_diagonal(D_ls, self_value)
            return D_ls
    else:
        # Ensure correct self distances
        for j, sample in enumerate(train_ind):
            D_ls[sample, j] = self_value
        return D_ls[test_ind]
def score(D:np.ndarray, target:np.ndarray, k=5,
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0,
          sample_idx=None, filter_self=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    sample_idx : ...
        TODO add description

    filter_self : bool, optional, default: True
        Remove self similarities from sparse ``D``.
        This assumes that the highest similarity per row is the self
        similarity.
        
        NOTE: Quadratic dense matrices are always filtered for self
        distances/similarities, even if `filter_self` is set t0 `False`.
        
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)

        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items

        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of
        the ``k=20`` experiment.
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    io.check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    D_is_sparse = issparse(D)

    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))

    cl = np.sort(np.unique(target))
    if D_is_sparse:
        # Add a label for unknown class (object w/o nonzero sim to any others)
        cl = np.append(cl, cl.max()+1)
        n_classes = len(cl) + 1
    else:
        n_classes = len(cl)
    cmat = np.zeros((k_length, n_classes, n_classes))

    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    rnd_classif = np.zeros(k_length)
    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        seed_class = classes[i]

        if D_is_sparse:
            row = D.getrow(i)
        else:
            row = D[i, :]
            if sample_idx is None:
                row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            rp = np.arange(len(sample_idx))
        if D_is_sparse:
            nnz = row.nnz
            rp = np.random.permutation(nnz)
            d2 = row.data[rp]
            # Partition for each k value
            kth = nnz - k - 1
            # sort the two highest similarities to end
            kth = np.append(kth, [nnz-2, nnz-1])
            # Clip negative indices (nnz < k)
            np.clip(kth, a_min=0, a_max=nnz-1, out=kth)
            # Remove duplicate k values and sort
            kth = np.unique(kth)
            d2idx = np.argpartition(d2, kth=kth)
            d2idx = d2idx[~np.isnan(d2[d2idx])][::-1]
            idx = row.nonzero()[1][rp[d2idx]]
            idx = idx[1:] # rem self sim
        else:
            rp = np.random.permutation(rp)
            d2 = row[rp]
            d2idx = np.argsort(d2, axis=0)[::sort_order]
            d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
            idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            if D_is_sparse:
                #print(row[0, idx[0:k[j]]].toarray())
                finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel())
                #print(finite_val)
            else:
                finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                rnd_classif[j] += 1
            if sample_idx is None:
                nn_class = classes[idx[0:k[j]]][finite_val]
            else:
                #finite_val = np.isfinite(sample_row[idx[0:k[j]]])
                nn_class = sample_classes[idx[0:k[j]]][finite_val]
            cs = np.bincount(nn_class.astype(int))
            if cs.size > 0:
                max_cs = np.where(cs == np.max(cs))[0]
            else:
                max_cs = np.array([len(cl) - 1]) # misclassification label

            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1

    if np.any(rnd_classif):
        for x in rnd_classif:
            log.warning(("{} queries were classified randomly, because all "
                        "distances were non-finite numbers.").format(x))
    if verbose:
        log.message("Finished k-NN experiment.")

    return acc, corr, cmat
Beispiel #30
0
def flux_matrix(T, pi, qminus, qplus, netflux=True):
    r"""Compute the TPT flux network for the reaction A-->B.

    Parameters
    ----------
    T : (M, M) ndarray
        transition matrix
    pi : (M,) ndarray
        Stationary distribution corresponding to T
    qminus : (M,) ndarray
        Backward comittor
    qplus : (M,) ndarray
        Forward committor
    netflux : boolean
        True: net flux matrix will be computed
        False: gross flux matrix will be computed

    Returns
    -------
    flux : (M, M) ndarray
        Matrix of flux values between pairs of states.

    Notes
    -----
    Computation of the flux network relies on transition path theory
    (TPT) [1]. Here we use discrete transition path theory [2] in
    the transition matrix formulation [3].

    See also
    --------
    committor.forward_committor, committor.backward_committor

    Notes
    -----
    Computation of the flux network relies on transition path theory
    (TPT). The central object used in transition path theory is the
    forward and backward comittor function.

    The TPT (gross) flux is defined as

    .. math:: f_{ij}=\left \{ \begin{array}{rl}
                          \pi_i q_i^{(-)} p_{ij} q_j^{(+)} & i \neq j \\
                          0                                & i=j\
                          \end{array} \right .

    The TPT net flux is then defined as

    .. math:: f_{ij}=\max\{f_{ij} - f_{ji}, 0\} \:\:\:\forall i,j.

    References
    ----------
    .. [1] W. E and E. Vanden-Eijnden.
        Towards a theory of transition paths.
        J. Stat. Phys. 123: 503-523 (2006)
    .. [2] P. Metzner, C. Schuette and E. Vanden-Eijnden.
        Transition Path Theory for Markov Jump Processes.
        Multiscale Model Simul 7: 1192-1219 (2009)
    .. [3] F. Noe, Ch. Schuette, E. Vanden-Eijnden, L. Reich and
        T. Weikl: Constructing the Full Ensemble of Folding Pathways
        from Short Off-Equilibrium Simulations.
        Proc. Natl. Acad. Sci. USA, 106, 19011-19016 (2009)

    """
    if issparse(T):
        return sparse.tpt.flux_matrix(T, pi, qminus, qplus, netflux=netflux)
    elif isdense(T):
        return dense.tpt.flux_matrix(T, pi, qminus, qplus, netflux=netflux)
    else:
        raise _type_not_supported
    def __init__(self, counts, mu=None, reversible=False, Tinit=None):
        """
        Sets the count matrix used for sampling. Assumes that the prior 
        (if desired) is included.
       
        Parameters
        ----------
        counts : ndarray (n, n)
            the posterior count matrix
        mu : ndarray (n)
           optional stationary distribution, if given, the sampled transition matrix
           will have this this stat dist.
        reversible : boolean
           should sample a reversible transition matrix.
           
        Tinit : ndarray(n, n)
           optional start point for sampling algorithm.
           
        Example
        -------
        >>> C = np.array([[5, 2], [1,10]]) 
        >>> sampler = ITransitionMatrixSampler(C)
        >>> T = sampler.sample(10**6)
        >>> print T
        
        """
        if issparse(counts):
            counts = counts.toarray()
        # the interface in stallone takes counts as doubles
        counts = counts.astype(np.float64)

        try:
            C = ndarray_to_stallone_array(counts)
            jpackage = stallone.mc.sampling
            # convert types to java
            if Tinit is not None:
                Tinit = ndarray_to_stallone_array(Tinit)
            if mu is not None:
                mu = ndarray_to_stallone_array(mu)

            if reversible:
                if mu:  # fixed pi
                    if Tinit:
                        self.sampler = jpackage.TransitionMatrixSamplerRevFixPi(
                            C, Tinit, mu)
                    else:
                        self.sampler = jpackage.TransitionMatrixSamplerRevFixPi(
                            C, mu)
                else:  # sample reversible matrix, with arbitrary pi
                    if Tinit:
                        self.sampler = jpackage.TransitionMatrixSamplerRev(
                            C, Tinit)
                    else:
                        self.sampler = jpackage.TransitionMatrixSamplerRev(C)
            else:  # sample non rev
                if Tinit:
                    self.sampler = jpackage.TransitionMatrixSamplerNonrev(
                        C, Tinit)
                else:
                    self.sampler = jpackage.TransitionMatrixSamplerNonrev(C)

        except JavaException as je:
            log = getLogger()
            log.exception("Error during creation of tmatrix sampling wrapper:"
                          " stack\n%s" % je.stacktrace())
            raise
Beispiel #32
0
 def to_array(self, potential_sparse_array):
     if issparse(potential_sparse_array):
         return potential_sparse_array.toarray()
     else:
         return potential_sparse_array
Beispiel #33
0
    def score(self,
              X: np.ndarray = None,
              y=None,
              has_self_distances: bool = False):
        """ Estimate hubness in a data set.

        Hubness is estimated from the distances between all objects in X to all objects in Y.
        If Y is None, all-against-all distances between the objects in X are used.
        If self.metric == 'precomputed', X must be a distance matrix.

        Parameters
        ----------
        X: ndarray, shape (n_query, n_features) or (n_query, n_indexed)
            Array of query vectors, or distance, if self.metric == 'precomputed'

        y: ignored

        has_self_distances: bool, default = False
            Define, whether a precomputed distance matrix contains self distances,
            which need to be excluded.

        Returns
        -------
        hubness_measure: float or dict
            Return the hubness measure as indicated by `return_value`.
            Additional hubness indices are provided as attributes
            (e.g. :func:`robinhood_index_`).
            if return_value is 'all', a dict of all hubness measures is returned.
        """
        check_is_fitted(self, 'X_train_')
        if X is None:
            X_test = self.X_train_
        else:
            X_test = X
        X_test = check_array(X_test, accept_sparse=True)
        X_train = self.X_train_

        kth = np.arange(self.k)
        start = 0
        end = self.k
        if self.metric == 'precomputed':
            if X is not None:
                raise ValueError(
                    f'No X must be passed with metric=="precomputed".')
            n_test, n_train = X_test.shape
            if has_self_distances:
                kth = np.arange(self.k + 1)
                start = 1
                end = self.k + 1
        else:
            if X is None:
                # Self distances do occur in this case
                kth = np.arange(self.k + 1)
                start = 1
                end = self.k + 1
            n_test, m_test = X_test.shape
            n_train, m_train = X_train.shape
            if m_test != m_train:
                raise ValueError(
                    f'Number of features do not match: X_train.shape={X_train.shape}, '
                    f'X_test.shape={X_test.shape}.')

        if self.metric == 'precomputed':
            if issparse(X_test):
                k_neighbors = self._k_neighbors_precomputed_sparse(X_test)
            else:
                k_neighbors = self._k_neighbors_precomputed(
                    X_test, kth, start, end)
        else:
            if X is None:
                k_neighbors = self._k_neighbors()
            else:
                k_neighbors = self._k_neighbors(X_test=X_test)
        if self.store_k_neighbors:
            self.k_neighbors = k_neighbors

        # Negative indices can occur, when ANN does not find enough neighbors,
        # and must be removed
        mask = k_neighbors < 0
        if np.any(mask):
            k_neighbors = k_neighbors[~mask]
            del mask

        k_occurrence = np.bincount(k_neighbors.astype(int).ravel(),
                                   minlength=n_train)
        if self.store_k_occurrence:
            self.k_occurrence = k_occurrence

        # traditional skewness measure
        self.k_skewness = stats.skew(k_occurrence)

        # new skewness measure (truncated normal distribution)
        self.k_skewness_truncnorm = self._calc_skewness_truncnorm(k_occurrence)

        # Gini index
        limiting = 'space' if k_occurrence.shape[0] > 10_000 else 'time'
        self.gini_index = self._calc_gini_index(k_occurrence, limiting)

        # Robin Hood index
        self.robinhood_index = self._calc_robinhood_index(k_occurrence)

        # Atkinson index
        self.atkinson_index = self._calc_atkinson_index(k_occurrence)

        # anti-hub occurrence
        self.antihubs, self.antihub_occurrence = \
            self._calc_antihub_occurrence(k_occurrence)

        # hub occurrence
        self.hubs, self.hub_occurrence = \
            self._calc_hub_occurrence(k=self.k, k_occurrence=k_occurrence,
                                      n_test=n_test, hub_size=self.hub_size)

        # Largest hub
        self.groupie_ratio = k_occurrence.max() / n_test / self.k

        # Dictionary of all hubness measures
        self.hubness_measures = {
            'k_skewness': self.k_skewness,
            'k_skewness_truncnorm': self.k_skewness_truncnorm,
            'atkinson': self.atkinson_index,
            'gini': self.gini_index,
            'robinhood': self.robinhood_index,
            'antihubs': self.antihubs,
            'antihub_occurrence': self.antihub_occurrence,
            'hubs': self.hubs,
            'hub_occurrence': self.hub_occurrence,
            'groupie_ratio': self.groupie_ratio,
        }
        if hasattr(self, 'k_neighbors'):
            self.hubness_measures['k_neighbors'] = self.k_neighbors
        if hasattr(self, 'k_occurrence'):
            self.hubness_measures['k_occurrence'] = self.k_occurrence

        if self.return_value == 'all':
            return self.hubness_measures
        else:
            return self.hubness_measures[self.return_value]
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance',
                average:str='weighted', return_y_pred:int=0,
                verbose:int=0, n_jobs:int=1) -> float:
    """ Calculate R-Precision (recall at R-th position).

    Parameters
    ----------
    S : ndarray or CSR matrix
        Distance (similarity) matrix

    y : ndarray
        Target (ground truth) labels

    metric : 'distance' or 'similarity', optional, default: 'similarity'
        Define, whether `S` is a distance or similarity matrix.

    average : 'weighted', 'macro' or None, optional, default: 'weighted'
        Ignored. Weighted and macro precisions are returned.

    return_y_pred : int, optional, default: 0
        If > 0, return the labels of the `return_y_pred` nearest neighbors

    verbose : int, optional, default: 0
        Increasing level of output.

    n_jobs : int, optional, default: 1
        Number of parallel processes to use.

    Returns
    -------
    r_precision : dictionary with following keys:
        macro : float
            Macro R-Precision.

        weighted : float
            Weighted R-Precision.

        per_item : ndarray
            R-Precision at the object.

        relevant_items : ndarray
            Relevant items per class.

        y_true : ndarray
            Target labels (req. for weighting).

        y_pred : ndarray
            Labels of some k-nearest neighbors
    """
    io.check_distance_matrix_shape(S)
    io.check_distance_matrix_shape_fits_labels(S, y)
    io.check_valid_metric_parameter(metric)
    log = ConsoleLogging()
    n, _ = S.shape
    S_is_sparse = issparse(S)
    if metric != 'similarity' or not S_is_sparse:
        raise NotImplementedError("Only sparse similarity matrices so far.")

    # Map labels to 0..n(labels)-1
    le = LabelEncoder()
    # Add int.min for misclassifications
    incorr_orig = np.array([np.nan]).astype(int)
    le.fit(np.append(y, incorr_orig))
    y = le.transform(y)
    incorrect = le.transform(incorr_orig)
    # Number of relevant items, i.e. number of each label
    relevant_items = np.bincount(y) - 1 # one less for self class
    # R-Precision for each item
    r_prec = np.zeros(n, dtype=np.float)
    
    # Classify each point in test set
    if verbose:
        log.message("Creating shared memory data.")
    n_random_pred = mp.Value(ctypes.c_int)
    n_random_pred.value = 0
    if verbose and log:
        log.message("Spawning processes for prediction.")
    y_pred = np.zeros((n, return_y_pred), dtype=float)
    kwargs = {'y_pred' : return_y_pred,
              'incorrect' : incorrect}
    with mp.Pool(processes=n_jobs, 
                 initializer=_load_shared_csr, 
                 initargs=(S, y, n_random_pred, relevant_items)) as pool:
        for i, r in enumerate(
            pool.imap(
                func=partial(_r_prec_worker, **kwargs),
                iterable=range(n), 
                chunksize=int(1e2))):
            if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1):
                log.message("Classification: {} of {} on {}.".format(
                            i+1, n, mp.current_process().name), flush=True)
            try:
                r_prec[i] = r[0]
                y_pred[i, :] = r[1]
            except:
                r_prec[i] = r
            if i == n-1:
                pass
    pool.join()

    if verbose and log:
        log.message("Retrieving nearest neighbors.")
    # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder
    y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T
    if verbose and log:
        log.message("Finishing.")
    if n_random_pred.value:
        log.warning(("{} queries were classified randomly, because all "
                     "distances were non-finite numbers or there were no other "
                     "objects in the same class.").format(n_random_pred.value))
    return_dict = {'macro' : r_prec.mean(),
                   'weighted' : np.average(r_prec, weights=relevant_items[y]),
                   'per_item' : r_prec,
                   'relevant_items' : relevant_items,
                   'y_true' : y,
                   'y_pred' : y_pred}
    return return_dict
def hubness(D:np.ndarray, k:int=5, metric='distance', 
            verbose:int=0, n_jobs:int=-1):
    """Compute hubness of a distance matrix.
    
    Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse 
    nearest neighbor count, i.e. how often does a point occur in the 
    `k`-nearest neighbor lists of other points).
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 5)
        Neighborhood size for `k`-occurence.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    n_jobs : int, optional (default: -1)
        Number of parallel processes spawned for hubness calculation.
        Default value (-1): number of available CPUs.
        
    Returns
    -------
    S_k : float
        Hubness (skewness of `k`-occurence distribution)
    D_k : ndarray
        `k`-nearest neighbor lists
    N_k : ndarray
        `k`-occurence list    
    
    References
    ----------
    .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). 
           Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. 
           Journal of Machine Learning Research, 11, 2487–2531. Retrieved from 
           http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/
           radovanovic10a.pdf
    """
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
    
    if verbose:
        log.message("Hubness calculation (skewness of {}-occurence)".format(k))
        
    # Initialization
    n = D.shape[0]
    D = D.copy()
    D_k = np.zeros((k, D.shape[1]), dtype=np.float32 )
    
    if issparse(D): 
        pass # correct self-distance must be ensured upstream for sparse
    else:
        # Set self dist to inf
        np.fill_diagonal(D, d_self)
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self
                        
    # Parallelization
    if n_jobs == -1: # take all cpus
        NUMBER_OF_PROCESSES = mp.cpu_count()  # @UndefinedVariable
    else:
        NUMBER_OF_PROCESSES = n_jobs
    tasks = []
    
    batches = []
    batch_size = n // NUMBER_OF_PROCESSES
    for i in range(NUMBER_OF_PROCESSES-1):
        batches.append( np.arange(i*batch_size, (i+1)*batch_size) )
    batches.append( np.arange((NUMBER_OF_PROCESSES-1)*batch_size, n) )
    
    for idx, batch in enumerate(batches):
        submatrix = D[batch[0]:batch[-1]+1]
        tasks.append((_partial_hubness, 
                     (k, d_self, log, sort_order, 
                      batch, submatrix, idx, n, verbose)))   
    
    task_queue = mp.Queue()  # @UndefinedVariable
    done_queue = mp.Queue()  # @UndefinedVariable
    
    for task in tasks:
        task_queue.put(task)
        
    for i in range(NUMBER_OF_PROCESSES):  # @UnusedVariable
        mp.Process(target=_worker, args=(task_queue, done_queue)).start()  # @UndefinedVariable
    
    for i in range(len(tasks)):  # @UnusedVariable
        rows, Dk_part = done_queue.get()
        D_k[:, rows[0]:rows[-1]+1] = Dk_part
        
    for i in range(NUMBER_OF_PROCESSES):  # @UnusedVariable
        task_queue.put('STOP')        
               
    # k-occurence
    N_k = np.bincount(D_k.astype(int).ravel())    
    # Hubness
    S_k = stats.skew(N_k)
     
    if verbose:
        log.message("Hubness calculation done.", flush=True)
        
    # return hubness, k-nearest neighbors, N occurence
    return S_k, D_k, N_k
def predict(D:np.ndarray, target:np.ndarray, k=5,
            metric:str='distance', test_ind:np.ndarray=None, verbose:int=0,
            sample_idx=None, return_cmat=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth) or
        ``n x c`` in case of ``c`` binarized multilabels

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    return_cmat : bool, optional, default: True
        If False, only return the predictions `y_pred`.
        Otherwise also return the confusion matrices.

    Returns
    -------
    y_pred : ndarray (shape=(n_k, n, c), dtype=int)
        Predicted class labels (`n_k`... number of items in parameter `k`)
        
        HINT: Referring to the above example... 
        ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment.

    cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of
        the first class in the ``k=20`` experiment in the following order:
            TN    FP
            FN    TP
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    #io._check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    if not issparse(D):
        D = D.copy()
    target = target.astype(int)
    if target.ndim == 1:
        target = target[:, np.newaxis]
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy     io.check_valid_metric_parameter(metric)
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int)
    y_pred = np.zeros((k_length, *target.shape), dtype=int)

    classes = target.copy()
    for idx, cur_class in enumerate(np.array(cl).ravel()):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        if issparse(D):
            row = D.getrow(i)
            #row = D.data
            ind = row.nonzero()[1]
            row = row.toarray().ravel()
        else:
            row = D[i, :]
        if sample_idx is None:
            row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            if issparse(D):
                rp = ind
            else:
                rp = np.arange(len(sample_idx))
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
        idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                log.warning("Query was classified randomly, because all "
                            "distances were non-finite numbers.")
            for l in range(target.shape[1]):
                l_classes = classes[:, l]
                if sample_idx is None:
                    nn_class = l_classes[idx[0:k[j]]][finite_val]
                else:
                    l_sample_classes = sample_classes[:, l]
                    nn_class = l_sample_classes[idx[0:k[j]]][finite_val]
                cs = np.bincount(nn_class.astype(int))
                max_cs = np.where(cs == np.max(cs))[0]
                seed_class = classes[i, l]
                # "tie": use nearest neighbor
                if len(max_cs) > 1:
                    y_pred[j, i, l] = nn_class[0]
                    cmat[j, l, seed_class, nn_class[0]] += 1
                # majority vote
                else:
                    y_pred[j, i, l] = cl[max_cs[0]]
                    cmat[j, l, seed_class, cl[max_cs[0]]] += 1

    if verbose:
        log.message("Finished k-NN experiment.")

    if return_cmat:
        return y_pred, cmat
    else:
        return y_pred
Beispiel #37
0
def hubness(D:np.ndarray, k:int=5, metric='distance',
            verbose:int=0, n_jobs:int=1,
            random_state=None, shuffle_equal=True):
    """Compute hubness of a distance matrix.

    Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse
    nearest neighbor count, i.e. how often does a point occur in the
    `k`-nearest neighbor lists of other points).

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix or
        an ``n x m`` partial distances matrix (e.g. for train/test splits,
        with test objects in rows, train objects in column)
        
        NOTE: Partial distance matrices MUST NOT contain self distances.

    k : int, optional (default: 5)
        Neighborhood size for `k`-occurrence.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    n_jobs : int, optional (default: 1)
        Number of parallel processes spawned for hubness calculation.
        Value 1 (default): One process (not using multiprocessing)
        Value (-1): As many processes as number of available CPUs.

    random_state : int, optional
        Seed the RNG for reproducible results.
        
        NOTE: Currently only compatible with `n_jobs`=1

    shuffle_equal : bool, optional
        If true, shuffle neighbors with identical distances to avoid
        artifact hubness.
        NOTE: This is especially useful for secondary distance measures
        with a restricted number of possible values, e.g. SNN or MP empiric.

    Returns
    -------
    S_k : float
        Hubness (skewness of `k`-occurrence distribution)
    D_k : ndarray
        `k`-nearest neighbor lists
    N_k : ndarray
        `k`-occurrence list

    References
    ----------
    .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010).
           Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data.
           Journal of Machine Learning Research, 11, 2487–2531. Retrieved from
           http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/
           radovanovic10a.pdf
    """
    # Don't use multiprocessing environment when using only one job
    if n_jobs == 1:
        return _hubness_no_multiprocessing(D=D,
                                           k=k,
                                           metric=metric,
                                           verbose=verbose,
                                           random_state=random_state,
                                           shuffle_equal=shuffle_equal)
    if random_state is not None:
        raise ValueError("Seeding the RNG is not compatible with using n_jobs > 1.")
    log = ConsoleLogging()
    io.check_is_nD_array(arr=D, n=2, arr_type='Distance')
    io.check_valid_metric_parameter(metric)
    n, m = D.shape
    if k >= m:
        k_old = k
        k = m - 1
        log.warning("Reducing k from {} to {}, so that it is less than "
                    "the total number of neighbors.".format(k_old, k))
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
        kth = np.arange(k)
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
        kth = np.arange(m - k, m)

    if verbose:
        log.message("Hubness calculation (skewness of {}-occurrence)".format(k))

    # Initialization
    D = D.copy()
    D_k = np.zeros((n, k), dtype=np.float64)

    if issparse(D):
        pass # correct self-distance must be ensured upstream for sparse
    else:
        if n == m:
            # Set self dist to inf
            np.fill_diagonal(D, d_self)
        else:
            pass # Partial distance matrices MUST NOT contain self distances
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self

    # Parallelization
    if n_jobs == -1: # take all cpus
        NUMBER_OF_PROCESSES = mp.cpu_count() # @UndefinedVariable
    else:
        NUMBER_OF_PROCESSES = n_jobs
    D_k_ctype = RawArray(ctypes.c_int32, n*k)
    D_k = np.frombuffer(D_k_ctype, dtype=np.int32).reshape((n, k))
    with Pool(processes=NUMBER_OF_PROCESSES,
              initializer=_hubness_load_shared_data,
              initargs=(D, D_k, )) as pool:
        for _ in pool.imap(
            func=partial(_hubness_nearest_neighbors, n=n, m=m, 
                         d_self=d_self, metric=metric, kth=kth, 
                         sort_order=sort_order, log=log, verbose=verbose,
                         shuffle_equal=shuffle_equal),
            #chunksize=int(1e2),
            iterable=range(n)):
            pass # results handled within func

    # k-occurrence
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=m)
    # Hubness
    S_k = stats.skew(N_k)

    if verbose:
        log.message("Hubness calculation done.", flush=True)

    # return hubness, k-nearest neighbors, N occurence
    return S_k, D_k, N_k
Beispiel #38
0
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with Local Scaling.

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if n_jobs == -1:
        n_jobs = cpu_count()
    if metric == 'similarity':
        kth = n - k
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
        if sparse and n_jobs != 1:
            log.warning("Parallel processing not implemented for sparse "
                        "matrices. Using single process instead.")
            n_jobs = 1
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")
    D = np.copy(D)

    if test_ind is None:
        train_ind = slice(0, n) #take all        
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)
    if sparse:
        r = np.zeros(n)
        for i in range(n):
            di = D[i, train_ind].toarray()
            di[i] = exclude
            r[i] = np.partition(di, kth=kth)[kth]
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        np.fill_diagonal(D, exclude)
        if n_jobs > 1:
            r_ctype = RawArray(ctypes.c_double, n)
            r = np.frombuffer(r_ctype, dtype=np.float64)
            with Pool(processes=n_jobs,
                      initializer=_ls_load_shared_data,
                      initargs=(D, train_ind, r, r_ctype)) as pool:
                for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth),
                                   iterable=range(n)):
                    pass # results handled within func
        else:
            r = np.partition(D[:, train_ind], kth=kth)[:, kth]

    if sparse or n_jobs == 1:
        D_ls = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop: calc only triu part
            tmp = np.empty(n-i)
            tmp[0] = self_tmp_value
            if metric == 'similarity':
                if sparse and nnz[i] <= k:  # Don't rescale if there are
                    tmp[1:] = np.nan        # too few neighbors in row
                else:
                    tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            else:
                tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            D_ls[i, i:] = tmp
        # copy triu to tril -> symmetric matrix (diag=zeros)
        # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
        D_ls += D_ls.T
    else:
        D_ls_ctype = RawArray(ctypes.c_double, D.size)
        D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_ls_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool:
            for _ in pool.imap(func=partial(_ls_calculate_sec_dist,
                                  n=n, metric=metric,
                                  self_tmp_value=self_tmp_value),
                               iterable=range(n)):
                pass # results handled within func
        # triu is copied to tril within func
    if sparse:
        for i, nz in enumerate(nnz):
            if nz <= k: # too few neighbors
                D_ls[i, :] = D[i, :]
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
Beispiel #39
0
    def fit_transform(self, X, Y=None, has_self_distances=False):
        # Let's assume there are no self distances in X
        kth = np.arange(self.k)
        start = 0
        end = self.k
        if self.metric == 'precomputed':
            if Y is not None:
                raise ValueError(
                    f"Y must be None when using precomputed distances.")
            n_test, n_train = X.shape
            if n_test == n_train and has_self_distances:
                kth = np.arange(self.k + 1)
                start = 1
                end = self.k + 1
        else:
            n_test, m_test = X.shape
            if Y is None:
                Y = X
                # Self distances do occur in this case
                kth = np.arange(self.k + 1)
                start = 1
                end = self.k + 1
            n_train, m_train = Y.shape
            assert m_test == m_train, f'Number of features do not match'

        if self.metric == 'precomputed':
            if issparse(X):
                k_neighbors = self._k_neighbors_precomputed_sparse(X)
            else:
                k_neighbors = self._k_neighbors_precomputed(X, kth, start, end)
        else:
            k_neighbors = self._k_neighbors(
                X, Y, kth=kth, n_test=n_test, start=start, end=end)
        if self.return_k_neighbors:
            self.k_neighbors_ = k_neighbors
        k_occurrence = np.bincount(
            k_neighbors.astype(int).ravel(), minlength=n_train)
        if self.return_k_occurrence:
            self.k_occurrence_ = k_occurrence
        # traditional skewness measure
        self.k_skewness_ = stats.skew(k_occurrence)
        # new skewness measure (truncated normal distribution)
        self.k_skewness_truncnorm_ = self._skewness_truncnorm(k_occurrence)
        # Gini index
        if k_occurrence.shape[0] > 10_000:
            limiting = 'space'
        else:
            limiting = 'time'
        self.gini_index_ = self._gini_index(k_occurrence, limiting)
        # Robin Hood index
        self.hood_index_ = self._hood_index(k_occurrence)
        # Atkinson index
        self.atkinson_index_ = self._atkinson_index(k_occurrence)
        # anti-hub occurrence
        self.antihubs_, self.antihub_occurrence_ = \
            self._antihub_occurrence(k_occurrence)
        # hub occurrence
        self.hubs_, self.hub_occurrence_ = \
            self._hub_occurrence(k=self.k, k_occurrence=k_occurrence,
                                 n_test=n_test, hub_size=self.hub_size)
        # Largest hub
        # TODO That should probably also be diveded by k...
        self.groupie_ratio_ = k_occurrence.max() / n_test
        return self
Beispiel #40
0
def nicdm_sample(D:np.ndarray, k:int=7, metric:str='distance',
                 train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with local scaling variant NICDM.
    
    --- DRAFT version ---

    Transforms the given distance matrix into new one using NICDM [1]_
    with the given neighborhood radius `k` (average). There are two types of
    local scaling methods implemented. The original one and the non-iterative
    contextual dissimilarity measure, both reduce hubness in distance spaces,
    similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_nicdm : ndarray
        Secondary distance NICDM matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        raise NotImplementedError("NICDM does not support similarity matrices "
                                  "at the moment.")
    else: # metric == 'distance':
        D = np.copy(D)
        kth = np.arange(k)
        exclude = np.inf
        self_value = 0
        if issparse(D):
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    n = D.shape[0]
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude

    # Statistics
    r = np.partition(D, kth=kth, axis=1)[:, :k].mean(axis=1)
    r_geom = _local_geomean(r) #knn.ravel())

    # Calculate secondary distances
    D_nicdm = np.zeros_like(D)
    for i in n_ind:
        # vectorized inner loop (using broadcasting)
        D_nicdm[i, :] = (r_geom * D[i, :]) / np.sqrt(r[i] * r[train_ind])
        #D_nicdm[i, :] = ((r_geom**2) * D[i, :]) / (r[i] * r[train_ind])

    # Ensure correct self distances and return sec. dist. matrix
    if test_ind is None:
        np.fill_diagonal(D_nicdm, self_value)
        return D_nicdm 
    else:
        for j, sample in enumerate(train_ind):
            D_nicdm[sample, j] = self_value
        return D_nicdm[test_ind]
Beispiel #41
0
    def simcond(self, xo, method='approx', i_unknown=None):
        """
        Simulate values conditionally on observed known values

        Parameters
        ----------
        x : vector
            timeseries including missing data.
            (missing data must be NaN if i_unknown is not given)
            Assumption: The covariance of x is equal to self and have the
            same sample period.
        method : string
            defining method used in the conditional simulation. Options are:
            'approximate': Condition only on the closest points. Quite fast
            'exact' : Exact simulation. Slow for large data sets, may not
                return any result due to near singularity of the covariance
                matrix.
        i_unknown : integers
            indices to spurious or missing data in x

        Returns
        -------
        sample : ndarray
            a random sample of the missing values conditioned on the observed
            data.
        mu, sigma : ndarray
            mean and standard deviation, respectively, of the missing values
            conditioned on the observed data.

        Notes
        -----
        SIMCOND generates the missing values from x conditioned on the observed
        values assuming x comes from a multivariate Gaussian distribution
        with zero expectation and Auto Covariance function R.

        See also
        --------
        CovData1D.sim
        TimeSeries.reconstruct,
        rndnormnd

        Reference
        ---------
        Brodtkorb, P, Myrhaug, D, and Rue, H (2001)
        "Joint distribution of wave height and wave crest velocity from
        reconstructed data with application to ringing"
        Int. Journal of Offshore and Polar Engineering, Vol 11, No. 1,
        pp 23--32

        Brodtkorb, P, Myrhaug, D, and Rue, H (1999)
        "Joint distribution of wave height and wave crest velocity from
        reconstructed data"
        in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73
        """
        x = atleast_1d(xo).ravel()
        acf = self._get_acf()

        num_x = len(x)
        num_acf = len(acf)

        if i_unknown is not None:
            x[i_unknown] = nan
        i_unknown = flatnonzero(isnan(x))
        num_unknown = len(i_unknown)

        mu1o = zeros((num_unknown, ))
        mu1o_std = zeros((num_unknown, ))
        sample = zeros((num_unknown, ))
        if num_unknown == 0:
            warnings.warn('No missing data, no point to continue.')
            return sample, mu1o, mu1o_std
        if num_unknown == num_x:
            warnings.warn('All data missing, returning sample from' +
                          ' the apriori distribution.')
            mu1o_std = ones(num_unknown) * sqrt(acf[0])
            return self.sim(ns=num_unknown, cases=1)[:, 1], mu1o, mu1o_std

        i_known = flatnonzero(1 - isnan(x))

        if method.startswith('exac'):
            # exact but slow. It also may not return any result
            if num_acf > 0.3 * num_x:
                Sigma = toeplitz(hstack((acf, zeros(num_x - num_acf))))
            else:
                acf[0] = acf[0] * 1.00001
                Sigma = sptoeplitz(hstack((acf, zeros(num_x - num_acf))))
            Soo, So1, S11 = self._split_cov(Sigma, i_known, i_unknown)

            if issparse(Sigma):
                So1 = So1.todense()
                S11 = S11.todense()
                S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T
            else:
                Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T,
                                                    2 * So1,
                                                    cond=1e-4)
                S1o_Sooinv = Sooinv_So1.T
            mu1o = S1o_Sooinv.dot(x[i_known])
            Sigma1o = S11 - S1o_Sooinv.dot(So1)
            if (diag(Sigma1o) < 0).any():
                raise ValueError('Failed to converge to a solution')

            mu1o_std = sqrt(diag(Sigma1o))
            sample[:] = rndnormnd(mu1o, Sigma1o, cases=1).ravel()

        elif method.startswith('appr'):
            # approximating by only condition on the closest points

            Nsig = min(2 * num_acf, num_x)

            Sigma = toeplitz(hstack((acf, zeros(Nsig - num_acf))))
            overlap = int(Nsig / 4)
            # indices to the points used
            idx = r_[0:Nsig] + max(0, min(i_unknown[0] - overlap,
                                          num_x - Nsig))
            mask_unknown = zeros(num_x, dtype=bool)
            # temporary storage of indices to missing points
            mask_unknown[i_unknown] = True
            t_unknown = where(mask_unknown[idx])[0]
            t_known = where(1 - mask_unknown[idx])[0]
            ns = len(t_unknown)  # number of missing data in the interval

            num_restored = 0  # number of previously simulated points
            x2 = x.copy()

            while ns > 0:
                Soo, So1, S11 = self._split_cov(Sigma, t_known, t_unknown)
                if issparse(Soo):
                    So1 = So1.todense()
                    S11 = S11.todense()
                    S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T
                else:
                    Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T,
                                                        2 * So1,
                                                        cond=1e-4)
                    S1o_Sooinv = Sooinv_So1.T
                Sigma1o = S11 - S1o_Sooinv.dot(So1)
                if (diag(Sigma1o) < 0).any():
                    raise ValueError('Failed to converge to a solution')

                ix = slice((num_restored), (num_restored + ns))
                # standard deviation of the expected surface
                mu1o_std[ix] = np.maximum(mu1o_std[ix], sqrt(diag(Sigma1o)))

                # expected surface conditioned on the closest known
                # observations from x
                mu1o[ix] = S1o_Sooinv.dot(x2[idx[t_known]])
                # sample conditioned on the known observations from x
                mu1os = S1o_Sooinv.dot(x[idx[t_known]])
                sample[ix] = rndnormnd(mu1os, Sigma1o, cases=1)
                if idx[-1] == num_x - 1:
                    ns = 0  # no more points to simulate
                else:
                    x2[idx[t_unknown]] = mu1o[ix]  # expected surface
                    x[idx[t_unknown]] = sample[ix]  # sampled surface
                    # removing indices to data which has been simulated
                    mask_unknown[idx[:-overlap]] = False
                    # data we want to simulate once more
                    nw = sum(mask_unknown[idx[-overlap:]] is True)
                    num_restored += ns - nw  # update # points simulated so far

                    idx = self._update_window(idx, i_unknown, num_x, num_acf,
                                              overlap, nw, num_restored)

                    # find new interval with missing data
                    t_unknown = flatnonzero(mask_unknown[idx])
                    t_known = flatnonzero(1 - mask_unknown[idx])
                    ns = len(t_unknown)  # # missing data in the interval
        return sample, mu1o, mu1o_std
Beispiel #42
0
    def simcond(self, xo, method='approx', i_unknown=None):
        """
        Simulate values conditionally on observed known values

        Parameters
        ----------
        x : vector
            timeseries including missing data.
            (missing data must be NaN if i_unknown is not given)
            Assumption: The covariance of x is equal to self and have the
            same sample period.
        method : string
            defining method used in the conditional simulation. Options are:
            'approximate': Condition only on the closest points. Quite fast
            'exact' : Exact simulation. Slow for large data sets, may not
                return any result due to near singularity of the covariance
                matrix.
        i_unknown : integers
            indices to spurious or missing data in x

        Returns
        -------
        sample : ndarray
            a random sample of the missing values conditioned on the observed
            data.
        mu, sigma : ndarray
            mean and standard deviation, respectively, of the missing values
            conditioned on the observed data.

        Notes
        -----
        SIMCOND generates the missing values from x conditioned on the observed
        values assuming x comes from a multivariate Gaussian distribution
        with zero expectation and Auto Covariance function R.

        See also
        --------
        CovData1D.sim
        TimeSeries.reconstruct,
        rndnormnd

        Reference
        ---------
        Brodtkorb, P, Myrhaug, D, and Rue, H (2001)
        "Joint distribution of wave height and wave crest velocity from
        reconstructed data with application to ringing"
        Int. Journal of Offshore and Polar Engineering, Vol 11, No. 1,
        pp 23--32

        Brodtkorb, P, Myrhaug, D, and Rue, H (1999)
        "Joint distribution of wave height and wave crest velocity from
        reconstructed data"
        in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73
        """
        x = atleast_1d(xo).ravel()
        acf = self._get_acf()

        num_x = len(x)
        num_acf = len(acf)

        if i_unknown is not None:
            x[i_unknown] = nan
        i_unknown = flatnonzero(isnan(x))
        num_unknown = len(i_unknown)

        mu1o = zeros((num_unknown,))
        mu1o_std = zeros((num_unknown,))
        sample = zeros((num_unknown,))
        if num_unknown == 0:
            warnings.warn('No missing data, no point to continue.')
            return sample, mu1o, mu1o_std
        if num_unknown == num_x:
            warnings.warn('All data missing, returning sample from' +
                          ' the apriori distribution.')
            mu1o_std = ones(num_unknown) * sqrt(acf[0])
            return self.sim(ns=num_unknown, cases=1)[:, 1], mu1o, mu1o_std

        i_known = flatnonzero(1 - isnan(x))

        if method.startswith('exac'):
            # exact but slow. It also may not return any result
            if num_acf > 0.3 * num_x:
                Sigma = toeplitz(hstack((acf, zeros(num_x - num_acf))))
            else:
                acf[0] = acf[0] * 1.00001
                Sigma = sptoeplitz(hstack((acf, zeros(num_x - num_acf))))
            Soo, So1, S11 = self._split_cov(Sigma, i_known, i_unknown)

            if issparse(Sigma):
                So1 = So1.todense()
                S11 = S11.todense()
                S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T
            else:
                Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T, 2 * So1,
                                                    cond=1e-4)
                S1o_Sooinv = Sooinv_So1.T
            mu1o = S1o_Sooinv.dot(x[i_known])
            Sigma1o = S11 - S1o_Sooinv.dot(So1)
            if (diag(Sigma1o) < 0).any():
                raise ValueError('Failed to converge to a solution')

            mu1o_std = sqrt(diag(Sigma1o))
            sample[:] = rndnormnd(mu1o, Sigma1o, cases=1).ravel()

        elif method.startswith('appr'):
            # approximating by only condition on the closest points

            Nsig = min(2 * num_acf, num_x)

            Sigma = toeplitz(hstack((acf, zeros(Nsig - num_acf))))
            overlap = int(Nsig / 4)
            # indices to the points used
            idx = r_[0:Nsig] + max(0, min(i_unknown[0] - overlap,
                                          num_x - Nsig))
            mask_unknown = zeros(num_x, dtype=bool)
            # temporary storage of indices to missing points
            mask_unknown[i_unknown] = True
            t_unknown = where(mask_unknown[idx])[0]
            t_known = where(1 - mask_unknown[idx])[0]
            ns = len(t_unknown)  # number of missing data in the interval

            num_restored = 0  # number of previously simulated points
            x2 = x.copy()

            while ns > 0:
                Soo, So1, S11 = self._split_cov(Sigma, t_known, t_unknown)
                if issparse(Soo):
                    So1 = So1.todense()
                    S11 = S11.todense()
                    S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T
                else:
                    Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T, 2 * So1,
                                                        cond=1e-4)
                    S1o_Sooinv = Sooinv_So1.T
                Sigma1o = S11 - S1o_Sooinv.dot(So1)
                if (diag(Sigma1o) < 0).any():
                    raise ValueError('Failed to converge to a solution')

                ix = slice((num_restored), (num_restored + ns))
                # standard deviation of the expected surface
                mu1o_std[ix] = np.maximum(mu1o_std[ix], sqrt(diag(Sigma1o)))

                # expected surface conditioned on the closest known
                # observations from x
                mu1o[ix] = S1o_Sooinv.dot(x2[idx[t_known]])
                # sample conditioned on the known observations from x
                mu1os = S1o_Sooinv.dot(x[idx[t_known]])
                sample[ix] = rndnormnd(mu1os, Sigma1o, cases=1)
                if idx[-1] == num_x - 1:
                    ns = 0  # no more points to simulate
                else:
                    x2[idx[t_unknown]] = mu1o[ix]  # expected surface
                    x[idx[t_unknown]] = sample[ix]  # sampled surface
                    # removing indices to data which has been simulated
                    mask_unknown[idx[:-overlap]] = False
                    # data we want to simulate once more
                    nw = sum(mask_unknown[idx[-overlap:]] is True)
                    num_restored += ns - nw  # update # points simulated so far

                    idx = self._update_window(idx, i_unknown, num_x, num_acf,
                                              overlap, nw,  num_restored)

                    # find new interval with missing data
                    t_unknown = flatnonzero(mask_unknown[idx])
                    t_known = flatnonzero(1 - mask_unknown[idx])
                    ns = len(t_unknown)  # # missing data in the interval
        return sample, mu1o, mu1o_std
def __nearest_neighbors_search(pipe_to_exec, source_file_path, file_path):
    '''
        runs "pipe_to_exec" nearest neighbors search estimator
            
        parameters: 
        
            * source_file_path : hdf file in which input documents, queries and targets are stored
            * file_path: hdf filename where nns results will be stored
    '''

    #     print(linei.describe)

    d = hdf_to_sparse_matrix('documents', source_file_path)
    pipe_to_exec.fit(d, None)
    d_mean_time = pipe_to_exec.steps[0][1].fit_time

    print("fitted in %f s" % (d_mean_time))

    del d

    q = hdf_to_sparse_matrix('queries', source_file_path)
    d_indices, qd_distances, q_mean_time = pipe_to_exec.transform(q)

    #     print("mean retrieval time %f s"%(q_mean_time))

    time_dataframe = pd.DataFrame({
        'documents_mean_time': [d_mean_time],
        'queries_mean_time': [q_mean_time],
    })
    '''
        storing nearest neighbors search results
    '''
    time_dataframe.to_hdf(file_path.replace('results.h5', 'time.h5'),
                          'time_dataframe')
    sparse_matrix_to_hdf(d_indices, 'retrieved_docs', file_path)
    sparse_matrix_to_hdf(lil_matrix(qd_distances), 'qd_distances', file_path)

    del q, d_mean_time, q_mean_time, qd_distances, time_dataframe
    '''
        Evaluating results in terms of Precision, Recalls and MAP.
    '''

    t = hdf_to_sparse_matrix('targets', source_file_path)

    retrieved_relevants = []
    for q_index in range(d_indices.shape[0]):
        q_retrieved_relevants = np.cumsum(t[q_index, d_indices[q_index, :]].A,
                                          axis=1)
        retrieved_relevants.append(q_retrieved_relevants)

    retrieved_relevants = vstack(retrieved_relevants)
    '''
        broadcasting
    '''
    approachi_recalls = np.divide(retrieved_relevants,
                                  np.matrix(t.sum(axis=1)))
    ranking_sum = np.multiply(
        np.ones(retrieved_relevants.shape),
        np.matrix(range(1, retrieved_relevants.shape[1] + 1)))
    approachi_precisions = np.divide(retrieved_relevants, ranking_sum)

    average_precision = np.zeros((d_indices.shape[0], 1))
    for q_index in range(d_indices.shape[0]):
        relevants_precision = np.multiply(approachi_precisions[q_index, :],
                                          t[q_index, d_indices[q_index, :]].A)
        average_precision[q_index, 0] = relevants_precision.mean(axis=1)


#         print(q_index,'.MAP =',average_precision[q_index,0])

#     print(t.sum(axis=1))
#     print(retrieved_relevants)
    del d_indices, retrieved_relevants

    #     print("MAP=",average_precision.mean(),average_precision.std(),'precision.sum=',average_precision.sum())
    #     print("recalls.sum = ",approachi_recalls.sum(),'| mean = ',approachi_recalls.sum()/(approachi_recalls.shape[0]*approachi_recalls.shape[1]))

    for to_store, to_store_name in [(approachi_precisions, 'precisions'),
                                    (approachi_recalls, 'recalls'),
                                    (average_precision, 'average_precisions')]:
        if not issparse(to_store):
            to_store = csr_matrix(to_store)
        sparse_matrix_to_hdf(
            to_store, to_store_name,
            file_path.replace('results', 'results_evaluation'))

        del to_store