Esempio n. 1
0
    def _k_neighbors_precomputed_sparse(self,
                                        X: csr_matrix,
                                        n_samples: int = None):
        """ Find nearest neighbors in sparse distance matrix.

        Parameters
        ----------
        X: sparse, shape = [n_test, n_indexed]
            Sparse distance matrix. Only non-zero elements
            may be considered neighbors.

        n_samples: int
            Number of sampled indexed objects, e.g.
            in approximate hubness reduction.
            If None, this is inferred from the first row of X.

        Returns
        -------
        k_neighbors : ndarray
            Flattened array of neighbor indices.
        """
        if not issparse(X):
            raise TypeError(f'Matrix X is not sparse')
        X = X.tocsr()
        if n_samples is None:
            n_samples = X.indptr[1] - X.indptr[0]
        n_test, _ = X.shape
        # To allow different number of explicit entries per row,
        # we need to process the matrix row-by-row.
        if np.all(X.indptr[1:] -
                  X.indptr[:-1] == n_samples) and not self.shuffle_equal:
            min_ind = np.argpartition(X.data.reshape(n_test, n_samples),
                                      kth=np.arange(self.k),
                                      axis=1)[:, :self.k]
            k_neighbors = X.indices[min_ind.ravel() +
                                    np.repeat(X.indptr[:-1], repeats=self.k)]
        else:
            k_neighbors = np.empty((n_test, ), dtype=object)
            if self.verbose:
                range_n_test = tqdm(range(n_test))
            else:
                range_n_test = range(n_test)
            if self.shuffle_equal:
                for i in range_n_test:
                    x = X.getrow(i)
                    rp = self._random_state.permutation(x.nnz)
                    d2 = x.data[rp]
                    d2idx = np.argpartition(d2, kth=np.arange(self.k))
                    k_neighbors[i] = x.indices[rp[d2idx[:self.k]]]
            else:
                for i in range_n_test:
                    x = X.getrow(i)
                    min_ind = np.argpartition(x.data,
                                              kth=np.arange(self.k))[:self.k]
                    k_neighbors[i] = x.indices[min_ind]
            k_neighbors = np.concatenate(k_neighbors)
        return k_neighbors
Esempio n. 2
0
 def sample_by_random_uniform(data: sp.csr_matrix, num_items=99) -> sp.csr_matrix:
     rows = []
     cols = []
     for row in range(data.shape[0]):
         candidate_negatives = list(zip(*data.getrow(row).nonzero()))
         sampled_negatives = np.array(candidate_negatives)[random.sample(range(len(candidate_negatives)), num_items)]
         rows.extend(list(np.ones(len(sampled_negatives), dtype=int) * row))
         cols.extend(sampled_negatives[:, 1])
     negative_samples = sp.csr_matrix((np.ones_like(rows), (rows, cols)), dtype='bool',
                                      shape=(data.shape[0], data.shape[1]))
     return negative_samples
Esempio n. 3
0
def train_test_split(X: sp.csr_matrix):
    row_indices = get_unique_nonzero_indices(X)

    train_data = []
    test_data = []

    for row_index in row_indices:
        col_indices = X.getrow(row_index).indices

        test_index = np.random.choice(col_indices, 1)[0]
        train_data.extend([(row_index, col_index) for col_index in col_indices if col_index != test_index])
        test_data.append((row_index, test_index))
    return train_data, test_data
Esempio n. 4
0
def filter_rows(X: sp.csr_matrix, min_row_values=10):
    row_indices = get_unique_nonzero_indices(X)

    # Set elements of filtered rows to 0.
    for row_index in row_indices:
        col_indices = X.getrow(row_index).indices
        if len(col_indices) < min_row_values:
            for col_index in col_indices:
                X[row_index, col_index] = 0

    # Remove zero rows.
    X = remove_zero_rows(X)
    return X
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, verbose: int = 0, log=None):
    """MP empiric for sparse similarity matrices. 
    
    Please do not directly use this function, but invoke via 
    mutual_proximity_empiric()
    """
    self_value = 1.0  # similarity matrix
    n = S.shape[0]
    S_mp = lil_matrix(S.shape)

    for i, j in zip(*triu(S).nonzero()):
        if verbose and log and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True)
        d = S[j, i]
        dI = S.getrow(i).toarray()
        dJ = S.getrow(j).toarray()
        nz = (dI > 0) & (dJ > 0)
        S_mp[i, j] = (nz & (dI <= d) & (dJ <= d)).sum() / (nz.sum() - 1)

    S_mp += S_mp.T
    for i in range(n):
        S_mp[i, i] = self_value  # need to set self values

    return S_mp.tocsr()
Esempio n. 6
0
def _mutual_proximity_empiric_sparse(S: csr_matrix,
                                     test_set_ind: np.ndarray = None,
                                     min_nnz=0,
                                     verbose: int = 0,
                                     log=None,
                                     n_jobs=None):
    """MP empiric for sparse similarity matrices.

    Please do not directly use this function, but invoke via 
    mutual_proximity_empiric()
    """
    if verbose and log:
        log.message("Starting MP empiric for sparse matrices.")
    self_value = 1.  # similarity matrix
    n = S.shape[0]
    if not n_jobs:
        n_jobs = 1
    elif n_jobs == -1:
        n_jobs = cpu_count()
    else:
        pass

    # This will become S_mp.data
    shared_data = Array(ctypes.c_double, S.data.size)
    shared_data_np = np.ctypeslib.as_array(shared_data.get_obj())

    if verbose and log:
        log.message("Spawning processes and starting MP computation.")
    with Pool(processes=n_jobs,
              initializer=_mpes_init,
              initargs=(S, shared_data)) as pool:
        S_nonzero = filterfalse(lambda ij: ij[0] > ij[1], zip(*S.nonzero()))
        for _ in pool.imap(func=partial(_mpes_sec_dist,
                                        args=(verbose, log, n, min_nnz)),
                           iterable=S_nonzero,
                           chunksize=int(1e5)):
            pass  # output stored by function in shared array
    pool.join()
    if verbose and log:
        log.message("Assemble upper-triangular MP matrix.")
    S_mp = csr_matrix((shared_data_np, S.indices, S.indptr),
                      shape=S.shape,
                      copy=False).tolil()
    del shared_data, shared_data_np
    if verbose and log:
        log.message("Symmetrizing matrix.")
    S_mp += S_mp.T
    # Retain original distances for objects with too few neighbors.
    # That is, keep distances FROM these objects to others (rows), but
    # set distances of other objects TO them to NaN (columns).
    # Returned matrix is thus NOT SYMMETRIC.
    if verbose and log:
        log.message(("Retain original similarities for objects with too few "
                     "neighbors. If there are any, the output matrix will "
                     "not be symmetric anymore! (Rows corresponding to these "
                     "objects will be in original space; corresponding "
                     "columns will contain NaN)."))
    for row in np.argwhere(S.getnnz(axis=1) <= min_nnz):
        row = row[0]  # use scalar for indexing instead of array
        S_mp[row, :] = S.getrow(row)
    if verbose and log:
        log.message("Setting self similarities.")
    for i in range(n):
        S_mp[i, i] = self_value  #need to set self values
    if verbose and log:
        log.message("Converting to CSR matrix and returning.")
    return S_mp.tocsr()