def _k_neighbors_precomputed_sparse(self, X: csr_matrix, n_samples: int = None): """ Find nearest neighbors in sparse distance matrix. Parameters ---------- X: sparse, shape = [n_test, n_indexed] Sparse distance matrix. Only non-zero elements may be considered neighbors. n_samples: int Number of sampled indexed objects, e.g. in approximate hubness reduction. If None, this is inferred from the first row of X. Returns ------- k_neighbors : ndarray Flattened array of neighbor indices. """ if not issparse(X): raise TypeError(f'Matrix X is not sparse') X = X.tocsr() if n_samples is None: n_samples = X.indptr[1] - X.indptr[0] n_test, _ = X.shape # To allow different number of explicit entries per row, # we need to process the matrix row-by-row. if np.all(X.indptr[1:] - X.indptr[:-1] == n_samples) and not self.shuffle_equal: min_ind = np.argpartition(X.data.reshape(n_test, n_samples), kth=np.arange(self.k), axis=1)[:, :self.k] k_neighbors = X.indices[min_ind.ravel() + np.repeat(X.indptr[:-1], repeats=self.k)] else: k_neighbors = np.empty((n_test, ), dtype=object) if self.verbose: range_n_test = tqdm(range(n_test)) else: range_n_test = range(n_test) if self.shuffle_equal: for i in range_n_test: x = X.getrow(i) rp = self._random_state.permutation(x.nnz) d2 = x.data[rp] d2idx = np.argpartition(d2, kth=np.arange(self.k)) k_neighbors[i] = x.indices[rp[d2idx[:self.k]]] else: for i in range_n_test: x = X.getrow(i) min_ind = np.argpartition(x.data, kth=np.arange(self.k))[:self.k] k_neighbors[i] = x.indices[min_ind] k_neighbors = np.concatenate(k_neighbors) return k_neighbors
def sample_by_random_uniform(data: sp.csr_matrix, num_items=99) -> sp.csr_matrix: rows = [] cols = [] for row in range(data.shape[0]): candidate_negatives = list(zip(*data.getrow(row).nonzero())) sampled_negatives = np.array(candidate_negatives)[random.sample(range(len(candidate_negatives)), num_items)] rows.extend(list(np.ones(len(sampled_negatives), dtype=int) * row)) cols.extend(sampled_negatives[:, 1]) negative_samples = sp.csr_matrix((np.ones_like(rows), (rows, cols)), dtype='bool', shape=(data.shape[0], data.shape[1])) return negative_samples
def train_test_split(X: sp.csr_matrix): row_indices = get_unique_nonzero_indices(X) train_data = [] test_data = [] for row_index in row_indices: col_indices = X.getrow(row_index).indices test_index = np.random.choice(col_indices, 1)[0] train_data.extend([(row_index, col_index) for col_index in col_indices if col_index != test_index]) test_data.append((row_index, test_index)) return train_data, test_data
def filter_rows(X: sp.csr_matrix, min_row_values=10): row_indices = get_unique_nonzero_indices(X) # Set elements of filtered rows to 0. for row_index in row_indices: col_indices = X.getrow(row_index).indices if len(col_indices) < min_row_values: for col_index in col_indices: X[row_index, col_index] = 0 # Remove zero rows. X = remove_zero_rows(X) return X
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, verbose: int = 0, log=None): """MP empiric for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_empiric() """ self_value = 1.0 # similarity matrix n = S.shape[0] S_mp = lil_matrix(S.shape) for i, j in zip(*triu(S).nonzero()): if verbose and log and ((i + 1) % 1000 == 0 or i == n - 2): log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True) d = S[j, i] dI = S.getrow(i).toarray() dJ = S.getrow(j).toarray() nz = (dI > 0) & (dJ > 0) S_mp[i, j] = (nz & (dI <= d) & (dJ <= d)).sum() / (nz.sum() - 1) S_mp += S_mp.T for i in range(n): S_mp[i, i] = self_value # need to set self values return S_mp.tocsr()
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, min_nnz=0, verbose: int = 0, log=None, n_jobs=None): """MP empiric for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_empiric() """ if verbose and log: log.message("Starting MP empiric for sparse matrices.") self_value = 1. # similarity matrix n = S.shape[0] if not n_jobs: n_jobs = 1 elif n_jobs == -1: n_jobs = cpu_count() else: pass # This will become S_mp.data shared_data = Array(ctypes.c_double, S.data.size) shared_data_np = np.ctypeslib.as_array(shared_data.get_obj()) if verbose and log: log.message("Spawning processes and starting MP computation.") with Pool(processes=n_jobs, initializer=_mpes_init, initargs=(S, shared_data)) as pool: S_nonzero = filterfalse(lambda ij: ij[0] > ij[1], zip(*S.nonzero())) for _ in pool.imap(func=partial(_mpes_sec_dist, args=(verbose, log, n, min_nnz)), iterable=S_nonzero, chunksize=int(1e5)): pass # output stored by function in shared array pool.join() if verbose and log: log.message("Assemble upper-triangular MP matrix.") S_mp = csr_matrix((shared_data_np, S.indices, S.indptr), shape=S.shape, copy=False).tolil() del shared_data, shared_data_np if verbose and log: log.message("Symmetrizing matrix.") S_mp += S_mp.T # Retain original distances for objects with too few neighbors. # That is, keep distances FROM these objects to others (rows), but # set distances of other objects TO them to NaN (columns). # Returned matrix is thus NOT SYMMETRIC. if verbose and log: log.message(("Retain original similarities for objects with too few " "neighbors. If there are any, the output matrix will " "not be symmetric anymore! (Rows corresponding to these " "objects will be in original space; corresponding " "columns will contain NaN).")) for row in np.argwhere(S.getnnz(axis=1) <= min_nnz): row = row[0] # use scalar for indexing instead of array S_mp[row, :] = S.getrow(row) if verbose and log: log.message("Setting self similarities.") for i in range(n): S_mp[i, i] = self_value #need to set self values if verbose and log: log.message("Converting to CSR matrix and returning.") return S_mp.tocsr()