def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_distance_matrix_shape_fits_labels(D, target) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, len(cl), len(cl))) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: seed_class = classes[i] if issparse(D): row = D.getrow(i).toarray().ravel() else: row = D[i, :] row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) rp = train_set_ind rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): nn_class = classes[idx[0:k[j]]] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def predict(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_ind:np.ndarray=None, verbose:int=0, sample_idx=None, return_cmat=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth) or ``n x c`` in case of ``c`` binarized multilabels k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). return_cmat : bool, optional, default: True If False, only return the predictions `y_pred`. Otherwise also return the confusion matrices. Returns ------- y_pred : ndarray (shape=(n_k, n, c), dtype=int) Predicted class labels (`n_k`... number of items in parameter `k`) HINT: Referring to the above example... ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment. cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of the first class in the ``k=20`` experiment in the following order: TN FP FN TP """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) #io._check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed if not issparse(D): D = D.copy() target = target.astype(int) if target.ndim == 1: target = target[:, np.newaxis] if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy io.check_valid_metric_parameter(metric) train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int) y_pred = np.zeros((k_length, *target.shape), dtype=int) classes = target.copy() for idx, cur_class in enumerate(np.array(cl).ravel()): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) if issparse(D): row = D.getrow(i) #row = D.data ind = row.nonzero()[1] row = row.toarray().ravel() else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: if issparse(D): rp = ind else: rp = np.arange(len(sample_idx)) rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) log.warning("Query was classified randomly, because all " "distances were non-finite numbers.") for l in range(target.shape[1]): l_classes = classes[:, l] if sample_idx is None: nn_class = l_classes[idx[0:k[j]]][finite_val] else: l_sample_classes = sample_classes[:, l] nn_class = l_sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] seed_class = classes[i, l] # "tie": use nearest neighbor if len(max_cs) > 1: y_pred[j, i, l] = nn_class[0] cmat[j, l, seed_class, nn_class[0]] += 1 # majority vote else: y_pred[j, i, l] = cl[max_cs[0]] cmat[j, l, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") if return_cmat: return y_pred, cmat else: return y_pred
def _mutual_proximity_gammai_sparse(S: np.ndarray, min_nnz: int = 30, test_set_ind: np.ndarray = None, verbose: int = 0, log=None): """MP gammai for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_gammai() """ n = S.shape[0] self_value = 1. if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # mean, variance WITH zero values #======================================================================= # from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 # mu, va = csr_mean_variance_axis0(self.S[train_set_mask]) #======================================================================= # mean, variance WITHOUT zero values (missing values), ddof=1 if S.diagonal().max() != 1. or S.diagonal().min() != 1.: raise ValueError("Self similarities must be 1.") S_param = S[train_set_ind] # the -1 accounts for self similarities that must be excluded from the calc mu = np.array((S_param.sum(0) - 1) / (S_param.getnnz(0) - 1)).ravel() E2 = mu**2 X = S_param.copy() X.data **= 2 n_x = (X.getnnz(0) - 1) E1 = np.array((X.sum(0) - 1) / (n_x)).ravel() del X # for an unbiased sample variance va = n_x / (n_x - 1) * (E1 - E2) del E1 A = E2 / va B = va / mu del mu, va, E2 A[A < 0] = np.nan B[B <= 0] = np.nan S_mp = lil_matrix(S.shape, dtype=np.float32) nnz = S.getnnz(axis=1) # nnz per row for i in range(n): if verbose and log and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True) j_idx = slice(i + 1, n) Dij = S[i, j_idx].toarray().ravel() #Extract dense rows temporarily tmp = np.empty(n - i) tmp[0] = self_value / 2. if nnz[i] <= min_nnz: tmp[1:] = np.nan else: p1 = _local_gamcdf(Dij, A[i], B[i]) del Dij Dji = S[j_idx, i].toarray().ravel() #for vectorization below. p2 = _local_gamcdf(Dji, A[j_idx], B[j_idx]) del Dji tmp[1:] = (p1 * p2).ravel() S_mp[i, i:] = tmp del tmp, j_idx S_mp += S_mp.T # Retain original distances for objects with too few neighbors. # That is, keep distances FROM these objects to others (rows), but # set distances of other objects TO them to NaN (columns). # Returned matrix is thus NOT SYMMETRIC. for row in np.argwhere(nnz <= min_nnz): row = row[0] # use scalar for indexing instead of array S_mp[row, :] = S.getrow(row) return S_mp.tocsr()
def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, sample_idx=None, filter_self=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). sample_idx : ... TODO add description filter_self : bool, optional, default: True Remove self similarities from sparse ``D``. This assumes that the highest similarity per row is the self similarity. NOTE: Quadratic dense matrices are always filtered for self distances/similarities, even if `filter_self` is set t0 `False`. Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) io.check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) D_is_sparse = issparse(D) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) if D_is_sparse: # Add a label for unknown class (object w/o nonzero sim to any others) cl = np.append(cl, cl.max()+1) n_classes = len(cl) + 1 else: n_classes = len(cl) cmat = np.zeros((k_length, n_classes, n_classes)) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) rnd_classif = np.zeros(k_length) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) seed_class = classes[i] if D_is_sparse: row = D.getrow(i) else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: rp = np.arange(len(sample_idx)) if D_is_sparse: nnz = row.nnz rp = np.random.permutation(nnz) d2 = row.data[rp] # Partition for each k value kth = nnz - k - 1 # sort the two highest similarities to end kth = np.append(kth, [nnz-2, nnz-1]) # Clip negative indices (nnz < k) np.clip(kth, a_min=0, a_max=nnz-1, out=kth) # Remove duplicate k values and sort kth = np.unique(kth) d2idx = np.argpartition(d2, kth=kth) d2idx = d2idx[~np.isnan(d2[d2idx])][::-1] idx = row.nonzero()[1][rp[d2idx]] idx = idx[1:] # rem self sim else: rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification if D_is_sparse: #print(row[0, idx[0:k[j]]].toarray()) finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel()) #print(finite_val) else: finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) rnd_classif[j] += 1 if sample_idx is None: nn_class = classes[idx[0:k[j]]][finite_val] else: #finite_val = np.isfinite(sample_row[idx[0:k[j]]]) nn_class = sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) if cs.size > 0: max_cs = np.where(cs == np.max(cs))[0] else: max_cs = np.array([len(cl) - 1]) # misclassification label # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if np.any(rnd_classif): for x in rnd_classif: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers.").format(x)) if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def _mutual_proximity_gumbel_sparse(S: np.ndarray, min_nnz: int = 30, test_set_ind: np.ndarray = None, verbose: int = 0, log=None): """MP Gumbel for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_gumbel() """ n = S.shape[0] self_value = 1. if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # mean, variance WITHOUT zero values (missing values), ddof=1 if S.diagonal().max() != 1. or S.diagonal().min() != 1.: raise ValueError("Self similarities must be 1.") S_param = S[train_set_ind] # the -1 accounts for self similarities that must be excluded from the calc mu = np.array((S_param.sum(0) - 1) / (S_param.getnnz(0) - 1)).ravel() E2 = mu**2 X = S_param.copy() X.data **= 2 n_x = (X.getnnz(0) - 1) E1 = np.array((X.sum(0) - 1) / (n_x)).ravel() del X # for an unbiased sample variance va = n_x / (n_x - 1) * (E1 - E2) del E1, E2 sd = np.sqrt(va) del va # Euler-Mascheroni gamma=.57721566490153286 (https://oeis.org/A001620) EULER_MASCHERONI = np.euler_gamma beta_hat = sd * np.sqrt(6) / np.pi mu_hat = mu - EULER_MASCHERONI * beta_hat del mu, sd S_mp = lil_matrix(S.shape, dtype=np.float32) nnz = S.getnnz(axis=1) # nnz per row for i in range(n): if verbose and log and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gumbel: {} of {}".format(i + 1, n), flush=True) j_idx = slice(i + 1, n) Dij = S[i, j_idx].toarray().ravel() #Extract dense rows temporarily tmp = np.empty(n - i) tmp[0] = self_value / 2. if nnz[i] <= min_nnz: tmp[1:] = np.nan else: # Rescale iff there are enough neighbors for current point p1 = _gumbelcdf(Dij, mu_hat[i], beta_hat[i]) p1[Dij == 0] = 0. del Dij Dji = S[j_idx, i].toarray().ravel() #for vectorization below. p2 = _gumbelcdf(Dji, mu_hat[j_idx], beta_hat[j_idx]) p2[Dji == 0] = 0. del Dji tmp[1:] = (p1 * p2).ravel() S_mp[i, i:] = tmp del tmp, j_idx S_mp += S_mp.T # Retain original distances for objects with too few neighbors. # That is, keep distances FROM these objects to others (rows), but # set distances of other objects TO them to NaN (columns). # Returned matrix is thus NOT SYMMETRIC. for row in np.argwhere(nnz <= min_nnz): row = row[0] # use scalar for indexing instead of array S_mp[row, :] = S.getrow(row) return S_mp.tocsr()