Ejemplo n.º 1
0
    def __init__(self, D:np.ndarray, secondary_distance_type:str,
                 metric:str='distance', classes:np.ndarray=None,
                 vectors:np.ndarray=None):
        """Initialize a hubness experiment"""

        IO._check_distance_matrix_shape(D)
        IO._check_valid_metric_parameter(metric)
        if secondary_distance_type not in SEC_DIST.keys():
            raise ValueError("Requested secondary distance type unknown.")
        if classes is not None:
            IO._check_distance_matrix_shape_fits_labels(D, classes)
        if vectors is None:
            self.embedding_dim = None
        else: # got vectors
            IO._check_distance_matrix_shape_fits_vectors(D, vectors)
            self.embedding_dim = vectors.shape[1]
        self.original_distance = D
        self.secondary_distance_type = secondary_distance_type
        self.classes = classes
        self.vectors = vectors
        self.metric = metric
        self.n = D.shape[0]
        # Obtained later through functions:
        self.secondary_distance = None
        self.hubness = dict()
        self.anti_hubs = dict()
        self.max_hub_k_occurence = dict()
        self.knn_accuracy = dict()
        self.gk_index = None
Ejemplo n.º 2
0
 def test_check_dist_vs_classes(self):
     with self.assertRaises(TypeError):
         D = np.empty((5, 5))
         classes = np.empty(4)
         IO._check_distance_matrix_shape_fits_labels(D, classes)
Ejemplo n.º 3
0
def score(D:np.ndarray, target:np.ndarray, k=5, 
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0):
    """Perform `k`-nearest neighbor classification.
    
    Use the ``n x n`` symmetric distance matrix `D` and target class 
    labels `target` to perform a `k`-NN experiment (leave-one-out 
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).
    
    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.
        
        HINT: Providing more than one value for `k` is a cheap means to perform 
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit 
          model to remaining data. Evaluate model on test set.
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
    
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)
        
        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items
        
        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)
        
        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of 
        the ``k=20`` experiment.
    """
    
    # Check input sanity
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_distance_matrix_shape_fits_labels(D, target)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
    
    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:  
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e
        
    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))
        
    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, len(cl), len(cl)))
    
    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    
    cl = range(len(cl))
    
    # Classify each point in test set
    for i in test_set_ind:
        seed_class = classes[i]
        
        if issparse(D):
            row = D.getrow(i).toarray().ravel()
        else:
            row = D[i, :]
        row[i] = d_self
        
        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        rp = train_set_ind
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        idx = rp[d2idx]      
        
        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            nn_class = classes[idx[0:k[j]]]
            cs = np.bincount(nn_class.astype(int))
            max_cs = np.where(cs == np.max(cs))[0]
            
            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1       
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1
                       
    if verbose:
        log.message("Finished k-NN experiment.")
        
    return acc, corr, cmat