def __init__(self, n_neighbors=1, metric='euclidean', metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, n_jobs=1, seed_rng=123): """ :param n_neighbors: int value specifying the number of nearest neighbors. Should be >= 1. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance. This is a secondary distance metric that is found to be better suited to high dimensional data. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param seed_rng: int value specifying the seed for the random number generator. """ self.n_neighbors = n_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.shared_nearest_neighbors = shared_nearest_neighbors self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.seed_rng = seed_rng self.index_knn = None self.y_train = None self.n_classes = None self.labels_dtype = None self.label_enc = None self.label_dec = None
def __init__(self, neighborhood_constant=0.4, n_neighbors=None, metric='euclidean', metric_kwargs=None, approx_nearest_neighbors=True, n_jobs=1, seed_rng=123): """ :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param seed_rng: int value specifying the seed for the random number generator. """ self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.seed_rng = seed_rng self.num_samples = None self.index_knn = None self.lid_nominal = None
def __init__(self, data, neighborhood_constant=0.4, n_neighbors=None, metric='euclidean', metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, n_jobs=1, seed_rng=123): """ :param data: numpy array with the data samples. Has shape `(N, d)`, where `N` is the number of samples and `d` is the number of features. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance. This is a secondary distance metric that is found to be better suited to high dimensional data. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param seed_rng: int value specifying the seed for the random number generator. """ self.data = data self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.shared_nearest_neighbors = shared_nearest_neighbors self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.seed_rng = seed_rng N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N ** self.neighborhood_constant)) # Number of neighbors to use for calculating the shared nearest neighbor distance self.n_neighbors_snn = min(int(1.2 * self.n_neighbors), N - 1) # self.n_neighbors_snn = self.n_neighbors self.index_knn = self.build_knn_index(data)
def knn_parameter_search(data, labels, k_range, dim_proj_range=None, method_proj=None, num_cv_folds=5, metric='euclidean', metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, skip_preprocessing=False, pca_cutoff=1.0, n_jobs=-1, seed_rng=123): """ Search for the best value of `k` (number of neighbors) of a KNN classifier using cross-validation. Error rate is the metric. Optionally, you can also search over a range of reduced data dimensions via the parameters `dim_proj_range` and `method_proj`. In this case, the dimensionality reduction method `method_proj` is applied to reduce the dimension of the data to the specified values, and the search is doing over both `k` and the dimension. :param data: numpy array with the training data of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param labels: numpy array with the training labels of shape `(N, )`. :param k_range: list or array with the k values to search over. Expected to be sorted in increasing values. :param dim_proj_range: None or a list/array with the dimension values to search over. Set to `None` if there is no need to search over the data dimension. :param method_proj: None or a method for performing dimension reduction. The method string has to be one of the defined values `['LPP', 'OLPP', 'NPP', 'ONPP', 'PCA']`. :param num_cv_folds: int value > 1 that specifies the number of cross-validation folds. :param metric: same as the function `wrapper_knn`. :param metric_kwargs: same as the function `wrapper_knn`. :param shared_nearest_neighbors: same as the function `wrapper_knn`. :param approx_nearest_neighbors: same as the function `wrapper_knn`. :param skip_preprocessing: Set to True to skip the pre-processing step using PCA to remove noisy features with low variance. :param pca_cutoff: cumulative variance cutoff value in (0, 1]. This value is used for PCA. :param n_jobs: None or int value that specifies the number of parallel jobs. If set to None, -1, or 0, this will use all the available CPU cores. If set to negative values, this value will be subtracted from the available number of CPU cores. For example, `n_jobs = -2` will use `cpu_count - 2`. :param seed_rng: same as the function `wrapper_knn`. :return: (k_best, dim_best, error_rate_min, data_proj), where - k_best: selected best value for `k` from the list `k_range`. - dim_best: select best value of dimension. Can be ignored if no search is performed over the data dimension. - error_rate_min: minimum cross-validation error rate. - data_proj: projected (dimension reduced) data corresponding to the `dim_best`. Can be ignored if no search is performed over the data dimension. """ # Number of parallel jobs n_jobs = get_num_jobs(n_jobs) # Unique labels labels_unique = np.unique(labels) if skip_preprocessing: data_proj_list = [data] dim_proj_range = [data.shape[1]] elif method_proj is None: # Applying PCA as pre-processing step to remove noisy features data_proj, mean_data, transform_pca = pca_wrapper(data, cutoff=pca_cutoff, seed_rng=seed_rng) data_proj_list = [data_proj] dim_proj_range = [data_proj.shape[1]] else: if method_proj not in METHODS_LIST: raise ValueError("Invalid value '{}' specified for the argument 'method_proj'".format(method_proj)) logger.info("Using {} for dimension reduction.".format(method_proj)) if isinstance(dim_proj_range, int): dim_proj_range = [dim_proj_range] # Project the data to different reduced dimensions using the method `method_proj` data_proj_list = wrapper_data_projection(data, method_proj, dim_proj=dim_proj_range, metric=metric, metric_kwargs=metric_kwargs, snn=shared_nearest_neighbors, ann=approx_nearest_neighbors, pca_cutoff=pca_cutoff, n_jobs=n_jobs, seed_rng=seed_rng) # Split the data into stratified folds for cross-validation skf = StratifiedKFold(n_splits=num_cv_folds, shuffle=True, random_state=seed_rng) nd = len(dim_proj_range) nk = len(k_range) if nd > 1: logger.info("Performing cross-validation to search for the best combination of number of neighbors and " "projected data dimension:") else: logger.info("Performing cross-validation to search for the best number of neighbors:") error_rates_cv = np.zeros((nd, nk)) for ind_tr, ind_te in skf.split(data, labels): # Each cv fold for i in range(nd): # Each projected dimension data_proj = data_proj_list[i] # KNN classifier model with the maximum k value in `k_range` knn_model = KNNClassifier( n_neighbors=k_range[-1], metric=metric, metric_kwargs=metric_kwargs, shared_nearest_neighbors=shared_nearest_neighbors, approx_nearest_neighbors=approx_nearest_neighbors, n_jobs=n_jobs, seed_rng=seed_rng ) # Fit to the training data from this fold knn_model.fit(data_proj[ind_tr, :], labels[ind_tr], y_unique=labels_unique) # Get the label predictions for the different values of k in `k_range`. # `labels_test_pred` will be a numpy array of shape `(len(k_range), ind_te.shape[0])` labels_test_pred = knn_model.predict_multiple_k(data_proj[ind_te, :], k_range) # Error rate on the test data from this fold err_rate_fold = np.count_nonzero(labels_test_pred != labels[ind_te], axis=1) / float(ind_te.shape[0]) error_rates_cv[i, :] = error_rates_cv[i, :] + err_rate_fold # Average cross-validated error rate error_rates_cv = error_rates_cv / num_cv_folds # Find the projected dimension and k value corresponding to the minimum error rate a = np.argmin(error_rates_cv) row_ind = np.repeat(np.arange(nd)[:, np.newaxis], nk, axis=1).ravel() col_ind = np.repeat(np.arange(nk)[np.newaxis, :], nd, axis=0).ravel() ir = row_ind[a] ic = col_ind[a] error_rate_min = error_rates_cv[ir, ic] k_best = k_range[ic] dim_best = dim_proj_range[ir] logger.info("Best value of k (number of neighbors) = {:d}. Data dimension = {:d}. " "Cross-validation error rate = {:.6f}".format(k_best, dim_best, error_rate_min)) return k_best, dim_best, error_rate_min, data_proj_list[ir]
def __init__(self, dim_projection='auto', # 'auto' or positive integer orthogonal=False, # True to enable Orthogonal NPP (ONPP) method pca_cutoff=1.0, neighborhood_constant=0.4, n_neighbors=None, # Specify one of them. If `n_neighbors` is specified, # `neighborhood_constant` will be ignored. shared_nearest_neighbors=False, metric='euclidean', metric_kwargs=None, # distance metric and its parameter dict (if any) approx_nearest_neighbors=True, n_jobs=1, reg_eps=0.001, seed_rng=123): """ :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced dimension will be chosen by estimating the intrinsic dimension of the data. If an integer value is specified, it should be in the range `[1, dim - 1]`, where `dim` is the observed dimension of the data. :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP in [3]. :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve in the projected dimension-reduced data. PCA is applied as a first-level dimension reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in order to handle only the data matrix singularity. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to find the K nearest neighbors. This is a secondary distance metric that is found to be better suited to high dimensional data. :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity calculation. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param reg_eps: small float value that multiplies the trace to regularize the Gram matrix, if it is close to singular. :param seed_rng: int value specifying the seed for the random number generator. """ self.dim_projection = dim_projection self.orthogonal = orthogonal self.pca_cutoff = pca_cutoff self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.shared_nearest_neighbors = shared_nearest_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.reg_eps = reg_eps self.seed_rng = seed_rng self.mean_data = None self.index_knn = None self.iterated_laplacian_matrix = None self.transform_pca = None self.transform_npp = None self.transform_comb = None
def __init__(self, dim_projection='auto', # 'auto' or positive integer orthogonal=False, # True to enable Orthogonal LPP (OLPP) pca_cutoff=1.0, neighborhood_constant=0.4, n_neighbors=None, # Specify one of them. If `n_neighbors` is specified, # `neighborhood_constant` will be ignored. shared_nearest_neighbors=False, edge_weights='SNN', # Choices are {'simple', 'SNN', 'heat_kernel'} heat_kernel_param=None, # Used only if `edge_weights = 'heat_kernel'` metric='euclidean', metric_kwargs=None, # distance metric and its parameter dict (if any) approx_nearest_neighbors=True, n_jobs=1, seed_rng=123): """ :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced dimension will be chosen by estimating the intrinsic dimension of the data. If an integer value is specified, it should be in the range `[1, dim - 1]`, where `dim` is the observed dimension of the data. :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP in [3]. :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve in the projected dimension-reduced data. PCA is applied as a first-level dimension reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in order to handle only the data matrix singularity. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to find the K nearest neighbors. This is a secondary distance metric that is found to be better suited to high dimensional data. This will be set to True if `edge_weights = 'SNN'`. :param edge_weights: Weighting method to use for the edge weights. Valid choices are {'simple', 'SNN', 'heat_kernel'}. They are described below: - 'simple': the edge weight is set to one for every sample pair in the neighborhood. - 'SNN': the shared nearest neighbors (SNN) similarity score between two samples is used as the edge weight. This will be a value in [0, 1]. - 'heat_kernel': the heat (Gaussian) kernel with a suitable scale parameter defines the edge weight. :param heat_kernel_param: Heat kernel scale parameter. If set to `None`, this parameter is set automatically based on the median of the pairwise distances between samples. Else a positive real value can be specified. :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity calculation. This is used only if `edge_weights = 'SNN'`. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. Again, this is used only if `edge_weights = 'SNN'`. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param seed_rng: int value specifying the seed for the random number generator. """ self.dim_projection = dim_projection self.orthogonal = orthogonal self.pca_cutoff = pca_cutoff self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.shared_nearest_neighbors = shared_nearest_neighbors self.edge_weights = edge_weights.lower() self.heat_kernel_param = heat_kernel_param self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.seed_rng = seed_rng if self.edge_weights not in {'simple', 'snn', 'heat_kernel'}: raise ValueError("Invalid value '{}' for parameter 'edge_weights'".format(self.edge_weights)) if self.edge_weights == 'snn': self.shared_nearest_neighbors = True self.mean_data = None self.index_knn = None self.adjacency_matrix = None self.incidence_matrix = None self.laplacian_matrix = None self.transform_pca = None self.transform_lpp = None self.transform_comb = None