def __init__(self,
                 n_neighbors=1,
                 metric='euclidean', metric_kwargs=None,
                 shared_nearest_neighbors=False,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 seed_rng=123):
        """
        :param n_neighbors: int value specifying the number of nearest neighbors. Should be >= 1.
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance.
                                         This is a secondary distance metric that is found to be better suited to
                                         high dimensional data.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.seed_rng = seed_rng

        self.index_knn = None
        self.y_train = None
        self.n_classes = None
        self.labels_dtype = None
        self.label_enc = None
        self.label_dec = None
    def __init__(self,
                 neighborhood_constant=0.4, n_neighbors=None,
                 metric='euclidean', metric_kwargs=None,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 seed_rng=123):
        """

        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.seed_rng = seed_rng

        self.num_samples = None
        self.index_knn = None
        self.lid_nominal = None
Ejemplo n.º 3
0
    def __init__(self, data,
                 neighborhood_constant=0.4, n_neighbors=None,
                 metric='euclidean', metric_kwargs=None,
                 shared_nearest_neighbors=False,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 seed_rng=123):
        """
        :param data: numpy array with the data samples. Has shape `(N, d)`, where `N` is the number of samples and
                     `d` is the number of features.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance.
                                         This is a secondary distance metric that is found to be better suited to
                                         high dimensional data.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.data = data
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.seed_rng = seed_rng

        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N ** self.neighborhood_constant))

        # Number of neighbors to use for calculating the shared nearest neighbor distance
        self.n_neighbors_snn = min(int(1.2 * self.n_neighbors), N - 1)
        # self.n_neighbors_snn = self.n_neighbors

        self.index_knn = self.build_knn_index(data)
def knn_parameter_search(data, labels, k_range,
                         dim_proj_range=None, method_proj=None,
                         num_cv_folds=5,
                         metric='euclidean', metric_kwargs=None,
                         shared_nearest_neighbors=False,
                         approx_nearest_neighbors=True,
                         skip_preprocessing=False,
                         pca_cutoff=1.0,
                         n_jobs=-1,
                         seed_rng=123):
    """
    Search for the best value of `k` (number of neighbors) of a KNN classifier using cross-validation. Error rate
    is the metric. Optionally, you can also search over a range of reduced data dimensions via the parameters
    `dim_proj_range` and `method_proj`. In this case, the dimensionality reduction method `method_proj` is applied
    to reduce the dimension of the data to the specified values, and the search is doing over both `k` and the
    dimension.

    :param data: numpy array with the training data of shape `(N, d)`, where `N` is the number of samples
                 and `d` is the dimension.
    :param labels: numpy array with the training labels of shape `(N, )`.
    :param k_range: list or array with the k values to search over. Expected to be sorted in increasing values.
    :param dim_proj_range: None or a list/array with the dimension values to search over. Set to `None` if there
                           is no need to search over the data dimension.
    :param method_proj: None or a method for performing dimension reduction. The method string has to be one of the
                        defined values `['LPP', 'OLPP', 'NPP', 'ONPP', 'PCA']`.
    :param num_cv_folds: int value > 1 that specifies the number of cross-validation folds.
    :param metric: same as the function `wrapper_knn`.
    :param metric_kwargs: same as the function `wrapper_knn`.
    :param shared_nearest_neighbors: same as the function `wrapper_knn`.
    :param approx_nearest_neighbors: same as the function `wrapper_knn`.
    :param skip_preprocessing: Set to True to skip the pre-processing step using PCA to remove noisy features
                               with low variance.
    :param pca_cutoff: cumulative variance cutoff value in (0, 1]. This value is used for PCA.
    :param n_jobs: None or int value that specifies the number of parallel jobs. If set to None, -1, or 0, this will
                   use all the available CPU cores. If set to negative values, this value will be subtracted from
                   the available number of CPU cores. For example, `n_jobs = -2` will use `cpu_count - 2`.
    :param seed_rng: same as the function `wrapper_knn`.

    :return:
    (k_best, dim_best, error_rate_min, data_proj), where
        - k_best: selected best value for `k` from the list `k_range`.
        - dim_best: select best value of dimension. Can be ignored if no search is performed over the data dimension.
        - error_rate_min: minimum cross-validation error rate.
        - data_proj: projected (dimension reduced) data corresponding to the `dim_best`. Can be ignored if no search
                     is performed over the data dimension.
    """
    # Number of parallel jobs
    n_jobs = get_num_jobs(n_jobs)

    # Unique labels
    labels_unique = np.unique(labels)

    if skip_preprocessing:
        data_proj_list = [data]
        dim_proj_range = [data.shape[1]]
    elif method_proj is None:
        # Applying PCA as pre-processing step to remove noisy features
        data_proj, mean_data, transform_pca = pca_wrapper(data, cutoff=pca_cutoff, seed_rng=seed_rng)
        data_proj_list = [data_proj]
        dim_proj_range = [data_proj.shape[1]]
    else:
        if method_proj not in METHODS_LIST:
            raise ValueError("Invalid value '{}' specified for the argument 'method_proj'".format(method_proj))

        logger.info("Using {} for dimension reduction.".format(method_proj))
        if isinstance(dim_proj_range, int):
            dim_proj_range = [dim_proj_range]

        # Project the data to different reduced dimensions using the method `method_proj`
        data_proj_list = wrapper_data_projection(data, method_proj,
                                                 dim_proj=dim_proj_range,
                                                 metric=metric, metric_kwargs=metric_kwargs,
                                                 snn=shared_nearest_neighbors,
                                                 ann=approx_nearest_neighbors,
                                                 pca_cutoff=pca_cutoff,
                                                 n_jobs=n_jobs,
                                                 seed_rng=seed_rng)

    # Split the data into stratified folds for cross-validation
    skf = StratifiedKFold(n_splits=num_cv_folds, shuffle=True, random_state=seed_rng)
    nd = len(dim_proj_range)
    nk = len(k_range)
    if nd > 1:
        logger.info("Performing cross-validation to search for the best combination of number of neighbors and "
                    "projected data dimension:")
    else:
        logger.info("Performing cross-validation to search for the best number of neighbors:")

    error_rates_cv = np.zeros((nd, nk))
    for ind_tr, ind_te in skf.split(data, labels):
        # Each cv fold
        for i in range(nd):
            # Each projected dimension
            data_proj = data_proj_list[i]

            # KNN classifier model with the maximum k value in `k_range`
            knn_model = KNNClassifier(
                n_neighbors=k_range[-1],
                metric=metric, metric_kwargs=metric_kwargs,
                shared_nearest_neighbors=shared_nearest_neighbors,
                approx_nearest_neighbors=approx_nearest_neighbors,
                n_jobs=n_jobs,
                seed_rng=seed_rng
            )
            # Fit to the training data from this fold
            knn_model.fit(data_proj[ind_tr, :], labels[ind_tr], y_unique=labels_unique)

            # Get the label predictions for the different values of k in `k_range`.
            # `labels_test_pred` will be a numpy array of shape `(len(k_range), ind_te.shape[0])`
            labels_test_pred = knn_model.predict_multiple_k(data_proj[ind_te, :], k_range)

            # Error rate on the test data from this fold
            err_rate_fold = np.count_nonzero(labels_test_pred != labels[ind_te], axis=1) / float(ind_te.shape[0])
            error_rates_cv[i, :] = error_rates_cv[i, :] + err_rate_fold

    # Average cross-validated error rate
    error_rates_cv = error_rates_cv / num_cv_folds

    # Find the projected dimension and k value corresponding to the minimum error rate
    a = np.argmin(error_rates_cv)
    row_ind = np.repeat(np.arange(nd)[:, np.newaxis], nk, axis=1).ravel()
    col_ind = np.repeat(np.arange(nk)[np.newaxis, :], nd, axis=0).ravel()
    ir = row_ind[a]
    ic = col_ind[a]
    error_rate_min = error_rates_cv[ir, ic]
    k_best = k_range[ic]
    dim_best = dim_proj_range[ir]
    logger.info("Best value of k (number of neighbors) = {:d}. Data dimension = {:d}. "
                "Cross-validation error rate = {:.6f}".format(k_best, dim_best, error_rate_min))

    return k_best, dim_best, error_rate_min, data_proj_list[ir]
Ejemplo n.º 5
0
    def __init__(self,
                 dim_projection='auto',                         # 'auto' or positive integer
                 orthogonal=False,                              # True to enable Orthogonal NPP (ONPP) method
                 pca_cutoff=1.0,
                 neighborhood_constant=0.4, n_neighbors=None,   # Specify one of them. If `n_neighbors` is specified,
                                                                # `neighborhood_constant` will be ignored.
                 shared_nearest_neighbors=False,
                 metric='euclidean', metric_kwargs=None,        # distance metric and its parameter dict (if any)
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 reg_eps=0.001,
                 seed_rng=123):
        """
        :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced
                               dimension will be chosen by estimating the intrinsic dimension of the data. If an
                               integer value is specified, it should be in the range `[1, dim - 1]`, where `dim`
                               is the observed dimension of the data.
        :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP
                           in [3].
        :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve
                           in the projected dimension-reduced data. PCA is applied as a first-level dimension
                           reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in
                           order to handle only the data matrix singularity.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to
                                         find the K nearest neighbors. This is a secondary distance metric that is
                                         found to be better suited to high dimensional data.
        :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity
                       calculation.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param reg_eps: small float value that multiplies the trace to regularize the Gram matrix, if it is
                        close to singular.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.dim_projection = dim_projection
        self.orthogonal = orthogonal
        self.pca_cutoff = pca_cutoff
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.reg_eps = reg_eps
        self.seed_rng = seed_rng

        self.mean_data = None
        self.index_knn = None
        self.iterated_laplacian_matrix = None
        self.transform_pca = None
        self.transform_npp = None
        self.transform_comb = None
Ejemplo n.º 6
0
    def __init__(self,
                 dim_projection='auto',                         # 'auto' or positive integer
                 orthogonal=False,                              # True to enable Orthogonal LPP (OLPP)
                 pca_cutoff=1.0,
                 neighborhood_constant=0.4, n_neighbors=None,   # Specify one of them. If `n_neighbors` is specified,
                                                                # `neighborhood_constant` will be ignored.
                 shared_nearest_neighbors=False,
                 edge_weights='SNN',                            # Choices are {'simple', 'SNN', 'heat_kernel'}
                 heat_kernel_param=None,                        # Used only if `edge_weights = 'heat_kernel'`
                 metric='euclidean', metric_kwargs=None,        # distance metric and its parameter dict (if any)
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 seed_rng=123):
        """
        :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced
                               dimension will be chosen by estimating the intrinsic dimension of the data. If an
                               integer value is specified, it should be in the range `[1, dim - 1]`, where `dim`
                               is the observed dimension of the data.
        :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP
                           in [3].
        :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve
                           in the projected dimension-reduced data. PCA is applied as a first-level dimension
                           reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in
                           order to handle only the data matrix singularity.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to
                                         find the K nearest neighbors. This is a secondary distance metric that is
                                         found to be better suited to high dimensional data. This will be set to
                                         True if `edge_weights = 'SNN'`.
        :param edge_weights: Weighting method to use for the edge weights. Valid choices are {'simple', 'SNN',
                             'heat_kernel'}. They are described below:
                             - 'simple': the edge weight is set to one for every sample pair in the neighborhood.
                             - 'SNN': the shared nearest neighbors (SNN) similarity score between two samples is used
                             as the edge weight. This will be a value in [0, 1].
                             - 'heat_kernel': the heat (Gaussian) kernel with a suitable scale parameter defines the
                             edge weight.
        :param heat_kernel_param: Heat kernel scale parameter. If set to `None`, this parameter is set automatically
                                  based on the median of the pairwise distances between samples. Else a positive
                                  real value can be specified.
        :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity
                       calculation. This is used only if `edge_weights = 'SNN'`.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary. Again, this is used only if `edge_weights = 'SNN'`.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.dim_projection = dim_projection
        self.orthogonal = orthogonal
        self.pca_cutoff = pca_cutoff
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.edge_weights = edge_weights.lower()
        self.heat_kernel_param = heat_kernel_param
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.seed_rng = seed_rng

        if self.edge_weights not in {'simple', 'snn', 'heat_kernel'}:
            raise ValueError("Invalid value '{}' for parameter 'edge_weights'".format(self.edge_weights))

        if self.edge_weights == 'snn':
            self.shared_nearest_neighbors = True

        self.mean_data = None
        self.index_knn = None
        self.adjacency_matrix = None
        self.incidence_matrix = None
        self.laplacian_matrix = None
        self.transform_pca = None
        self.transform_lpp = None
        self.transform_comb = None