Python get_squared_froebenius_norm_line_wiseの例、qkmeans.core.utils.get_squared_froebenius_norm_line_wise Pythonの例

コード例 #1

0

ファイルを表示

def special_rbf_kernel(X, Y, gamma, norm_X, norm_Y):
    """
    Rbf kernel expressed under the form f(x)f(u)f(xy^T)

    Can handle X and Y as Sparse Factors.

    :param X: n x d matrix
    :param Y: n x d matrix
    :return:
    """
    assert len(X.shape) == len(Y.shape) == 2

    if norm_X is None:
        norm_X = get_squared_froebenius_norm_line_wise(X)
    if norm_Y is None:
        norm_Y = get_squared_froebenius_norm_line_wise(Y)

    def f(norm_mat):
        return np.exp(-gamma * norm_mat)

    def g(scal):
        return np.exp(2 * gamma * scal)

    if isinstance(X, SparseFactors) and isinstance(Y, SparseFactors):
        # xyt = SparseFactors(X.get_list_of_factors() + Y.transpose().get_list_of_factors()).compute_product(return_array=True)
        S = SparseFactors(
            lst_factors=X.get_list_of_factors() + Y.get_list_of_factors_H(),
            lst_factors_H=X.get_list_of_factors_H() + Y.get_list_of_factors())
        xyt = S.compute_product(return_array=True)
    else:
        xyt = X @ Y.transpose()

    return f(norm_X).reshape(-1, 1) * g(xyt) * f(norm_Y).reshape(1, -1)

コード例 #2

0

ファイルを表示

ファイル: 5_6_qmeans_minibatch_new_expes.py プロジェクト: lucgiffon/qkmeans

def make_batch_assignation_evaluation(X, centroids):
    """
    Assign `size_batch` random samples of `X` to some of the centroids.
    All the samples are assigned at the same time using a matrix-vector multiplication.
    Time is recorded.

    :param X: The input data from which to take the samples.
    :param centroids: The centroids to which to assign the samples (must be of same dimension than `X`)
    :param size_batch: The number of data points to assign

    :return: None
    """
    size_batch = paraman["--batch-assignation-time"]
    if size_batch > X.shape[0]:
        logger.warning("Batch size for batch assignation evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(size_batch, X.shape[0]))
        size_batch = X.shape[0]
        paraman["--batch-assignation-time"] = size_batch

    precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(centroids)
    # precomputed_centroid_norms = None
    indexes_batch = np.random.permutation(X.shape[0])[:size_batch]
    start_time = time.process_time()
    _ = get_distances(X[indexes_batch], centroids, precomputed_centroids_norm=precomputed_centroid_norms)
    stop_time = time.process_time()

    resprinter.add({
        "batch_assignation_mean_time": (stop_time-start_time) / size_batch,
    })

コード例 #3

0

ファイルを表示

def make_assignation_evaluation(X, centroids):
    nb_eval = paraman["--assignation-time"]
    if nb_eval > X.shape[0]:
        logger.warning(
            "Batch size for assignation evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(nb_eval, X.shape[0]))
        nb_eval = X.shape[0]
        paraman["--assignation-time"] = nb_eval

    times = []
    precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(
        centroids)
    for i in np.random.permutation(X.shape[0])[:nb_eval]:
        start_time = time.time()
        get_distances(X[i].reshape(1, -1),
                      centroids,
                      precomputed_centroids_norm=precomputed_centroid_norms)
        stop_time = time.time()
        times.append(stop_time - start_time)

    mean_time = np.mean(times)
    std_time = np.std(times)

    resprinter.add({
        "assignation_mean_time": mean_time,
        "assignation_std_time": std_time
    })

コード例 #4

0

ファイルを表示

ファイル: 1_2_qmeans_objective_function_new_interface_qmeans.py プロジェクト: lucgiffon/qkmeans

    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = [KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0])]
        log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation")
        precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids)
        # precomputed_centroid_norms = None
        start_inference_time = time.process_time()
        distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms)
        stop_get_distances_time = time.process_time()
        get_distance_time = stop_get_distances_time - start_inference_time
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.process_time()
        log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation")
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_get_distance_time": get_distance_time / x_test.shape[0],
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

コード例 #5

0

ファイルを表示

ファイル: 1_2_efficient_nystrom.py プロジェクト: lucgiffon/qkmeans

def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks):
    # verify sample size for evaluation
    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None

    # Make nystrom approximation
    # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0])
    # nys_obj.fit(landmarks)
    # nystrom_embedding = nys_obj.transform(sample)
    landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis]
    metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma)
    nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T

    # Create real kernel matrix
    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)

    # evaluation reconstruction error
    reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm

    # start svm + nystrom classification
    if x_test is not None:
        logger.info("Start classification")

        x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

    else:
        accuracy_nystrom_svm = None

    return reconstruction_error_nystrom, accuracy_nystrom_svm

コード例 #6

0

ファイルを表示

ファイル: 5_6_qmeans_minibatch_new_expes.py プロジェクト: lucgiffon/qkmeans

    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = []
        indices_no_train_obs_in_cluster = []
        for i in range(U_centroids.shape[0]):
            try:
                lst_clf_by_cluster.append(KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i]))
            except ValueError:
                indices_no_train_obs_in_cluster.append(i)
                lst_clf_by_cluster.append(None)

        # lst_clf_by_cluster = [ for i in range(landmarks.shape[0])]
        log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation")
        precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids)
        # precomputed_centroid_norms = None
        start_inference_time = time.process_time()
        distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms)
        stop_get_distances_time = time.process_time()
        get_distance_time = stop_get_distances_time - start_inference_time
        if len(indices_no_train_obs_in_cluster):
            distances[:, np.array(indices_no_train_obs_in_cluster)] = np.inf
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.process_time()
        log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation")
        inference_time = (stop_inference_time - start_inference_time)

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy = f1
        else:
            accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_get_distance_time": get_distance_time / x_test.shape[0],
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

コード例 #7

0

ファイルを表示

ファイル: qmeans_objective_function_analysis.py プロジェクト: lucgiffon/qkmeans

def make_assignation_evaluation(X, centroids):
    nb_eval = 100
    times = []
    precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(
        centroids)
    for i in np.random.permutation(X.shape[0])[:nb_eval]:
        start_time = time.time()
        get_distances(X[i].reshape(1, -1),
                      centroids,
                      precomputed_centroids_norm=precomputed_centroid_norms)
        stop_time = time.time()
        times.append(stop_time - start_time)

    mean_time = np.mean(times)
    std_time = np.std(times)

    resprinter.add({
        "assignation_mean_time": mean_time,
        "assignation_std_time": std_time
    })

コード例 #8

0

ファイルを表示

    def setUp(self):
        self.n_features = 2000
        self.n_data = 100
        sparsity = 2

        self.sparse_data = create_sparse_factors(
            shape=(self.n_data * 2, self.n_features),
            n_factors=int(
                np.ceil(np.log2(min(self.n_data * 2, self.n_features)))),
            sparsity_level=sparsity)
        self.data = self.sparse_data.compute_product(return_array=True)

        self.data_verylittle = np.random.rand(*self.data.shape) * 1e2

        self.data_norm = get_squared_froebenius_norm_line_wise(self.data)
        self.sparse_data_norm = get_squared_froebenius_norm_line_wise(
            self.sparse_data)
        self.data_norm_verylittle = get_squared_froebenius_norm_line_wise(
            self.data_verylittle)

        self.gamma = compute_euristic_gamma(self.data)
        self.random_data = np.random.rand(self.n_data, self.n_features)
        self.mnist = mnist_dataset()["x_train"].astype(np.float64)
        self.fashionmnist = fashion_mnist_dataset()["x_train"].astype(
            np.float64)
        # self.caltech = caltech_dataset(28)["x_train"].astype(np.float64)

        self.pairs_data = {
            "notSparse": self.data[:self.n_data],
            "mnist": self.mnist[:self.n_data],
            # "caltech": self.caltech[:self.n_data],
            "fashmnist": self.fashionmnist[:self.n_data],
        }
        self.example_data = {
            "notSparse": self.data[self.n_data:self.n_data * 2],
            "mnist": self.mnist[self.n_data:self.n_data * 2],
            # "caltech": self.caltech[self.n_data:self.n_data*2],
            "fashmnist": self.fashionmnist[self.n_data:self.n_data * 2]
        }

        self.norm_data = {
            "notSparse":
            row_norms(self.pairs_data["notSparse"], squared=True)[:,
                                                                  np.newaxis],
            "mnist":
            get_squared_froebenius_norm_line_wise(
                self.pairs_data["mnist"])[:, np.newaxis],
            # "caltech": row_norms(self.pairs_data["caltech"], squared=True)[:, np.newaxis],
            "fashmnist":
            row_norms(self.pairs_data["fashmnist"], squared=True)[:,
                                                                  np.newaxis],
        }

        self.norm_example_data = {
            "notSparse":
            row_norms(self.example_data["notSparse"],
                      squared=True)[:, np.newaxis],
            "mnist":
            get_squared_froebenius_norm_line_wise(self.example_data["mnist"]),
            # "caltech": row_norms(self.example_data["caltech"], squared=True)[:, np.newaxis],
            "fashmnist":
            row_norms(self.example_data["fashmnist"],
                      squared=True)[:, np.newaxis],
        }

        self.gamma_data = {
            "notSparse": compute_euristic_gamma(self.pairs_data["notSparse"]),
            "mnist": compute_euristic_gamma(self.pairs_data["mnist"]),
            # "caltech": compute_euristic_gamma(self.pairs_data["caltech"]),
            "fashmnist": compute_euristic_gamma(self.pairs_data["fashmnist"]),
        }

コード例 #9

0

ファイルを表示

ファイル: qmeans_fast.py プロジェクト: lucgiffon/qkmeans

def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           delta_objective_error_threshold=1e-6,
           hierarchical_init=False):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param delta_objective_error_threshold:
    :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not.
    :return:
    """
    assert K_nb_cluster == initialization.shape[0], "The number of cluster {} is not equal to the number of centroids in the initialization {}.".format(K_nb_cluster, initialization.shape[0])

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    nb_examples = X_data.shape[0]

    logger.info("Initializing Qmeans")

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]
    delta_objective_error_threshold_inner_palm = params_palm4msa["delta_objective_error_threshold"]
    track_objective_palm = params_palm4msa["track_objective"]

    X_centroids_hat = copy.deepcopy(initialization)

    lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors)

    eye_norm = np.sqrt(K_nb_cluster)

    if hierarchical_inside or hierarchical_init:
        _lambda_tmp, op_factors, U_centroids, objective_palm, array_objective_hierarchical= \
            hierarchical_palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                track_objective_palm=track_objective_palm,
                delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm,
                return_objective_function=track_objective_palm)
    else:
        _lambda_tmp, op_factors, U_centroids, objective_palm, nb_iter_palm = \
            palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1][
                    "finetune"],
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                track_objective=track_objective_palm,
                delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

    lst_factors = None  # safe assignment for debug

    _lambda = _lambda_tmp / eye_norm

    objective_function = np.ones(nb_iter) * -1
    lst_all_objective_functions_palm = []
    lst_all_objective_functions_palm.append(objective_palm)

    i_iter = 0
    delta_objective_error = np.inf
    while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        lst_factors_ = op_factors.get_list_of_factors()
        op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:])

        ###########################
        # Cluster assignment step #
        ###########################

        indicator_vector, distances = assign_points_to_clusters(X_data, op_centroids, X_norms=X_data_norms)



        #######################
        # Cluster update step #
        #######################

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        # Update centroid location using the newly (it happens in the assess_cluster_integrity function)
        # assigned data point classes
        # and check if all clusters still have points
        # and change the object X_centroids_hat in place if some cluster have lost points (biggest cluster)
        counts, cluster_names_sorted = update_clusters_with_integrity_check(X_data,
                                                                            X_data_norms,
                                                                            X_centroids_hat, # in place changes
                                                                            K_nb_cluster,
                                                                            counts,
                                                                            indicator_vector,
                                                                            distances,
                                                                            cluster_names,
                                                                            cluster_names_sorted)

        #################
        # PALM4MSA step #
        #################

        # create the diagonal of the sqrt of those counts
        diag_counts_sqrt_normalized = csr_matrix(
            (np.sqrt(counts[cluster_names_sorted] / nb_examples),
             (np.arange(K_nb_cluster), np.arange(K_nb_cluster))))
        diag_counts_sqrt = np.sqrt(counts[cluster_names_sorted])

        # set it as first factor
        op_factors.set_factor(0, diag_counts_sqrt_normalized)


        if hierarchical_inside:
            _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \
                hierarchical_palm4msa(
                    arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                    lst_S_init=op_factors.get_list_of_factors(),
                    lst_dct_projection_function=lst_proj_op_by_fac_step,
                    f_lambda_init=_lambda * np.sqrt(nb_examples),
                    nb_iter=nb_iter_palm,
                    update_right_to_left=True,
                    residual_on_right=residual_on_right,
                    return_objective_function=track_objective_palm,
                    track_objective_palm=track_objective_palm,
                    delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm)

        else:
            _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
                palm4msa(arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                         lst_S_init=op_factors.get_list_of_factors(),
                         nb_factors=op_factors.n_factors,
                         lst_projection_functions=lst_proj_op_by_fac_step[-1][
                             "finetune"],
                         f_lambda_init=_lambda * np.sqrt(nb_examples),
                         nb_iter=nb_iter_palm,
                         update_right_to_left=True,
                         track_objective=track_objective_palm,
                         delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

        lst_all_objective_functions_palm.append(objective_palm)

        _lambda = _lambda_tmp / np.sqrt(nb_examples)

        objective_function[i_iter] = compute_objective(X_data, op_centroids, indicator_vector)
        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter-1]) / objective_function[i_iter-1]

        # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee

        i_iter += 1

    lst_factors_ = op_factors.get_list_of_factors()
    op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:])

    return objective_function[:i_iter], op_centroids, indicator_vector, lst_all_objective_functions_palm

コード例 #10

0

ファイルを表示

ファイル: kmeans_minibatch_gradient_descent.py プロジェクト: lucgiffon/qkmeans

def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization,
                     batch_size):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param batch_size: The size of each batch.
    :return:
    """

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    # Initialize our centroids by picking random data points
    U_centroids_hat = copy.deepcopy(initialization)
    U_centroids = U_centroids_hat
    full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)
    full_count_vector = np.zeros(K_nb_cluster, dtype=int)
    objective_function = np.empty((nb_iter, ))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while True:
        for i_iter, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            if not (delta_objective_error > delta_objective_error_threshold):
                logger.info(
                    "not (delta_objective_error {}-{}={} > delta_objective_error_threshold {})"
                    .format(objective_function[i_iter],
                            objective_function[i_iter - 1],
                            delta_objective_error,
                            delta_objective_error_threshold))
                break

            example_batch = X_data[example_batch_indexes]

            logger.info("Iteration Kmeans {}".format(i_iter))

            indicator_vector, distances = assign_points_to_clusters(
                example_batch,
                U_centroids,
                X_norms=X_data_norms[example_batch_indexes])
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            # cluster_names_sorted = np.argsort(cluster_names)
            #
            count_vector = np.zeros(K_nb_cluster, dtype=int)
            count_vector[cluster_names] = counts

            full_count_vector += count_vector
            # previous_full_count_vector = full_count_vector - count_vector

            # Update centroid location using the newly
            # assigned data point classes
            # This way of updating the centroids (centroid index wise) is better than the one proposed in the paper "Web-Scale K-Means Clustering"
            # as the number of update with always be <= batch_size
            for c in range(K_nb_cluster):
                if full_count_vector[c] != 0 and count_vector[c] != 0:
                    U_centroids_hat[c] += (1 / full_count_vector[c]) * np.sum(
                        example_batch[indicator_vector == c] -
                        U_centroids_hat[c],
                        axis=0)
                    # this is exactly equivalent to an update of the mean:
                    # U_centroids_hat[c] = (previous_full_count_vector[c] / full_count_vector[c]) * U_centroids_hat[c] + (1 / full_count_vector[c]) * np.sum(example_batch[indicator_vector == c], axis=0)

            # for i_ex, ex in enumerate(example_batch):
            #     c = indicator_vector[i_ex]
            #     full_count_vector[c] += 1
            #     eta = 1./full_count_vector[c]
            #     U_centroids_hat[c] = (1-eta) * U_centroids_hat[c] + eta * ex

            # counts, cluster_names_sorted = assess_clusters_integrity(X_data,
            #                                                          X_data_norms,
            #                                                          U_centroids_hat,
            #                                                          K_nb_cluster,
            #                                                          counts,
            #                                                          indicator_vector,
            #                                                          distances,
            #                                                          cluster_names,
            #                                                          cluster_names_sorted)

            # check if all clusters still have points
            # for c in range(K_nb_cluster):
            #     biggest_cluster_index = np.argmax(counts)  # type: int
            #     biggest_cluster = cluster_names[biggest_cluster_index]
            #     biggest_cluster_data = X_data[indicator_vector == biggest_cluster]
            #
            #     cluster_data = X_data[indicator_vector == c]
            #     if len(cluster_data) == 0:
            #         logger.warning("cluster has lost data, add new cluster. cluster idx: {}".format(c))
            #         U_centroids_hat[c] = biggest_cluster_data[np.random.randint(len(biggest_cluster_data))].reshape(1, -1)
            #         counts = list(counts)
            #         counts[biggest_cluster_index] -= 1
            #         counts.append(1)
            #         counts = np.array(counts)
            #         cluster_names_sorted = list(cluster_names_sorted)
            #         cluster_names_sorted.append(c)
            #         cluster_names_sorted = np.array(cluster_names_sorted)
            #     else:
            #         U_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

            U_centroids = U_centroids_hat

            objective_function[i_iter, ] = compute_objective(
                X_data, U_centroids, full_indicator_vector)

            if i_iter >= 1:
                delta_objective_error = np.abs(
                    objective_function[i_iter] - objective_function[i_iter - 1]
                ) / objective_function[
                    i_iter -
                    1]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

            i_iter += 1
        else:
            continue
        break

    return objective_function[:i_iter], U_centroids, indicator_vector

コード例 #11

0

ファイルを表示

ファイル: 5_6_qmeans_minibatch_new_expes.py プロジェクト: lucgiffon/qkmeans

def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis]
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage("Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis]
    log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma)
    log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation")

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)
    log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation")

    #################################
    # Sklearn based Nystrom uniform #
    #################################

    # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0])
    # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample)
    # sklearn_transfo = sklearn_nystrom.transform(sample)
    # kernel_sklearn_nys = sklearn_transfo  @ sklearn_transfo.T

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)
        time_classification_stop = time.process_time()

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)

コード例 #12

0

ファイルを表示

ファイル: 2_3_qmeans_minibatch.py プロジェクト: lucgiffon/qkmeans

def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """
    def prepare_nystrom(landmarks, landmarks_norm):
        basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma,
                                            landmarks_norm, landmarks_norm)
        U, S, V = np.linalg.svd(basis_kernel_W)
        S = np.maximum(S, 1e-12)
        normalization_ = np.dot(U / np.sqrt(S), V)

        return normalization_

    def nystrom_transformation(x_input, landmarks, p_metric, landmarks_norm,
                               x_input_norm):
        nystrom_embedding = special_rbf_kernel(landmarks, x_input, gamma,
                                               landmarks_norm,
                                               x_input_norm).T @ p_metric
        return nystrom_embedding

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning(
            "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage(
        "Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage(
        "Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric,
                                               centroids_norm, samples_norm)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage(
        "Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop -
                              nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(
        x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = None
    log_memory_usage(
        "Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm)
    log_memory_usage(
        "Memory after SVD computation in uniform part of make_nystrom_evaluation"
    )

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample,
                                                       metric_uniform,
                                                       uniform_sample_norm,
                                                       samples_norm)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm,
                                     samples_norm)
    real_kernel_norm = np.linalg.norm(real_kernel)
    log_memory_usage(
        "Memory after real kernel computation in make_nystrom_evaluation")

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(
        nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(
            x_train, U_centroids, metric, centroids_norm, None)
        x_test_nystrom_embedding = nystrom_transformation(
            x_test, U_centroids, metric, centroids_norm, None)

        linear_svc_clf = LinearSVC()
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding,
                                                    y_test)
        time_classification_stop = time.process_time()

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform":
        sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)

コード例 #13

0

ファイルを表示

def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           graphical_display=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param graphical_display: Tell the algorithm to display the results.
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]

    X_centroids_hat = copy.deepcopy(initialization)
    min_K_d = min(X_centroids_hat.shape)

    lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)]

    eye_norm = np.sqrt(K_nb_cluster)
    lst_factors[0] = np.eye(K_nb_cluster) / eye_norm
    lst_factors[1] = np.eye(K_nb_cluster, min_K_d)
    lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1]))

    if graphical_display:
        lst_factors_init = copy.deepcopy(lst_factors)

    _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
        arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
        lst_S_init=lst_factors,
        lst_dct_projection_function=lst_proj_op_by_fac_step,
        f_lambda_init=init_lambda * eye_norm,
        nb_iter=nb_iter_palm,
        update_right_to_left=True,
        residual_on_right=residual_on_right,
        graphical_display=False)

    _lambda = _lambda_tmp / eye_norm

    if graphical_display:
        if hierarchical_inside:
            plt.figure()
            plt.yscale("log")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                        objective_palm[:, 0],
                        marker="x",
                        label="before split")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                        objective_palm[:, 1],
                        marker="x",
                        label="between")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                        objective_palm[:, 2],
                        marker="x",
                        label="after finetune")
            plt.plot(np.arange(len(objective_palm) * 3),
                     objective_palm.flatten(),
                     color="k")
            plt.legend()
            plt.show()

        visual_evaluation_palm4msa(
            np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init,
            lst_factors, _lambda * multi_dot(lst_factors))

    objective_function = np.empty((nb_iter, 2))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while (i_iter <= 1) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        U_centroids = _lambda * multi_dot(lst_factors[1:])

        if i_iter > 0:
            objective_function[i_iter,
                               0] = compute_objective(X_data, U_centroids,
                                                      indicator_vector)

        # Assign all points to the nearest centroid
        # first get distance from all points to all centroids
        distances = get_distances(X_data,
                                  U_centroids,
                                  precomputed_data_points_norm=X_data_norms)
        # then, Determine class membership of each point
        # by picking the closest centroid
        indicator_vector = np.argmin(distances, axis=1)

        objective_function[i_iter,
                           1] = compute_objective(X_data, U_centroids,
                                                  indicator_vector)

        # Update centroid location using the newly
        # assigned data point classes
        for c in range(K_nb_cluster):
            X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        if len(counts) < K_nb_cluster:
            raise ValueError(
                "Some clusters have no point. Aborting iteration {}".format(
                    i_iter))

        diag_counts_sqrt = np.diag(np.sqrt(
            counts[cluster_names_sorted]))  # todo use sparse matrix object
        diag_counts_sqrt_norm = np.linalg.norm(
            diag_counts_sqrt
        )  # todo analytic sqrt(n) instead of cumputing it with norm
        diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm
        # set it as first factor
        lst_factors[0] = diag_counts_sqrt_normalized

        if graphical_display:
            lst_factors_init = copy.deepcopy(lst_factors)

        if hierarchical_inside:
            _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                # f_lambda_init=_lambda,
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                graphical_display=False)

            loss_palm_before = objective_palm[0, 0]
            loss_palm_after = objective_palm[-1, -1]

        else:
            _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1]
                ["finetune"],
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                graphical_display=False)

            loss_palm_before = objective_palm[0, -1]
            loss_palm_after = objective_palm[-1, -1]

        logger.debug("Loss palm before: {}".format(loss_palm_before))
        logger.debug("Loss palm after: {}".format(loss_palm_after))

        if graphical_display:
            if hierarchical_inside:
                plt.figure()
                plt.yscale("log")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                            objective_palm[:, 0],
                            marker="x",
                            label="before split")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                            objective_palm[:, 1],
                            marker="x",
                            label="between")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                            objective_palm[:, 2],
                            marker="x",
                            label="after finetune")
                plt.plot(np.arange(len(objective_palm) * 3),
                         objective_palm.flatten(),
                         color="k")
                plt.legend()
                plt.show()

            visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat,
                                       lst_factors_init, lst_factors,
                                       _lambda_tmp * multi_dot(lst_factors))

        _lambda = _lambda_tmp / diag_counts_sqrt_norm

        logger.debug("Returned loss (with diag) palm: {}".format(
            objective_palm[-1, 0]))

        if i_iter >= 2:
            delta_objective_error = np.abs(
                objective_function[i_iter, 0] -
                objective_function[i_iter - 1, 0]
            ) / objective_function[
                i_iter - 1,
                0]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    U_centroids = _lambda * multi_dot(lst_factors[1:])
    distances = get_distances(X_data,
                              U_centroids,
                              precomputed_data_points_norm=X_data_norms)
    indicator_vector = np.argmin(distances, axis=1)

    return objective_function[:i_iter], U_centroids, indicator_vector

コード例 #14

0

ファイルを表示

ファイル: qmeans_objective_function_analysis_fast_precompute_centroids_norm.py プロジェクト: lucgiffon/qkmeans

def make_nystrom_evaluation(x_train, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """
    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning(
            "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)

    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)

    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.time()
    basis_kernel_W = special_rbf_kernel(U_centroids, U_centroids, gamma,
                                        centroids_norm, centroids_norm)
    U, S, V = np.linalg.svd(basis_kernel_W)
    S = np.maximum(S, 1e-12)
    normalization_ = np.dot(U / np.sqrt(S), V)
    nystrom_build_stop_time = time.time()
    # STOP

    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]

    samples_norm = np.linalg.norm(sample, axis=1)**2

    real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm,
                                     samples_norm)

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.time()
    nystrom_embedding = special_rbf_kernel(U_centroids, sample, gamma,
                                           centroids_norm,
                                           samples_norm).T @ normalization_
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.time()
    ## STOP

    nystrom_inference_time = (nystrom_inference_time_stop -
                              nystrom_inference_time_start) / n_sample

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel)

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm
    }

    resprinter.add(nystrom_results)

コード例 #15

0

ファイルを表示

ファイル: kmeans.py プロジェクト: lucgiffon/qkmeans

def kmeans(X_data,
           K_nb_cluster,
           nb_iter,
           initialization,
           delta_objective_error_threshold=1e-6,
           proj_l1=False,
           _lambda=None,
           epsilon=None):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value.
    :return:
    """

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    # Initialize our centroids by picking random data points
    U_centroids_hat = copy.deepcopy(initialization)
    U_centroids = U_centroids_hat

    objective_function = np.empty((nb_iter, ))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error = np.inf
    while (i_iter == 0) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Kmeans {}".format(i_iter))

        indicator_vector, distances = assign_points_to_clusters(
            X_data, U_centroids, X_norms=X_data_norms)

        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        # Update centroid location using the new indicator vector
        counts, cluster_names_sorted = update_clusters_with_integrity_check(
            X_data, X_data_norms, U_centroids_hat, K_nb_cluster, counts,
            indicator_vector, distances, cluster_names, cluster_names_sorted)

        U_centroids = U_centroids_hat

        if proj_l1:
            if _lambda is None or epsilon is None:
                raise ValueError(
                    "epsilon and lambda must be set if proj_l1 is True")
            for i_centroid, centroid in enumerate(U_centroids):
                U_centroids[i_centroid, :] = proj_onto_l1_ball(
                    _lambda=_lambda, epsilon_tol=epsilon, vec=centroid)

        objective_function[i_iter, ] = compute_objective(
            X_data, U_centroids, indicator_vector)

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] -
                                           objective_function[i_iter - 1]
                                           ) / objective_function[i_iter - 1]

        i_iter += 1

    return objective_function[:i_iter], U_centroids, indicator_vector