def get_objective_value(X_data, op_centroids, indicator_vector): logger.info("Compute objective") if paraman["--minibatch"]: final_objective_value = compute_objective_by_batch(X_data, op_centroids, indicator_vector, paraman["--minibatch"]) else: final_objective_value = compute_objective(X_data, op_centroids, indicator_vector) resprinter.add({ "final_objective_value": final_objective_value, }) return final_objective_value
def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, delta_objective_error_threshold=1e-6, hierarchical_init=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param delta_objective_error_threshold: :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not. :return: """ assert K_nb_cluster == initialization.shape[0], "The number of cluster {} is not equal to the number of centroids in the initialization {}.".format(K_nb_cluster, initialization.shape[0]) X_data_norms = get_squared_froebenius_norm_line_wise(X_data) nb_examples = X_data.shape[0] logger.info("Initializing Qmeans") init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] delta_objective_error_threshold_inner_palm = params_palm4msa["delta_objective_error_threshold"] track_objective_palm = params_palm4msa["track_objective"] X_centroids_hat = copy.deepcopy(initialization) lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors) eye_norm = np.sqrt(K_nb_cluster) if hierarchical_inside or hierarchical_init: _lambda_tmp, op_factors, U_centroids, objective_palm, array_objective_hierarchical= \ hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm, return_objective_function=track_objective_palm) else: _lambda_tmp, op_factors, U_centroids, objective_palm, nb_iter_palm = \ palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) lst_factors = None # safe assignment for debug _lambda = _lambda_tmp / eye_norm objective_function = np.ones(nb_iter) * -1 lst_all_objective_functions_palm = [] lst_all_objective_functions_palm.append(objective_palm) i_iter = 0 delta_objective_error = np.inf while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Qmeans {}".format(i_iter)) lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) ########################### # Cluster assignment step # ########################### indicator_vector, distances = assign_points_to_clusters(X_data, op_centroids, X_norms=X_data_norms) ####################### # Cluster update step # ####################### # get the number of observation in each cluster cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) # Update centroid location using the newly (it happens in the assess_cluster_integrity function) # assigned data point classes # and check if all clusters still have points # and change the object X_centroids_hat in place if some cluster have lost points (biggest cluster) counts, cluster_names_sorted = update_clusters_with_integrity_check(X_data, X_data_norms, X_centroids_hat, # in place changes K_nb_cluster, counts, indicator_vector, distances, cluster_names, cluster_names_sorted) ################# # PALM4MSA step # ################# # create the diagonal of the sqrt of those counts diag_counts_sqrt_normalized = csr_matrix( (np.sqrt(counts[cluster_names_sorted] / nb_examples), (np.arange(K_nb_cluster), np.arange(K_nb_cluster)))) diag_counts_sqrt = np.sqrt(counts[cluster_names_sorted]) # set it as first factor op_factors.set_factor(0, diag_counts_sqrt_normalized) if hierarchical_inside: _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \ hierarchical_palm4msa( arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, return_objective_function=track_objective_palm, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa(arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), nb_factors=op_factors.n_factors, lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) lst_all_objective_functions_palm.append(objective_palm) _lambda = _lambda_tmp / np.sqrt(nb_examples) objective_function[i_iter] = compute_objective(X_data, op_centroids, indicator_vector) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter-1]) / objective_function[i_iter-1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee i_iter += 1 lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return objective_function[:i_iter], op_centroids, indicator_vector, lst_all_objective_functions_palm
def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization, batch_size): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param batch_size: The size of each batch. :return: """ X_data_norms = get_squared_froebenius_norm_line_wise(X_data) # Initialize our centroids by picking random data points U_centroids_hat = copy.deepcopy(initialization) U_centroids = U_centroids_hat full_indicator_vector = np.zeros(X_data.shape[0], dtype=int) full_count_vector = np.zeros(K_nb_cluster, dtype=int) objective_function = np.empty((nb_iter, )) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while True: for i_iter, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): if not (delta_objective_error > delta_objective_error_threshold): logger.info( "not (delta_objective_error {}-{}={} > delta_objective_error_threshold {})" .format(objective_function[i_iter], objective_function[i_iter - 1], delta_objective_error, delta_objective_error_threshold)) break example_batch = X_data[example_batch_indexes] logger.info("Iteration Kmeans {}".format(i_iter)) indicator_vector, distances = assign_points_to_clusters( example_batch, U_centroids, X_norms=X_data_norms[example_batch_indexes]) full_indicator_vector[example_batch_indexes] = indicator_vector cluster_names, counts = np.unique(indicator_vector, return_counts=True) # cluster_names_sorted = np.argsort(cluster_names) # count_vector = np.zeros(K_nb_cluster, dtype=int) count_vector[cluster_names] = counts full_count_vector += count_vector # previous_full_count_vector = full_count_vector - count_vector # Update centroid location using the newly # assigned data point classes # This way of updating the centroids (centroid index wise) is better than the one proposed in the paper "Web-Scale K-Means Clustering" # as the number of update with always be <= batch_size for c in range(K_nb_cluster): if full_count_vector[c] != 0 and count_vector[c] != 0: U_centroids_hat[c] += (1 / full_count_vector[c]) * np.sum( example_batch[indicator_vector == c] - U_centroids_hat[c], axis=0) # this is exactly equivalent to an update of the mean: # U_centroids_hat[c] = (previous_full_count_vector[c] / full_count_vector[c]) * U_centroids_hat[c] + (1 / full_count_vector[c]) * np.sum(example_batch[indicator_vector == c], axis=0) # for i_ex, ex in enumerate(example_batch): # c = indicator_vector[i_ex] # full_count_vector[c] += 1 # eta = 1./full_count_vector[c] # U_centroids_hat[c] = (1-eta) * U_centroids_hat[c] + eta * ex # counts, cluster_names_sorted = assess_clusters_integrity(X_data, # X_data_norms, # U_centroids_hat, # K_nb_cluster, # counts, # indicator_vector, # distances, # cluster_names, # cluster_names_sorted) # check if all clusters still have points # for c in range(K_nb_cluster): # biggest_cluster_index = np.argmax(counts) # type: int # biggest_cluster = cluster_names[biggest_cluster_index] # biggest_cluster_data = X_data[indicator_vector == biggest_cluster] # # cluster_data = X_data[indicator_vector == c] # if len(cluster_data) == 0: # logger.warning("cluster has lost data, add new cluster. cluster idx: {}".format(c)) # U_centroids_hat[c] = biggest_cluster_data[np.random.randint(len(biggest_cluster_data))].reshape(1, -1) # counts = list(counts) # counts[biggest_cluster_index] -= 1 # counts.append(1) # counts = np.array(counts) # cluster_names_sorted = list(cluster_names_sorted) # cluster_names_sorted.append(c) # cluster_names_sorted = np.array(cluster_names_sorted) # else: # U_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0) U_centroids = U_centroids_hat objective_function[i_iter, ] = compute_objective( X_data, U_centroids, full_indicator_vector) if i_iter >= 1: delta_objective_error = np.abs( objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[ i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 else: continue break return objective_function[:i_iter], U_centroids, indicator_vector
def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, graphical_display=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param graphical_display: Tell the algorithm to display the results. :return: """ assert K_nb_cluster == initialization.shape[0] X_data_norms = get_squared_froebenius_norm_line_wise(X_data) init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] X_centroids_hat = copy.deepcopy(initialization) min_K_d = min(X_centroids_hat.shape) lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)] eye_norm = np.sqrt(K_nb_cluster) lst_factors[0] = np.eye(K_nb_cluster) / eye_norm lst_factors[1] = np.eye(K_nb_cluster, min_K_d) lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1])) if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) _lambda = _lambda_tmp / eye_norm if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa( np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init, lst_factors, _lambda * multi_dot(lst_factors)) objective_function = np.empty((nb_iter, 2)) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while (i_iter <= 1) or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Qmeans {}".format(i_iter)) U_centroids = _lambda * multi_dot(lst_factors[1:]) if i_iter > 0: objective_function[i_iter, 0] = compute_objective(X_data, U_centroids, indicator_vector) # Assign all points to the nearest centroid # first get distance from all points to all centroids distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) # then, Determine class membership of each point # by picking the closest centroid indicator_vector = np.argmin(distances, axis=1) objective_function[i_iter, 1] = compute_objective(X_data, U_centroids, indicator_vector) # Update centroid location using the newly # assigned data point classes for c in range(K_nb_cluster): X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0) # get the number of observation in each cluster cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) if len(counts) < K_nb_cluster: raise ValueError( "Some clusters have no point. Aborting iteration {}".format( i_iter)) diag_counts_sqrt = np.diag(np.sqrt( counts[cluster_names_sorted])) # todo use sparse matrix object diag_counts_sqrt_norm = np.linalg.norm( diag_counts_sqrt ) # todo analytic sqrt(n) instead of cumputing it with norm diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm # set it as first factor lst_factors[0] = diag_counts_sqrt_normalized if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) if hierarchical_inside: _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, # f_lambda_init=_lambda, f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) loss_palm_before = objective_palm[0, 0] loss_palm_after = objective_palm[-1, -1] else: _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1] ["finetune"], f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, graphical_display=False) loss_palm_before = objective_palm[0, -1] loss_palm_after = objective_palm[-1, -1] logger.debug("Loss palm before: {}".format(loss_palm_before)) logger.debug("Loss palm after: {}".format(loss_palm_after)) if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat, lst_factors_init, lst_factors, _lambda_tmp * multi_dot(lst_factors)) _lambda = _lambda_tmp / diag_counts_sqrt_norm logger.debug("Returned loss (with diag) palm: {}".format( objective_palm[-1, 0])) if i_iter >= 2: delta_objective_error = np.abs( objective_function[i_iter, 0] - objective_function[i_iter - 1, 0] ) / objective_function[ i_iter - 1, 0] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 U_centroids = _lambda * multi_dot(lst_factors[1:]) distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) indicator_vector = np.argmin(distances, axis=1) return objective_function[:i_iter], U_centroids, indicator_vector
def kmeans(X_data, K_nb_cluster, nb_iter, initialization, delta_objective_error_threshold=1e-6, proj_l1=False, _lambda=None, epsilon=None): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value. :return: """ X_data_norms = get_squared_froebenius_norm_line_wise(X_data) # Initialize our centroids by picking random data points U_centroids_hat = copy.deepcopy(initialization) U_centroids = U_centroids_hat objective_function = np.empty((nb_iter, )) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error = np.inf while (i_iter == 0) or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Kmeans {}".format(i_iter)) indicator_vector, distances = assign_points_to_clusters( X_data, U_centroids, X_norms=X_data_norms) cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) # Update centroid location using the new indicator vector counts, cluster_names_sorted = update_clusters_with_integrity_check( X_data, X_data_norms, U_centroids_hat, K_nb_cluster, counts, indicator_vector, distances, cluster_names, cluster_names_sorted) U_centroids = U_centroids_hat if proj_l1: if _lambda is None or epsilon is None: raise ValueError( "epsilon and lambda must be set if proj_l1 is True") for i_centroid, centroid in enumerate(U_centroids): U_centroids[i_centroid, :] = proj_onto_l1_ball( _lambda=_lambda, epsilon_tol=epsilon, vec=centroid) objective_function[i_iter, ] = compute_objective( X_data, U_centroids, indicator_vector) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[i_iter - 1] i_iter += 1 return objective_function[:i_iter], U_centroids, indicator_vector