def update_clusters(X_data, X_centroids_hat, K_nb_cluster, counts_before, new_counts, indicator_vector): """ Update centroids and return new counts of each centroid. All changes are made in place. :param X_data: :param X_data_norms: :param X_centroids_hat: :param K_nb_cluster: :param new_counts: :param indicator_vector: :param distances: :param cluster_names: :param cluster_names_sorted: :return: """ total_count_vector = counts_before + new_counts for c in range(K_nb_cluster): if total_count_vector[c] != 0: X_centroids_hat[c] = ( (counts_before[c] / total_count_vector[c]) * X_centroids_hat[c]) + ( (1. / total_count_vector[c]) * np.sum(X_data[indicator_vector == c, :], 0)) else: logger.debug("Cluster {} has zero point, continue".format(c)) return total_count_vector
def load_kddcup04bio_no_classif(): data_url = "http://cs.joensuu.fi/sipu/datasets/KDDCUP04Bio.txt" with tempfile.TemporaryDirectory() as d_tmp: logger.debug( f"Downloading file from url {data_url} to temporary directory {d_tmp}" ) matfile_path = download_data(data_url, d_tmp) data = pandas.read_csv(matfile_path, delim_whitespace=True) return data.values
def get_squared_froebenius_norm_line_wise_batch_by_batch( data_arr_memmap, batch_size): data_norms = np.zeros(data_arr_memmap.shape[0]) logger.debug( "Start computing norm of datat array of shape {}, batch by batch". format(data_arr_memmap.shape)) for i_batch, batch in enumerate( DataGenerator(data_arr_memmap, batch_size=batch_size, return_indexes=False)): logger.debug("Compute norm of batch {}/{}".format( i_batch, data_arr_memmap.shape[0] // batch_size)) data_norms[i_batch * batch_size:(i_batch + 1) * batch_size] = np.linalg.norm(batch, axis=1)**2 return data_norms
def load_census1990(): """ Meek, Thiesson, and Heckerman (2001), "The Learning Curve Method Applied to Clustering", to appear in The Journal of Machine Learning Research. Number of clusters: 25, 50, 100 :return: """ data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/census1990-mld/USCensus1990.data.txt" with tempfile.TemporaryDirectory() as d_tmp: logger.debug( f"Downloading file from url {data_url} to temporary directory {d_tmp}" ) matfile_path = download_data(data_url, d_tmp) data = pandas.read_csv(matfile_path) return data.values[:, 1:], None # remove the `caseId` attribute
def load_plants(): """ USDA, NRCS. 2008. The PLANTS Database ([Web Link], 31 December 2008). National Plant Data Center, Baton Rouge, LA 70874-4490 USA. :return: """ data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/plants/plants.data" with tempfile.TemporaryDirectory() as d_tmp: logger.debug( f"Downloading file from url {data_url} to temporary directory {d_tmp}" ) file_path = download_data(data_url, d_tmp) with open(file_path, 'r', encoding="ISO-8859-15") as f: plants = f.readlines() # get all the features in a set set_plants_attributes = set() lst_plants = [] for plant_line in plants: plant_line_no_name = [v.strip() for v in plant_line.split(',')[1:]] lst_plants.append(plant_line_no_name) set_plants_attributes.update(plant_line_no_name) # give a code to each feature in a 1-hot fashion arr_plants_attributes = np.array([v for v in set_plants_attributes]) onehot_encoder = preprocessing.OneHotEncoder(sparse=False) onehot_encoder.fit(arr_plants_attributes.reshape(-1, 1)) # transform each plant with their code for i, plant_line_no_name in enumerate(lst_plants): plant_line_oh = np.sum(onehot_encoder.transform( np.array(plant_line_no_name).reshape(-1, 1)), axis=0) lst_plants[i] = plant_line_oh arr_lst_plants = np.array(lst_plants) return arr_lst_plants
def palm4msa(arr_X_target: np.array, lst_S_init: list, nb_factors: int, lst_projection_functions: list, f_lambda_init: float, nb_iter: int, update_right_to_left=True, graphical_display=False): """ lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product). example: S5 S4 S3 S2 S1 lst S [-j] = Sj :param arr_X_target: The target to approximate. :param lst_S_init: The initial list of sparse factors. :param nb_factors: The number of factors. :param lst_projection_functions: The projection function for each of the sparse factor. :param f_lambda_init: The initial scaling factor. :param nb_iter: The number of iteration before stopping. :param update_right_to_left: Tells the algorithm to update factors from right to left (S1 first) :param graphical_display: Make a graphical representation of results. :return: """ def update_S(S_old, _left_side, _right_side, _c, _lambda, projection_function): """ Return the new factor value. - Compute gradient - Do gradient step - Project data on _nb_keep_values highest entries - Normalize data """ # compute gradient of the distance metric (with 1/_c gradient step size) grad_step = 1. / _c * _lambda \ * _left_side.T \ @ ((_lambda * _left_side @ S_old @ _right_side) - arr_X_target) \ @ _right_side.T # 1 step for minimizing + flatten necessary for the upcoming projection S_tmp = S_old - grad_step # normalize because all factors must have norm 1 S_proj = projection_function(S_tmp) S_proj = S_proj / norm(S_proj, ord="fro") return S_proj def update_scaling_factor(X, X_est): return np.sum(X * X_est) / np.sum(X_est**2) logger.debug('Norme de arr_X_target: {}'.format( np.linalg.norm(arr_X_target, ord='fro'))) assert len(lst_S_init) > 0 assert get_side_prod(lst_S_init).shape == arr_X_target.shape assert len(lst_S_init) == nb_factors # initialization f_lambda = f_lambda_init lst_S = deepcopy( lst_S_init) # todo may not be necessary; check this ugliness objective_function = np.empty((nb_iter, nb_factors + 1)) if update_right_to_left: # range arguments: start, stop, step factor_number_generator = range(-1, -(nb_factors + 1), -1) else: factor_number_generator = range(0, nb_factors, 1) # main loop i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while i_iter == 0 or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): for j in factor_number_generator: if lst_projection_functions[j].__name__ == "constant_proj": continue left_side = get_side_prod( lst_S[:j], (arr_X_target.shape[0], arr_X_target.shape[0])) # L index_value_for_right_factors_selection = (nb_factors + j + 1) % ( nb_factors + 1) # trust me, I am a scientist. right_side = get_side_prod( lst_S[index_value_for_right_factors_selection:], (arr_X_target.shape[1], arr_X_target.shape[1])) # R # compute minimum c value (according to paper) min_c_value = (f_lambda * norm(right_side, ord=2) * norm(left_side, ord=2))**2 # add epsilon because it is exclusive minimum c = min_c_value * 1.001 logger.debug("Lipsitchz constant value: {}; c value: {}".format( min_c_value, c)) # compute new factor value lst_S[j] = update_S(lst_S[j], left_side, right_side, c, f_lambda, lst_projection_functions[j]) objective_function[i_iter, j - 1] = \ compute_objective_function(arr_X_target, _f_lambda=f_lambda, _lst_S=lst_S) # re-compute the full factorisation if len(lst_S) == 1: arr_X_curr = lst_S[0] else: arr_X_curr = multi_dot(lst_S) # update lambda f_lambda = update_scaling_factor(arr_X_target, arr_X_curr) logger.debug("Lambda value: {}".format(f_lambda)) objective_function[i_iter, -1] = \ compute_objective_function(arr_X_target, _f_lambda=f_lambda, _lst_S=lst_S) logger.debug("Iteration {}; Objective value: {}".format( i_iter, objective_function[i_iter, -1])) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter, -1] - objective_function[i_iter - 1, -1] ) / objective_function[i_iter - 1, -1] # TODO vérifier que l'erreur absolue est plus petite que le # threshold plusieurs fois d'affilée i_iter += 1 objective_function = objective_function[:i_iter, :] if graphical_display: plt.figure() plt.title("n factors {}".format(nb_factors)) for j in range(nb_factors + 1): plt.semilogy(objective_function[:, j], label=str(j)) plt.legend() plt.show() plt.figure() plt.semilogy(objective_function.flat) plt.legend() plt.show() # todo maybe change arrX_curr by lambda * arrX_curr return f_lambda, lst_S, arr_X_curr, objective_function, i_iter
def palm4msa_fast4(arr_X_target: np.array, lst_S_init: list, nb_factors: int, lst_projection_functions: list, f_lambda_init: float, nb_iter: int, update_right_to_left=True, track_objective=False, delta_objective_error_threshold=1e-6): """ lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product). example: S5 S4 S3 S2 S1 lst S [-j] = Sj :param arr_X_target: The target to approximate. :param lst_S_init: The initial list of sparse factors. :param nb_factors: The number of factors. :param lst_projection_functions: The projection function for each of the sparse factor. :param f_lambda_init: The initial scaling factor. :param nb_iter: The number of iteration before stopping. :param update_right_to_left: Tells the algorithm to update factors from right to left (S1 first) :param graphical_display: Make a graphical representation of results. :param track_objective: If true, the objective function is computed for each factor and not only at the end of each iteration. :param delta_objective_error_threshold: The normalized difference threshold between error at two successive iterations threshold below which the computation is stopped. :return: the sparse factorization but careful: the final X isn't multiplyed by lambda """ logger.debug('Norme de arr_X_target: {}'.format( np.linalg.norm(arr_X_target, ord='fro'))) # initialization f_lambda = f_lambda_init S_factors_op = SparseFactors(lst_S_init) assert np.all(S_factors_op.shape == arr_X_target.shape) assert S_factors_op.n_factors > 0 assert S_factors_op.n_factors == nb_factors if track_objective: objective_function = np.ones( (nb_iter, nb_factors + 1)) * -1 # (nb_factors + 1) because of the lambda else: objective_function = np.ones((nb_iter, 1)) * -1 if update_right_to_left: # range arguments: start, stop, step factor_number_generator = range(-1, -(nb_factors + 1), -1) else: factor_number_generator = range(0, nb_factors, 1) # main loop i_iter = 0 delta_objective_error = np.inf init_vectors_norm_comp_L = [None] * nb_factors init_vectors_norm_comp_R = [None] * nb_factors while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): for machine_idx_fac, j in enumerate(factor_number_generator): if lst_projection_functions[j].__name__ == "constant_proj": if track_objective: objective_function[ i_iter, machine_idx_fac] = compute_objective_function( arr_X_target, _f_lambda=f_lambda, _lst_S=S_factors_op) logger.debug( "Iteration {}; Factor idx {}; Objective value {}". format(i_iter, j, objective_function[i_iter, machine_idx_fac])) continue L = S_factors_op.get_L(j) R = S_factors_op.get_R(-j - 1) # R = S_factors_op.get_R(nb_factors - j - 1) # print(nb_factors, L.n_factors+R.n_factors+1, L.n_factors, # R.n_factors, j, -j-1) # compute minimum c value (according to paper) L_norm, init_vectors_norm_comp_L[j] = L.compute_spectral_norm(init_vector_eigs_v0=init_vectors_norm_comp_L[j]) \ if L.n_factors > 0 else (1, init_vectors_norm_comp_L[j]) R_norm, init_vectors_norm_comp_R[j] = R.compute_spectral_norm(init_vector_eigs_v0=init_vectors_norm_comp_R[j]) \ if R.n_factors > 0 else (1, init_vectors_norm_comp_R[j]) min_c_value = (f_lambda * L_norm * R_norm)**2 # lipsitchz constant # add epsilon because it is exclusive minimum c = min_c_value * 1.001 logger.debug("Lipsitchz constant value: {}; c value: {}".format( min_c_value, c)) # compute new factor value # todo check if it is not redundant to recompute the S_factors_op res = f_lambda * S_factors_op.compute_product() - arr_X_target # res_RH = R.dot(res.T).T if R.n_factors > 0 else res res_RH = S_factors_op.apply_RH(n_factors=-j - 1, X=res) # res_RH = S_factors_op.apply_RH(n_factors=nb_factors-j-1, X=res) LH_res_RH = S_factors_op.apply_LH(n_factors=j, X=res_RH) grad_step = 1. / c * f_lambda * LH_res_RH Sj = S_factors_op.get_factor(j) # normalize because all factors must have norm 1 S_proj = lst_projection_functions[j](Sj - grad_step) S_proj = csr_matrix(S_proj) S_proj /= np.sqrt(S_proj.power(2).sum()) S_factors_op.set_factor(j, S_proj) if track_objective: objective_function[ i_iter, machine_idx_fac] = compute_objective_function( arr_X_target, _f_lambda=f_lambda, _lst_S=S_factors_op) logger.debug( "Iteration {}; Factor idx {}; Objective value {}".format( i_iter, j, objective_function[i_iter, machine_idx_fac])) # re-compute the full factorisation # todo check if it is not redundant to recompute the S_factors_op arr_X_curr = S_factors_op.compute_product() # update lambda f_lambda = update_scaling_factor(X=arr_X_target, X_est=arr_X_curr) logger.debug("Lambda value: {}".format(f_lambda)) objective_function[i_iter, -1] = \ compute_objective_function(arr_X_target, _f_lambda=f_lambda, _lst_S=S_factors_op) logger.debug("Iteration {}; Objective value: {}".format( i_iter, objective_function[i_iter, -1])) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter, -1] - objective_function[i_iter - 1, -1] ) / objective_function[i_iter - 1, -1] logger.debug("Delta objective: {}".format(delta_objective_error)) # TODO vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 return f_lambda, S_factors_op, arr_X_curr, objective_function, i_iter
def load_caltech(final_size): data_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar" lst_images = [] lst_classes_idx = [] with tempfile.TemporaryDirectory() as d_tmp: logger.debug( f"Downloading file from url {data_url} to temporary directory {d_tmp}" ) tarfile_path = Path(download_data(data_url, d_tmp)) dir_path = Path(d_tmp) tf = tarfile.open(tarfile_path) tf.extractall(dir_path / "caltech256") tf.close() for root, dirs, files in os.walk(dir_path / "caltech256"): print(root) label_class = root.split("/")[-1] splitted_label_class = label_class.split(".") if splitted_label_class[-1] == "clutter": continue if len(splitted_label_class) > 1: label_idx = int(splitted_label_class[0]) else: continue for file in files: path_img_file = Path(root) / file try: img = plt.imread(path_img_file) except: continue aspect_ratio = max(final_size / img.shape[0], final_size / img.shape[1]) new_img = cv2.resize(img, dsize=(0, 0), fx=aspect_ratio, fy=aspect_ratio) new_img = crop_center(new_img, (final_size, final_size, 3)) if new_img.shape == (final_size, final_size): new_img = cv2.cvtColor(new_img, cv2.COLOR_GRAY2RGB) lst_images.append(new_img.flatten()) lst_classes_idx.append(label_idx) X = np.vstack(lst_images) y = np.array(lst_classes_idx) print(X.shape) print(y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y) return (X_train, y_train), (X_test, y_test)
def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, graphical_display=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param graphical_display: Tell the algorithm to display the results. :return: """ assert K_nb_cluster == initialization.shape[0] X_data_norms = get_squared_froebenius_norm_line_wise(X_data) init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] X_centroids_hat = copy.deepcopy(initialization) min_K_d = min(X_centroids_hat.shape) lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)] eye_norm = np.sqrt(K_nb_cluster) lst_factors[0] = np.eye(K_nb_cluster) / eye_norm lst_factors[1] = np.eye(K_nb_cluster, min_K_d) lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1])) if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) _lambda = _lambda_tmp / eye_norm if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa( np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init, lst_factors, _lambda * multi_dot(lst_factors)) objective_function = np.empty((nb_iter, 2)) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while (i_iter <= 1) or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Qmeans {}".format(i_iter)) U_centroids = _lambda * multi_dot(lst_factors[1:]) if i_iter > 0: objective_function[i_iter, 0] = compute_objective(X_data, U_centroids, indicator_vector) # Assign all points to the nearest centroid # first get distance from all points to all centroids distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) # then, Determine class membership of each point # by picking the closest centroid indicator_vector = np.argmin(distances, axis=1) objective_function[i_iter, 1] = compute_objective(X_data, U_centroids, indicator_vector) # Update centroid location using the newly # assigned data point classes for c in range(K_nb_cluster): X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0) # get the number of observation in each cluster cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) if len(counts) < K_nb_cluster: raise ValueError( "Some clusters have no point. Aborting iteration {}".format( i_iter)) diag_counts_sqrt = np.diag(np.sqrt( counts[cluster_names_sorted])) # todo use sparse matrix object diag_counts_sqrt_norm = np.linalg.norm( diag_counts_sqrt ) # todo analytic sqrt(n) instead of cumputing it with norm diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm # set it as first factor lst_factors[0] = diag_counts_sqrt_normalized if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) if hierarchical_inside: _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, # f_lambda_init=_lambda, f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) loss_palm_before = objective_palm[0, 0] loss_palm_after = objective_palm[-1, -1] else: _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1] ["finetune"], f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, graphical_display=False) loss_palm_before = objective_palm[0, -1] loss_palm_after = objective_palm[-1, -1] logger.debug("Loss palm before: {}".format(loss_palm_before)) logger.debug("Loss palm after: {}".format(loss_palm_after)) if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat, lst_factors_init, lst_factors, _lambda_tmp * multi_dot(lst_factors)) _lambda = _lambda_tmp / diag_counts_sqrt_norm logger.debug("Returned loss (with diag) palm: {}".format( objective_palm[-1, 0])) if i_iter >= 2: delta_objective_error = np.abs( objective_function[i_iter, 0] - objective_function[i_iter - 1, 0] ) / objective_function[ i_iter - 1, 0] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 U_centroids = _lambda * multi_dot(lst_factors[1:]) distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) indicator_vector = np.argmin(distances, axis=1) return objective_function[:i_iter], U_centroids, indicator_vector
"split": [ get_lambda_proxsplincol(nb_keep_values), get_lambda_proxsplincol(nb_values_residual) ], "finetune": [constant_proj] + [get_lambda_proxsplincol(nb_keep_values)] * (k) + [get_lambda_proxsplincol(nb_values_residual)] } lst_proj_op_by_fac_step.append(dct_step_lst_nb_keep_values) #final_lambda, final_factors, final_X = PALM4LED(H, lst_factors, [nb_keep_values for _ in range(nb_factors)], _lambda, nb_iter) final_lambda, final_factors, final_X, nb_iter_by_factor, _ = hierarchical_palm4msa( arr_X_target=H, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda, nb_iter=nb_iter, update_right_to_left=True, residual_on_right=True, graphical_display=True) logger.debug("Number of iteration for each factor: {}; Total: {}".format( nb_iter_by_factor, sum(nb_iter_by_factor))) visual_evaluation_palm4msa(H, lst_factors, final_factors, final_X) vec = np.random.rand(d) h_vec = H @ vec r_vec = final_X @ vec logger.debug("Distance matrice to random vector (true vs fake):{}".format( norm(h_vec - r_vec)))
def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization, batch_size, delta_objective_error_threshold=1e-6, proj_l1=False, _lambda=None, epsilon=None): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param batch_size: The size of each batch. :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value. :return: """ logger.debug("Compute squared froebenius norm of data") X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch( X_data, batch_size) # Initialize our centroids by picking random data points U_centroids = copy.deepcopy(initialization) objective_function = np.empty((nb_iter, )) total_nb_of_minibatch = X_data.shape[0] // batch_size # Loop for the maximum number of iterations i_iter = 0 delta_objective_error = np.inf while i_iter < nb_iter and (delta_objective_error > delta_objective_error_threshold): logger.info("Iteration number {}/{}".format(i_iter, nb_iter)) # Prepare next epoch full_count_vector = np.zeros(K_nb_cluster, dtype=int) full_indicator_vector = np.zeros(X_data.shape[0], dtype=int) U_centroids_before = np.copy(U_centroids) U_centroids = np.zeros_like(U_centroids_before) for i_minibatch, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): logger.info( "Minibatch number {}/{}; Iteration number {}/{}".format( i_minibatch, total_nb_of_minibatch, i_iter, nb_iter)) example_batch = X_data[example_batch_indexes] example_batch_norms = X_data_norms[example_batch_indexes] indicator_vector, distances = assign_points_to_clusters( example_batch, U_centroids_before, X_norms=example_batch_norms) full_indicator_vector[example_batch_indexes] = indicator_vector cluster_names, counts = np.unique(indicator_vector, return_counts=True) count_vector = np.zeros(K_nb_cluster) count_vector[cluster_names] = counts full_count_vector = update_clusters(example_batch, U_centroids, K_nb_cluster, full_count_vector, count_vector, indicator_vector) # Update centroid location using the newly # assigned data point classes if proj_l1: if _lambda is None or epsilon is None: raise ValueError( "epsilon and lambda must be set if proj_l1 is True") for i_centroid, centroid in enumerate(U_centroids): U_centroids[i_centroid, :] = proj_onto_l1_ball( _lambda=_lambda, epsilon_tol=epsilon, vec=centroid) objective_function[i_iter, ] = compute_objective_by_batch( X_data, U_centroids, full_indicator_vector, batch_size) if i_iter >= 1: delta_objective_error = np.abs( objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[ i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 return objective_function[:i_iter], U_centroids, full_indicator_vector
) / objective_function[ i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 return objective_function[:i_iter], U_centroids, full_indicator_vector if __name__ == "__main__": batch_size = 10000 nb_clust = 1000 nb_iter = 30 X = np.memmap( "/home/luc/PycharmProjects/qalm_qmeans/data/external/blobs_1_billion.dat", mode="r", dtype="float32", shape=(int(1e6), 2000)) logger.debug("Initializing clusters") centroids_init = X[np.random.permutation(X.shape[0])[:nb_clust]] start = time.time() logger.debug("Nb iteration: {}".format(nb_iter)) obj, _, _ = kmeans_minibatch(X, nb_clust, nb_iter, centroids_init, batch_size) stop = time.time() plt.plot(obj) plt.show() print("It took {} s".format(stop - start))
def qkmeans_minibatch(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, batch_size: int, hierarchical_inside=False, delta_objective_error_threshold=1e-6, hierarchical_init=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param delta_objective_error_threshold: :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not. :param batch_size: The size of each batch. :return: """ assert K_nb_cluster == initialization.shape[0] logger.debug("Compute squared froebenius norm of data") X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch( X_data, batch_size) nb_examples = X_data.shape[0] total_nb_of_minibatch = X_data.shape[0] // batch_size X_centroids_hat = copy.deepcopy(initialization) # ################################ INIT PALM4MSA ############################### logger.info("Initializing QKmeans with PALM algorithm") lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors) eye_norm = np.sqrt(K_nb_cluster) ########################## # GET PARAMS OF PALM4MSA # ########################## init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] delta_objective_error_threshold_inner_palm = params_palm4msa[ "delta_objective_error_threshold"] track_objective_palm = params_palm4msa["track_objective"] #################### # INIT RUN OF PALM # #################### if hierarchical_inside or hierarchical_init: _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical= \ hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm, return_objective_function=track_objective_palm) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) # ################################################################ lst_factors = None # safe assignment for debug _lambda = _lambda_tmp / eye_norm objective_function = np.ones(nb_iter) * -1 lst_all_objective_functions_palm = [] lst_all_objective_functions_palm.append(objective_palm) i_iter = 0 delta_objective_error = np.inf while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration number {}/{}".format(i_iter, nb_iter)) # Re-init palm factors for iteration lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) # Prepare next epoch full_count_vector = np.zeros(K_nb_cluster, dtype=int) full_indicator_vector = np.zeros(X_data.shape[0], dtype=int) X_centroids_hat = np.zeros_like(X_centroids_hat) for i_minibatch, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): logger.info( "Minibatch number {}/{}; Iteration number {}/{}".format( i_minibatch, total_nb_of_minibatch, i_iter, nb_iter)) example_batch = X_data[example_batch_indexes] example_batch_norms = X_data_norms[example_batch_indexes] ########################## # Update centroid oracle # ########################## indicator_vector, distances = assign_points_to_clusters( example_batch, op_centroids, X_norms=example_batch_norms) full_indicator_vector[example_batch_indexes] = indicator_vector cluster_names, counts = np.unique(indicator_vector, return_counts=True) count_vector = np.zeros(K_nb_cluster) count_vector[cluster_names] = counts full_count_vector = update_clusters(example_batch, X_centroids_hat, K_nb_cluster, full_count_vector, count_vector, indicator_vector) objective_function[i_iter] = compute_objective_by_batch( X_data, op_centroids, full_indicator_vector, batch_size) # inplace modification of X_centrois_hat and full_count_vector and full_indicator_vector check_cluster_integrity(X_data, X_centroids_hat, K_nb_cluster, full_count_vector, full_indicator_vector) ######################### # Do palm for iteration # ######################### # create the diagonal of the sqrt of those counts diag_counts_sqrt_normalized = csr_matrix( (np.sqrt(full_count_vector / nb_examples), (np.arange(K_nb_cluster), np.arange(K_nb_cluster)))) diag_counts_sqrt = np.sqrt(full_count_vector) # set it as first factor op_factors.set_factor(0, diag_counts_sqrt_normalized) if hierarchical_inside: _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \ hierarchical_palm4msa( arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, return_objective_function=track_objective_palm, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa(arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), nb_factors=op_factors.n_factors, lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) _lambda = _lambda_tmp / np.sqrt(nb_examples) ############################ lst_all_objective_functions_palm.append(objective_palm) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee i_iter += 1 op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return objective_function[: i_iter], op_centroids, full_indicator_vector, lst_all_objective_functions_palm