def test_once_by_random_features(): Xtrain = numpy.random.random_sample((5000)).reshape(-1, 10) Xtest = numpy.random.random_sample((500)).reshape(-1, 10) gmm_orig = GaussianMixture(n_components=8, random_state=1) gmm_copy = GaussianMixture() gmm_orig.fit(Xtrain) gmm_copy.weights_ = gmm_orig.weights_ gmm_copy.means_ = gmm_orig.means_ gmm_copy.covariances_ = gmm_orig.covariances_ gmm_copy.precisions_ = gmm_orig.precisions_ gmm_copy.precisions_cholesky_ = gmm_orig.precisions_cholesky_ gmm_copy.converged_ = gmm_orig.converged_ gmm_copy.n_iter_ = gmm_orig.n_iter_ gmm_copy.lower_bound_ = gmm_orig.lower_bound_ y_orig = gmm_orig.score_samples(Xtest) y_copy = gmm_copy.score_samples(Xtest) return all(y_orig == y_copy)
def main(dataset_name, pca, cluster_method, lm_type, document_repr_type, random_state): save_dict_data = {} # pca = 0 means no pca do_pca = pca != 0 save_dict_data["dataset_name"] = dataset_name save_dict_data["pca"] = pca save_dict_data["cluster_method"] = cluster_method save_dict_data["lm_type"] = lm_type save_dict_data["document_repr_type"] = document_repr_type save_dict_data["random_state"] = random_state naming_suffix = f"pca{pca}.clus{cluster_method}.{lm_type}.{document_repr_type}.{random_state}" print(naming_suffix) data_dir = os.path.join(INTERMEDIATE_DATA_FOLDER_PATH, dataset_name) print(data_dir) with open(os.path.join(data_dir, "dataset.pk"), "rb") as f: dictionary = pk.load(f) class_names = dictionary["class_names"] num_classes = len(class_names) print(class_names) with open( os.path.join( data_dir, f"document_repr_lm-{lm_type}-{document_repr_type}.pk"), "rb") as f: dictionary = pk.load(f) document_representations = dictionary["document_representations"] class_representations = dictionary["class_representations"] repr_prediction = np.argmax(cosine_similarity_embeddings( document_representations, class_representations), axis=1) save_dict_data["repr_prediction"] = repr_prediction if do_pca: _pca = PCA(n_components=pca, random_state=random_state) document_representations = _pca.fit_transform(document_representations) class_representations = _pca.transform(class_representations) print(f"Explained variance: {sum(_pca.explained_variance_ratio_)}") if cluster_method == 'gmm': cosine_similarities = cosine_similarity_embeddings( document_representations, class_representations) document_class_assignment = np.argmax(cosine_similarities, axis=1) document_class_assignment_matrix = np.zeros( (document_representations.shape[0], num_classes)) for i in range(document_representations.shape[0]): document_class_assignment_matrix[i][ document_class_assignment[i]] = 1.0 gmm = GaussianMixture(n_components=num_classes, covariance_type='tied', random_state=random_state, n_init=999, warm_start=True) gmm.converged_ = "HACK" gmm._initialize(document_representations, document_class_assignment_matrix) gmm.lower_bound_ = -np.infty gmm.fit(document_representations) documents_to_class = gmm.predict(document_representations) centers = gmm.means_ save_dict_data["centers"] = centers distance = -gmm.predict_proba(document_representations) + 1 elif cluster_method == 'kmeans': kmeans = KMeans(n_clusters=num_classes, init=class_representations, random_state=random_state) kmeans.fit(document_representations) documents_to_class = kmeans.predict(document_representations) centers = kmeans.cluster_centers_ save_dict_data["centers"] = centers distance = np.zeros( (document_representations.shape[0], centers.shape[0]), dtype=float) for i, _emb_a in enumerate(document_representations): for j, _emb_b in enumerate(centers): distance[i][j] = np.linalg.norm(_emb_a - _emb_b) save_dict_data["documents_to_class"] = documents_to_class save_dict_data["distance"] = distance with open(os.path.join(data_dir, f"data.{naming_suffix}.pk"), "wb") as f: pk.dump(save_dict_data, f)