def get_cv_cpv(x: str, percent: float) -> float: global model_goal # Get dataset number dataset_num = get_dataset_num(x) # Get number of pcs for CPV > 0.8 and CPV > 0.99 if percent == 0.99: pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.99)"] else: pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.8)"] # Get df_results df = pd.read_csv(x) idx = df.features_kept == pcs_cpv try: return df.loc[idx].cv.values[0] except: inputs = Inputs(paths) inputs.random_seed = 1969 inputs.get_df_split(dataset_num) pca_model = get_pca_model(inputs) cluster_model = Clustering(inputs.num_cluster, 100, inputs.random_seed) cluster_model.fit(pca_model.pcs_train.loc[:, :pcs_cpv - 1]) cluster_prediction = cluster_model.predict( pca_model.pcs_test.loc[:, :pcs_cpv - 1]) cluster_performances = cluster_model.get_cluster_performances( inputs.df_test.copy(), cluster_prediction, pcs_cpv, inputs.num_cluster, model_goal=model_goal) return variation(cluster_performances)
def cluster(self, shapelets): """ Uses a clustering algorithm to reduce the number of shapelets. :param shapelets: list of shapelet candidates :type shapelets: np.array, shape = (len(shapelets), len(s), len(dim(s))) :return: list of remaining shapelet candidates :rtype np.array, shape = (|remaining candidates|, len(s), len(dim(s))) """ clustering = Clustering(self.d_max) clustering.fit(shapelets) return clustering.nn_centers()
def perform_clustering( term_ids_to_embs: Dict[int, List[float]]) -> Dict[int, Set[int]]: """Cluster the given terms into 5 clusters. Args: term_ids_to_embs: A dictionary mapping term-ids to their embeddings. Return: A dictionary of mapping each cluster label to its cluster. Each cluster is a set of term-ids. """ # Case less than 5 terms to cluster. num_terms = len(term_ids_to_embs) if num_terms < 5: clusters = {} for i, tid in enumerate(term_ids_to_embs): clusters[i] = {tid} return clusters # Case more than 5 terms to cluster. c = Clustering() term_ids_embs_items = [(k, v) for k, v in term_ids_to_embs.items()] results = c.fit([it[1] for it in term_ids_embs_items]) labels = results['labels'] print(' Density:', results['density']) clusters = defaultdict(set) for i in range(len(term_ids_embs_items)): term_id = term_ids_embs_items[i][0] label = labels[i] clusters[label].add(term_id) return clusters
def clustering(x, df, n_clusters=10, distance='angular', method='K-medians'): """ Do the clustering, based on the 91 features. Args: x: array of features df: dataframe of features n_clusters: number of clusters distance: could be 'angular' or 'euclidean'; method: could be 'K-medians', 'K-means', 'Hierarchical' Output: new_df: the labeled dataframe, according to the clustering algorithm relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids cs: dictionary with the centroid features """ relevant_features_id = [ 0, 3, 5, 13, 15, 17, 25, 46, 47, 56, 64, 65, 76, 77, 83, 85, 90 ] keys_dict = [ '0-1', '0-4', '0-6', '1-2', '1-4', '1-6', '2-3', '4-5', '4-6', '5-7', '6-8', '6-9', '8-9', '8-10', '9-12', '10-11', '12-13' ] clustering_ = Clustering(k=n_clusters, distance=distance, method=method) cs, cls = clustering_.fit(x) assert len(list(cls.keys())) == n_clusters d = pd.DataFrame() l = [] for i in range(n_clusters): df1 = pd.DataFrame(cls[i]) d = pd.concat([d, df1], sort=False) l += [i] * len(cls[i]) d.columns = df.columns d.insert(91, 'label', l) new_df = df.reset_index().merge(d).set_index('index') relevant_features_cs = [] if method == 'Hierarchical': pass else: for i in range(len(cs)): d = {} cs_rf = cs[i][relevant_features_id] for k in range(len(keys_dict)): d[keys_dict[k]] = cs_rf[k] relevant_features_cs.append(d) return new_df, relevant_features_cs, cs
def clustering(x, n_clusters): """ Do the clustering, based on the 91 features. We compute the reconstructed poses only with the following default parameters: method: 'K-Medians' distance: 'angular' Args: x: array of features n_clusters: number of clusters Output: new_df: the labeled dataframe, according to the clustering algorithm relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids cs: dictionary with the centroid features """ clustering_ = Clustering(k=n_clusters) cs, cls = clustering_.fit(x) d = pd.DataFrame() l = [] for i in range(len(cs)): df1 = pd.DataFrame(cls[i]) d = pd.concat([d, df1], sort=False) l += [i] * len(cls[i]) d.columns = df.columns d.insert(91, 'label', l) new_df = df.reset_index().merge(d).set_index('index') assert len(cs) == n_clusters relevant_features_cs = [] for i in range(len(cs)): d = {} cs_rf = cs[i][relevant_features_id] for k in range(len(keys_dict)): d[keys_dict[k]] = cs_rf[k] relevant_features_cs.append(d) return new_df, relevant_features_cs, cs
from clustering import Clustering # from embeddings import * words = [ 'computer', 'algorithm', 'program', 'bear', 'cat', 'snake', 'fish', 'tree', 'flower', 'gras', 'tea', 'water', 'milk' ] # embedder = FastTextE() # embedder.load_model() emb_dict = {} with open('fasttext-wiki-news-300d-1M.vec', 'r', encoding='utf8') as f: for line in f: sp_line = line.split(' ') token, vector = sp_line[0], sp_line[1:] emb_dict[token] = vector word_embeddings = [emb_dict[word] for word in words] clus = Clustering() print(clus) print(clus.clus_type) print(clus.affinity) print(clus.fit(word_embeddings))