def get_km(xs, n): km = KMedoids(n_cluster=n, max_iter=1000, tol=1e-5) km.fit(xs) kmidx = list(km.medoids) testidx = [i for i in range(len(xs)) if i not in kmidx] np.random.shuffle(testidx) return kmidx, testidx[:100]
def __init__(self, corpus, document_titles, num_clusters = None, num_features = None): """ Computes the interpreter matrix by calculating the TF-IDF value of each token in each concept (doc) in corpus. document_titles give the names of each concept (doc) in corpus. num_features gives the number of features of corpus If num_clusters == None all documents are used as concepts. """ if not num_clusters: self.num_clusters = len(document_titles) else: self.num_clusters = num_clusters if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features #reduce column count by k-medoid clustering and using medoid of each cluster #TODO: skip clustering when num_clusters == None clusterer = KMedoids(corpus = corpus, num_features = self.num_features, num_clusters = self.num_clusters, max_iterations = 10) clusters = clusterer.cluster() #set the corpus to medoids #the corpus is our interpreter matrix. It is not sparse #each column is a doc and is seen as a concept self.corpus = clusterer.get_medoids().T #reduce document titles self.document_titles = DocumentTitles() for cluster_id in clusters.iterkeys(): self.document_titles.append(document_titles[cluster_id] or "no title") #print clusters with their members for cluster_id, members in clusters.iteritems(): cluster_title = document_titles[cluster_id] member_titles = ", ".join(document_titles[member_id] for member_id in members) logger.debug("%s: %s" % (cluster_title, member_titles))
def fit(self, X, y, **kwargs): if self.verbose: print("fitting", self) clss = sorted(set(y)) meds = [] for c, k in enumerate(clss): idxs_c = np.where(y == k)[0] kmpp_idxs = kmeanspp(X[idxs_c], self.mk, seed=self.seed) kmed = KMedoids(self.mk, init=kmpp_idxs) kmed.fit(X[idxs_c], dist=False) meds.append(idxs_c[kmed.medoids].tolist()) self.idxs = np.concatenate(meds, axis=0) self.vecs = X[self.idxs] cc = Counter(y[self.idxs]) assert set(cc.values()) == set({self.mk}) if self.verbose: print("fitted KMedoids seed={},".format(self.seed), cc) self.clf.fit(self.vecs, y[self.idxs]) return self
def test_write_on_file(self): K = 10 # Output population pp = get_population() fout = open('pp.csv', 'w') fout.write('x,y,label\n') for x, y in pp: fout.write('%f,%f,k\n' % (x, y)) fout.close() # Output MDS samples lin = Linear(self.dist_func, K = K) lin.update(pp) sampled = lin.get_sampled() fout = open('mcs.csv', 'w') fout.write('x,y,label\n') for x, y in sampled: fout.write('%f,%f,b\n' % (x, y)) fout.close() # Output random samples random.seed(1) sampled = random.sample(pp, K) fout = open('random.csv', 'w') fout.write('x,y,label\n') for x, y in sampled: fout.write('%f,%f,g\n' % (x, y)) fout.close() # Ouptut k means samples kmedoids = KMedoids(self.dist_func, K) sampled = kmedoids.sample(pp) fout = open('kmedoids.csv', 'w') fout.write('x,y,label\n') for x, y in sampled: fout.write('%f,%f,r\n' % (x, y)) fout.close()
def run_kmedoids(self, params): """ Performs clustering using the k-medoids algorithm. :type params: dictionary :param params: {'k','t_max','init','criterion'} """ self.n_clusters_ = params['k'] kmedoids = KMedoids(n_clusters=params['k'], max_iter=params['t_max'], init=params['init'], criterion=params['criterion']).fit(self.dist_mat) self.centers_l = kmedoids.cluster_centers_ self.labels_l = kmedoids.labels_ self.form_kmedoids_results()
} else: raise Exception(f"Not recognized dataset: {args.dataset}") if args.dis == "euclidean": make_pretrainer = lambda: KMeans(n_clusters=n_clusters) dis = DMAE.Dissimilarities.euclidean dis_loss = DMAE.Losses.euclidean_loss init_dmae = lambda pretrainer: { "centers": DMAE.Initializers.InitKMeans(pretrainer), "mixers": tf.keras.initializers.Constant(1.0) } cov = False elif args.dis == "cosine": make_pretrainer = lambda: KMedoids(n_clusters=n_clusters, metric="cosine") dis = DMAE.Dissimilarities.cosine dis_loss = DMAE.Losses.cosine_loss init_dmae = lambda pretrainer: { "centers": DMAE.Initializers.InitKMeans(pretrainer), "mixers": tf.keras.initializers.Constant(1.0) } cov = False elif args.dis == "manhattan": make_pretrainer = lambda: KMedoids(n_clusters=n_clusters, metric="manhattan") dis = DMAE.Dissimilarities.manhattan dis_loss = DMAE.Losses.manhattan_loss init_dmae = lambda pretrainer: { "centers": DMAE.Initializers.InitKMeans(pretrainer),
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None): vecinos = algorithms[ALGORITHM] algoritmos = "coseno" if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]: algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS elif PORCENTAJE_VECINOS != 0: algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS) titulo = MODELO + "-" + algoritmos if normalizar is not None: titulo += "-" + normalizar fname = sys.argv[2] + "/" + titulo + ".out" if os.path.isfile(fname): return print(titulo) print("-" * 20) if PORCENTAJE_VECINOS == 0: X = coseno if MODELO == "dbscan": # Solo sirve para coseno! X = 1 - X else: neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy" if os.path.isfile(neighbour_file_name): NEIGHBOURS = np.load(neighbour_file_name) else: print("Calculando vecinos") NEIGHBOURS = np.zeros((len(service_number), len(service_number))) for i in range(0, len(service_number)): for j in range(i, len(service_number)): NEIGHBOURS[i][j] = vecinos(followers, users, i, j) if i != j: NEIGHBOURS[j][i] = NEIGHBOURS[i][j] np.save(neighbour_file_name, NEIGHBOURS) if normalizar is not None: print("Normalizando Vecinos") if normalizar == 'minmax': NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS) elif normalizar == 'scale': NEIGHBOURS = preprocessing.scale(NEIGHBOURS) elif normalizar == 'robust': NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS) elif normalizar == 'softmax': NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True) elif normalizar == 'matrixminmax': NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS)) elif normalizar == 'matrixmax': NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS) if MODELO == "dbscan": # Si es distancia if normalizar is not None: NEIGHBOURS = 1 - NEIGHBOURS else: NEIGHBOURS = - NEIGHBOURS X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS else: # Si es afinidad if PORCENTAJE_VECINOS == "boost": X = np.multiply(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "maxsim": X = np.maximum(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "dist": NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS)) COSINE_SORTED = np.argsort(np.argsort(coseno)) POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED))) X = POS_BOOST else: X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS print("Generando Modelo") if MODELO == 'kmedoids': model = KMedoids(n_clusters=1500).fit(X) if MODELO == 'kmedoids470': model = KMedoids(n_clusters=470).fit(X) elif MODELO == 'ap': model = AffinityPropagation(affinity='precomputed').fit(X) elif MODELO == 'dbscan': model = DBSCAN(metric='precomputed').fit(X) labels = model.labels_ clusters = defaultdict(list) for index, classif in enumerate(labels): clusters[classif].append(index) n_clusters_ = len(clusters) info = "" info += 'Clusters: %d\n' % n_clusters_ # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels) info += 'Entropy: %0.3f\n' % entropy(labels_true, labels) info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels) info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels) info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels) info += 'Purity: %0.3f\n' % purity(labels_true, labels) info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels) info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels) info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels) clustersize = Counter(labels) salida = open(fname, 'w', encoding='UTF-8') print(info) salida.write(titulo + "\n") for cluster, services in clusters.items(): countcat = Counter([labels_true[svc] for svc in services]) max_key, num = countcat.most_common(1)[0] salida.write("%i (%s - %i/%i): %s \n" % ( cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services]))) salida.write("-" * 20 + "\n") salida.write(info) salida.close()