def medoids(simulation_object, w_samples, b, B=200): inputs_set, psi_set, _, _, z = select_top_candidates( simulation_object, w_samples, B) D = pairwise_distances(psi_set, metric='euclidean') M, C = kmedoids.kMedoids(D, b) return inputs_set[M, :z], inputs_set[M, z:]
def main(): '''do clustering''' args = get_args() data = [] with open(args[2]) as json_data: for line in json_data: tweet = json.loads(line) data.append(tweet['text']) print('The scikit-learn version is {}.'.format(sklearn.__version__)) print('distance') # distance matrix distance = pairwise_distances(data, 'jaccard') print('splitting') # split into k clusters M, clusters = kmedoids.kMedoids(distance, int(args[1])) print('split') print('medoids:') for point_idx in M: print(data[point_idx]) print('') print('clustering result:') for label in clusters: for point_idx in clusters[label]: print('label {0}: {1}'.format(label, data[point_idx]))
def run_kmedoids(self, distances): center_indices, labels_dict = kmedoids.kMedoids( distances, self.branching_factor) labels = np.empty(distances.shape[0]).astype(int) for key, value in labels_dict.items(): labels[value] = key return center_indices, labels
def solver(): parser = argparse.ArgumentParser() parser.add_argument("integer", type=int, help="Please give arguments as 'Centroid','Min','Max'") args = parser.parse_args() clusters = args.integer reader = DataReader() data = reader.loadData() simMatrix, indexes = genSimilarityMatrix(data) M, C = kmedoids.kMedoids(simMatrix, clusters) fileWriter = open('data/Kmedoids_output_{}.txt'.format(clusters), 'w') print('medoids', file=fileWriter) i = 1 for point in M: print('medoid of cluster ', i, ' ', indexes[point], file=fileWriter) i = i + 1 print(' ', file=fileWriter) print('clustering result:', file=fileWriter) i = 1 for label in C: for point_idx in C[label]: print('Cluster ', i, ': ', indexes[point_idx], file=fileWriter) i = i + 1 fileWriter.close() print("Clustering Done!!,No. of new clusters are {}".format(clusters)) print("New clusters are stored in file-data/Kmedoids_output_{}.txt".format( clusters))
def k_medoids(sample, num_clusters): # clusters the samples into the number of clusters (num_clusters) according # to the K-Medoids clustering algorithm and returns the medoids and the # samples that belong to each cluster D = distance_matrix(sample, sample) M, C = kMedoids(D, num_clusters) return M, C
def consensus_matrix(distance_mx, ks): print("Building consensus matrix") n, m, m = distance_mx.shape cons_mx = np.zeros((ks - 1, m, m)) for k in range(2, ks): count = 1 for node in distance_mx: print("Clustering for k = " + str(k) + " node " + str(count)) _, clusters = km.kMedoids(node, k) for value in clusters.values(): pairs = list(combinations(value, 2)) for ij in pairs: i, j = ij cons_mx[k - 2][i][j] += 1 cons_mx[k - 2][j][i] += 1 count += 1 cons_mx[k - 2] = cons_mx[k - 2] / float(n) cons_mx[k - 2] = cons_mx[k - 2] / float(k) cons_mx = np.sum(cons_mx, axis=0) print("...built!") return cons_mx
def clust_creation(x): # T0=time() windscen = {} windscen[1] = pd.read_csv(WindScen_file_1, index_col=0) windscen[2] = pd.read_csv(WindScen_file_2, index_col=0) windscen[3] = pd.read_csv(WindScen_file_3, index_col=0) windscen[4] = pd.read_csv(WindScen_file_4, index_col=0) windscen[5] = pd.read_csv(WindScen_file_5, index_col=0) windscen[6] = pd.read_csv(WindScen_file_6, index_col=0) windscen[7] = pd.read_csv(WindScen_file_7, index_col=0) windscen[8] = pd.read_csv(WindScen_file_8, index_col=0) windscen[9] = pd.read_csv(WindScen_file_9, index_col=0) windscen[10] = pd.read_csv(WindScen_file_10, index_col=0) windscen[11] = pd.read_csv(WindScen_file_11, index_col=0) windscen[12] = pd.read_csv(WindScen_file_12, index_col=0) windscen[13] = pd.read_csv(WindScen_file_13, index_col=0) windscen[14] = pd.read_csv(WindScen_file_14, index_col=0) windscen[15] = pd.read_csv(WindScen_file_15, index_col=0) windinfo = pd.read_csv(windfarms_file, index_col=0) windfarms = windinfo.index.tolist() scenprob_init = {s: 1.0 / NScen for s in range(1, NScen + 1)} timeseries = [] for k in range(len(windfarms)): timeseries.append([]) for i in range(1, NScen + 1): timeseries[k].append(windscen[k + 1]['{0}'.format(i)].values) time_series = [] for i in range(1, NScen + 1): l = list() for k in range(len(windfarms)): l += timeseries[k][i - 1].tolist() time_series.append(l) #use k-medoid to generate the clusters and the centroids (significant scenario of every cluster) n_clusters = x D = pairwise_distances(np.array(time_series), metric='euclidean') M, C = kmedoids.kMedoids( D, n_clusters ) #M is a list of medoids and C a list of cluster (repartition of the scenarios in the clusters, only number (ID) of scenario not data) cluster = [] for i in range(n_clusters): cluster.append(list(C[i])) medoid_prob = { p: sum(scenprob_init[i + 1] for i in cluster[p]) for p in range(len(cluster)) } scenprob = {} clusters = [] for i in range(len(cluster)): add_med = list(M) add_med.pop(i) clusters.append(cluster[i] + add_med) scenprob[i] = {} for j in clusters[i]: if (j in cluster[i]): scenprob[i][j + 1] = scenprob_init[j + 1] else: scenprob[i][j + 1] = medoid_prob[list(M).index(j)] return (clusters, scenprob)
def boundary_medoids(simulation_object, w_samples, b, B=200): inputs_set, psi_set, _, _, z = select_top_candidates(simulation_object, w_samples, B) hull = ConvexHull(psi_set) simplices = np.unique(hull.simplices) boundary_psi = psi_set[simplices] boundary_inputs = inputs_set[simplices] D = pairwise_distances(boundary_psi, metric='euclidean') M, C = kmedoids.kMedoids(D, b) return boundary_inputs[M, :z], boundary_inputs[M, z:]
def partitional_approach(frame_instances, percentage=10): ''' Find prototypical frame instances using partitional clustering approach ''' condensed_matrix, instance_indexes = create_distance_matrix( frame_instances) num_clusters = len(frame_instances) / percentage + 1 medoids, clusters = kmedoids.kMedoids(squareform(condensed_matrix), num_clusters) medoid_instances = {} for point_index in medoids: frame_id = instance_indexes[point_index] medoid_instances[frame_id] = format_instance(frame_instances[frame_id]) return medoid_instances
def main(): '''do clustering''' args = get_args() data = get_data(args, 2) data, classes = extract(data, 0) #data = fit_encode(data) data = np.array(data) # distance matrix distance = pairwise_distances(data, metric='euclidean') # split into k clusters M, clusters = kmedoids.kMedoids(distance, int(args[1])) print('centers:') for point_idx in M: print(data[point_idx])
def kmedoid(attributes, ids, n=2): # dimension reduction data = np.array(attributes) reduced_data = PCA(n_components=2).fit_transform(data) D = pairwise_distances(reduced_data, metric='euclidean') # split into 2 clusters # #M store the points that is regarded as center M, C = kmedoids.kMedoids(D, n) group_members = [[] for i in range(n)] for i in range(n): for j in C[i]: group_members[i].append(ids[j]) show_kmedoid(M, C, reduced_data) # return kmeans result and ids of patients for tracing return group_members, M, C, reduced_data
def __init__(self, n = 20): self.data_matrix, self.items, self.features, self.sim_items, self.sim_feats = get_data() k = n if k > 200: k = 200 for i in range(500): try: medoids, clusters = kMedoids(self.sim_items, k) break except: continue else: print "medoids failed" medoids = [0] clusters = {0:range(len(self.items))} super(GoodN, self).__init__(9, k, medoids)
def kmedoids_active_learning(xtrain, ytrain, xact_sort, yact_sort, cut, n): from sklearn.metrics.pairwise import pairwise_distances import kmedoids print('kmedoids') result = np.zeros(n) for i in range(1, n): xact_sort = xact_sort[:cut, :] yact_sort = yact_sort[:cut] D = pairwise_distances(xact_sort, metric='euclidean') M, C = kmedoids.kMedoids(D, i) xact_medoids = xact_sort[M, :] yact_medoids = yact_sort[M] xtrain_new = np.concatenate((xtrain, xact_medoids), axis=0) ytrain_new = np.concatenate((ytrain, yact_medoids), axis=0) act_learn = svm.SVC(kernel='linear', C=1) act_learn.fit(xtrain_new, ytrain_new) score_km = act_learn.score(xtest, ytest) result[i] = score_km print(score_km) return (result)
def k_medoids_clust(self, data, dist_matrix, num_iter): # def k_medoids_cluster(self, data, dist_matrix, num_iter, w, r, verbose=True): # Turn dist_matrix from upper-triangle to full for i in range(len(data)): for j in range(0, i): dist_matrix[i][j] = dist_matrix[j][i] import kmedoids M, C = kmedoids.kMedoids(dist_matrix, self.num_clust, num_iter) # Wrap up, get in same format as kmeans self.medoids = [] for c_ts_idx in M: self.medoids.append(data[c_ts_idx]) self.ts_dists = defaultdict(dict) self.assignments = defaultdict(list) for c in C: # just 0, 1, 2, .... k-1 # print c_ts_idx c_ts_idx = M[c] for ts_idx in C[c]: self.assignments[c].append(data[ts_idx]) self.ts_dists[c][ts_idx] = max(dist_matrix[c_ts_idx][ts_idx], dist_matrix[ts_idx][c_ts_idx]) # Even though whole point of medoids is to avoid Euclidean mean-based centroids, I think it is still # nice to show the 'mean' of the curves of one cluster for kmedoids to produce smoother representations # of each curve self.centroids = [] for c in C: cur_centroid = np.zeros(data.shape[1]) for ts_idx in C[c]: cur_centroid += data[ts_idx] cur_centroid /= len(C[c]) self.centroids.append(cur_centroid)
def clusterKMedoids(shooterMeans, max_clusters=10): medoids = list() clusterings = list() performances = list() n, _ = shooterMeans.shape D = pairwise_distances(shooterMeans) for k in range(1, max_clusters + 1): print(k) m, c = kmedoids.kMedoids(D, k, 10000) medoids.append(m) # Cluster output of kMedoids is dictionary {cid: [x where c(x)==cid]} # Transform to list of cluster id for similarity with kMeans labels = [-1] * n for label in c: for idx in c[label]: labels[idx] = label labels = np.array(labels) # Transform so that np.nonzero works wgss = 0 for label in range(k): indices = np.nonzero(labels == label)[0] for combo in combinations(indices, 2): v1 = shooterMeans.loc[combo[0]] v2 = shooterMeans.loc[combo[1]] d = [v1[name] - v2[name] for name in shooterMeans.columns] dist = np.linalg.norm(d)**2 wgss += dist clusterings.append(labels) performances.append(wgss) return clusterings, performances, medoids
g = sent_tokenize(f) summary = open("summary.txt", "w") op = open("med.txt", "w") text = open("sentcode.txt", "r") data = [] size = int(np.shape(vectorop)[0] / 4) for line in text: data.append(line.strip().split()) data = np.asarray(data) index = [] # distance matrix D = pairwise_distances(data, metric='euclidean') # split into size clusters M, C = kmedoids.kMedoids(D, size) print('medoids:') for point_idx in M: print(data[point_idx], file=op) with open("sentcode.txt") as myFile: for num, line in enumerate(myFile, 1): if data[point_idx][1] in line: index.append(num) index.sort() print(index) for i in index: print(g[i - 1], file=summary)
for e1 in r1: e1min = 100 for e2 in r2: if area_dist[unique_area_idx[e1], unique_area_idx[e2]] < e1min: e1min = area_dist[unique_area_idx[e1], unique_area_idx[e2]] dist += e1min return dist researcher_dist = np.ones((len(aid), len(aid))) for x1, d1 in enumerate(aid): for x2, d2 in enumerate(aid): researcher_dist[x1, x2] = manhattan(d1, d2) researcher_dist = np.maximum(researcher_dist, researcher_dist.T) medoids, clusters = kmedoids.kMedoids(researcher_dist, n_clusters) output = {} for c in clusters.values(): group = [] for d in c: group += [rid[d]] for d in c: output[rid[d]] = list(set(group) - set([rid[d]])) with open('data.json', 'w') as outfile: json.dump(output, outfile)
def compute_sankey(results_search, n_max_clusters, n_min_clusters, n_repet_assess_cluster_number, List_actions, day_selected): liste_resfinal = results_search webpages = [x[1] for x in liste_resfinal] flattened_webpages = [item for sublist in webpages for item in sublist] flattened_webpages = list(set(flattened_webpages)) features = pd.DataFrame(index = flattened_webpages) distance_matrix = [[0 for i in range(len(flattened_webpages))] for i in range(0, len(flattened_webpages))] for i in range(0, len(flattened_webpages)): for j in range(i+1, len(flattened_webpages)): x_page_name = delete_first_tag(flattened_webpages[i]) y_page_name = delete_first_tag(flattened_webpages[j]) distance_matrix[i][j] = ((distance(x_page_name,y_page_name))) distance_matrix[j][i] = distance_matrix[i][j] distance_matrix = np.array(distance_matrix) n_max = n_max_clusters n_min = n_min_clusters range_n_clusters = [i for i in range(n_min,n_max)] silhouette_avg_scores = [0 for i in range(n_min, n_max)] for j in range(0,n_repet_assess_cluster_number): for n_clusters in range_n_clusters: medoids, clusterer= kmedoids.kMedoids(distance_matrix, n_clusters) cluster_labels = [0 for i in range(len(distance_matrix))] for label in clusterer : for point_idx in clusterer[label]: cluster_labels[point_idx] = label silhouette_avg = silhouette_score(distance_matrix, cluster_labels, metric="precomputed") print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) silhouette_avg_scores[range_n_clusters.index(n_clusters)]+=silhouette_avg silhouette_avg_scores = (np.array(silhouette_avg_scores)/(n_repet_assess_cluster_number)).tolist() cluster_number = range_n_clusters[silhouette_avg_scores.index(max(silhouette_avg_scores))] medoids, clusters = kmedoids.kMedoids(distance_matrix, cluster_number) labels = [0 for i in range(len(distance_matrix))] for label in clusters : for point_idx in clusters[label]: labels[point_idx] = label features['labels'] = labels data_nodes = find_clusters_names(labels,features) label_to_process = 0 for x in data_nodes : if "devis" in x : label_to_process = data_nodes.index(x) else : label_to_process =0 ##Compute the clusterized Sankey diagramm -########################################################### #Initial computation with every nodes and every flux colors = [] sources = [] targets = [] values= [] links = [] for sublist in List_actions: for i in range(0,len(sublist)-1): src_webpage = sublist[i] trg_webpage = sublist[i+1] if (src_webpage in flattened_webpages) and (trg_webpage in flattened_webpages): src_label = features.loc[src_webpage,'labels'] trg_label= features.loc[trg_webpage, 'labels'] if (src_label, trg_label) not in links: links.append((src_label, trg_label)) values.append(1) sources.append(src_label) targets.append(trg_label) else: values[links.index((src_label, trg_label))]+=1 #clean up the Sankey a bit: remove bidirectional edges between immediately close nodes cleaned_values=[] cleaned_sources = [] cleaned_targets = [] sum_in_links = [0 for i in range(max(labels)+1)] sum_out_links = [0 for i in range(max(labels)+1)] for (src_label, trg_label) in links: if values[links.index((src_label, trg_label))] > 100: if (trg_label, src_label) in links: if values[links.index((src_label, trg_label))] >= values[links.index((trg_label, src_label))]: cleaned_values.append(values[links.index((src_label, trg_label))]) cleaned_sources.append(src_label) cleaned_targets.append(trg_label) sum_in_links[trg_label]+= values[links.index((src_label, trg_label))] sum_out_links[src_label] += values[links.index((src_label, trg_label))] else: cleaned_values.append(values[links.index((src_label, trg_label))]) cleaned_sources.append(src_label) cleaned_targets.append(trg_label) sum_in_links[trg_label]+= values[links.index((src_label, trg_label))] sum_out_links[src_label] += values[links.index((src_label, trg_label))] cleaned_val_V2 = [] cleaned_src_V2 = [] cleaned_trg_V2= [] rate = 0.10 for i in range(0,len(cleaned_values)): if (cleaned_values[i] > rate*sum_in_links[cleaned_targets[i]] and cleaned_values[i] > rate*sum_out_links[cleaned_sources[i]] and (cleaned_sources[i]!=label_to_process or cleaned_targets[i]==label_to_process)): cleaned_val_V2.append(cleaned_values[i]) cleaned_src_V2.append(cleaned_sources[i]) cleaned_trg_V2.append(cleaned_targets[i]) #plot the final Sankey for i in range (0, len(labels)): color_array = list(np.random.choice(range(256), size = 3)) colors.append("rgba(" + str(color_array[0]) + ", " + str(color_array[1]) + ", " + str(color_array[2]) + ", 0.8 )") data_trace = dict( type='sankey', orientation = "h", valueformat = ".0f", valuesuffix = " logs", textfont = dict( size = 12 ), node = dict( pad = 22, thickness = 15, line = dict( color = "black", width = 0.5 ), label = data_nodes ), link = dict( source = cleaned_src_V2, target = cleaned_trg_V2, value = cleaned_val_V2, label = ["" for x in cleaned_val_V2] )) layouts = dict( title = "Dynamique du traffic pertinent sur le site credit-agricole.fr le "+ str(day_selected) +" - clustering fonctionnel", font = dict( size = 10 ), width = 1750, height = 800 ) res = dcc.Tab(id='Graph_function', children =[ dcc.Graph( id = 'Sankey_function', figure = { 'data' : [data_trace], 'layout' : layouts } ) ]) return res
import numpy as np from kmedoids import kMedoids from sklearn.metrics.pairwise import pairwise_distances a = np.load(open('feats_saved_10k.bn', 'rb')) already_sel = np.load(open('selected10000.bn', 'rb')) remaining = np.setdiff1d(np.array(range(50000)), already_sel) D = pairwise_distances(a[remaining, :], metric='euclidean') M, C = kMedoids(D, 5000) nd = np.array(list(already_sel) + list(M)) np.save(open('selected15000.bn', 'wb'), nd)
def _build_clusters(self,clust_num,method): timeseries=[] for k in range(len(self.data.windfarms)): timeseries.append([]) for i in range(1,NScen+1): # print(k,i) # print(self.data.windscen[k+1]['{0}'.format(i)].values) timeseries[k].append(self.data.windscen[k+1]['{0}'.format(i)].values) time_series=[] for i in range(1,NScen+1): l=list() for k in range(len(self.data.windfarms)): l+=timeseries[k][i-1].tolist() time_series.append(l) #k-shape from kshape.core if method=='k_shape': #selection of the number of clusters that should be done cluster_num =clust_num #apply clustering method cluster = kshape(zscore(time_series, axis=1), cluster_num) self.clusters=[] for k in range(len(cluster)): self.clusters.append(cluster[k][1]) #kshape from tslearn (recommended by Paparrizos) # from tslearn.clustering import KShape # from tslearn.utils import to_time_series_dataset # formatted_dataset = to_time_series_dataset(time_series) # ks=ks=KShape(n_clusters=cluster_num, verbose=False) # y_pred=ks.fit_predict(formatted_dataset) # self.clusters=[] # for n in range(cluster_num): # self.clusters.append([]) # for k in range(NScen): # self.clusters[y_pred[k]].append(k) if method=='k_means': #k-means clustering n_clusters=clust_num kmeans = KMeans(n_clusters, random_state=0).fit(time_series) kmeans.labels_ self.clusters=[] for n in range(n_clusters): self.clusters.append([]) k=0 for i in range(NScen): k+=1 self.clusters[kmeans.labels_[i]].append(k-1) if method=='hierar': #hierarchical clustering n_clusters=clust_num cluster = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward').fit_predict(time_series) self.clusters=[] for n in range(n_clusters): self.clusters.append([]) k=0 for i in range(NScen): k+=1 self.clusters[cluster[i]].append(k-1) if method=='k_medoids': #k-medoids clustering n_clusters=clust_num D = pairwise_distances(np.array(time_series), metric='euclidean') M, C = kmedoids.kMedoids(D, n_clusters) #M is a list of medoids and C a list of cluster (repartition of the scenarios in the clusters, only number (ID) of scenario not data) self.clusters=[] self.medoids=list(M) for i in range(n_clusters): self.clusters.append(list(C[i]))
scaler = RobustScaler() elif (sys.argv[4] == '5'): scaler = Normalizer() data = scaler.fit_transform(data) print(data[0:5]) from sklearn.metrics.pairwise import pairwise_distances D = pairwise_distances(data, metric='euclidean', n_jobs=1) print("Pairwise shape : ") print(D.shape) # np.save('all_data.npy', D) print("Done creating distance matrix, start the algorithm") # split into 2 clusters M, C = kmedoids.kMedoids(data, D, int(sys.argv[2])) st = '' print('medoids:') for point_idx in M: print(data[point_idx]) st = st + str(point_idx) + '\n' f = open(sys.argv[3], 'w') f.write(st) st = '' print('') print('clustering result:') for label in C: for point_idx in C[label]: print('label {0}: {1}'.format(label, point_idx))
from sklearn.metrics.pairwise import pairwise_distances import numpy as np import kmedoids # 3 points in dataset data = np.array([[1,1], [2,2], [10,10]]) # distance matrix D = pairwise_distances(data, metric='euclidean') # split into 2 clusters M, C = kmedoids.kMedoids(D, 2) print('medoids:') for point_idx in M: print( data[point_idx] ) print('') print('clustering result:') for label in C: for point_idx in C[label]: print(label, data[point_idx]) ^^^()()^^^
k_min = 165 #Minimum number of cluster k_max = 198 #maximum number of cluster radius = 500 print("Radius ", radius, "k_min ", k_min, "k_max ", k_max) max_gen = 150 #Initialization #Initial population is set. . Each individual in the population is determined randomly D = pairwise_distances(data, metric='euclidean') #Now we shall create initial population consisting of a certain number of individuals population = [] for i in range(population_size): no_of_cluster = np.random.randint(k_min, k_max + 1) M, C = kmedoids.kMedoids(D, no_of_cluster) medoid = [] for item in M: medoid.append(data[item]) if medoid not in population: population.append(medoid) gen_no = 0 #while loop runs till maximum generation while (gen_no < max_gen): coverage = [ calculate_coverage(population[i]) for i in range(0, population_size) ] tl = [tour_length(population[i]) for i in range(0, population_size)] non_dominated_sorted_population = fast_non_dominated_sort(
def k_medoids(sample, num_clusters): D = scipy.spatial.distance_matrix(sample, sample) M, C = kMedoids(D, num_clusters) return M, C
plt.title('MDS avec la matrice des distances entre PI') plt.show() print( "temps écoulé pour calculer la matrice des distances entre diagrammes : ", timeSpent) nbclusters = 6 #k-medoid classification with persistance diagramm #we launch k-medoid nIniatialisation time nInitialisation = 1000 errorFinal = 10**25 for i in range(nInitialisation): errorTot = 0 results = kMedoids(dist_mat, nbclusters) clusters = results[1] for i in range(nbclusters): cluster = pi.getIndivInCluster(clusters, i, label_color) error = pi.errorInCluster(cluster, nbclusters) errorTot = errorTot + error #print("error in cluster i", error) #print("individuals in cluster : ", i, cluster) if (errorFinal > errorTot): errorFinal = errorTot clusterFinal = clusters print("taux d'erreur : ", errorFinal / (nbclusters * nbIndivByClass) * 100) homologyDegree = 1 sigma2 = 0.0001 b = 0.02
kmeans = KMeans(init='k-means++', n_clusters=args.clusters, n_init=args.clusters, max_iter=100) labels = kmeans.fit_predict(sbg) silscore = silhouette_score(sbg, labels) cname = 'kmeans_' + str(i) kmeansObject = utils.cmethod(cname, labels, silscore, 0.0, args.maxfract) methods.append(kmeansObject) print >> sys.stderr, passedTime( start, time.time()), "KMEDOIDS (probabilistic, YMMV)" # Same issue as Kmeans. Same approach for i in xrange(1, 6): medoids, clusterinfo, labels = kmedoids.kMedoids( s_distance, args.clusters) silscore = silhouette_score(s_distance, labels) cname = 'kmedoids_' + str(i) kmedObject = utils.cmethod(cname, labels, silscore, 0.0, args.maxfract) methods.append(kmedObject) ##### Create consistent sample groups ###################################################### # This outputs a list of samples that always occur together in a cluster, no matter which method is used # It also adds a 'shared' column to the clusters output file. An attempt is made to give similar clusters # similar labels, so that cluster B1 largely contains the same samples in each cluster method. if args.print_groups: print >> sys.stderr, passedTime( start, time.time()), "Finding consistent groups in all methods used" setlist = [i.dups for i in methods if i.ok] grouplist1, tally = utils.persistent_groups(copy.copy(setlist),
# coding: utf-8 from sklearn.metrics.pairwise import pairwise_distances import numpy as np import kmedoids # 3 points in dataset data = np.array([[1,1], [2,2], [10,10]]) # distance matrix D = pairwise_distances(data, metric='euclidean') # split into 2 clusters M, C = kmedoids.kMedoids(D, 2) print('medoids:') for point_idx in M: print( data[point_idx] ) print('') print('clustering result:') for label in C: for point_idx in C[label]: print('label {0}: {1}'.format(label, data[point_idx]))
#print(predict) print("purity:", purity(predict['predict'], target)) # concatenate labels to df as a new column r = pd.concat([data, predict], axis=1) # plot the cluster assignments plt.scatter(r['Life expectancy at birth, total (years)'], r['GNI (constant 2010 US$)'], c=r['predict'], cmap="plasma") plt.show() print() #kmedoids model distances = pairwise_distances(data, metric='euclidean') M, C = kmedoids.kMedoids(distances, 4) predict = np.zeros(len(data)) for label in C: for point_idx in C[label]: predict[point_idx] = label predict = pd.DataFrame(predict) predict.columns = ['predict'] print("purity:", purity(predict['predict'], target)) plt.scatter(data['Life expectancy at birth, total (years)'], data['GNI (constant 2010 US$)'], c=predict['predict'], cmap="plasma") plt.show()
import numpy as np from sklearn.metrics.pairwise import pairwise_distances import kmedoids W0 = np.load("W0_10d.npy") # distance matrix D = pairwise_distances(W0, metric='euclidean') # split into 60 clusters M, C = kmedoids.kMedoids(D, 60) C_label = np.zeros( 35390) # 35390 = 17695*2 (number of genes from both networks) for label in C: for point_idx in C[label]: C_label[point_idx] = label np.save("kmedoids.npy", C_label.astype(int))
#tempar=np.array([inputar]) D=np.vstack([D,inputar]) #D=D.reshape(int(length/),int(length/2)) ''' if M is not None: M, C = kmedoids.kMedoids(D, numberofclusters) if M is not None: M, C = kmedoids.kMedoids(D, numberofclusters) if M is not None: raise Exception('too many medoids (after removing duplicate points)') ''' #file=open("output.txt","w") M, C = kmedoids.kMedoids(D, numberofclusters) #print(C) print('medoids:') for point_idx in M: outputar=np.concatenate((outputar,data[point_idx]),axis=0) np.savetxt('medoid.txt',M,fmt="%s") np.savetxt('output.txt',outputar,fmt="%s",delimiter=',') print('') print('clustering result:',M) #np.savetxt('clusteringoutput.txt',C) cluster=[None] * int(length/2) for label in C: for point_idx in C[label]: cluster[point_idx]=M[label] print(cluster) np.savetxt('clusteringoutput.txt',cluster,fmt="%s")
def pairwiseClustering(df1, df2): clean_lyrics = getCleanLyrics(df1, df2) vec = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english', max_features=5000) tfidf_matrix = vec.fit_transform(clean_lyrics) feature_names = vec.get_feature_names() n1 = len(df1['lyrics']) tfidf_vectors = tfidf_matrix.toarray() n = len(clean_lyrics) distances = [[0 for x in range(n)] for y in range(n)] d_file = open('distances_bigram.txt', 'a+') for i in range(n): for j in range(n): distances[i][j] = 10 * round( np.linalg.norm(tfidf_vectors[i] - tfidf_vectors[j]), 5) d_file.write(str(distances[i][j])) if (j != n - 1): d_file.write(',') else: d_file.write('\n') d_file.close() maxx = 0 minx = 10000 count = 0 sum = 0 for i in range(n): for j in range(n): if distances[i][j] != 0: sum += distances[i][j] count += 1 if (distances[i][j] > maxx): maxx = distances[i][j] if (distances[i][j] < minx): minx = distances[i][j] import kmedoids A = np.matrix(distances) n = len(A) def cost(d_mat, M, C): k = len(M) costs = [] for i in range(k): costs.append(0) for c_i in range(k): for i in C[c_i]: costs[c_i] += d_mat[M[c_i], i] return np.sum(costs) M, C = kmedoids.kMedoids(A, n, 2) for i in range(100): t_M, t_C = kmedoids.kMedoids(A, n, 2) if (cost(A, t_M, t_C) < cost(A, M, C)): M = t_M C = t_C print "Pair || " + str(df1['year'].iloc[0]) + ": " + str(len( df1['year'])) + ", " + str(df2['year'].iloc[0]) + ": " + str( len(df2['year'])) print "====================================================" count1 = 0 count2 = 0 print "Cluster 1 : " + str(len(C[0])) for point in C[0]: if point < n1: count1 += 1 else: count2 += 1 if count1 > count2: c_1 = count1 else: c_2 = count2 print str(df1['year'].iloc[0]) + ": " + str(count1) + ", " + str( df2['year'].iloc[0]) + ": " + str(count2) count1 = 0 count2 = 0 print "Cluster 2 : " + str(len(C[1])) for point in C[1]: if point < n1: count1 += 1 else: count2 += 1 if count1 > count2: c_1 = count1 else: c_2 = count2 print str(df1['year'].iloc[0]) + ": " + str(count1) + ", " + str( df2['year'].iloc[0]) + ": " + str(count2) accuracy = (c_1 + c_2) * 1.0 / n print "\nAccuracy: " + str( accuracy) + "\n\n====================================================" print "\n" return accuracy
# split into 2 clusters def cost(d_mat, M, C): k = len(M) costs = [] for i in range(k): costs.append(0) for c_i in range(k): for i in C[c_i]: costs[c_i] += d_mat[M[c_i], i] return np.sum(costs) M, C = kmedoids.kMedoids(D, n, 2) for i in range(1000): t_M, t_C = kmedoids.kMedoids(D, n, 2) if (cost(D, t_M, t_C) < cost(D, M, C)): M = t_M C = t_C print('medoids:') for point_idx in M: # print( data[point_idx] ) print point_idx # print('') print('clustering result:')