def main(): parser = argparse.ArgumentParser() parser.add_argument( 'in_dissim_matrix', default='../dissim_matrix_hybrid.pickle', help='The dissimilarity-Matrix to be used as a pickled Numpy-Array') parser.add_argument( 'in_cores', default='../h_cores_as_indexes.pickle', help='file path of cores file: "../h_cores_as_indexes.pickle"') parser.add_argument('in_values') parser.add_argument('random_init') args = parser.parse_args() f = open(args.in_dissim_matrix) distances = pickle.load(f) f.close() cores = loadtxt(args.in_cores) if args.random_init != "random": init = getInit(distances, cores) set_printoptions(threshold='nan') set_printoptions(linewidth=900000000) result = kmedoids(distance=distances, npass=20, nclusters=len(cores)) #result = kmedoids(distance = distances, initialid = init) print args.in_values + " " + str(len(cores)) + " " + str( result[1]) + " " + str(result[0])
def kMedoids(args): """ Do k-medoids clustering on a distance matrix. @param args: A tuple of the form (dist_matrix, k, n_passes) @return: The result tuple returned by Pycluster.kmedoids """ dist_matrix, k, n_passes = args return kmedoids(dist_matrix, k, n_passes)
def main(): parser = argparse.ArgumentParser() parser.add_argument('in_dissim_matrix', default='../dissim_matrix_hybrid.pickle', help='The dissimilarity-Matrix to be used as a pickled Numpy-Array') parser.add_argument('in_cores', default='../h_cores_as_indexes.pickle', help='file path of cores file: "../h_cores_as_indexes.pickle"') parser.add_argument('in_values') parser.add_argument('random_init') args = parser.parse_args() f = open(args.in_dissim_matrix) distances = pickle.load(f) f.close() cores = loadtxt(args.in_cores) if args.random_init != "random": init = getInit( distances, cores ) set_printoptions(threshold='nan') set_printoptions(linewidth=900000000) result = kmedoids(distance = distances, npass=20, nclusters=len(cores)) #result = kmedoids(distance = distances, initialid = init) print args.in_values + " "+str(len(cores))+" "+str(result[1])+" "+str(result[0])
inorm1 = inorm[DLi:DHi + 1] inorm1 = np.swapaxes(inorm1, 0, 1) inorm1 = inorm1[ELi:EHi + 1] inorm1 = np.swapaxes(inorm1, 0, 1) difmat = inorm1[1] - i1norm1[1] difmat2 = difmat**2 RMSD = (np.average(difmat2))**0.5 pRMSD = RMSD * 100 rmsdmat1.append(pRMSD) rmsdmat.append(rmsdmat1) rmsdmat = np.array(rmsdmat) rmsdmat = 1 - rmsdmat np.savetxt("DifferenceMatrix.csv", rmsdmat, delimiter=",") clusterid, error, nfound = kmedoids(rmsdmat, nclusters=c, npass=10) print "Error = ", error print "Found this configuration ", nfound, " out of 10 times" with open("Clusters.csv", "wb") as csvfile: writer = csv.writer(csvfile, delimiter=",") writer.writerow(["item #", "File Name", "Cluster #"]) writer.writerow([""]) for i in range(0, len(clusterid)): print i, files[i], " cluster = ", clusterid[i] writer.writerow([i, files[i], clusterid[i]]) writer.writerow([""]) writer.writerow([ "Note : The cluster number is defined as the item number of the centroid of the cluster." ]) print "\n" + "The cluster number is defined as the item number of the centroid of the cluster."
def get_surface_sources(surface, space=5, distance='euclidean', remains=None): """get sources in volume Parameters ---------- surface : Surface object space : float The distance between sources distance : 'euclidean' | 'dijkstra' | 'continuous' The distance used to compute distance on surface remains : None | int The number of sources that we want to keep Returns ------- src : SourceSpaces object ------- Author : Alexandre Fabre """ if remains is None: remains, removes = get_number_sources(surface, space=space, surface=True) else: # avoid to have a incorrect number of sources remains = max(0, min(surface.pos_length, remains)) removes = surface.pos_length - remains if remains == 0: raise ValueError('Error, 0 source created') if removes == 0: # all points are sources # logger.info('all points are remained') centroids_id = np.arange(remains) inuse = np.ones(surface.pos_length, dtype=int) else: # connectivity of neighbors points n_neighbors = min(50, surface.pos_length) # get the matrix that identify neighbors points knn_graph = kneighbors_graph(surface.pos, n_neighbors, include_self=False) # ward criterion is adapted for a surface clustering model = AgglomerativeClustering(linkage='ward', connectivity=knn_graph, n_clusters=remains) # compute clusters model.fit(surface.pos) # get cluster labels cluster_id = model.labels_ # get the distance between points on the surface with Dijkstra or continuous # if distance is euclidean, it just computes euclidean distances between points distance = surf_m.get_surf_distance(surface.pos, surface.triangles, distance=distance) # clusters give by AgglomerativeClustering are initial clusters for k-medoids # for k-medoids, the centroid is a point in a cluster # k-medoids method return clusters that are identified by the index of their centroid point cluster_id, _, _ = kmedoids(distance, nclusters=remains, npass=1, initialid=cluster_id) # get the index of centroids centroids_id = np.unique(cluster_id) inuse = np.zeros(surface.pos_length) inuse[centroids_id] = 1 inuse = inuse.astype(int) # Need to be int # must be converted to meters and transorm to numpy array rr = surface.pos * 1e-3 # Change index for hemi if surface.hemi=='lh': Id = 101 elif surface.hemi=='rh': Id = 102 src = [{'rr': rr, 'coord_frame': np.array((FIFF.FIFFV_COORD_MRI,), np.int32), 'type': 'surf', 'id': Id, 'np': surface.pos_length, 'nn': surface.normals, 'inuse': inuse, 'nuse': remains, 'dist': None, 'ntri': surface.triangles_length, 'nearest': None, 'use_tris': None, 'nuse_tris': 0, 'vertno': centroids_id, 'patch_inds': None, 'tris': surface.triangles, 'dist_limit': None, 'pinfo': None, 'nearest_dist': None, 'removes': removes}] src = SourceSpaces(src) return src
def main(): p = opt.ArgumentParser(description=""" Constructs a dictionary for image representation based on histograms of codeblocks (Gabor wavelet local descriptors) over larger neighborhoods. The dictionary is built from a set of images given as a list in an input file. """) p.add_argument('img_path', action='store', help='path to image files - all images in the folder will be used') p.add_argument('img_ext', action='store', help='extension of the image files (e.g. "jpg" or "png") - NO DOT!') p.add_argument('l0_model', action='store', help='level-0 codebook model file') p.add_argument('out_file', action='store', help='resulting model file name') p.add_argument('codebook_size', action='store', help='codebook size', type=int) p.add_argument('-w', '--window', action='store', help='local window size (default: 512)', type=int, default=512) args = p.parse_args() #--------- # data data_path = args.img_path img_ext = args.img_ext wnd_size = args.window with ModelPersistence(args.l0_model, 'r', format='pickle') as mp: l0_model = mp img_files = glob.glob(data_path + '/*.' + img_ext) if len(img_files) == 0: return #--------- # Gabor tmp = np.array([0.0, np.pi / 4.0, np.pi / 2.0, 3.0 * np.pi / 4.0], dtype=np.double) tmp2 = np.array([3.0 / 4.0, 3.0 / 8.0, 3.0 / 16.0], dtype=np.double) tmp3 = np.array([1.0, 2 * np.sqrt(2.0)], dtype=np.double) local_descriptor = GaborDescriptor(theta=tmp, freq=tmp2, sigma=tmp3) ## Process: sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # unbuferred output desc_vectors = [] # a list of local descriptor vectors print('Computing level 0 coding...') desc_vectors = \ Parallel(n_jobs=cpu_count()) \ ( delayed(worker)(img_name, local_descriptor, wnd_size, l0_model) for img_name in img_files ) print('OK') print('Vector quantization:') print('-prepare...') X = np.vstack(desc_vectors) # each row is a histogram np.save('X_bag_level1.dat', X) print('-compute pairwise distances...') n = X.shape[0] pdist = Parallel(n_jobs=cpu_count()) ( delayed(worker_chisq_M)(X[i,:], X[i+1:n,:]) for i in np.arange(0, n-1) ) # make the list flat: pdist = np.array(list(itertools.chain.from_iterable(pdist))) #for i in np.arange(0, X.shape[0]-1): # for j in np.arange(i+1, X.shape[0]): # pdist.append(dist.chisq(X[i,:], X[j,:])) pdist = np.array(pdist) np.save('X_pdist_level1.data.npy', pdist) print('-cluster (k-medoids)...') meds = kmedoids(pdist, nclusters=args.codebook_size, npass=20) labels = np.unique(meds[0]) # also the indexes of vectors from X that became cluster centers (medoids) vq = {} vq['cluster_centers_'] = X[labels, :] vq['labels_'] = labels vq['distance'] = 'chisq' print('OK') print('Saving model...', end='') # compute the average distance and std.dev. of the points in each cluster: avg_dist = np.zeros(args.codebook_size) sd_dist = np.zeros(args.codebook_size) for k in range(0, args.codebook_size): idx = np.where(meds[0] == labels[k])[0] d = [] for i in idx: d.append(dist.chisq(X[i,:], vq['cluster_centers_'][k,:])) avg_dist[k] = np.array(d).mean() sd_dist[k] = np.array(d).std() print('K-medoids summary:'); print('-avg. dist: ', avg_dist) print('-std. dev. dist: ', sd_dist) with ModelPersistence(args.out_file, 'c', format='pickle') as d: d['codebook'] = vq d['avg_dist_to_centroid'] = avg_dist d['stddev_dist_to_centroid'] = sd_dist print('OK') return
def main(): p = opt.ArgumentParser(description=""" Constructs a dictionary for image representation based on histograms of codeblocks (Gabor wavelet local descriptors) over larger neighborhoods. The dictionary is built from a set of images given as a list in an input file. """) p.add_argument( 'img_path', action='store', help='path to image files - all images in the folder will be used') p.add_argument( 'img_ext', action='store', help='extension of the image files (e.g. "jpg" or "png") - NO DOT!') p.add_argument('l0_model', action='store', help='level-0 codebook model file') p.add_argument('out_file', action='store', help='resulting model file name') p.add_argument('codebook_size', action='store', help='codebook size', type=int) p.add_argument('-w', '--window', action='store', help='local window size (default: 512)', type=int, default=512) args = p.parse_args() #--------- # data data_path = args.img_path img_ext = args.img_ext wnd_size = args.window with ModelPersistence(args.l0_model, 'r', format='pickle') as mp: l0_model = mp img_files = glob.glob(data_path + '/*.' + img_ext) if len(img_files) == 0: return #--------- # Gabor tmp = np.array([0.0, np.pi / 4.0, np.pi / 2.0, 3.0 * np.pi / 4.0], dtype=np.double) tmp2 = np.array([3.0 / 4.0, 3.0 / 8.0, 3.0 / 16.0], dtype=np.double) tmp3 = np.array([1.0, 2 * np.sqrt(2.0)], dtype=np.double) local_descriptor = GaborDescriptor(theta=tmp, freq=tmp2, sigma=tmp3) ## Process: sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # unbuferred output desc_vectors = [] # a list of local descriptor vectors print('Computing level 0 coding...') desc_vectors = \ Parallel(n_jobs=cpu_count()) \ ( delayed(worker)(img_name, local_descriptor, wnd_size, l0_model) for img_name in img_files ) print('OK') print('Vector quantization:') print('-prepare...') X = np.vstack(desc_vectors) # each row is a histogram np.save('X_bag_level1.dat', X) print('-compute pairwise distances...') n = X.shape[0] pdist = Parallel(n_jobs=cpu_count())( delayed(worker_chisq_M)(X[i, :], X[i + 1:n, :]) for i in np.arange(0, n - 1)) # make the list flat: pdist = np.array(list(itertools.chain.from_iterable(pdist))) #for i in np.arange(0, X.shape[0]-1): # for j in np.arange(i+1, X.shape[0]): # pdist.append(dist.chisq(X[i,:], X[j,:])) pdist = np.array(pdist) np.save('X_pdist_level1.data.npy', pdist) print('-cluster (k-medoids)...') meds = kmedoids(pdist, nclusters=args.codebook_size, npass=20) labels = np.unique( meds[0] ) # also the indexes of vectors from X that became cluster centers (medoids) vq = {} vq['cluster_centers_'] = X[labels, :] vq['labels_'] = labels vq['distance'] = 'chisq' print('OK') print('Saving model...', end='') # compute the average distance and std.dev. of the points in each cluster: avg_dist = np.zeros(args.codebook_size) sd_dist = np.zeros(args.codebook_size) for k in range(0, args.codebook_size): idx = np.where(meds[0] == labels[k])[0] d = [] for i in idx: d.append(dist.chisq(X[i, :], vq['cluster_centers_'][k, :])) avg_dist[k] = np.array(d).mean() sd_dist[k] = np.array(d).std() print('K-medoids summary:') print('-avg. dist: ', avg_dist) print('-std. dev. dist: ', sd_dist) with ModelPersistence(args.out_file, 'c', format='pickle') as d: d['codebook'] = vq d['avg_dist_to_centroid'] = avg_dist d['stddev_dist_to_centroid'] = sd_dist print('OK') return
data = squareform(data) else: print "\n>>> Loading provided distance matrix..." data = np.loadtxt('%s' % input_dist) if write_dist: print "\n>>> Writing distance_matrix.txt..." np.savetxt('distance_matrix.txt', data, fmt='%10.3f') if clus_min != clus_max: #Compute silhouette score for k=kmin -> k=kmax: print "\n>>> Determining optimal number of clusters using silhouette score..." sil = numpy.zeros(clus_max - clus_min + 1) for i in range(clus_min, clus_max + 1): idx, error, nfound = kmedoids(data, nclusters=i, npass=passes) sil[i - clus_min] = silhouette_score(data, idx, metric='precomputed', sample_size=None, random_state=None) clus_num = numpy.argmax(sil) + clus_min print "\tOptimal number of clusters: %s" % clus_num else: sil = sil = numpy.zeros(clus_max - clus_min + 1) clus_num = clus_min print "\n>>> Requested %s clusters" % clus_num #K-Medoids clustering idx, error, nfound = kmedoids(data, nclusters=clus_num, npass=passes)