Python kmedoids 예제들, Pycluster.kmedoids Python 예제들

예제 #1

0

파일 보기

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'in_dissim_matrix',
        default='../dissim_matrix_hybrid.pickle',
        help='The dissimilarity-Matrix to be used as a pickled Numpy-Array')
    parser.add_argument(
        'in_cores',
        default='../h_cores_as_indexes.pickle',
        help='file path of cores file: "../h_cores_as_indexes.pickle"')
    parser.add_argument('in_values')
    parser.add_argument('random_init')
    args = parser.parse_args()

    f = open(args.in_dissim_matrix)
    distances = pickle.load(f)
    f.close()

    cores = loadtxt(args.in_cores)
    if args.random_init != "random":
        init = getInit(distances, cores)
    set_printoptions(threshold='nan')
    set_printoptions(linewidth=900000000)

    result = kmedoids(distance=distances, npass=20, nclusters=len(cores))
    #result = kmedoids(distance = distances, initialid = init)

    print args.in_values + " " + str(len(cores)) + " " + str(
        result[1]) + " " + str(result[0])

예제 #2

0

파일 보기

파일: smyth.py 프로젝트: shalisawong/j-thesis

def kMedoids(args):
	"""
	Do k-medoids clustering on a distance matrix.
	@param args: A tuple of the form (dist_matrix, k, n_passes)
	@return: The result tuple returned by Pycluster.kmedoids
	"""
	dist_matrix, k, n_passes = args
	return kmedoids(dist_matrix, k, n_passes)

예제 #3

0

파일 보기

파일: kmedoids.py 프로젝트: gessulat/txt_mining

def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('in_dissim_matrix', default='../dissim_matrix_hybrid.pickle', help='The dissimilarity-Matrix to be used as a pickled Numpy-Array')
	parser.add_argument('in_cores', default='../h_cores_as_indexes.pickle', help='file path of cores file: "../h_cores_as_indexes.pickle"')
	parser.add_argument('in_values')
	parser.add_argument('random_init')
	args = parser.parse_args()

	f = open(args.in_dissim_matrix)
	distances = pickle.load(f)
	f.close()

	cores = loadtxt(args.in_cores)
	if args.random_init != "random":
		init = getInit( distances, cores )
	set_printoptions(threshold='nan')	
	set_printoptions(linewidth=900000000)	

	result = kmedoids(distance = distances, npass=20, nclusters=len(cores))
	#result = kmedoids(distance = distances, initialid = init)

	print args.in_values + " "+str(len(cores))+" "+str(result[1])+" "+str(result[0])

예제 #4

0

파일 보기

            inorm1 = inorm[DLi:DHi + 1]
            inorm1 = np.swapaxes(inorm1, 0, 1)
            inorm1 = inorm1[ELi:EHi + 1]
            inorm1 = np.swapaxes(inorm1, 0, 1)

            difmat = inorm1[1] - i1norm1[1]
            difmat2 = difmat**2
            RMSD = (np.average(difmat2))**0.5
            pRMSD = RMSD * 100
            rmsdmat1.append(pRMSD)
        rmsdmat.append(rmsdmat1)
    rmsdmat = np.array(rmsdmat)
    rmsdmat = 1 - rmsdmat
    np.savetxt("DifferenceMatrix.csv", rmsdmat, delimiter=",")
    clusterid, error, nfound = kmedoids(rmsdmat, nclusters=c, npass=10)
    print "Error = ", error
    print "Found this configuration ", nfound, " out of 10 times"
    with open("Clusters.csv", "wb") as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        writer.writerow(["item #", "File Name", "Cluster #"])
        writer.writerow([""])
        for i in range(0, len(clusterid)):
            print i, files[i], " cluster = ", clusterid[i]
            writer.writerow([i, files[i], clusterid[i]])
        writer.writerow([""])
        writer.writerow([
            "Note : The cluster number is defined as the item number of the centroid of the cluster."
        ])

    print "\n" + "The cluster number is defined as the item number of the centroid of the cluster."

예제 #5

0

파일 보기

def get_surface_sources(surface, space=5, distance='euclidean', remains=None):
    """get sources in volume

    Parameters
    ----------
    surface : Surface object
    space : float
        The distance between sources
    distance : 'euclidean' | 'dijkstra' | 'continuous'
        The distance used to compute distance on surface
    remains : None | int
        The number of sources that we want to keep

    Returns
    -------
    src : SourceSpaces  object
    -------
    Author : Alexandre Fabre
    """

    if remains is None:
        remains, removes = get_number_sources(surface, space=space,
                                              surface=True)
    else:
        # avoid to have a incorrect number of sources
        remains = max(0, min(surface.pos_length, remains))
        removes = surface.pos_length - remains

    if remains == 0:
        raise ValueError('Error, 0 source created')

    if removes == 0:
        # all points are sources
        # logger.info('all points are remained')
        centroids_id = np.arange(remains)
        inuse = np.ones(surface.pos_length, dtype=int)

    else:
        # connectivity of neighbors points
        n_neighbors = min(50, surface.pos_length)

        # get the matrix that identify neighbors points
        knn_graph = kneighbors_graph(surface.pos, n_neighbors, include_self=False)

        # ward criterion is adapted for a surface clustering
        model = AgglomerativeClustering(linkage='ward', connectivity=knn_graph,
                                        n_clusters=remains)
        # compute clusters
        model.fit(surface.pos)

        # get cluster labels
        cluster_id = model.labels_

        # get the distance between points on the surface with Dijkstra or continuous
        # if distance is euclidean, it just computes euclidean distances between points
        distance = surf_m.get_surf_distance(surface.pos, surface.triangles,
                                            distance=distance)

        # clusters give by AgglomerativeClustering are initial clusters for k-medoids
        # for k-medoids, the centroid is a point in a cluster
        # k-medoids method return clusters that are identified by the index of their centroid point
        cluster_id, _, _ = kmedoids(distance, nclusters=remains, npass=1,
                                    initialid=cluster_id)

        # get the index of centroids
        centroids_id = np.unique(cluster_id)

        inuse = np.zeros(surface.pos_length)
        inuse[centroids_id] = 1
        inuse = inuse.astype(int)   # Need to be int

    # must be converted to meters and transorm to numpy array
    rr = surface.pos * 1e-3

    # Change index for hemi
    if surface.hemi=='lh':
        Id = 101
    elif surface.hemi=='rh':
        Id = 102

    src = [{'rr': rr, 'coord_frame': np.array((FIFF.FIFFV_COORD_MRI,), np.int32), 'type': 'surf', 'id': Id,
            'np': surface.pos_length, 'nn': surface.normals, 'inuse': inuse, 'nuse': remains, 'dist': None,
            'ntri': surface.triangles_length, 'nearest': None, 'use_tris': None, 'nuse_tris': 0,
            'vertno': centroids_id, 'patch_inds': None, 'tris': surface.triangles, 'dist_limit': None, 'pinfo': None,
            'nearest_dist': None, 'removes': removes}]

    src = SourceSpaces(src)

    return src

예제 #6

0

파일 보기

파일: wsi_bot_codebook_gabor_level1.py 프로젝트: vladpopovici/WSItk

def main():
    p = opt.ArgumentParser(description="""
            Constructs a dictionary for image representation based on histograms of codeblocks
            (Gabor wavelet local descriptors) over larger neighborhoods.
            The dictionary is built from a set of images given as a list in an input file.
            """)

    p.add_argument('img_path', action='store', help='path to image files - all images in the folder will be used')
    p.add_argument('img_ext', action='store', help='extension of the image files (e.g. "jpg" or "png") - NO DOT!')
    p.add_argument('l0_model', action='store', help='level-0 codebook model file')
    p.add_argument('out_file', action='store', help='resulting model file name')
    p.add_argument('codebook_size', action='store', help='codebook size', type=int)
    p.add_argument('-w', '--window', action='store', help='local window size (default: 512)', type=int, default=512)

    args = p.parse_args()

    #---------
    # data
    data_path = args.img_path
    img_ext = args.img_ext
    wnd_size = args.window


    with ModelPersistence(args.l0_model, 'r', format='pickle') as mp:
        l0_model = mp

    img_files = glob.glob(data_path + '/*.' + img_ext)
    if len(img_files) == 0:
        return

    #---------
    # Gabor
    tmp  = np.array([0.0, np.pi / 4.0, np.pi / 2.0, 3.0 * np.pi / 4.0], dtype=np.double)
    tmp2 = np.array([3.0 / 4.0, 3.0 / 8.0, 3.0 / 16.0], dtype=np.double)
    tmp3 = np.array([1.0, 2 * np.sqrt(2.0)], dtype=np.double)

    local_descriptor = GaborDescriptor(theta=tmp, freq=tmp2, sigma=tmp3)


    ## Process:
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)    # unbuferred output
    desc_vectors = []                  # a list of local descriptor vectors

    print('Computing level 0 coding...')

    desc_vectors = \
        Parallel(n_jobs=cpu_count()) \
        ( delayed(worker)(img_name, local_descriptor, wnd_size, l0_model) for img_name in img_files )

    print('OK')

    print('Vector quantization:')
    print('-prepare...')
    X = np.vstack(desc_vectors)        # each row is a histogram
    np.save('X_bag_level1.dat', X)

    print('-compute pairwise distances...')
    n = X.shape[0]
    pdist = Parallel(n_jobs=cpu_count()) ( delayed(worker_chisq_M)(X[i,:], X[i+1:n,:]) for i in np.arange(0, n-1) )

    # make the list flat:
    pdist = np.array(list(itertools.chain.from_iterable(pdist)))

    #for i in np.arange(0, X.shape[0]-1):
    #    for j in np.arange(i+1, X.shape[0]):
    #        pdist.append(dist.chisq(X[i,:], X[j,:]))
    pdist = np.array(pdist)

    np.save('X_pdist_level1.data.npy', pdist)

    print('-cluster (k-medoids)...')
    meds = kmedoids(pdist, nclusters=args.codebook_size, npass=20)
    labels = np.unique(meds[0])  # also the indexes of vectors from X that became cluster centers (medoids)
    vq = {}
    vq['cluster_centers_'] = X[labels, :]
    vq['labels_'] = labels
    vq['distance'] = 'chisq'
    print('OK')

    print('Saving model...', end='')
    # compute the average distance and std.dev. of the points in each cluster:
    avg_dist = np.zeros(args.codebook_size)
    sd_dist = np.zeros(args.codebook_size)
    for k in range(0, args.codebook_size):
        idx = np.where(meds[0] == labels[k])[0]
        d = []
        for i in idx:
            d.append(dist.chisq(X[i,:], vq['cluster_centers_'][k,:]))
        avg_dist[k] = np.array(d).mean()
        sd_dist[k] = np.array(d).std()

    print('K-medoids summary:');
    print('-avg. dist: ', avg_dist)
    print('-std. dev. dist: ', sd_dist)

    with ModelPersistence(args.out_file, 'c', format='pickle') as d:
        d['codebook'] = vq
        d['avg_dist_to_centroid'] = avg_dist
        d['stddev_dist_to_centroid'] = sd_dist

    print('OK')

    return

예제 #7

0

파일 보기

파일: wsi_bot_codebook_gabor_level1.py 프로젝트: vladpopovici/WSItk

def main():
    p = opt.ArgumentParser(description="""
            Constructs a dictionary for image representation based on histograms of codeblocks
            (Gabor wavelet local descriptors) over larger neighborhoods.
            The dictionary is built from a set of images given as a list in an input file.
            """)

    p.add_argument(
        'img_path',
        action='store',
        help='path to image files - all images in the folder will be used')
    p.add_argument(
        'img_ext',
        action='store',
        help='extension of the image files (e.g. "jpg" or "png") - NO DOT!')
    p.add_argument('l0_model',
                   action='store',
                   help='level-0 codebook model file')
    p.add_argument('out_file',
                   action='store',
                   help='resulting model file name')
    p.add_argument('codebook_size',
                   action='store',
                   help='codebook size',
                   type=int)
    p.add_argument('-w',
                   '--window',
                   action='store',
                   help='local window size (default: 512)',
                   type=int,
                   default=512)

    args = p.parse_args()

    #---------
    # data
    data_path = args.img_path
    img_ext = args.img_ext
    wnd_size = args.window

    with ModelPersistence(args.l0_model, 'r', format='pickle') as mp:
        l0_model = mp

    img_files = glob.glob(data_path + '/*.' + img_ext)
    if len(img_files) == 0:
        return

    #---------
    # Gabor
    tmp = np.array([0.0, np.pi / 4.0, np.pi / 2.0, 3.0 * np.pi / 4.0],
                   dtype=np.double)
    tmp2 = np.array([3.0 / 4.0, 3.0 / 8.0, 3.0 / 16.0], dtype=np.double)
    tmp3 = np.array([1.0, 2 * np.sqrt(2.0)], dtype=np.double)

    local_descriptor = GaborDescriptor(theta=tmp, freq=tmp2, sigma=tmp3)

    ## Process:
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)  # unbuferred output
    desc_vectors = []  # a list of local descriptor vectors

    print('Computing level 0 coding...')

    desc_vectors = \
        Parallel(n_jobs=cpu_count()) \
        ( delayed(worker)(img_name, local_descriptor, wnd_size, l0_model) for img_name in img_files )

    print('OK')

    print('Vector quantization:')
    print('-prepare...')
    X = np.vstack(desc_vectors)  # each row is a histogram
    np.save('X_bag_level1.dat', X)

    print('-compute pairwise distances...')
    n = X.shape[0]
    pdist = Parallel(n_jobs=cpu_count())(
        delayed(worker_chisq_M)(X[i, :], X[i + 1:n, :])
        for i in np.arange(0, n - 1))

    # make the list flat:
    pdist = np.array(list(itertools.chain.from_iterable(pdist)))

    #for i in np.arange(0, X.shape[0]-1):
    #    for j in np.arange(i+1, X.shape[0]):
    #        pdist.append(dist.chisq(X[i,:], X[j,:]))
    pdist = np.array(pdist)

    np.save('X_pdist_level1.data.npy', pdist)

    print('-cluster (k-medoids)...')
    meds = kmedoids(pdist, nclusters=args.codebook_size, npass=20)
    labels = np.unique(
        meds[0]
    )  # also the indexes of vectors from X that became cluster centers (medoids)
    vq = {}
    vq['cluster_centers_'] = X[labels, :]
    vq['labels_'] = labels
    vq['distance'] = 'chisq'
    print('OK')

    print('Saving model...', end='')
    # compute the average distance and std.dev. of the points in each cluster:
    avg_dist = np.zeros(args.codebook_size)
    sd_dist = np.zeros(args.codebook_size)
    for k in range(0, args.codebook_size):
        idx = np.where(meds[0] == labels[k])[0]
        d = []
        for i in idx:
            d.append(dist.chisq(X[i, :], vq['cluster_centers_'][k, :]))
        avg_dist[k] = np.array(d).mean()
        sd_dist[k] = np.array(d).std()

    print('K-medoids summary:')
    print('-avg. dist: ', avg_dist)
    print('-std. dev. dist: ', sd_dist)

    with ModelPersistence(args.out_file, 'c', format='pickle') as d:
        d['codebook'] = vq
        d['avg_dist_to_centroid'] = avg_dist
        d['stddev_dist_to_centroid'] = sd_dist

    print('OK')

    return

예제 #8

0

파일 보기

파일: clusterK.py 프로젝트: sklumpe/ModelMaker

    data = squareform(data)

else:
    print "\n>>> Loading provided distance matrix..."
    data = np.loadtxt('%s' % input_dist)

if write_dist:
    print "\n>>> Writing distance_matrix.txt..."
    np.savetxt('distance_matrix.txt', data, fmt='%10.3f')

if clus_min != clus_max:
    #Compute silhouette score for k=kmin -> k=kmax:
    print "\n>>> Determining optimal number of clusters using silhouette score..."
    sil = numpy.zeros(clus_max - clus_min + 1)
    for i in range(clus_min, clus_max + 1):
        idx, error, nfound = kmedoids(data, nclusters=i, npass=passes)
        sil[i - clus_min] = silhouette_score(data,
                                             idx,
                                             metric='precomputed',
                                             sample_size=None,
                                             random_state=None)

    clus_num = numpy.argmax(sil) + clus_min
    print "\tOptimal number of clusters: %s" % clus_num
else:
    sil = sil = numpy.zeros(clus_max - clus_min + 1)
    clus_num = clus_min
    print "\n>>> Requested %s clusters" % clus_num

#K-Medoids clustering
idx, error, nfound = kmedoids(data, nclusters=clus_num, npass=passes)