def do_compute(reference_txt, pre_clustering_txt, groundtruth_npy): # load reference clusters reference = Clustering.load(reference_txt) # load hypothesis clusters hypothesis = Clustering.load(pre_clustering_txt) # number of hypothesis clusters nPreClusters = len(hypothesis.clusters) preClusters = sorted(hypothesis.clusters) # groundtruth[i, j] contains # 1 if all elements in clusters i and j are in the same cluster # 0 if elements in clusters i and j are not in the same cluster # -1 if either cluster i or j is not pure groundtruth = np.empty((nPreClusters, nPreClusters), dtype=int) # clustersRef[c] contains reference cluster for pure hypothesis cluster c # in case c is not pure, clustersRef[c] is None clustersRef = {} for c in preClusters: r = set([reference[i] for i in hypothesis.clusters[c]]) if len(r) == 1: clustersRef[c] = r.pop() else: clustersRef[c] = None for k, ci in enumerate(preClusters): if clustersRef[ci] is None: groundtruth[ci, :] = -1 groundtruth[:, ci] = -1 continue for cj in preClusters[k:]: if clustersRef[cj] is not None: groundtruth[ci, cj] = clustersRef[ci] == clustersRef[cj] groundtruth[cj, ci] = groundtruth[ci, cj] # save groundtruth matrix np.save(groundtruth_npy, groundtruth)
def do_it(image_txt, features_npy, clustering_txt, output_npy): # load image list with open(image_txt, 'r') as f: images = [int(line.strip()) for line in f.readlines()] image2index = {image: index for index, image in enumerate(images)} # load hypothesis clusters clustering = Clustering.load(clustering_txt) clusters = sorted(clustering.clusters) # load features features = np.load(features_npy) # L2 normalization (for later dot product) features = (features.T / np.sqrt(np.sum((features**2), axis=1))).T # find centroid image for every cluster centroid = {} for c, cluster in enumerate(clusters): # list of images in current cluster _images = clustering.clusters[cluster] # corresponding indices in features matrix _indices = [image2index[image] for image in _images] # compute distance matrix between # all images of current cluster _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # find centroid image i = np.argmin(np.sum(_distance, axis=0)) centroid[cluster] = _images[i] print 'image %s is centroid of cluster %s' % (centroid[cluster], cluster) # centroid indices in features matrix _indices = [image2index[centroid[cluster]] for cluster in clusters] # compute distance matrix between all centroids _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # save distance matrix with open(output_npy, 'wb') as f: np.save(f, _distance)
def do_it(image_txt, features_npy, clustering_txt, output_npy): # load image list with open(image_txt, 'r') as f: images = [int(line.strip()) for line in f.readlines()] image2index = {image: index for index, image in enumerate(images)} # load hypothesis clusters clustering = Clustering.load(clustering_txt) clusters = sorted(clustering.clusters) # load features features = np.load(features_npy) # L2 normalization (for later dot product) features = (features.T / np.sqrt(np.sum((features ** 2), axis=1))).T # find centroid image for every cluster centroid = {} for c, cluster in enumerate(clusters): # list of images in current cluster _images = clustering.clusters[cluster] # corresponding indices in features matrix _indices = [image2index[image] for image in _images] # compute distance matrix between # all images of current cluster _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # find centroid image i = np.argmin(np.sum(_distance, axis=0)) centroid[cluster] = _images[i] print 'image %s is centroid of cluster %s' % (centroid[cluster], cluster) # centroid indices in features matrix _indices = [image2index[centroid[cluster]] for cluster in clusters] # compute distance matrix between all centroids _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # save distance matrix with open(output_npy, 'wb') as f: np.save(f, _distance)
for k, v in cluster.iteritems(): for photo in v: file.write("%d\t%d\n" % (photo, k)) file.close() print "Loading json into memory..." dictionary = readjson( "/vol/corpora4/mediaeval/2014/SED_2014_Dev_Metadata.json") print "...Done !" clusterU = clusterUser(dictionary, fileID) clusterD = clusterDate(dictionary, fileID, clusterU) print_result_file(clusterD, fileOUT) reference = Clustering.load(fileREF) hypothesis = Clustering.load(fileOUT) images = [] for c in clusterD.values(): for i in range(0, len(c)): images.append(c[i]) h = homogeneity(reference, hypothesis, images) print h c = completeness(reference, hypothesis, images) print c
file = open(filename, "w") for k, v in cluster.iteritems(): for photo in v: file.write("%d\t%d\n" % (photo, k)) file.close() print "Loading json into memory..." dictionary = readjson("/vol/corpora4/mediaeval/2014/SED_2014_Dev_Metadata.json") print "...Done !" clusterU = clusterUser(dictionary, fileID) clusterD = clusterDate(dictionary, fileID, clusterU) print_result_file(clusterD, fileOUT) reference = Clustering.load(fileREF) hypothesis = Clustering.load(fileOUT) images = [] for c in clusterD.values(): for i in range (0, len(c)): images.append(c[i]) h = homogeneity(reference, hypothesis, images) print h c = completeness(reference, hypothesis, images) print c