def __init__(self, root_folder, extensions, prefetch = False, target_size = None, max_size = None, min_size = None, center_crop = None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder,)) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [f for f in files if any([ f.lower().endswith(ext) for ext in extensions])] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def __init__(self, root_folder, extensions, prefetch=False, target_size=None, max_size=None, min_size=None, center_crop=None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder, )) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [ f for f in files if any([f.lower().endswith(ext) for ext in extensions]) ] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def testAgree(self): self.assertTrue(mpi.agree(True)) self.assertFalse(mpi.agree(False)) self.assertTrue(mpi.agree(mpi.RANK == 0)) self.assertFalse(mpi.agree(mpi.RANK != 0)) self.assertFalse(mpi.agree(mpi.RANK))
def kmeans(X, k, n_init=1, max_iter=300, tol=1e-4): """ K-means clustering algorithm. Parameters ---------- X: ndarray A M by N array of M observations in N dimensions. X in every MPI node is the local data points it is responsible for. k: int or ndarray The number of clusters to form. n_init: int, optional, default: 1 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. tol: float, optional The relative increment in the results before declaring convergence. Returns ------- centroid: ndarray A k by N array of centroids found at the last iteration of k-means. label: ndarray label[i] is the code or index of the centroid the i'th observation is closest to. inertia: float The final value of the inertia criterion """ # do k-means training # vdata helps the stop criterion vdata = mpi.COMM.allreduce(np.mean(np.var(X, 0))) / mpi.SIZE best_inertia = np.infty if k <= 0: raise ValueError, "The number of centers (%d) should be positive." % k if mpi.COMM.allreduce(X.shape[0], op=mpi.MPI.MIN) == 0: raise RuntimeError, "Some nodes has zero data." logging.debug("Kmeans: A total of %d data points." % \ mpi.COMM.allreduce(X.shape[0])) # pre-compute squared norms of data points x_squared_norms = (X**2).sum(axis=1) for init_count in range(n_init): logging.debug("Kmeans trial %d" % (init_count,)) # initialization centers = X[np.random.randint(X.shape[0], size = k)] centers_all = mpi.COMM.gather(centers) if mpi.is_root(): centers_all = np.vstack(centers_all) centers[:] = centers_all[ np.random.permutation(centers_all.shape[0])[:k]] mpi.COMM.Bcast(centers) # iterations for iter_id in range(max_iter): logging.debug("Kmeans iter %d" % (iter_id)) centers_old = centers.copy() labels, inertia = _e_step(X, centers, x_squared_norms=x_squared_norms) inertia = mpi.COMM.allreduce(inertia) logging.debug("Inertia %f" % (inertia),) centers = _m_step(X, labels, k) # test convergence converged = (np.sum((centers_old - centers) ** 2) < tol * vdata) if mpi.agree(converged): break if inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia return best_centers, best_labels, best_inertia
def kmeans(X, k, n_init=1, max_iter=300, tol=1e-4): """ K-means clustering algorithm. Parameters ---------- X: ndarray A M by N array of M observations in N dimensions. X in every MPI node is the local data points it is responsible for. k: int or ndarray The number of clusters to form. n_init: int, optional, default: 1 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. tol: float, optional The relative increment in the results before declaring convergence. Returns ------- centroid: ndarray A k by N array of centroids found at the last iteration of k-means. label: ndarray label[i] is the code or index of the centroid the i'th observation is closest to. inertia: float The final value of the inertia criterion """ # do k-means training # vdata helps the stop criterion vdata = mpi.COMM.allreduce(np.mean(np.var(X, 0))) / mpi.SIZE best_inertia = np.infty if k <= 0: raise ValueError, "The number of centers (%d) should be positive." % k if mpi.COMM.allreduce(X.shape[0], op=mpi.MPI.MIN) == 0: raise RuntimeError, "Some nodes has zero data." logging.debug("Kmeans: A total of %d data points." % \ mpi.COMM.allreduce(X.shape[0])) # pre-compute squared norms of data points x_squared_norms = (X**2).sum(axis=1) for init_count in range(n_init): logging.debug("Kmeans trial %d" % (init_count, )) # initialization centers = X[np.random.randint(X.shape[0], size=k)] centers_all = mpi.COMM.gather(centers) if mpi.is_root(): centers_all = np.vstack(centers_all) centers[:] = centers_all[np.random.permutation( centers_all.shape[0])[:k]] mpi.COMM.Bcast(centers) # iterations for iter_id in range(max_iter): logging.debug("Kmeans iter %d" % (iter_id)) centers_old = centers.copy() labels, inertia = _e_step(X, centers, x_squared_norms=x_squared_norms) inertia = mpi.COMM.allreduce(inertia) logging.debug("Inertia %f" % (inertia), ) centers = _m_step(X, labels, k) # test convergence converged = (np.sum((centers_old - centers)**2) < tol * vdata) if mpi.agree(converged): break if inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia return best_centers, best_labels, best_inertia