def cifar_demo(): """Performs a demo classification on cifar """ mpi.mkdir(FLAGS.output_dir) logging.info('Loading cifar data...') cifar = visiondata.CifarDataset(FLAGS.root, is_training=True) cifar_test = visiondata.CifarDataset(FLAGS.root, is_training=False) conv = pipeline.ConvLayer([ pipeline.PatchExtractor([6,6], 1), # extracts patches pipeline.MeanvarNormalizer({'reg': 10}), # normalizes the patches pipeline.LinearEncoder({}, trainer = pipeline.ZcaTrainer({'reg': 0.1})), # Does whitening pipeline.ThresholdEncoder({'alpha': 0.25, 'twoside': True}, trainer = pipeline.OMPTrainer( {'k': 800, 'max_iter':100})), # does encoding pipeline.SpatialPooler({'grid': (2,2), 'method': 'ave'}) # average pool ]) logging.info('Training the pipeline...') conv.train(cifar, 50000) logging.info('Dumping the pipeline...') if mpi.is_root(): with open(os.path.join(FLAGS.output_dir, FLAGS.model_file),'w') as fid: pickle.dump(conv, fid) fid.close() with open(os.path.join(FLAGS.output_dir, FLAGS.model_file),'r') as fid: conv = pickle.load(fid) logging.info('Extracting features...') Xtrain = conv.process_dataset(cifar, as_2d = True) mpi.dump_matrix_multi(Xtrain, os.path.join(FLAGS.output_dir, FLAGS.feature_file+'_train')) Ytrain = cifar.labels().astype(np.int) Xtest = conv.process_dataset(cifar_test, as_2d = True) mpi.dump_matrix_multi(Xtest, os.path.join(FLAGS.output_dir, FLAGS.feature_file+'_test')) Ytest = cifar_test.labels().astype(np.int) # normalization m, std = classifier.feature_meanstd(Xtrain) Xtrain -= m Xtrain /= std Xtest -= m Xtest /= std w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.01) if mpi.is_root(): with open(os.path.join(FLAGS.output_dir, FLAGS.svm_file), 'w') as fid: pickle.dump({'m': m, 'std': std, 'w': w, 'b': b}, fid) accu = np.sum(Ytrain == (np.dot(Xtrain,w)+b).argmax(axis=1)) \ / float(len(Ytrain)) accu_test = np.sum(Ytest == (np.dot(Xtest,w)+b).argmax(axis=1)) \ / float(len(Ytest)) logging.info('Training accuracy: %f' % accu) logging.info('Testing accuracy: %f' % accu_test)
def cifar_demo(): """Performs a demo classification on cifar """ mpi.mkdir(FLAGS.output_dir) logging.info("Loading cifar data...") cifar = visiondata.CifarDataset(FLAGS.root, is_training=True) cifar_test = visiondata.CifarDataset(FLAGS.root, is_training=False) # try: use sub images # cifar = datasets.SubImageSet(cifar, [28,28], 1) # cifar_test = datasets.CenterRegionSet(cifar_test, [28,28]) conv = pipeline.ConvLayer( [ pipeline.PatchExtractor([6, 6], 1), # extracts patches pipeline.MeanvarNormalizer({"reg": 10}), # normalizes the patches pipeline.LinearEncoder({}, trainer=pipeline.ZcaTrainer({"reg": 0.1})), # Does whitening pipeline.ThresholdEncoder( {"alpha": 0.25, "twoside": True}, trainer=pipeline.OMPTrainer({"k": 1600, "max_iter": 100}) ), # does encoding pipeline.SpatialPooler({"grid": (4, 4), "method": "max"}), # average pool ] ) logging.info("Training the pipeline...") conv.train(cifar, 400000) logging.info("Dumping the pipeline...") if mpi.is_root(): with open(os.path.join(FLAGS.output_dir, FLAGS.model_file), "w") as fid: pickle.dump(conv, fid) fid.close() logging.info("Extracting features...") Xtrain = conv.process_dataset(cifar, as_2d=True) mpi.dump_matrix_multi(Xtrain, os.path.join(FLAGS.output_dir, FLAGS.feature_file + "_train")) Ytrain = cifar.labels().astype(np.int) Xtest = conv.process_dataset(cifar_test, as_2d=True) mpi.dump_matrix_multi(Xtest, os.path.join(FLAGS.output_dir, FLAGS.feature_file + "_test")) Ytest = cifar_test.labels().astype(np.int) # normalization m, std = classifier.feature_meanstd(Xtrain) Xtrain -= m Xtrain /= std Xtest -= m Xtest /= std w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.005) if mpi.is_root(): with open(os.path.join(FLAGS.output_dir, FLAGS.svm_file), "w") as fid: pickle.dump({"m": m, "std": std, "w": w, "b": b}, fid) accu = np.sum(Ytrain == (np.dot(Xtrain, w) + b).argmax(axis=1)) / float(len(Ytrain)) accu_test = np.sum(Ytest == (np.dot(Xtest, w) + b).argmax(axis=1)) / float(len(Ytest)) logging.info("Training accuracy: %f" % accu) logging.info("Testing accuracy: %f" % accu_test)
def omp_n(X, k, num_active, max_iter=100, tol=1e-4): """OMP training with MPI Input: X: a num_data_local * dim numpy matrix containing the data, each row being a datum. k: the dictionary size. num_active: the number of active dictionary entries for each datum max_iter: (optional) the maximum number of iteration. Default 100. tol: (optional) the tolerance threshold to determine convergence. Default 1e-4. """ # vdata is used for testing convergence Nlocal = X.shape[0] vdatalocal = np.sum(np.var(X, 0)) N = mpi.COMM.allreduce(Nlocal) vdata = mpi.COMM.allreduce(vdatalocal) vdata /= N # random initialization centroids = np.random.randn(k, X.shape[1]) centroids /= np.sqrt(np.sum(centroids ** 2, axis=1)).reshape(k, 1) centroids_all = mpi.COMM.gather(centroids) # make sure we are using the same centroids on all nodes if mpi.is_root(): centroids_all = np.vstack(centroids_all) centroids[:] = centroids_all[np.random.permutation(centroids_all.shape[0])[:k]] mpi.COMM.Bcast(centroids, root=0) timer = util.Timer() for iter_id in range(max_iter): logging.debug( "OMP-%d iter %d, last iteration %s, elapsed %s" % (num_active, iter_id, timer.lap(), timer.total()) ) centroids_old = centroids.copy() labels, val = omp_n_predict(X, centroids, num_active) centroids = omp_n_maximize(X, labels, val, k) # check convergence on root if mpi.is_root(): converged = np.sum((centroids_old - centroids) ** 2) < tol * vdata else: converged = None converged = mpi.COMM.bcast(converged) if converged: logging.debug("OMP has converged.") break else: logging.debug("OMP reached the maximum number of iterations.") return centroids
def omp_n(X, k, num_active, max_iter=100, tol=1e-4): '''OMP training with MPI Input: X: a num_data_local * dim numpy matrix containing the data, each row being a datum. k: the dictionary size. num_active: the number of active dictionary entries for each datum max_iter: (optional) the maximum number of iteration. Default 100. tol: (optional) the tolerance threshold to determine convergence. Default 1e-4. ''' # vdata is used for testing convergence Nlocal = X.shape[0] vdatalocal = np.sum(np.var(X, 0)) N = mpi.COMM.allreduce(Nlocal) vdata = mpi.COMM.allreduce(vdatalocal) vdata /= N # random initialization centroids = np.random.randn(k, X.shape[1]) centroids /= np.sqrt(np.sum(centroids**2, axis=1)).reshape(k, 1) centroids_all = mpi.COMM.gather(centroids) # make sure we are using the same centroids on all nodes if mpi.is_root(): centroids_all = np.vstack(centroids_all) centroids[:] = centroids_all[\ np.random.permutation(centroids_all.shape[0])[:k]] mpi.COMM.Bcast(centroids, root=0) timer = util.Timer() for iter_id in range(max_iter): logging.debug("OMP-%d iter %d, last iteration %s, elapsed %s" % \ (num_active, iter_id, timer.lap(), timer.total())) centroids_old = centroids.copy() labels, val = omp_n_predict(X, centroids, num_active) centroids = omp_n_maximize(X, labels, val, k) # check convergence on root if mpi.is_root(): converged = np.sum((centroids_old - centroids)**2) < tol * vdata else: converged = None converged = mpi.COMM.bcast(converged) if converged: logging.debug("OMP has converged.") break else: logging.debug("OMP reached the maximum number of iterations.") return centroids
def compute_caltech_features(): caltech = datasets.TwoLayerDataset(FLAGS.root, ['jpg'], max_size=300) conv = pipeline.ConvLayer([ dsift.DsiftExtractor(FLAGS.sift_size, FLAGS.sift_stride), pipeline.LLCEncoder({'k': FLAGS.llc_k}, trainer=pipeline.KmeansTrainer( {'k': FLAGS.dict_size})), pipeline.PyramidPooler({ 'level': 3, 'method': 'max' }) ]) conv.train(caltech, 400000) feat = conv.process_dataset(caltech, as_2d=True) mpi.mkdir(FLAGS.feature_dir) if mpi.is_root(): with (open(os.path.join(FLAGS.feature_dir, FLAGS.model_file), 'w')) as fid: pickle.dump(conv, fid) mpi.dump_matrix_multi(feat, os.path.join(FLAGS.feature_dir, FLAGS.feature_file)) mpi.dump_matrix_multi(caltech.labels(), os.path.join(FLAGS.feature_dir, FLAGS.label_file))
def __init__(self, rootfolder, is_training): super(MNISTDataset, self).__init__() if mpi.is_root(): # root loads the data if is_training: self._data = self._read_byte_data( os.path.join(rootfolder,'train-images-idx3-ubyte'), 16, (MNISTDataset.__num_train,) + \ MNISTDataset.__image_dim) self._label = self._read_byte_data( os.path.join(rootfolder, 'train-labels-idx1-ubyte'), 8, [MNISTDataset.__num_train]).astype(np.int) else: self._data = self._read_byte_data( os.path.join(rootfolder,'t10k-images-idx3-ubyte'), 16, (MNISTDataset.__num_test,) + \ MNISTDataset.__image_dim) self._label = self._read_byte_data( os.path.join(rootfolder, 't10k-labels-idx1-ubyte'), 8, [MNISTDataset.__num_test]).astype(np.int) else: self._data = None self._label = None self._data = mpi.distribute(self._data) self._label = mpi.distribute(self._label) self._dim = MNISTDataset.__image_dim self._channels = 1
def __init__(self, rootfolder, is_training): super(MNISTDataset, self).__init__() if mpi.is_root(): # root loads the data if is_training: self._data = self._read_byte_data( os.path.join(rootfolder,'train-images-idx3-ubyte'), 16, (MNISTDataset.__num_train,) + \ MNISTDataset.__image_dim) self._label = self._read_byte_data( os.path.join(rootfolder,'train-labels-idx1-ubyte'), 8, [MNISTDataset.__num_train]).astype(np.int) else: self._data = self._read_byte_data( os.path.join(rootfolder,'t10k-images-idx3-ubyte'), 16, (MNISTDataset.__num_test,) + \ MNISTDataset.__image_dim) self._label = self._read_byte_data( os.path.join(rootfolder,'t10k-labels-idx1-ubyte'), 8, [MNISTDataset.__num_test]).astype(np.int) else: self._data = None self._label = None self._data = mpi.distribute(self._data) self._label = mpi.distribute(self._label) self._dim = MNISTDataset.__image_dim self._channels = 1
def load_cifar10(self, rootfolder, is_training): """loads the cifar-10 dataset """ if mpi.is_root(): if is_training: self._data = np.empty((CifarDataset.__num_train,) + \ CifarDataset.__image_dim) self._label = np.empty(CifarDataset.__num_train) # training batches for i in range(CifarDataset.__num_batches): with open(os.path.join(rootfolder, 'data_batch_{0}'.format(i+1)),'r') as fid: batch = pickle.load(fid) start_idx = CifarDataset.__batchsize * i end_idx = CifarDataset.__batchsize * (i+1) self._data[start_idx:end_idx] = \ CifarDataset.get_images_from_matrix(batch['data']) self._label[start_idx:end_idx] = np.array(batch['labels']) else: with open(os.path.join(rootfolder, 'test_batch'), 'r') as fid: batch = pickle.load(fid) self._data = CifarDataset.get_images_from_matrix(batch['data']) self._label = np.array(batch['labels']) else: self._data = None self._label = None self._data = mpi.distribute(self._data) self._label = mpi.distribute(self._label)
def load_cifar10(self, rootfolder, is_training): """loads the cifar-10 dataset """ if mpi.is_root(): if is_training: self._data = np.empty((CifarDataset.__num_train,) + \ CifarDataset.__image_dim) self._label = np.empty(CifarDataset.__num_train) # training batches for i in range(CifarDataset.__num_batches): with open( os.path.join(rootfolder, 'data_batch_{0}'.format(i + 1)), 'r') as fid: batch = pickle.load(fid) start_idx = CifarDataset.__batchsize * i end_idx = CifarDataset.__batchsize * (i + 1) self._data[start_idx:end_idx] = \ CifarDataset.get_images_from_matrix(batch['data']) self._label[start_idx:end_idx] = np.array(batch['labels']) else: with open(os.path.join(rootfolder, 'test_batch'), 'r') as fid: batch = pickle.load(fid) self._data = CifarDataset.get_images_from_matrix(batch['data']) self._label = np.array(batch['labels']) else: self._data = None self._label = None self._data = mpi.distribute(self._data) self._label = mpi.distribute(self._label)
def get_predictions_logreg_perclass(X, weights): pred = mathutil.dot(X,weights[0])+weights[1] prob = 1.0/(1.0+np.exp(-pred)) prob = mpi.COMM.gather(prob) if mpi.is_root(): return np.vstack(prob) else: return np.zeros((0))
def __init__(self, root_folder, extensions, prefetch = False, target_size = None, max_size = None, min_size = None, center_crop = None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder,)) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [f for f in files if any([ f.lower().endswith(ext) for ext in extensions])] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def _add_default_fminargs(self): """ This function adds some default args to fmin, if we have not explicitly specified them. """ self._fminargs["maxfun"] = self._fminargs.get("maxfun", 1000) self._fminargs["disp"] = self._fminargs.get("disp", 1) # even when fmin displays outputs, we set non-root display to none if not mpi.is_root(): self._fminargs["disp"] = 0
def testDumpLoad(self): local_size = 2 mat_sources = [np.random.rand(local_size), np.random.rand(local_size, 2), np.random.rand(local_size, 2, 3)] for mat in mat_sources: mpi.dump_matrix(mat, _MPI_DUMP_TEST_FILE) if mpi.is_root(): mat_dumped = np.load(_MPI_DUMP_TEST_FILE) self.assertEqual(mat_dumped.shape, (local_size * mpi.SIZE,) + mat.shape[1:]) mat_read = mpi.load_matrix(_MPI_DUMP_TEST_FILE) self.assertEqual(mat.shape, mat_read.shape)
def get_predictions_logreg(X, weights): pred = mathutil.dot(X,weights[0])+weights[1] prob = pred - pred.max(axis=1)[:,np.newaxis] mathutil.exp(prob, out=prob) prob /= prob.sum(axis=1)[:, np.newaxis] prob = mpi.COMM.gather(prob) if mpi.is_root(): return np.vstack(prob) else: return np.zeros((0))
def _add_default_fminargs(self): ''' This function adds some default args to fmin, if we have not explicitly specified them. ''' self._fminargs['maxfun'] = self._fminargs.get('maxfun', 1000) self._fminargs['disp'] = self._fminargs.get('disp', 1) # even when fmin displays outputs, we set non-root display to none if not mpi.is_root(): self._fminargs['disp'] = 0
def get_predictions_nn(self, X, special_bias=None): X_feats = np.ascontiguousarray(np.hstack((X[self.feat_list[i]] for i in range(len(self.feat_list))))) X_feats -= self.m X_feats /= self.std if special_bias != None: X_feats = np.ascontiguousarray(np.hstack((X_feats, special_bias))) prob = get_predictions_nn(X_feats, self._weights_nn, self._arch)[0] prob = mpi.COMM.gather(prob) if mpi.is_root(): return np.vstack(prob) else: return np.zeros((0))
def train(self, incoming_patches): m, covmat = mathutil.mpi_meancov(incoming_patches) if mpi.is_root(): # only root carries out the computation eigval, eigvec = np.linalg.eigh(covmat) reg = self.specs.get('reg', np.finfo(np.float64).eps) W = eigvec * 1.0 / (np.sqrt(np.maximum(eigval, 0.0)) + reg) else: eigval, eigvec, W = None, None, None W = mpi.COMM.bcast(W) eigval = mpi.COMM.bcast(eigval) eigvec = mpi.COMM.bcast(eigvec) return (W, -m), (eigval, eigvec, covmat)
def get_predictions_nn(X, weights,arch): hid = mathutil.dot(X,weights[0])+weights[1] hid = 1.0/(1+np.exp(-hid)) #sigmoid pred = mathutil.dot(hid,weights[2])+weights[3] #prob = pred - pred.max(axis=1)[:,np.newaxis] #mathutil.exp(prob, out=prob) #prob /= prob.sum(axis=1)[:, np.newaxis] prob = 1.0/(1.0+np.exp(-pred)) prob = mpi.COMM.gather(prob) hid = mpi.COMM.gather(hid) if mpi.is_root(): return np.vstack(prob),np.vstack(hid) else: return np.zeros((0)),np.zeros((0))
def omp1(X, k, max_iter=100, tol=1e-4): '''omp1 training with MPI Note that the X matrix passed should be the local data each node is hosting. ''' # vdata is used for testing convergence Nlocal = X.shape[0] vdatalocal = np.sum(np.var(X, 0)) N = mpi.COMM.allreduce(Nlocal) vdata = mpi.COMM.allreduce(vdatalocal) vdata /= N # random initialization centroids = np.random.randn(k, X.shape[1]) centroids /= np.sqrt(np.sum(centroids**2, axis=1)).reshape(k, 1) centroids_all = mpi.COMM.gather(centroids) # make sure we are using the same centroids on all nodes if mpi.is_root(): centroids_all = np.vstack(centroids_all) centroids[:] = centroids_all[\ np.random.permutation(centroids_all.shape[0])[:k]] mpi.COMM.Bcast(centroids, root=0) for iter_id in range(max_iter): logging.debug("OMP iteration %d" % (iter_id,)) centroids_old = centroids.copy() labels, val = omp1_predict(X, centroids) centroids = omp1_maximize(X, labels, val, k) # check convergence on root if mpi.is_root(): converged = np.sum((centroids_old - centroids) ** 2) < tol * vdata else: converged = None converged = mpi.COMM.bcast(converged) if converged: logging.debug("OMP has converged.") break return centroids
def testDumpLoad(self): local_size = 2 mat_sources = [ np.random.rand(local_size), np.random.rand(local_size, 2), np.random.rand(local_size, 2, 3) ] for mat in mat_sources: mpi.dump_matrix(mat, _MPI_DUMP_TEST_FILE) if mpi.is_root(): mat_dumped = np.load(_MPI_DUMP_TEST_FILE) self.assertEqual(mat_dumped.shape, (local_size * mpi.SIZE, ) + mat.shape[1:]) mat_read = mpi.load_matrix(_MPI_DUMP_TEST_FILE) self.assertEqual(mat.shape, mat_read.shape)
def testFeatureMeanStd(self): mat = np.random.rand(100,50) m_test, std_test = classifier.feature_meanstd(mat) # use the naive approach to compute the mean and std mats = mpi.COMM.gather(mat) if mpi.is_root(): mats = np.vstack(mats) m = mats.mean(0) std = mats.std(0) else: m = None std = None m = mpi.COMM.bcast(m) std = mpi.COMM.bcast(std) np.testing.assert_array_almost_equal(m, m_test) np.testing.assert_array_almost_equal(std, std_test)
def get_data(filename): """This is a wrapper function that returns the images in the right axes order """ if mpi.is_root(): matdata = io.loadmat(filename) X = matdata['X'].reshape(\ (matdata['X'].shape[0],) + STL10Dataset._image_dim[::-1]) # make it contiguous so we can do mpi distribute X = np.ascontiguousarray(np.transpose(X, axes=[0, 3, 2, 1]), dtype=X.dtype) Y = matdata['y'].astype(int).flatten() else: X = None Y = None return mpi.distribute(X), mpi.distribute(Y)
def average_precision(Y, pred): """Average Precision for binary classification """ # since we need to compute the precision recall curve, we have to # compute this on the root node. Y = mpi.COMM.gather(Y) pred = mpi.COMM.gather(pred) if mpi.is_root(): Y = np.hstack(Y) pred = np.hstack(pred) precision, recall, _ = metrics.precision_recall_curve(Y == 1, pred) ap = metrics.auc(recall, precision) else: ap = None mpi.barrier() return mpi.COMM.bcast(ap)
def get_data(filename): """This is a wrapper function that returns the images in the right axes order """ if mpi.is_root(): matdata = io.loadmat(filename) X = matdata['X'].reshape(\ (matdata['X'].shape[0],) + STL10Dataset._image_dim[::-1]) # make it contiguous so we can do mpi distribute X = np.ascontiguousarray(np.transpose(X, axes=[0,3,2,1]), dtype = X.dtype) Y = matdata['y'].astype(int).flatten() else: X = None Y = None return mpi.distribute(X), mpi.distribute(Y)
def __init__(self, root, is_training, crop=False, prefetch=False, target_size=None): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. Note that we will use the python indexing (labels start from 0). """ if is_training: mat_filename = 'train_list.mat' else: mat_filename = 'test_list.mat' if mpi.is_root(): matfile = io.loadmat(os.path.join(root, mat_filename)) labels = np.array(matfile['labels'].flatten() - 1, dtype=np.int) files = [f[0][0] for f in matfile['file_list']] else: labels = None files = None self._data = mpi.distribute_list(files) self._label = mpi.distribute(labels) self._root = root self._prefetch = prefetch self._crop = crop self._target_size = target_size if target_size is not None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 if self._prefetch: self._data = [self._read(i) for i in range(len(self._data))]
def obj(wb,solver): ''' The objective function used by fmin ''' # obtain w and b K = solver._K dim = solver._dim w = wb[:K*dim].reshape((dim, K)) b = wb[K*dim:] # pred is a matrix of size [num_datalocal, K] mathutil.dot(solver._X, w, out = solver._pred) solver._pred += b # compute the loss function if solver.gpredcache: flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight, solver._gpred, solver._gpredcache, **solver._lossargs) else: flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight, **solver._lossargs) mathutil.dot(solver._X.T, gpred, out = solver._glocal[:K*dim].reshape(dim, K)) solver._glocal[K*dim:] = gpred.sum(axis=0) # we should normalize them with the number of data flocal /= solver._num_data solver._glocal /= solver._num_data # add regularization term, but keep in mind that we have multiple nodes # so we only carry it out on root to make sure we only added one # regularization term if mpi.is_root(): freg, greg = solver.reg(w, **solver._regargs) flocal += solver._gamma * freg solver._glocal[:K*dim] += solver._gamma * greg.ravel() # do mpi reduction mpi.barrier() f = mpi.COMM.allreduce(flocal) mpi.COMM.Allreduce(solver._glocal, solver._g) ######### DEBUG PART ############## if np.isnan(f): # check all the components to see what went wrong. print 'rank %s: isnan X: %d' % (mpi.RANK,np.any(np.isnan(solver._X))) print 'rank %s: isnan Y: %d' % (mpi.RANK,np.any(np.isnan(solver._Y))) print 'rank %s: isnan flocal: %d' % (mpi.RANK,np.any(np.isnan(flocal))) print 'rank %s: isnan pred: %d' % (mpi.RANK,np.any(np.isnan(solver._pred))) print 'rank %s: isnan w: %d' % (mpi.RANK,np.any(np.isnan(w))) print 'rank %s: isnan b: %d' % (mpi.RANK,np.any(np.isnan(b))) return f, solver._g
def average_precision(Y, pred): """Average Precision for binary classification """ # since we need to compute the precision recall curve, we have to # compute this on the root node. Y = mpi.COMM.gather(Y) pred = mpi.COMM.gather(pred) if mpi.is_root(): Y = np.hstack(Y) pred = np.hstack(pred) precision, recall, _ = metrics.precision_recall_curve( Y == 1, pred) ap = metrics.auc(recall, precision) else: ap = None mpi.barrier() return mpi.COMM.bcast(ap)
def get_predictions_nn_old(self, X, special_bias=None): X_feats = np.ascontiguousarray(np.hstack((X[self.feat_list[i]] for i in range(len(self.feat_list))))) X_feats -= self.m X_feats /= self.std if special_bias != None: X_feats = np.ascontiguousarray(np.hstack((X_feats, special_bias))) DS = ClassificationDataSet( X_feats.shape[1], 1, nb_classes=2 ) #for i in range(X_feats.shape[0]): # DS.addSample( X_feats[i,:], [0.0] ) DS.setField('input', X_feats) DS.setField('target', np.zeros((X_feats.shape[0],1))) DS._convertToOneOfMany() prob = self._nn.activateOnDataset(DS) prob = mpi.COMM.gather(prob) if mpi.is_root(): return np.vstack(prob) else: return np.zeros((0))
def demo_kmeans(): """A simple kmeans demo """ print 'Running kmeans demo' data = np.vstack((np.random.randn(500,2)+1,\ np.random.randn(500,2)-1)) centers, labels, inertia = kmeans(data, 8, n_init=1, max_iter=5) print 'inertia =', inertia print 'centers = \n', centers try: from matplotlib import pyplot if mpi.is_root(): pyplot.scatter(data[:, 0], data[:, 1], c=labels) pyplot.show() mpi.barrier() except Exception: print 'cannot show figure. will simply pass' pass
def __init__(self, root, is_training, crop = False, prefetch = False, target_size = None): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. Note that we will use the python indexing (labels start from 0). """ if is_training: mat_filename = 'train_list.mat' else: mat_filename = 'test_list.mat' if mpi.is_root(): matfile = io.loadmat(os.path.join(root, mat_filename)) labels = np.array(matfile['labels'].flatten()-1, dtype=np.int) files = [f[0][0] for f in matfile['file_list']] else: labels = None files = None self._data = mpi.distribute_list(files) self._label = mpi.distribute(labels) self._root = root self._prefetch = prefetch self._crop = crop self._target_size = target_size if target_size is not None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 if self._prefetch: self._data = [self._read(i) for i in range(len(self._data))]
def compute_caltech_features(): caltech = datasets.TwoLayerDataset(FLAGS.root, ["jpg"], max_size=300) conv = pipeline.ConvLayer( [ dsift.DsiftExtractor(FLAGS.sift_size, FLAGS.sift_stride), pipeline.LLCEncoder({"k": FLAGS.llc_k}, trainer=pipeline.KmeansTrainer({"k": FLAGS.dict_size})), pipeline.PyramidPooler({"level": 3, "method": "max"}), ] ) conv.train(caltech, 400000) feat = conv.process_dataset(caltech, as_2d=True) mpi.mkdir(FLAGS.feature_dir) if mpi.is_root(): with (open(os.path.join(FLAGS.feature_dir, FLAGS.model_file), "w")) as fid: pickle.dump(conv, fid) mpi.dump_matrix_multi(feat, os.path.join(FLAGS.feature_dir, FLAGS.feature_file)) mpi.dump_matrix_multi(caltech.labels(), os.path.join(FLAGS.feature_dir, FLAGS.label_file))
def load_cifar100(self, rootfolder, is_training): """loads the cifar-100 dataset """ if mpi.is_root(): if is_training: filename = 'train' else: filename = 'test' with open(rootfolder + os.sep + filename) as fid: batch = pickle.load(fid) self._data = CifarDataset.get_images_from_matrix(batch['data']) self._coarselabel = np.array(batch['coarse_labels']) self._label = np.array(batch['fine_labels']) else: self._data = None self._coarselabel = None self._label = None self._data = mpi.distribute(self._data) self._coarselabel = mpi.distribute(self._coarselabel) self._label = mpi.distribute(self._label)
def demo_kmeans(): """A simple kmeans demo """ print 'Running kmeans demo' data = np.vstack((np.random.randn(500,2)+1,\ np.random.randn(500,2)-1)) centers, labels, inertia = kmeans(data, 8, n_init=1, max_iter=5) print 'inertia =', inertia print 'centers = \n', centers try: from matplotlib import pyplot if mpi.is_root(): pyplot.scatter(data[:,0],data[:,1],c=labels) pyplot.show() mpi.barrier() except Exception: print 'cannot show figure. will simply pass' pass
def prune_conv(conv, dataset, num_patches, num_features): if not isinstance(conv[-1], pipeline.Pooler): raise TypeError, "The last layer should be a pooler." if not isinstance(conv[-2], pipeline.FeatureEncoder): raise TypeError, "The second last layer should be an encoder." logging.debug('Randomly sampling pooled features...') features = conv.sample(dataset, num_patches, True) if features.shape[1] != conv[-2].dictionary.shape[0]: raise ValueError, "Huh, I can't figure out the encoding method.\n"\ "Feature shape: %d, dictionary size: %d" % \ (features.shape[1], conv[-2].dictionary.shape[0]) logging.debug('Perform feature selection...') covmat = mathutil.mpi_cov(features) if mpi.is_root(): selected_idx = max_variance_feature_selection(covmat, num_features) else: selected_idx = None selected_idx = mpi.COMM.bcast(selected_idx) conv[-2].dictionary = conv[-2].dictionary[selected_idx] return covmat
def demo_read(root): from iceberk import visualize vis = visualize.PatchVisualizer() print 'Loading training data...' traindata = STL10Dataset(root, 'train') print 'My training data size:', traindata.size() print 'Loading testing data...' testdata = STL10Dataset(root, 'test') print 'My testing data size:', testdata.size() print 'Loading unlabeled data...' unlabeleddata = STL10Dataset(root, 'unlabeled') print 'My unlabeled data size:', unlabeleddata.size() if mpi.is_root(): vis.pyplot.figure() vis.show_multiple(traindata.raw_data()[:25]) vis.pyplot.title('Sample training images.') vis.pyplot.figure() vis.show_multiple(testdata.raw_data()[:25]) vis.pyplot.title('Sample testing images.') vis.pyplot.figure() vis.show_multiple(unlabeleddata.raw_data()[:25]) vis.pyplot.title('Sample unlabeled images.') vis.pyplot.show() mpi.barrier()
def __init__(self, list_file, feat_range, posting_file, perc_pos, keep_full_utt=False, posting_sampler=None, min_dur=0.2, min_count=0.0, max_count=10000000.0, reader_type='utterance', pickle_fname=None, list_file_sph=None, kw_feat=None, merge_score_files=None): '''TODO: Read pieces of utterance from the CSV file instead to save memory. It would be nice to index thse by utt_id (by now I do a map).''' super(BabelDataset, self).__init__() if list_file.find('eval') >= 0: self.is_eval = True self.T = FLAGS.T_eval else: self.is_eval = False self.T = FLAGS.T_train self.beta = FLAGS.beta self.reader_type = reader_type if reader_type=='lattice': self.is_lattice = True utt_reader = LatticeReader.LatticeReader(list_file) utt_reader.ReadAllLatices() elif reader_type=='utterance': self.is_lattice = False utt_reader = UtteranceReader.UtteranceReader(list_file,pickle_fname=pickle_fname) utt_reader.ReadAllUtterances(feat_range) elif reader_type=='snr': self.is_lattice = False utt_reader = SNRReader.SNRReader(list_file,pickle_fname=pickle_fname) utt_reader.ReadAllSNR() elif reader_type=='srate': self.is_lattice = False utt_reader = SrateReader.SrateReader(list_file,pickle_fname=pickle_fname) utt_reader.ReadAllSrate() elif reader_type=='score': self.is_lattice = False utt_reader = ScoreReader.ScoreReader(list_file,list_file_sph=list_file_sph,pickle_fname=pickle_fname, merge_score_files=merge_score_files) else: print 'Reader not implemented!' exit(0) if posting_sampler == None: testParser = PostingParser.PostingParser(posting_file) self.posting_sampler = Sampler.Sampler(testParser) self.posting_sampler.GetPositive() self.posting_sampler.GetNegative() self.posting_sampler.SampleData(perc_pos) else: self.posting_sampler = posting_sampler self.min_dur = min_dur self._data_all = None self._dim = False self._channels = 1 self.keep_full_utt = keep_full_utt if mpi.is_root(): self._data = [] self._label = [] self._features = [] self._utt_id = [] self._times = [] self._keyword = [] skipped = 0 for i in range(len(self.posting_sampler.negative_data)): if utt_reader.map_utt_idx.has_key(self.posting_sampler.negative_data[i]['file']): if self.posting_sampler.negative_data[i]['sys_bt'] == '': print 'We found a negative example that was not produced by the system!' exit(0) sys_bt = float(self.posting_sampler.negative_data[i]['sys_bt']) sys_et = float(self.posting_sampler.negative_data[i]['sys_et']) sys_sc = float(self.posting_sampler.negative_data[i]['sys_score']) if(sys_et-sys_bt < self.min_dur): skipped += 1 continue self._data.append(utt_reader.GetKeywordData(self.posting_sampler.negative_data[i]['file'], sys_bt, sys_et,kw=self.posting_sampler.negative_data[i]['termid'])) self._label.append(0) self._features.append(sys_sc) self._utt_id.append(self.posting_sampler.negative_data[i]['file']) self._times.append((sys_bt,sys_et)) self._keyword.append(self.posting_sampler.negative_data[i]['termid']) else: pass for i in range(len(self.posting_sampler.positive_data)): if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']): if self.posting_sampler.positive_data[i]['sys_bt'] == '': sys_bt = 0 sys_et = None sys_sc = -1.0 #print self.posting_sampler.positive_data[i]['alignment'] continue #Should just ignore these? else: sys_bt = float(self.posting_sampler.positive_data[i]['sys_bt']) sys_et = float(self.posting_sampler.positive_data[i]['sys_et']) sys_sc = float(self.posting_sampler.positive_data[i]['sys_score']) if(sys_et-sys_bt < self.min_dur): skipped += 1 continue self._data.append(utt_reader.GetKeywordData(self.posting_sampler.positive_data[i]['file'], sys_bt, sys_et,kw=self.posting_sampler.positive_data[i]['termid'])) self._label.append(1) self._features.append(sys_sc) self._utt_id.append(self.posting_sampler.positive_data[i]['file']) self._times.append((sys_bt,sys_et)) self._keyword.append(self.posting_sampler.positive_data[i]['termid']) else: pass print 'I skipped ',skipped,' entries out of ',(len(self.posting_sampler.negative_data)+len(self.posting_sampler.positive_data)) self._label = np.array(self._label) else: self._data = None self._label = None self._features = None self._utt_id = None self._times = None self._keyword = None #populate true kw freq self._map_kw_counts = {} for i in range(len(self.posting_sampler.positive_data)): if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']): kw = self.posting_sampler.positive_data[i]['termid'] if self._map_kw_counts.has_key(kw): self._map_kw_counts[kw] += 1 else: self._map_kw_counts[kw] = 1 #filter dataset depending on count if mpi.is_root(): ind_keep = [] kw_zero = 0 for i in range(len(self._keyword)): kw = self._keyword[i] kw_count = 0 if self._map_kw_counts.has_key(kw): kw_count = self._map_kw_counts[kw] else: kw_zero += 1 if kw_count <= max_count and kw_count >= min_count: ind_keep.append(i) self._data = [self._data[i] for i in ind_keep] self._label = [self._label[i] for i in ind_keep] self._features = [self._features[i] for i in ind_keep] self._utt_id = [self._utt_id[i] for i in ind_keep] self._times = [self._times[i] for i in ind_keep] self._keyword = [self._keyword[i] for i in ind_keep] self._data = mpi.distribute_list(self._data) self._label = mpi.distribute(self._label) self._features = mpi.distribute_list(self._features) self._utt_id = mpi.distribute_list(self._utt_id) self._times = mpi.distribute_list(self._times) self._keyword = mpi.distribute_list(self._keyword) if self.keep_full_utt == True: self.utt_reader = utt_reader if kw_feat != None: try: kw_feat.has_key('length') self.CopyKeywordMaps(kw_feat) except: self.LoadMappingHescii(FLAGS.hescii_file) self.ComputeKeywordMaps()
def testIsRoot(self): if mpi.RANK == 0: self.assertTrue(mpi.is_root()) else: self.assertFalse(mpi.is_root())
std.resize(np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1]) std = std.mean(axis=0) std_order = np.argsort(std) # now, compute the within-class std regions_pooled_view = regions_pooled.reshape(regions_pooled.shape[0], np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1]) within_std_local = regions_pooled_view.var(axis=1) print within_std_local.shape within_std = np.sqrt(mathutil.mpi_mean(within_std_local)) within_std_order = np.argsort(within_std) std_comparison = within_std / (std + 1e-10) std_comparison_order = np.argsort(std_comparison) if mpi.is_root(): pyplot.figure() visualize.show_multiple(conv[-2].dictionary[std_order]) pyplot.savefig("codes_std_ordered.pdf") pyplot.figure() visualize.show_multiple(conv[-2].dictionary[within_std_order]) pyplot.savefig("codes_within_std_ordered.pdf") pyplot.figure() visualize.show_multiple(conv[-2].dictionary[std_comparison_order]) pyplot.savefig("codes_std_comparison_ordered.pdf") pyplot.figure() pyplot.plot(std) pyplot.show() mpi.barrier()
def apcluster_k(feature, num_centers, corr = True, tol = 0): """perform the affinity propagation algorithm for the input codes. """ logging.debug("ap: preparing similarity matrix") covmat = mathutil.mpi_cov(feature) std = np.diag(covmat) # normalize std = np.sqrt(std**2 + 0.01) if corr: # compute correlation. If corr is False, we will use the covariance # directly covmat /= std covmat /= std[:, np.newaxis] # compute the similarity matrix norm = np.diag(covmat) / 2 covmat -= norm covmat -= norm[:, np.newaxis] # add a small noise to covmat noise = (covmat + np.finfo(np.float64).eps) * \ np.random.rand(covmat.shape[0], covmat.shape[1]) mpi.COMM.Bcast(noise) covmat += noise # The remaining part can just be carried out on root if mpi.is_root(): # set preference pmax = covmat.max() #af = AffinityPropagation().fit(covmat, pmax) #num_max = len(af.cluster_centers_indices_) # in fact, num_max would always be covmat.shape[0] so we don't really # run ap num_max = covmat.shape[0] logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max)) pmin = covmat.min() af = AffinityPropagation().fit(covmat, pmin) # num_min is the theoretical min, but the python code seem to raise bugs... num_min = len(af.cluster_centers_indices_) logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min)) if num_centers < num_min: logging.warning("num_centers too small, will return %d centers" % (num_min,)) return af.cluster_centers_indices_, af.labels_, covmat if num_centers > num_max: logging.warning("num_centers too large, will return everything.") return np.arange(covmat.shape[0], dtype=np.int), \ np.arange(covmat.shape[0], dtype=np.int) logging.debug("ap: start affinity propagation") # We will simply use bisection search to find the right number of centroids. for i in range(_AP_MAX_ITERATION): pref = (pmax + pmin) / 2 af = AffinityPropagation().fit(covmat, pref) num = len(af.cluster_centers_indices_) logging.debug("ap try %d: pref = %s, num = %s" % (i + 1, pref, num)) if num >= num_centers - tol and num <= num_centers + tol: break elif num < num_centers: pmin = pref num_min = num else: pmax = pref num_max = num else: af = None mpi.barrier() af = mpi.COMM.bcast(af) return af.cluster_centers_indices_, af.labels_, covmat
def cifar_demo(): """Performs a demo classification on cifar """ mpi.mkdir(FLAGS.output_dir) logging.info('Loading cifar data...') cifar = visiondata.CifarDataset(FLAGS.root, is_training=True) cifar_test = visiondata.CifarDataset(FLAGS.root, is_training=False) conv = pipeline.ConvLayer([ pipeline.PatchExtractor([6, 6], 1), # extracts patches pipeline.MeanvarNormalizer({'reg': 10}), # normalizes the patches pipeline.LinearEncoder({}, trainer=pipeline.ZcaTrainer( {'reg': 0.1})), # Does whitening pipeline.ThresholdEncoder({ 'alpha': 0.25, 'twoside': True }, trainer=pipeline.OMPTrainer({ 'k': 800, 'max_iter': 100 })), # does encoding pipeline.SpatialPooler({ 'grid': (2, 2), 'method': 'ave' }) # average pool ]) logging.info('Training the pipeline...') conv.train(cifar, 50000) logging.info('Dumping the pipeline...') if mpi.is_root(): with open(os.path.join(FLAGS.output_dir, FLAGS.model_file), 'w') as fid: pickle.dump(conv, fid) fid.close() with open(os.path.join(FLAGS.output_dir, FLAGS.model_file), 'r') as fid: conv = pickle.load(fid) logging.info('Extracting features...') Xtrain = conv.process_dataset(cifar, as_2d=True) mpi.dump_matrix_multi( Xtrain, os.path.join(FLAGS.output_dir, FLAGS.feature_file + '_train')) Ytrain = cifar.labels().astype(np.int) Xtest = conv.process_dataset(cifar_test, as_2d=True) mpi.dump_matrix_multi( Xtest, os.path.join(FLAGS.output_dir, FLAGS.feature_file + '_test')) Ytest = cifar_test.labels().astype(np.int) # normalization m, std = classifier.feature_meanstd(Xtrain) Xtrain -= m Xtrain /= std Xtest -= m Xtest /= std w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.01) if mpi.is_root(): with open(os.path.join(FLAGS.output_dir, FLAGS.svm_file), 'w') as fid: pickle.dump({'m': m, 'std': std, 'w': w, 'b': b}, fid) accu = np.sum(Ytrain == (np.dot(Xtrain,w)+b).argmax(axis=1)) \ / float(len(Ytrain)) accu_test = np.sum(Ytest == (np.dot(Xtest,w)+b).argmax(axis=1)) \ / float(len(Ytest)) logging.info('Training accuracy: %f' % accu) logging.info('Testing accuracy: %f' % accu_test)
def __init__(self, root_folder, extensions, prefetch=False, target_size=None, max_size=None, min_size=None, center_crop=None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder, )) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [ f for f in files if any([f.lower().endswith(ext) for ext in extensions]) ] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def apcluster_k(feature, num_centers, corr=True, tol=0): """perform the affinity propagation algorithm for the input codes. """ logging.debug("ap: preparing similarity matrix") covmat = mathutil.mpi_cov(feature) std = np.diag(covmat) # normalize std = np.sqrt(std**2 + 0.01) if corr: # compute correlation. If corr is False, we will use the covariance # directly covmat /= std covmat /= std[:, np.newaxis] # compute the similarity matrix norm = np.diag(covmat) / 2 covmat -= norm covmat -= norm[:, np.newaxis] # add a small noise to covmat noise = (covmat + np.finfo(np.float64).eps) * \ np.random.rand(covmat.shape[0], covmat.shape[1]) mpi.COMM.Bcast(noise) covmat += noise # The remaining part can just be carried out on root if mpi.is_root(): # set preference pmax = covmat.max() #af = AffinityPropagation().fit(covmat, pmax) #num_max = len(af.cluster_centers_indices_) # in fact, num_max would always be covmat.shape[0] so we don't really # run ap num_max = covmat.shape[0] logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max)) pmin = covmat.min() af = AffinityPropagation().fit(covmat, pmin) # num_min is the theoretical min, but the python code seem to raise bugs... num_min = len(af.cluster_centers_indices_) logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min)) if num_centers < num_min: logging.warning("num_centers too small, will return %d centers" % (num_min, )) return af.cluster_centers_indices_, af.labels_, covmat if num_centers > num_max: logging.warning("num_centers too large, will return everything.") return np.arange(covmat.shape[0], dtype=np.int), \ np.arange(covmat.shape[0], dtype=np.int) logging.debug("ap: start affinity propagation") # We will simply use bisection search to find the right number of centroids. for i in range(_AP_MAX_ITERATION): pref = (pmax + pmin) / 2 af = AffinityPropagation().fit(covmat, pref) num = len(af.cluster_centers_indices_) logging.debug("ap try %d: pref = %s, num = %s" % (i + 1, pref, num)) if num >= num_centers - tol and num <= num_centers + tol: break elif num < num_centers: pmin = pref num_min = num else: pmax = pref num_max = num else: af = None mpi.barrier() af = mpi.COMM.bcast(af) return af.cluster_centers_indices_, af.labels_, covmat
eigval, eigvec = np.linalg.eigh(covmat) U = eigvec[:, -(200*FLAGS.grid*FLAGS.grid):] #U = eigvec[:,-400:] * np.sqrt(eigval[-400:]) Xtrain = np.dot(Xtrain, U) Xtest = np.dot(Xtest, U) """ w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.002, fminargs={ 'disp': 0, 'maxfun': 1000 }) accu_train = classifier.Evaluator.accuracy(Ytrain, np.dot(Xtrain, w) + b) accu_test = classifier.Evaluator.accuracy(Ytest, np.dot(Xtest, w) + b) logging.info('Training accuracy: %f' % accu_train) logging.info('Testing accuracy: %f' % accu_test) if __name__ == "__main__": gflags.FLAGS(sys.argv) if mpi.is_root(): logging.basicConfig(level=logging.DEBUG) if FLAGS.profile_file != "": cProfile.run('cifar_demo()', FLAGS.profile_file) else: cifar_demo() else: cifar_demo()
def kmeans(X, k, n_init=1, max_iter=300, tol=1e-4): """ K-means clustering algorithm. Parameters ---------- X: ndarray A M by N array of M observations in N dimensions. X in every MPI node is the local data points it is responsible for. k: int or ndarray The number of clusters to form. n_init: int, optional, default: 1 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. tol: float, optional The relative increment in the results before declaring convergence. Returns ------- centroid: ndarray A k by N array of centroids found at the last iteration of k-means. label: ndarray label[i] is the code or index of the centroid the i'th observation is closest to. inertia: float The final value of the inertia criterion """ # do k-means training # vdata helps the stop criterion vdata = mpi.COMM.allreduce(np.mean(np.var(X, 0))) / mpi.SIZE best_inertia = np.infty if k <= 0: raise ValueError, "The number of centers (%d) should be positive." % k if mpi.COMM.allreduce(X.shape[0], op=mpi.MPI.MIN) == 0: raise RuntimeError, "Some nodes has zero data." logging.debug("Kmeans: A total of %d data points." % \ mpi.COMM.allreduce(X.shape[0])) # pre-compute squared norms of data points x_squared_norms = (X**2).sum(axis=1) for init_count in range(n_init): logging.debug("Kmeans trial %d" % (init_count, )) # initialization centers = X[np.random.randint(X.shape[0], size=k)] centers_all = mpi.COMM.gather(centers) if mpi.is_root(): centers_all = np.vstack(centers_all) centers[:] = centers_all[np.random.permutation( centers_all.shape[0])[:k]] mpi.COMM.Bcast(centers) # iterations for iter_id in range(max_iter): logging.debug("Kmeans iter %d" % (iter_id)) centers_old = centers.copy() labels, inertia = _e_step(X, centers, x_squared_norms=x_squared_norms) inertia = mpi.COMM.allreduce(inertia) logging.debug("Inertia %f" % (inertia), ) centers = _m_step(X, labels, k) # test convergence converged = (np.sum((centers_old - centers)**2) < tol * vdata) if mpi.agree(converged): break if inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia return best_centers, best_labels, best_inertia