Example #1
0
 def obj(wb,solver):
     '''
     The objective function used by fmin
     '''
     # obtain w and b
     Khidden = solver._Khidden
     dim = solver._dim
     whidden = wb[:Khidden*dim].reshape((dim, Khidden))
     tree = solver._regargs['tree']
     w = mathutil.dot(whidden, tree)
     b = wb[Khidden*dim:]
     # pred is a matrix of size [num_datalocal, K]
     mathutil.dot(solver._X, w, out = solver._pred)
     solver._pred += b
     # compute the loss function
     flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight,
                                **solver._lossargs)
     mathutil.dot(mathutil.dot(solver._X.T, gpred), tree.T,
             out = solver._glocal[:Khidden*dim].reshape(dim, Khidden))
     solver._glocal[Khidden*dim:] = gpred.sum(axis=0)
     
     # add regularization term, but keep in mind that we have multiple nodes
     freg, greg = solver.reg(whidden, **solver._regargs)
     flocal += solver._num_data * solver._gamma * freg / mpi.SIZE
     solver._glocal[:Khidden*dim] += solver._num_data * solver._gamma \
             * greg.ravel() / mpi.SIZE
     # do mpi reduction
     mpi.barrier()
     f = mpi.COMM.allreduce(flocal)
     mpi.COMM.Allreduce(solver._glocal, solver._g)
     return f, solver._g
Example #2
0
    def obj(wb, solver):
        """
        The objective function used by fmin
        """
        # obtain w and b
        K = solver._K
        dim = solver._dim
        w = wb[: K * dim].reshape((dim, K))
        b = wb[K * dim :]
        # pred is a matrix of size [num_datalocal, K]
        pred = mathutil.dot(solver._X, w)
        pred += b
        # compute the loss function
        flocal, gpred = solver.loss(solver._Y, pred, solver._weight, **solver._lossargs)
        glocal = np.empty(wb.shape)
        glocal[: K * dim] = mathutil.dot(solver._X.T, gpred).flat
        glocal[K * dim :] = gpred.sum(axis=0)

        # add regularization term, but keep in mind that we have multiple nodes
        freg, greg = solver.reg(w, **solver._regargs)
        flocal += solver._num_data * solver._gamma * freg / mpi.SIZE
        glocal[: K * dim] += solver._num_data * solver._gamma / mpi.SIZE * greg.ravel()
        # do mpi reduction
        mpi.barrier()
        f = mpi.COMM.allreduce(flocal)
        g = np.empty(glocal.shape, dtype=glocal.dtype)
        mpi.COMM.Allreduce(glocal, g)
        return f, g
Example #3
0
    def testBarrier(self):
        import time

        # sleep for a while, and resume
        time.sleep(mpi.RANK)
        mpi.barrier()
        self.assertTrue(True)
Example #4
0
 def train(self, dataset, num_patches,
           exhaustive = False, ratio_per_image = 0.1):
     """ train the convolutional layer
     
     Note that we do not train the first element (patch extractor),
     and stop when we see the spatial pooler. There might be some post
     processing components after the pooler, but they should not require
     any training (if they do, you may want to move them to the next layer
     """
     if len(self) == 0:
         return
     logging.debug("Training convolutional layer...")
     if not isinstance(self[0], Extractor):
         raise ValueError, \
               "The first component should be a patch extractor!"
     patches = self[0].sample(dataset, num_patches, self._previous_layer,
                              exhaustive, ratio_per_image)
     if len(self) == 1 or isinstance(self[1], Pooler):
         logging.debug('Nothing to be trained in this layer.')
         return
     # actually train the model
     for i in range(1, len(self)):
         component = self[i]
         mpi.barrier()
         logging.debug("Training %s..." % (component.__class__.__name__))
         component.train(patches)
         if i == len(self) - 1 or isinstance(self[i+1], Pooler):
             # if we've reached a pooler, stop training
             break
         else:
             # prepare the next component's input
             patches = component.process(patches)
     logging.debug("Training convolutional layer done.")
 def __init__(self, root_folder, extensions, prefetch = False, 
              target_size = None, max_size = None, min_size = None,
              center_crop = None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder,))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [f for f in files  if any([
                         f.lower().endswith(ext) for ext in extensions])]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Example #6
0
 def average_precision(Y, pred):
     """Average Precision for binary classification
     """
     # since we need to compute the precision recall curve, we have to
     # compute this on the root node.
     Y = mpi.COMM.gather(Y)
     pred = mpi.COMM.gather(pred)
     if mpi.is_root():
         Y = np.hstack(Y)
         pred = np.hstack(pred)
         precision, recall, _ = metrics.precision_recall_curve(Y == 1, pred)
         ap = metrics.auc(recall, precision)
     else:
         ap = None
     mpi.barrier()
     return mpi.COMM.bcast(ap)
 def obj(wb,solver):
     '''
     The objective function used by fmin
     '''
     # obtain w and b
     K = solver._K
     dim = solver._dim
     w = wb[:K*dim].reshape((dim, K))
     b = wb[K*dim:]
     # pred is a matrix of size [num_datalocal, K]
     mathutil.dot(solver._X, w, out = solver._pred)
     solver._pred += b
     # compute the loss function
     if solver.gpredcache:
         flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight,
                                    solver._gpred, solver._gpredcache,
                                    **solver._lossargs)
     else:
         flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight,
                                    **solver._lossargs)
     mathutil.dot(solver._X.T, gpred,
                  out = solver._glocal[:K*dim].reshape(dim, K))
     solver._glocal[K*dim:] = gpred.sum(axis=0)
     # we should normalize them with the number of data
     flocal /= solver._num_data
     solver._glocal /= solver._num_data
     # add regularization term, but keep in mind that we have multiple nodes
     # so we only carry it out on root to make sure we only added one 
     # regularization term
     if mpi.is_root():
         freg, greg = solver.reg(w, **solver._regargs)
         flocal += solver._gamma * freg
         solver._glocal[:K*dim] += solver._gamma * greg.ravel()
     # do mpi reduction
     mpi.barrier()
     f = mpi.COMM.allreduce(flocal)
     mpi.COMM.Allreduce(solver._glocal, solver._g)
     ######### DEBUG PART ##############
     if np.isnan(f):
         # check all the components to see what went wrong.
         print 'rank %s: isnan X: %d' % (mpi.RANK,np.any(np.isnan(solver._X)))
         print 'rank %s: isnan Y: %d' % (mpi.RANK,np.any(np.isnan(solver._Y)))
         print 'rank %s: isnan flocal: %d' % (mpi.RANK,np.any(np.isnan(flocal)))
         print 'rank %s: isnan pred: %d' % (mpi.RANK,np.any(np.isnan(solver._pred)))
         print 'rank %s: isnan w: %d' % (mpi.RANK,np.any(np.isnan(w)))
         print 'rank %s: isnan b: %d' % (mpi.RANK,np.any(np.isnan(b)))
     return f, solver._g
 def average_precision(Y, pred):
     """Average Precision for binary classification
     """
     # since we need to compute the precision recall curve, we have to
     # compute this on the root node.
     Y = mpi.COMM.gather(Y)
     pred = mpi.COMM.gather(pred)
     if mpi.is_root():
         Y = np.hstack(Y)
         pred = np.hstack(pred)
         precision, recall, _ = metrics.precision_recall_curve(
                 Y == 1, pred)
         ap = metrics.auc(recall, precision)
     else:
         ap = None
     mpi.barrier()
     return mpi.COMM.bcast(ap)
Example #9
0
def demo_kmeans():
    """A simple kmeans demo
    """
    print 'Running kmeans demo'
    data = np.vstack((np.random.randn(500,2)+1,\
                      np.random.randn(500,2)-1))
    centers, labels, inertia = kmeans(data, 8, n_init=1, max_iter=5)
    print 'inertia =', inertia
    print 'centers = \n', centers
    try:
        from matplotlib import pyplot
        if mpi.is_root():
            pyplot.scatter(data[:, 0], data[:, 1], c=labels)
            pyplot.show()
        mpi.barrier()
    except Exception:
        print 'cannot show figure. will simply pass'
        pass
def demo_kmeans():
    """A simple kmeans demo
    """
    print 'Running kmeans demo'
    data = np.vstack((np.random.randn(500,2)+1,\
                      np.random.randn(500,2)-1))
    centers, labels, inertia = kmeans(data, 8, 
                                         n_init=1, 
                                         max_iter=5)
    print 'inertia =', inertia
    print 'centers = \n', centers
    try:
        from matplotlib import pyplot
        if mpi.is_root():
            pyplot.scatter(data[:,0],data[:,1],c=labels)
            pyplot.show()
        mpi.barrier()
    except Exception:
        print 'cannot show figure. will simply pass'
        pass
Example #11
0
def omp1_maximize(X, labels, val, k):
    '''Learn the new OMP dictionary from the given activations

    Input:
        X: the data matrix, each row being a datum. Note that X is the
            local data hosted in each MPI node.
        labels: a vector of size X.shape[0], containing the indices of
            the dictionary entry that is active, one for each datum.
        val: a vector of size X.shape[0], the activation value of the 
            corresponding entry
        k: an int specifying the dictionary size.

    Output:
        centroids: a matrix of size [k, X.shape[1]] containing the new
            dictionary.
    '''
    dim = X.shape[1]
    centroids_local = np.zeros((k, dim))
    centroids_local_nonempty = np.zeros(k, dtype=np.int)
    # loop over the classes
    for q in range(k):
        center_mask = (labels == q)
        if np.any(center_mask):
            centroids_local[q] = np.dot(val[center_mask], X[center_mask])
            centroids_local_nonempty[q] = 1
    centroids_nonempty = np.zeros(k, dtype=np.int)
    mpi.barrier()
    mpi.COMM.Allreduce(centroids_local_nonempty, centroids_nonempty)
    # now, for those empty centroids, we need to randomly restart them
    for q in range(k):
        if centroids_nonempty[q] == 0 and mpi.is_president():
            centroids_local[q] = X[np.random.randint(X.shape[0])]
    # collect all centroids
    centroids = np.zeros((k, dim))
    mpi.COMM.Reduce(centroids_local, centroids)
    centroids /= (np.sqrt(np.sum(centroids**2, axis=1)) \
                  +np.finfo(np.float64).eps \
                 )[:, np.newaxis]
    # broadcast to remove any numerical unstability
    mpi.COMM.Bcast(centroids)
    return centroids
Example #12
0
def omp1_maximize(X, labels, val, k):
    '''Learn the new OMP dictionary from the given activations

    Input:
        X: the data matrix, each row being a datum. Note that X is the
            local data hosted in each MPI node.
        labels: a vector of size X.shape[0], containing the indices of
            the dictionary entry that is active, one for each datum.
        val: a vector of size X.shape[0], the activation value of the 
            corresponding entry
        k: an int specifying the dictionary size.

    Output:
        centroids: a matrix of size [k, X.shape[1]] containing the new
            dictionary.
    '''
    dim = X.shape[1]
    centroids_local = np.zeros((k, dim))
    centroids_local_nonempty = np.zeros(k, dtype = np.int)
    # loop over the classes
    for q in range(k):
        center_mask = (labels == q)
        if np.any(center_mask):
            centroids_local[q] = np.dot(val[center_mask], X[center_mask])
            centroids_local_nonempty[q] = 1
    centroids_nonempty = np.zeros(k, dtype=np.int)
    mpi.barrier()
    mpi.COMM.Allreduce(centroids_local_nonempty, centroids_nonempty)
    # now, for those empty centroids, we need to randomly restart them
    for q in range(k):
        if centroids_nonempty[q] == 0 and mpi.is_president():
            centroids_local[q] = X[np.random.randint(X.shape[0])]
    # collect all centroids
    centroids = np.zeros((k, dim))
    mpi.COMM.Reduce(centroids_local, centroids)
    centroids /= (np.sqrt(np.sum(centroids**2, axis=1)) \
                  +np.finfo(np.float64).eps \
                 )[:, np.newaxis]
    # broadcast to remove any numerical unstability
    mpi.COMM.Bcast(centroids)
    return centroids
Example #13
0
 def train(self, dataset, num_patches):
     """ train the convolutional layer
     
     Note that we do not train the first element (patch extractor),
     and stop when we see the spatial pooler. There might be some post
     processing components after the pooler, but they should not require
     any training (if they do, you may want to move them to the next layer
     """
     logging.debug("Training convolutional layer...")
     if not isinstance(self[0], Extractor):
         raise ValueError, \
               "The first component should be a patch extractor!"
     patches = self[0].sample(dataset, num_patches, self._previous_layer)
     for component in self[1:]:
         mpi.barrier()
         logging.debug("Training %s..." % (component.__class__.__name__))
         if isinstance(component, Pooler):
             # if we've reached pooler, stop training
             break
         patches = component.train(patches)
     logging.debug("Training convolutional layer done.")
Example #14
0
 def process_dataset(self, dataset, as_list=False, as_2d=False):
     """Processes a whole dataset and returns an numpy ndarray
     
     Input:
         dataset: the input dataset.
         as_list: if True, return a list. This applies when the output has
             different sizes for each image. Default False.
         as_2d: if True, return a matrix where each image corresponds to a
             row in the matrix. Default False.
     """
     # check if we want to use buffer
     if self._fixed_size:
         convbuffer = [None] * (len(self) + 1)
     else:
         convbuffer = None
     total = dataset.size_total()
     logging.debug("Processing a total of %s images" % (total, ))
     timer = util.Timer()
     if as_list:
         data = [self.process(dataset.image(i), convbuffer = convbuffer) \
                 for i in range(dataset.size())]
     else:
         # we assume that each image leads to the same feature size
         temp = self.process(dataset.image(0), as_vector=as_2d)
         logging.debug("Output feature shape: %s" % (str(temp.shape)))
         data = np.empty((dataset.size(), ) + temp.shape)
         data[0] = temp
         size = dataset.size()
         timer = util.Timer()
         for i in range(1, size):
             data[i] = self.process(dataset.image(i),
                                    as_vector=as_2d,
                                    convbuffer=convbuffer)
             # report local progress
             if (i * 10 / size) != ((i - 1) * 10 / size):
                 logging.debug("rank %d: %d percent. elapsed %s" % \
                         (mpi.RANK, i*100 / size, timer.total()))
     mpi.barrier()
     logging.debug("Feature extration took %s" % timer.total())
     return data
Example #15
0
def omp1_predict(X, centroids):
    ''' omp1 prediction
    
    This function does one-dimensional orthogonal matching pursuit.
    the returned values are simply going to be the indices and 
    inner products.
    '''
    idx = np.empty(X.shape[0], dtype=np.int)
    val = np.empty(X.shape[0])
    # in case we are going to deal with a large matrix, we buffer dots to avoid
    # multiple memory new / deletes.
    dots = np.empty((min(_MINIBATCH, X.shape[0]), centroids.shape[0]),
                    dtype = X.dtype)
    for start in range(0, X.shape[0], _MINIBATCH):
        end = min(start+_MINIBATCH, X.shape[0])
        batchsize = end-start
        mathutil.dot(X[start:end], centroids.T, out = dots[:batchsize])
        np.abs(dots, out=dots)
        idx[start:end] = np.argmax(dots[:batchsize], axis=1)
        val[start:end] = dots[range(batchsize), idx[start:end]]
    mpi.barrier()
    return idx, val
Example #16
0
def omp1_predict(X, centroids):
    ''' omp1 prediction
    
    This function does one-dimensional orthogonal matching pursuit.
    the returned values are simply going to be the indices and 
    inner products.
    '''
    idx = np.empty(X.shape[0], dtype=np.int)
    val = np.empty(X.shape[0])
    # in case we are going to deal with a large matrix, we buffer dots to avoid
    # multiple memory new / deletes.
    dots = np.empty((min(_MINIBATCH, X.shape[0]), centroids.shape[0]),
                    dtype=X.dtype)
    for start in range(0, X.shape[0], _MINIBATCH):
        end = min(start + _MINIBATCH, X.shape[0])
        batchsize = end - start
        mathutil.dot(X[start:end], centroids.T, out=dots[:batchsize])
        np.abs(dots, out=dots)
        idx[start:end] = np.argmax(dots[:batchsize], axis=1)
        val[start:end] = dots[range(batchsize), idx[start:end]]
    mpi.barrier()
    return idx, val
Example #17
0
 def process_dataset(self, dataset, as_list = False, as_2d = False):
     """Processes a whole dataset and returns an numpy ndarray
     
     Input:
         dataset: the input dataset.
         as_list: if True, return a list. This applies when the output has
             different sizes for each image. Default False.
         as_2d: if True, return a matrix where each image corresponds to a
             row in the matrix. Default False.
     """
     # check if we want to use buffer
     if self._fixed_size:
         convbuffer = [None] * (len(self) + 1)
     else:
         convbuffer = None
     total = dataset.size_total()
     logging.debug("Processing a total of %s images" % (total,))
     timer = util.Timer()
     if as_list:
         data = [self.process(dataset.image(i), convbuffer = convbuffer) \
                 for i in range(dataset.size())]
     else:
         # we assume that each image leads to the same feature size
         temp = self.process(dataset.image(0), as_vector = as_2d)
         logging.debug("Output feature shape: %s" % (str(temp.shape)))
         data = np.empty((dataset.size(),) + temp.shape)
         data[0] = temp
         size = dataset.size()
         timer = util.Timer()
         for i in range(1,size):
             data[i] = self.process(dataset.image(i), as_vector = as_2d,
                                    convbuffer = convbuffer)
             # report local progress
             if (i * 10 / size) != ((i-1) * 10 / size):
                 logging.debug("rank %d: %d percent. elapsed %s" % \
                         (mpi.RANK, i*100 / size, timer.total()))
     mpi.barrier()
     logging.debug("Feature extration took %s" % timer.total())
     return data
Example #18
0
def demo_read(root):
    from iceberk import visualize
    vis = visualize.PatchVisualizer()
    print 'Loading training data...'
    traindata = STL10Dataset(root, 'train')
    print 'My training data size:', traindata.size()
    print 'Loading testing data...'
    testdata = STL10Dataset(root, 'test')
    print 'My testing data size:', testdata.size()
    print 'Loading unlabeled data...'
    unlabeleddata = STL10Dataset(root, 'unlabeled')
    print 'My unlabeled data size:', unlabeleddata.size()
    if mpi.is_root():
        vis.pyplot.figure()
        vis.show_multiple(traindata.raw_data()[:25])
        vis.pyplot.title('Sample training images.')
        vis.pyplot.figure()
        vis.show_multiple(testdata.raw_data()[:25])
        vis.pyplot.title('Sample testing images.')
        vis.pyplot.figure()
        vis.show_multiple(unlabeleddata.raw_data()[:25])
        vis.pyplot.title('Sample unlabeled images.')
        vis.pyplot.show()
    mpi.barrier()
Example #19
0
def demo_read(root):
    from iceberk import visualize
    vis = visualize.PatchVisualizer()
    print 'Loading training data...'
    traindata = STL10Dataset(root, 'train')
    print 'My training data size:', traindata.size()
    print 'Loading testing data...'
    testdata = STL10Dataset(root, 'test')
    print 'My testing data size:', testdata.size()
    print 'Loading unlabeled data...'
    unlabeleddata = STL10Dataset(root, 'unlabeled')
    print 'My unlabeled data size:', unlabeleddata.size()
    if mpi.is_root():
        vis.pyplot.figure()
        vis.show_multiple(traindata.raw_data()[:25])
        vis.pyplot.title('Sample training images.')
        vis.pyplot.figure()
        vis.show_multiple(testdata.raw_data()[:25])
        vis.pyplot.title('Sample testing images.')
        vis.pyplot.figure()
        vis.show_multiple(unlabeleddata.raw_data()[:25])
        vis.pyplot.title('Sample unlabeled images.')
        vis.pyplot.show()
    mpi.barrier()
Example #20
0
    def obj(param, solver):
        """The objective function used by fmin
        """
        w = param[:-1]
        b = param[-1]
        # prediction is a vector
        pred = np.dot(solver._X, w) + b
        # call the loss
        flocal, gpred = solver.loss(solver._Y, pred, solver._weight, **solver._lossargs)
        # get the gradient for both w and b
        glocal = np.empty(param.shape)
        glocal[:-1] = np.dot(gpred, solver._X)
        glocal[-1] = gpred.sum()
        # do mpi reduction
        # for the regularization term
        freg, greg = solver.reg(w, **solver._regargs)
        flocal += solver._num_data * solver._gamma / mpi.SIZE * freg
        glocal[:-1] += solver._num_data * solver._gamma / mpi.SIZE * greg

        mpi.barrier()
        f = mpi.COMM.allreduce(flocal)
        g = np.empty(glocal.shape)
        mpi.COMM.Allreduce(glocal, g)
        return f, g
from iceberk import mpi
import logging
import numpy as np
import time

mpi.root_log_level(logging.INFO)

# just a large matrix
a = np.random.rand(1000, 12800)
a_local = np.random.rand(1000, 12800)
rank = mpi.RANK

logging.info('Testing mpi size %d' % mpi.SIZE)

mpi.barrier()
start = time.time()
mpi.COMM.Allreduce(a_local, a)
logging.info('Allreduce big speed: %f s' % (time.time() - start))

mpi.barrier()
start = time.time()
for i in xrange(a.shape[0]):
    mpi.COMM.Allreduce(a_local[i], a[i])
logging.info('Allreduce small speed: %f s' % (time.time() - start))
std.resize(np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1])
std = std.mean(axis=0)
std_order = np.argsort(std)

# now, compute the within-class std
regions_pooled_view = regions_pooled.reshape(regions_pooled.shape[0],
        np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1])
within_std_local = regions_pooled_view.var(axis=1)
print within_std_local.shape
within_std = np.sqrt(mathutil.mpi_mean(within_std_local))
within_std_order = np.argsort(within_std)

std_comparison = within_std / (std + 1e-10)
std_comparison_order = np.argsort(std_comparison)

if mpi.is_root():
    pyplot.figure()
    visualize.show_multiple(conv[-2].dictionary[std_order])
    pyplot.savefig("codes_std_ordered.pdf")
    pyplot.figure()
    visualize.show_multiple(conv[-2].dictionary[within_std_order])
    pyplot.savefig("codes_within_std_ordered.pdf")
    pyplot.figure()
    visualize.show_multiple(conv[-2].dictionary[std_comparison_order])
    pyplot.savefig("codes_std_comparison_ordered.pdf")
    pyplot.figure()
    pyplot.plot(std)
    pyplot.show()
mpi.barrier()

Example #23
0
def apcluster_k(feature, num_centers, corr=True, tol=0):
    """perform the affinity propagation algorithm for the input codes.
    """
    logging.debug("ap: preparing similarity matrix")
    covmat = mathutil.mpi_cov(feature)
    std = np.diag(covmat)
    # normalize
    std = np.sqrt(std**2 + 0.01)
    if corr:
        # compute correlation. If corr is False, we will use the covariance
        # directly
        covmat /= std
        covmat /= std[:, np.newaxis]
    # compute the similarity matrix
    norm = np.diag(covmat) / 2
    covmat -= norm
    covmat -= norm[:, np.newaxis]
    # add a small noise to covmat
    noise = (covmat + np.finfo(np.float64).eps) * \
            np.random.rand(covmat.shape[0], covmat.shape[1])
    mpi.COMM.Bcast(noise)
    covmat += noise
    # The remaining part can just be carried out on root
    if mpi.is_root():
        # set preference
        pmax = covmat.max()
        #af = AffinityPropagation().fit(covmat, pmax)
        #num_max = len(af.cluster_centers_indices_)
        # in fact, num_max would always be covmat.shape[0] so we don't really
        # run ap
        num_max = covmat.shape[0]
        logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max))
        pmin = covmat.min()
        af = AffinityPropagation().fit(covmat, pmin)
        # num_min is the theoretical min, but the python code seem to raise bugs...
        num_min = len(af.cluster_centers_indices_)
        logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min))

        if num_centers < num_min:
            logging.warning("num_centers too small, will return %d centers" %
                            (num_min, ))
            return af.cluster_centers_indices_, af.labels_, covmat

        if num_centers > num_max:
            logging.warning("num_centers too large, will return everything.")
            return np.arange(covmat.shape[0], dtype=np.int), \
                   np.arange(covmat.shape[0], dtype=np.int)

        logging.debug("ap: start affinity propagation")

        # We will simply use bisection search to find the right number of centroids.
        for i in range(_AP_MAX_ITERATION):
            pref = (pmax + pmin) / 2
            af = AffinityPropagation().fit(covmat, pref)
            num = len(af.cluster_centers_indices_)
            logging.debug("ap try %d: pref = %s, num = %s" %
                          (i + 1, pref, num))
            if num >= num_centers - tol and num <= num_centers + tol:
                break
            elif num < num_centers:
                pmin = pref
                num_min = num
            else:
                pmax = pref
                num_max = num
    else:
        af = None
    mpi.barrier()
    af = mpi.COMM.bcast(af)
    return af.cluster_centers_indices_, af.labels_, covmat
Example #24
0
 def testBarrier(self):
     import time
     # sleep for a while, and resume
     time.sleep(mpi.RANK)
     mpi.barrier()
     self.assertTrue(True)
Example #25
0
 def __init__(self,
              root_folder,
              extensions,
              prefetch=False,
              target_size=None,
              max_size=None,
              min_size=None,
              center_crop=None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder, ))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [
             f for f in files
             if any([f.lower().endswith(ext) for ext in extensions])
         ]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Example #26
0
def apcluster_k(feature, num_centers, corr = True, tol = 0):
    """perform the affinity propagation algorithm for the input codes.
    """
    logging.debug("ap: preparing similarity matrix")
    covmat = mathutil.mpi_cov(feature)
    std = np.diag(covmat)
    # normalize
    std = np.sqrt(std**2 + 0.01)
    if corr:
        # compute correlation. If corr is False, we will use the covariance
        # directly
        covmat /= std
        covmat /= std[:, np.newaxis]
    # compute the similarity matrix
    norm = np.diag(covmat) / 2
    covmat -= norm
    covmat -= norm[:, np.newaxis]
    # add a small noise to covmat
    noise = (covmat + np.finfo(np.float64).eps) * \
            np.random.rand(covmat.shape[0], covmat.shape[1])
    mpi.COMM.Bcast(noise)
    covmat += noise
    # The remaining part can just be carried out on root
    if mpi.is_root():
        # set preference
        pmax = covmat.max()
        #af = AffinityPropagation().fit(covmat, pmax)
        #num_max = len(af.cluster_centers_indices_)
        # in fact, num_max would always be covmat.shape[0] so we don't really
        # run ap
        num_max = covmat.shape[0]
        logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max))
        pmin = covmat.min()
        af = AffinityPropagation().fit(covmat, pmin)
        # num_min is the theoretical min, but the python code seem to raise bugs...
        num_min = len(af.cluster_centers_indices_)
        logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min))
        
        if num_centers < num_min:
            logging.warning("num_centers too small, will return %d centers" % (num_min,))
            return af.cluster_centers_indices_, af.labels_, covmat
    
        if num_centers > num_max:
            logging.warning("num_centers too large, will return everything.")
            return np.arange(covmat.shape[0], dtype=np.int), \
                   np.arange(covmat.shape[0], dtype=np.int)
        
        logging.debug("ap: start affinity propagation")
        
        # We will simply use bisection search to find the right number of centroids.
        for i in range(_AP_MAX_ITERATION):
            pref = (pmax + pmin) / 2
            af = AffinityPropagation().fit(covmat, pref)
            num = len(af.cluster_centers_indices_)
            logging.debug("ap try %d: pref = %s, num = %s" % (i + 1, pref, num))
            if num >= num_centers - tol and num <= num_centers + tol:
                break
            elif num < num_centers:
                pmin = pref
                num_min = num
            else:
                pmax = pref
                num_max = num
    else:
        af = None
    mpi.barrier()
    af = mpi.COMM.bcast(af)
    return af.cluster_centers_indices_, af.labels_, covmat